In [1]:
# https://www.kaggle.com/tunguz/bow-meta-text-and-dense-features-lgbm-clone?scriptVersionId=3540839

# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
import time, gc
import pandas as pd
import numpy as np
from sklearn import preprocessing
from nltk.corpus import stopwords 
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import matplotlib.pyplot as plt
import pymorphy2
import nltk, re
from nltk.tokenize import ToktokTokenizer
from multiprocessing import cpu_count, Pool
from sklearn.linear_model import Ridge


In [2]:
#path = '../input/'
path = "/home/darragh/avito/data/"
#path = '/Users/dhanley2/Documents/avito/data/'
path = '/home/ubuntu/avito/data/'
start_time = time.time()
full = False

print('[{}] Load Train/Test'.format(time.time() - start_time))
traindf = pd.read_csv(path + 'train.csv.zip', index_col = "item_id", parse_dates = ["activation_date"], compression = 'zip')
traindex = traindf.index
testdf = pd.read_csv(path + 'test.csv.zip', index_col = "item_id", parse_dates = ["activation_date"])
testdex = testdf.index
y = traindf.deal_probability.copy()
traindf.drop("deal_probability",axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*traindf.shape))
print('Test shape: {} Rows, {} Columns'.format(*testdf.shape))
traindf['activation_date'].value_counts()

(traindf['image_top_1'] == traindf['image_top_1']).value_counts()
(testdf['image_top_1'] == testdf['image_top_1']).value_counts()

[6.723403930664062e-05] Load Train/Test
Train shape: 1503424 Rows, 16 Columns
Test shape: 508438 Rows, 16 Columns


True     465829
False     42609
Name: image_top_1, dtype: int64

In [3]:
print('[{}] Create Validation Index'.format(time.time() - start_time))
if full:
    trnidx = (traindf.activation_date<=pd.to_datetime('2017-03-28')).values
    validx = (traindf.activation_date>=pd.to_datetime('2017-03-29')).values
else:
    trnidx = (traindf.activation_date<=pd.to_datetime('2017-03-26')).values
    validx = (traindf.activation_date>=pd.to_datetime('2017-03-27')).values

print('[{}] Combine Train and Test'.format(time.time() - start_time))
df = pd.concat([traindf,testdf],axis=0)
del traindf,testdf
gc.collect()
df['idx'] = range(df.shape[0])
print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))

#print('[{}] Count NA row wise'.format(time.time() - start_time))
#df['NA_count_rows'] = df.isnull().sum(axis=1)




[27.567209720611572] Create Validation Index
[27.574214935302734] Combine Train and Test

All Data shape: 2011862 Rows, 17 Columns


In [4]:
print('[{}] Load meta image engineered features'.format(time.time() - start_time))
featimgmeta = pd.concat([pd.read_csv(path + '../features/img_features_%s.csv.gz'%(i)) for i in range(6)])
featimgmeta.rename(columns = {'name':'image'}, inplace = True)
featimgmeta['image'] = featimgmeta['image'].str.replace('.jpg', '')
df = df.reset_index('item_id').merge(featimgmeta, on = ['image'], how = 'left').set_index('item_id')
for col in featimgmeta.columns.values[1:]:
    df[col].fillna(-1, inplace = True)
    df[col].astype(np.float32, inplace = True)
    
print('[{}] Load translated image engineered features'.format(time.time() - start_time))
feattrlten = pd.concat([pd.read_csv(path + '../features/translate_trn_en.csv.gz', compression = 'gzip'),
                        pd.read_csv(path + '../features/translate_tst_en.csv.gz', compression = 'gzip')])
# feattrlten = pd.concat([pd.read_pickle(path + '../features/translate_trn_en.pkl'),
#                        pd.read_pickle(path + '../features/translate_tst_en.pkl')])
feattrlten.fillna('', inplace = True)
feattrlten['translation'] = feattrlten['title_translated'] + ' ' + feattrlten['param_1_translated'] + ' ' \
            + feattrlten['param_2_translated'] + ' ' + feattrlten['param_3_translated'] + ' '  \
            + feattrlten['category_name_translated'] + ' ' + feattrlten['parent_category_name_translated']
feattrlten = feattrlten.set_index('item_id')[['translation']]
feattrlten.head()
df = pd.merge(df, feattrlten, left_index=True, right_index=True, how='left')
del feattrlten
gc.collect()

[28.53509545326233] Load meta image engineered features
[48.18177270889282] Load translated image engineered features


48

In [5]:
print('[{}] Load other engineered features'.format(time.time() - start_time))
featlatlon = pd.read_csv(path + '../features/avito_region_city_features.csv') # https://www.kaggle.com/frankherfert/region-and-city-details-with-lat-lon-and-clusters
featlatlon.drop(['city_region', 'city_region_id', 'region_id'], 1, inplace = True)
featpop    = pd.read_csv(path + '../features/city_population_wiki_v3.csv') # https://www.kaggle.com/stecasasso/russian-city-population-from-wikipedia/comments
featusrttl = pd.read_csv(path + '../features/user_agg.csv.gz', compression = 'gzip') # created with features/make/user_actagg_1705.py
featusrcat = pd.read_csv(path + '../features/usercat_agg.csv.gz', compression = 'gzip') # created with features/make/user_actagg_1705.py
featusrprd = pd.read_csv(path + '../features/user_activ_period_stats.gz', compression = 'gzip') # created with features/make/user_actagg_1705.py
featrdgtxt = pd.read_csv(path + '../features/ridgeText5CV.csv.gz', compression = 'gzip') # created with features/make/user_actagg_1705.py
#featrdgtxts = pd.read_csv(path + '../features/ridgeTextStr5CV.csv.gz', compression = 'gzip') # created with features/make/user_actagg_1705.py
featrdgimg = pd.read_csv(path + '../features/ridgeImg5CV.csv.gz', compression = 'gzip') # created with features/make/user_actagg_1705.py
#featrdgprc = pd.read_csv(path + '../features/price_category_ratios.gz', compression = 'gzip') # created with features/make/user_actagg_1705.py
featrdgprc = pd.read_csv(path + '../features/price_seq_category_ratios.gz', compression = 'gzip') # created with features/make/user_actagg_1705.py
featrdgprc.fillna(-1, inplace = True)
featrdgrnk = pd.read_csv(path + '../features/price_rank_ratios0906.gz', compression = 'gzip') # created with R script and stemmer
featrdgrnk.isnull().sum()
featnumf = pd.read_csv(path + '../features/numericFeats.gz', compression = 'gzip') 
featnumf.fillna(0, inplace = True)
#featencfst = pd.read_csv(path + '../features/alldf_bayes_fest_1206.gz', compression = 'gzip')
featprtfst = pd.read_csv(path + '../features/pratios_fest_1206.gz', compression = 'gzip')

#featprmenc = pd.read_csv(path + '../features/alldf_bayes_mean_param_1006.gz', compression = 'gzip') 
featprmtro = pd.read_csv(path + '../features/price_param_ratios1006.gz', compression = 'gzip') 
#featimgnet = pd.read_csv(path + '../features/imgnet_decode_feats.csv.gz', compression = 'gzip')
featldlag  = pd.read_csv(path + '../features/pseq_leadlag_festivities_1906.gz', compression = 'gzip')

featimgprc = pd.read_csv(path + '../features/price_imagetop1_ratios.gz', compression = 'gzip') # created with features/make/priceImgRatios2705.R
featenc = pd.read_csv(path + '../features/alldf_bayes_mean.gz', compression = 'gzip') # created with features/make/user_actagg_1705.py
featct  = pd.read_csv(path + '../features/alldf_count.gz', compression = 'gzip') # created with features/make/user_actagg_1705.py
featusrttl.rename(columns={'title': 'all_titles'}, inplace = True)
df = df.reset_index().merge(featpop, on = 'city', how = 'left')
df = df.merge(featlatlon, on = ['city', 'region'], how = 'left')
df['population'].fillna(-1, inplace = True)
df = df.set_index('item_id')
keep = ['user_id', 'all_titles', 'user_avg_price', 'user_ad_ct']
df = df.reset_index().merge(featusrttl[keep], on = 'user_id').set_index('item_id')
keep = ['user_id', 'parent_category_name', 'usercat_avg_price', 'usercat_ad_ct']
gc.collect()
df = df.reset_index().merge(featusrcat[keep], on = ['user_id', 'parent_category_name']).set_index('item_id')
keep = ['user_id', 'user_activ_sum', 'user_activ_mean', 'user_activ_var']
gc.collect()
df = df.reset_index().merge(featusrprd[keep], on = ['user_id'], how = 'left').set_index('item_id')
#df = df.reset_index().merge(featimgnet, on = ['item_id'], how = 'left').set_index('item_id')

print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))  


[76.77756810188293] Load other engineered features

All Data shape: 2011862 Rows, 44 Columns


In [6]:
print('[{}] Resort data correctly'.format(time.time() - start_time))
df.sort_values('idx', inplace = True)
df.drop(['idx'], axis=1,inplace=True)
df.reset_index(inplace = True)
df.head()
df = pd.concat([df.reset_index(),featenc, featct, featrdgtxt, featrdgprc, featimgprc, featrdgrnk, featnumf, featprmtro, featprtfst, featldlag],axis=1)

print('[{}] Create folds'.format(time.time() - start_time))
foldls = [["2017-03-15", "2017-03-16", "2017-03-17"], \
       ["2017-03-18", "2017-03-19", "2017-03-20"], \
       ["2017-03-21", "2017-03-22", "2017-03-23"], \
       ["2017-03-24", "2017-03-25", "2017-03-26"], \
        ["2017-03-27", "2017-03-28", "2017-03-29", \
            "2017-03-30", "2017-03-31", "2017-04-01", \
            "2017-04-02", "2017-04-03","2017-04-07"]]
foldls = [[pd.to_datetime(d) for d in f] for f in foldls]
df['fold'] = -1
for t, fold in enumerate(foldls):
    df['fold'][df.activation_date.isin(fold)] = t
df['fold'].value_counts()
df.head()

[198.05890202522278] Resort data correctly
[203.28921937942505] Create folds


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Unnamed: 0,index,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,...,seq_seqsort_lead,seq_seqsort_lag_diff,seq_seqsort_lead_diff,item_seq_repeat_postings,category_repeat_postings,price_min_sequence,price_min_sequence_diff,seq_min_sequence,seq_min_sequence_diff,fold
0,0,b912c3c6a6ad,e00f8ff2eaf9,Свердловская область,Екатеринбург,Личные вещи,Товары для детей и игрушки,Постельные принадлежности,,,...,-1,1,-999,1,3,4000.0,-3600.0,1,1,4
1,1,2dac0150717d,39aeb48f0017,Самарская область,Самара,Для дома и дачи,Мебель и интерьер,Другое,,,...,-1,-999,-999,1,1,3000.0,0.0,19,0,3
2,2,ba83aefab5dc,91e2f88dd6e3,Ростовская область,Ростов-на-Дону,Бытовая электроника,Аудио и видео,"Видео, DVD и Blu-ray плееры",,,...,10,1,-1,1,1,2000.0,2000.0,1,8,1
3,3,02996f1dd2ea,bf5cccea572d,Татарстан,Набережные Челны,Личные вещи,Товары для детей и игрушки,Автомобильные кресла,,,...,-1,3,-999,1,7,0.0,2200.0,20,266,3
4,4,7c90be56d2ab,ef50846afc0b,Волгоградская область,Волгоград,Транспорт,Автомобили,С пробегом,ВАЗ (LADA),2110.0,...,-1,-999,-999,1,1,40000.0,0.0,3,0,0


In [7]:
df['ridge_img'] = featrdgimg['ridge_img_preds'].values
df = df.set_index('item_id')
df.drop(['index'], axis=1,inplace=True)
df.columns
del featusrttl, featusrcat, featusrprd, featenc, featrdgprc, featimgprc, featnumf, featprmtro,  featprtfst, featldlag
# del featusrttl, featusrcat, featusrprd, featenc, featrdgtxts
gc.collect()

190

In [8]:
print('[{}] Feature Engineering'.format(time.time() - start_time))
for col in df.columns:
    if 'price' in col:
        print(f'Fill {col}')
        df[col].fillna(-999,inplace=True)

for col in df.columns:
    if 'user_activ' in col:
        print(f'fill {col}')
        df[col].fillna(-9,inplace=True)
df["image_top_1"].fillna(-999,inplace=True)

del featct, featlatlon, featimgmeta, featpop, featrdgimg, featrdgtxt
gc.collect()

print('[{}] Manage Memory'.format(time.time() - start_time))
for col in df.columns:
    if np.float64 == df[col].dtype:
        df[col] = df[col].astype(np.float32)
    if np.int64 == df[col].dtype:
        df[col] = df[col].astype(np.int32)
    gc.collect()
df.dtypes

[212.0873899459839] Feature Engineering
Fill price
Fill user_avg_price
Fill usercat_avg_price
Fill pcat_price_rratio
Fill cat_price_rratio
Fill ttl_price_rratio
Fill dscr_price_rratio
Fill pcat_log_price_rratio
Fill user_log_price_rratio
Fill cat_price_iratio
Fill reg_price_iratio
Fill reg_price_gratio
Fill cty_price_gratio
Fill ttlst_price_rratio
Fill ttlst_city_price_rratio
Fill ttlst_prm_price_rratio
Fill par1cty_price_prratio
Fill par2cty_price_prratio
Fill par1utyp_price_prratio
Fill par2utyp_price_prratio
Fill rmean_price_byseq3_1
Fill rmean_price_byseq3_2
Fill rmean_price_byseq3_3
Fill rmean_price_byseq8_1
Fill rmean_price_byseq8_2
Fill rmean_price_byseq8_3
Fill price_datesort_lag
Fill price_datesort_lead
Fill price_seqsort_lag
Fill price_seqsort_lead
Fill price_min_sequence
Fill price_min_sequence_diff
fill user_activ_sum
fill user_activ_mean
fill user_activ_var
[212.31325244903564] Manage Memory


user_id                                                                                        object
region                                                                                         object
city                                                                                           object
parent_category_name                                                                           object
category_name                                                                                  object
param_1                                                                                        object
param_2                                                                                        object
param_3                                                                                        object
title                                                                                          object
description                                                                       

In [9]:
print('[{}] Text Features'.format(time.time() - start_time))
df['text_feat'] = df.apply(lambda row: ' '.join([
    str(row['param_1']), 
    str(row['param_2']), 
    str(row['param_3'])]),axis=1) # Group Param Features
df.drop(["param_1","param_2","param_3"],axis=1,inplace=True)

print('[{}] Text Features'.format(time.time() - start_time))
df['description'].fillna('unknowndescription', inplace=True)
df['title'].fillna('unknowntitle', inplace=True)
df['text']      = (df['description'].fillna('') + ' ' + df['title'] + ' ' + 
  df['parent_category_name'].fillna('').astype(str) + ' ' + df['category_name'].fillna('').astype(str) )



[256.2147834300995] Text Features
[544.9010486602783] Text Features


In [10]:
print('[{}] Create Time Variables'.format(time.time() - start_time))
df["Weekday"] = df['activation_date'].dt.weekday
df.drop(["activation_date","image"],axis=1,inplace=True)

print('[{}] Make Item Seq number as contiuous also'.format(time.time() - start_time))
df["item_seq_number_cont"] = df["item_seq_number"]
df['city'] = df['region'].fillna('').astype(str) + '_' + df['city'].fillna('').astype(str)
df.columns
print('[{}] Encode Variables'.format(time.time() - start_time))
df.drop(['user_id'], 1, inplace = True)
categorical = ["region","parent_category_name","user_type", 'city', 'category_name', "item_seq_number", 'image_top_1']
print("Encoding :",categorical)
# Encoder:
lbl = preprocessing.LabelEncoder()
for col in categorical:
    df[col] = lbl.fit_transform(df[col].astype(str))

[550.6073706150055] Create Time Variables
[552.7413213253021] Make Item Seq number as contiuous also
[553.9189920425415] Encode Variables
Encoding : ['region', 'parent_category_name', 'user_type', 'city', 'category_name', 'item_seq_number', 'image_top_1']


In [11]:
print('[{}] Meta Text Features'.format(time.time() - start_time))
textfeats = ["description","text_feat", "title"]
for cols in textfeats:
    df[cols] = df[cols].astype(str) 
    df[cols] = df[cols].astype(str).fillna('nicapotato') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    df[cols + '_num_chars'] = df[cols].apply(len) # Count number of Characters
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Words
    gc.collect()
df.info()
for cols in ['translation']:
    df[cols] = df[cols].astype(str) 
    df[cols] = df[cols].astype(str).fillna('nicapotato') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently


[586.5582447052002] Meta Text Features
<class 'pandas.core.frame.DataFrame'>
Index: 2011862 entries, b912c3c6a6ad to d374d332992f
Columns: 173 entries, region to title_words_vs_unique
dtypes: float32(89), float64(3), int32(58), int64(17), object(6)
memory usage: 1.5+ GB


In [12]:
print('[{}] Manage Memory'.format(time.time() - start_time))
for col in df.columns:
    if np.float64 == df[col].dtype:
        df[col] = df[col].astype(np.float32)
    if np.int64 == df[col].dtype:
        df[col] = df[col].astype(np.int32)
    gc.collect()
df.info()

[635.4091246128082] Manage Memory
<class 'pandas.core.frame.DataFrame'>
Index: 2011862 entries, b912c3c6a6ad to d374d332992f
Columns: 173 entries, region to title_words_vs_unique
dtypes: float32(92), int32(75), object(6)
memory usage: 1.4+ GB


In [13]:

print('[{}] Clean text and tokenize'.format(time.time() - start_time))
toktok = ToktokTokenizer()
tokSentMap = {}
morpher = pymorphy2.MorphAnalyzer()
def tokSent(sent):
    sent = sent.replace('/', ' ')
    return " ".join(morpher.parse(word)[0].normal_form for word in toktok.tokenize(rgx.sub(' ', sent)))
def tokCol(var):
    return [tokSent(s) for s in var.tolist()]
rgx = re.compile('[%s]' % '!"#%&()*,-./:;<=>?@[\\]^_`{|}~\t\n')   

partitions = 4 
def parallelize(data, func):
    data_split = np.array_split(data.values, partitions)
    pool = Pool(partitions)
    data = pd.concat([pd.Series(l) for l in pool.map(tokCol, data_split)]).values
    pool.close()
    pool.join()
    return data

load_text = True
text_cols = ['description', 'text', 'text_feat', 'title', 'translation']
if load_text:
    dftxt = pd.read_csv(path + '../features/text_features_morphed.csv.gz', compression = 'gzip')
    for col in text_cols:
        print(col + ' load tokenised [{}]'.format(time.time() - start_time))
        df[col] = dftxt[col].values
        df.fillna(' ', inplace = True)
    del dftxt
else:
    for col in text_cols:
        print(col + ' tokenise [{}]'.format(time.time() - start_time))
        df[col] = parallelize(df[col], tokCol)
    df[text_cols].to_csv(path + '../features/text_features_morphed.csv.gz', compression = 'gzip')
gc.collect()

print('[{}] Add some more test processing...'.format(time.time() - start_time))
from itertools import combinations
def create_bigrams(text):
    try:
        text = np.unique( [ w for w in text.split() ] )
        lst_bi = []
        for combo in combinations(text, 2):
            cb1=combo[0]+combo[1]
            cb2=combo[1]+combo[0]
            in_dict=False
            if cb1 in word_count_dict_one:
                new_word = cb1
                in_dict=True
            if cb2 in word_count_dict_one:
                new_word = cb2
                in_dict=True
            if not in_dict:
                new_word = combo[0]+'___'+combo[1]
            if len(cb1)>=0:
                lst_bi.append(new_word)
        return ' '.join( lst_bi )
    except:
        return ' '
    
def create_bigrams_df(df):
    return df.apply( create_bigrams )
def word_count(text, dc):
    text = set( text.split(' ') ) 
    for w in text:
        dc[w]+=1
def remove_low_freq(text, dc):
    return ' '.join( [w for w in text.split() if w in dc] )

def parallelize_dataframe(df, func, cores = 4):
    df_split = np.array_split(df, cores)
    pool = Pool(cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df


[643.9637570381165] Clean text and tokenize
description load tokenised [679.4236042499542]
text load tokenised [681.4867608547211]
text_feat load tokenised [683.5818066596985]
title load tokenised [685.4153831005096]
translation load tokenised [687.3237187862396]
[689.4930799007416] Add some more test processing...


In [None]:
print('[{}] [TF-IDF] Term Frequency Inverse Document Frequency Stage'.format(time.time() - start_time))
russian_stop = set(stopwords.words('russian'))
tfidf_para = {
    "stop_words": russian_stop,
    "token_pattern": r'\w{1,}',
    "sublinear_tf": True,
    "dtype": np.float32,
    "smooth_idf":False
}
countv_para = {
    #"stop_words": russian_stop,
    #"analyzer": 'word',
    #"token_pattern": r'\w{1,}',
    "lowercase": True,
    "min_df": 1 #False
}
def get_col(col_name): return lambda x: x[col_name]
vectorizer = FeatureUnion([
        ('text',TfidfVectorizer(
            ngram_range=(1),
            max_features=800000,
            **tfidf_para,
            preprocessor=get_col('text'))),
        ('title',TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=50000,
            **tfidf_para,
            preprocessor=get_col('title'))),
        ('text_feat',CountVectorizer(
            **countv_para,
            preprocessor=get_col('text_feat'))),
    ])
    

In [16]:
df.loc[traindex,:].head().to_dict('records')

[{'Weekday': 1,
  'all_titles': 'Кокоби(кокон для сна) Коляска 2 в 1 Tako jumperX Коляска 2 в 1 Tako jumperX',
  'average_blue': 0.6123271584510803,
  'average_green': 0.3197510838508606,
  'average_red': 0.3604223430156708,
  'blurness': 656.5001831054688,
  'cat_itseq_rratio': 0.007451982703059912,
  'cat_price_iratio': 0.1877954602241516,
  'cat_price_rratio': 0.0063537778332829475,
  'category_name': 42,
  'category_name_ct': 135280,
  'category_name_enc': 0.19382381439208984,
  'category_repeat_postings': 3,
  'city': 1313,
  'city_ct': 85993,
  'city_enc': 0.1260206699371338,
  'cont_adultclothes_sizeb': 0.0,
  'cont_adultclothes_sizel': 0.0,
  'cont_adultclothes_sizev': 0.0,
  'cont_appt_apFl': 0.0,
  'cont_appt_rooms': 0.0,
  'cont_appt_sqM': 0.0,
  'cont_appt_totFl': 0.0,
  'cont_car_year': 0,
  'cont_kidsclothes_mthb': 0,
  'cont_kidsclothes_nosz': 0,
  'cont_kidsclothes_yrb': 0,
  'cont_kidsclothes_yrsb': 0,
  'cty_price_gratio': 0.11322347074747086,
  'description': 'кокон 

In [14]:
start_vect=time.time()
vectorizer.fit(df.loc[traindex,:].to_dict('records'))
ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
tfvocab[:50]
print('[{}] Vectorisation completed'.format(time.time() - start_time))
# Drop Text Cols
df.drop(textfeats+['text', 'all_titles', 'translation'], axis=1,inplace=True)
#drop_cols= [c for c in textfeats+['text', 'all_titles', 'translation'] if c in df.columns]
#df.drop(drop_cols, axis=1,inplace=True)

[689.5389709472656] [TF-IDF] Term Frequency Inverse Document Frequency Stage


TypeError: 'int' object is not iterable

In [None]:
gc.collect()

print('[{}] Drop all the categorical'.format(time.time() - start_time))
df.drop(categorical, axis=1,inplace=True)

In [None]:
import tensorflow as tf
import os, random
os.environ['PYTHONHASHSEED'] = '10000'
np.random.seed(10001)
random.seed(10002)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=5, inter_op_parallelism_threads=1)
from keras import backend
tf.set_random_seed(10003)
backend.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf))
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, concatenate, GRU, Embedding, Flatten, Activation, BatchNormalization, PReLU
from keras.initializers import he_uniform
from keras.layers import Conv1D
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.optimizers import Adam, SGD
from keras.models import Model

In [None]:
f = 0
print('Fold %s'%(f) + ' [{}] Modeling Stage'.format(time.time() - start_time))
trnidx = (df['fold'].loc[traindex] != f).values
trndf = df.drop('fold', 1).loc[traindex,:][trnidx].copy()
trndf[trndf>10000] = 10000
trndf[trndf<0] = 0
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
trndf = scaler.fit_transform(np.log1p(trndf.values))
#X_train = [trndf,ready_df[0:traindex.shape[0]][trnidx]]
X_train = csr_matrix(hstack([csr_matrix(trndf),ready_df[0:traindex.shape[0]][trnidx]]))
y_train = y[trnidx]
tstdf =  df.drop('fold', 1).loc[traindex,:][~trnidx].copy()
tstdf[tstdf>10000] = 10000
tstdf[tstdf<0] = 0
tstdf = scaler.transform(np.log1p(tstdf.values))
#X_test =  [tstdf, ready_df[0:traindex.shape[0]][~trnidx]]
X_test =  csr_matrix(hstack([csr_matrix(tstdf), ready_df[0:traindex.shape[0]][~trnidx]]))
y_test  = y[~trnidx]

In [None]:
#print('[{}] Load Densenet image features'.format(time.time() - start_time))
#dnimgtrn = np.load(path+'../features/densenet_pool_array_train.npy')
#dnimgtrn = dnimgtrn
#scaler = preprocessing.StandardScaler()
#dnimgtrn = scaler.fit_transform(dnimgtrn)
#gc.collect()
#dnimgtst = np.load(path+'../features/densenet_pool_array_test.npy')
#dnimgtst = scaler.transform(dnimgtst)
#gc.collect()

In [None]:
!free -m

In [None]:
from keras.layers import BatchNormalization, PReLU

def sparseNN():                                             
    sparse_data = Input( shape=[ready_df.shape[1]], 
        dtype = 'float32',   sparse = True, name='sparse_data')  

    #dense = Input(shape=[trndf.shape[1]], name="dense")

    #x = concatenate( [sparse_data, dense] ) 
    x = Dense(200 , kernel_initializer=he_uniform(seed=0) )(sparse)    
    x = PReLU()(x)
    #x = BatchNormalization()(x)
    x = Dense(200 , kernel_initializer=he_uniform(seed=0) )(x)
    x = PReLU()(x)
    x = Dense(100 , kernel_initializer=he_uniform(seed=0) )(x)
    x = PReLU()(x)
    x= Dense(1)(x)
    
    model = Model([sparse_data],x)
    
    optimizer = Adam(.0011)
    model.compile(loss="mse", optimizer=optimizer)
    return model

In [None]:
BATCH_SIZE = 1024*20
nnbags = 1
print("Fitting SPARSE NN model ...")
gc.collect()
y_predls = []
sparse_nn = sparseNN()
sparse_nn.fit(X_train, y_train, \
            batch_size=BATCH_SIZE, \
            validation_data = (X_test, y_test), \
            epochs=1, verbose=1 )
gc.collect()

In [None]:
BATCH_SIZE = 1024*20
sparse_nn.fit(X_train, y_train, \
            batch_size=BATCH_SIZE, \
            validation_data = (X_test, y_test), \
            epochs=1, verbose=1 )
gc.collect()

In [None]:
y_predls.append(sparse_nn.predict(X_test, batch_size=1024*128))
print('RMSE fold %s:'%(f), np.sqrt(metrics.mean_squared_error(y_test, sum(y_predls)/len(y_predls))))

In [None]:
# Placeholder for predictions
df['fold'].value_counts()
y_pred_trn = pd.Series(-np.zeros(df.loc[traindex,:].shape[0]), index = traindex)
y_pred_tst = pd.Series(-np.zeros(df.loc[testdex ,:].shape[0]), index = testdex)
best_iters = []
bags       = 1
BATCH_SIZE = 5000
nnbags     = 3
bags       = 1
for bag in range(bags):
    for f in range(6):
        print('Fold %s'%(f) + ' [{}] Modeling Stage'.format(time.time() - start_time))
        trnidx = (df['fold'].loc[traindex] != f).values
        trndf = df.drop('fold', 1).loc[traindex,:][trnidx].copy()
        trndf[trndf>10000] = 10000
        trndf[trndf<0] = 0
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        trndf = scaler.fit_transform(trndf.values)
        X_train = csr_matrix(hstack([csr_matrix(trndf),ready_df[0:traindex.shape[0]][trnidx]]))
        y_train = y[trnidx]
        # 5 is the test fold
        if f == 5:
            tstdf = df.drop('fold', 1).loc[testdex,:].copy()
            tstdf[tstdf>10000] = 10000
            tstdf[tstdf<-0] = 0
            tstdf = scaler.transform(tstdf.values)
            X_test = csr_matrix(hstack([csr_matrix(tstdf),ready_df[traindex.shape[0]:]]))
        else:
            tstdf =  df.drop('fold', 1).loc[traindex,:][~trnidx].copy()
            tstdf[tstdf>10000] = 10000
            tstdf[tstdf<0] = 0
            tstdf = scaler.transform(tstdf.values)
            X_test = csr_matrix(hstack([csr_matrix(tstdf), ready_df[0:traindex.shape[0]][~trnidx]]))
            y_test  = y[~trnidx]
        #tfvocab = df.drop('fold', 1).columns.tolist() + vectorizer.get_feature_names()
        del trndf
        gc.collect()
        for shape in [X_train, X_test]:
            print("Fold {} : {} Rows and {} Cols".format(f, *shape.shape))
        gc.collect();
        gc.collect()
    
        if f==5:
            best_iter = 0
            print('avg best iter: %s'%(best_iter))
            # ridge = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=200, normalize=False, tol=0.01)
            # ridge.fit(X_train, y_train)
            y_predls = []
            for ep in range(nnbags):
                gc.collect()
                sparse_nn = sparseNN()
                sparse_nn.fit(X_train, y_train, \
                              batch_size=BATCH_SIZE, \
                              epochs=1, verbose=1 )
                y_predls.append(sparse_nn.predict(X_test, batch_size=20000))
            y_pred = sum(y_predls)/len(y_predls)
            
            
        else:
            best_iter = 0
            print('avg best iter: %s'%(best_iter))
            # ridge = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=200, normalize=False, tol=0.01)
            # ridge.fit(X_train, y_train)
            y_predls = []
            for ep in range(nnbags):
                gc.collect()
                sparse_nn = sparseNN()
                sparse_nn.fit(X_train, y_train, \
                              batch_size=BATCH_SIZE, \
                              validation_data = (X_test, y_test), \
                              epochs=1, verbose=1 )
                y_predls.append(sparse_nn.predict(X_test, batch_size=20000))
                print('RMSE fold %s bag %s:'%(f, ep), np.sqrt(metrics.mean_squared_error(y_test, sum(y_predls)/len(y_predls))))
            y_pred = sum(y_predls)/len(y_predls)
            
        print("Model Evaluation Stage")
        if f == 5:
            y_pred_tst[:] += y_pred.flatten() # ridge.predict(X_test)
        else:
            y_pred_trn[~trnidx] += y_pred.flatten() # ridge.predict(X_test)
            print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_trn[~trnidx])))
        del X_test
        gc.collect()
        y_pred_trn.to_csv("rmlp5CV_2306_trn.csv",index=True)
        y_pred_tst.to_csv("rmlp5CV_2306_tst.csv",index=True)    



In [None]:
lgsub = pd.concat([y_pred_trn, y_pred_tst]).reset_index()
lgsub.rename(columns = {0 : 'deal_probability'}, inplace=True)
lgsub['deal_probability'] = lgsub['deal_probability']/(bag+1)
lgsub.set_index('item_id', inplace = True)
print('RMSE for all :', np.sqrt(metrics.mean_squared_error(y, lgsub.loc[traindex])))
lgsub.to_csv("rmlp5CV_2306.csv.gz",index=True,header=True, compression = 'gzip')

lgsub.to_csv(path + "../sub/rmlp5CV_2306.csv.gz",index=True,header=True, compression = 'gzip')


In [None]:
gc.collect()

In [None]:
# Placeholder for predictions
df['fold'].value_counts()
y_pred_trn = pd.Series(-np.zeros(df.loc[traindex,:].shape[0]), index = traindex)
y_pred_tst = pd.Series(-np.zeros(df.loc[testdex ,:].shape[0]), index = testdex)
best_iters = []
bags       = 1
BATCH_SIZE = 5000
nnbags     = 3
bags       = 1
for bag in range(bags):
    for f in range(6):
        print('Fold %s'%(f) + ' [{}] Modeling Stage'.format(time.time() - start_time))
        scaler = StandardScaler()
        trnidx = (df['fold'].loc[traindex] != f).values
        trndf = df.drop('fold', 1).loc[traindex,:][trnidx].copy()
        trndf[trndf>1000000] = 1000000
        trndf[trndf<0] = 0
        skew = trndf.skew()
        for col in trndf.columns:
            if skew[col]>1:
                trndf[col] = np.log1p(trndf[col].values)
        trndf = scaler.fit_transform(trndf.values)
        X_train = csr_matrix(hstack([csr_matrix(trndf),ready_df[0:traindex.shape[0]][trnidx]]))
        y_train = y[trnidx]   
        if f == 5:
            tstdf =  df.drop('fold', 1).loc[testdex,:].copy()
            tstdf[tstdf>1000000] = 1000000
            tstdf[tstdf<0] = 0
            for col in tstdf.columns:
                if skew[col]>1:
                    tstdf[col] = np.log1p(tstdf[col].values)
            tstdf = scaler.transform(tstdf.values)
            X_test = csr_matrix(hstack([csr_matrix(tstdf),ready_df[traindex.shape[0]:]]))
        else:
            tstdf =  df.drop('fold', 1).loc[traindex,:][~trnidx].copy()
            tstdf[tstdf>1000000] = 1000000
            tstdf[tstdf<0] = 0
            for col in tstdf.columns:
                if skew[col]>1:
                    tstdf[col] = np.log1p(tstdf[col].values)
            tstdf = scaler.transform(tstdf.values)
            X_test = csr_matrix(hstack([csr_matrix(tstdf), ready_df[0:traindex.shape[0]][~trnidx]]))
            y_test  = y[~trnidx]
        #tfvocab = df.drop('fold', 1).columns.tolist() + vectorizer.get_feature_names()
        del trndf
        gc.collect()
        for shape in [X_train, X_test]:
            print("Fold {} : {} Rows and {} Cols".format(f, *shape.shape))
        gc.collect();
        gc.collect()
    
        if f==5:
            best_iter = 0
            print('avg best iter: %s'%(best_iter))
            # ridge = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=200, normalize=False, tol=0.01)
            # ridge.fit(X_train, y_train)
            y_predls = []
            for ep in range(nnbags):
                gc.collect()
                sparse_nn = sparseNN()
                sparse_nn.fit(X_train, y_train, \
                              batch_size=BATCH_SIZE, \
                              epochs=1, verbose=1 )
                y_predls.append(np.clip(sparse_nn.predict(X_test, batch_size=20000),-0.1, 1.1))
            y_pred = sum(y_predls)/len(y_predls)
        else:
            best_iter = 0
            print('avg best iter: %s'%(best_iter))
            # ridge = Ridge(solver='auto', fit_intercept=True, alpha=0.4, max_iter=200, normalize=False, tol=0.01)
            # ridge.fit(X_train, y_train)
            y_predls = []
            for ep in range(nnbags):
                gc.collect()
                sparse_nn = sparseNN()
                sparse_nn.fit(X_train, y_train, \
                              batch_size=BATCH_SIZE, \
                              validation_data = (X_test, y_test), \
                              epochs=1, verbose=1 )
                y_predls.append(np.clip(sparse_nn.predict(X_test, batch_size=20000),-0.1, 1.1))
                print('RMSE fold %s bag %s:'%(f, ep), np.sqrt(metrics.mean_squared_error(y_test, sum(y_predls)/len(y_predls))))
            y_pred = sum(y_predls)/len(y_predls)
            
        print("Model Evaluation Stage")
        if f == 5:
            y_pred_tst[:] += y_pred.flatten() # ridge.predict(X_test)
        else:
            y_pred_trn[~trnidx] += y_pred.flatten() # ridge.predict(X_test)
            print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_trn[~trnidx])))
        del X_test
        gc.collect()
        y_pred_trn.to_csv("rmlp5CV_2306B_trn.csv",index=True)
        y_pred_tst.to_csv("rmlp5CV_2306B_tst.csv",index=True)  

In [None]:
lgsub = pd.concat([y_pred_trn, y_pred_tst]).reset_index()
lgsub.rename(columns = {0 : 'deal_probability'}, inplace=True)
lgsub['deal_probability'] = lgsub['deal_probability']/(bag+1)
lgsub.set_index('item_id', inplace = True)
print('RMSE for all :', np.sqrt(metrics.mean_squared_error(y, lgsub.loc[traindex])))
lgsub.to_csv("rmlp5CV_2306B.csv.gz",index=True,header=True, compression = 'gzip')

lgsub.to_csv(path + "../sub/rmlp5CV_2306B.csv.gz",index=True,header=True, compression = 'gzip')