In [1]:
# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from math import sqrt
import time, gc
import pandas as pd
import numpy as np
from sklearn import preprocessing
from nltk.corpus import stopwords 
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack, csr_matrix, save_npz, load_npz
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import backend as K
import pickle, collections

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#path = '../input/'
path = "/home/darragh/avito/data/"
#path = '/Users/dhanley2/Documents/avito/data/'

path = '/home/ubuntu/avito/data/'
start_time = time.time()
full = False
validation = False

print('[{}] Load Train/Test'.format(time.time() - start_time))
usecols = ['activation_date', 'item_id', 'user_type', 'parent_category_name', 'category_name', 'price', \
           'deal_probability']
traindf = pd.read_csv(path + 'train.csv.zip', index_col = "item_id", usecols = usecols, compression = 'zip') # , parse_dates = ["activation_date"]
traindex = traindf.index
testdf = pd.read_csv(path + 'test.csv.zip', index_col = "item_id", usecols = usecols[:-1]) #, parse_dates = ["activation_date"]
testdex = testdf.index
y = traindf.deal_probability.copy()
traindf.drop("deal_probability",axis=1, inplace=True)
print('Train shape: {} Rows, {} Columns'.format(*traindf.shape))
print('Test shape: {} Rows, {} Columns'.format(*testdf.shape))
traindf['activation_date'].value_counts()

[5.984306335449219e-05] Load Train/Test
Train shape: 1503424 Rows, 5 Columns
Test shape: 508438 Rows, 5 Columns


2017-03-20    115190
2017-03-27    114863
2017-03-19    114416
2017-03-26    113513
2017-03-28    112885
2017-03-21    110535
2017-03-22    109813
2017-03-15    108615
2017-03-23    106544
2017-03-16    106168
2017-03-17     98773
2017-03-18     97554
2017-03-24     97351
2017-03-25     97104
2017-03-29        87
2017-04-02         3
2017-04-01         3
2017-03-30         3
2017-04-03         2
2017-04-07         1
2017-03-31         1
Name: activation_date, dtype: int64

In [3]:
print('[{}] Load Densenet image features'.format(time.time() - start_time))
dnimgtrn = np.load(path+'../features/vgg19_pool_array_train_float16.npy')
dnimgtst = np.load(path+'../features/vgg19_pool_array_test_float16.npy')

[14.993220329284668] Load Densenet image features


In [None]:
print('[{}] Combine Train and Test'.format(time.time() - start_time))
df = pd.concat([traindf,testdf],axis=0)
del traindf,testdf
gc.collect()
df['idx'] = range(df.shape[0])
print('\nAll Data shape: {} Rows, {} Columns'.format(*df.shape))

In [6]:
print('[{}] Combine Train and Test'.format(time.time() - start_time))
df["price_bins"] = pd.qcut(df['price'].fillna(-1), q = 50, labels = False, duplicates = 'drop').astype(str)

[476.52983498573303] Combine Train and Test


In [7]:
hotcols = ['user_type', 'parent_category_name', 'category_name', "price_bins"]
dfhot = pd.get_dummies(df[hotcols])

In [9]:
dfhot.head()

Unnamed: 0_level_0,user_type_Company,user_type_Private,user_type_Shop,parent_category_name_Бытовая электроника,parent_category_name_Для бизнеса,parent_category_name_Для дома и дачи,parent_category_name_Животные,parent_category_name_Личные вещи,parent_category_name_Недвижимость,parent_category_name_Транспорт,...,price_bins_37,price_bins_38,price_bins_39,price_bins_4,price_bins_40,price_bins_5,price_bins_6,price_bins_7,price_bins_8,price_bins_9
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b912c3c6a6ad,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2dac0150717d,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ba83aefab5dc,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02996f1dd2ea,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7c90be56d2ab,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
dnimgtrn = np.hstack([dfhot.loc[traindex].values, dnimgtrn])
dnimgtst = np.hstack([dfhot.loc[testdex].values, dnimgtst])


print('[{}] Set up folds'.format(time.time() - start_time))
foldls = [["2017-03-15", "2017-03-16", "2017-03-17"], \
           ["2017-03-18", "2017-03-19", "2017-03-20"], \
           ["2017-03-21", "2017-03-22", "2017-03-23"], \
           ["2017-03-24", "2017-03-25", "2017-03-26"], \
            ["2017-03-27", "2017-03-28", "2017-03-29", \
                "2017-03-30", "2017-03-31", "2017-04-01", \
                "2017-04-02", "2017-04-03","2017-04-07"]]
folds = [df.loc[traindex,:]['activation_date'].isin(f) for f in foldls]

[526.9494519233704] Set up folds


In [26]:
print('[{}] Modified sklearn wrapper'.format(time.time() - start_time))
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None, seed_bool = True):
        if(seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

print('[{}] Start nnet'.format(time.time() - start_time))
ntrain  = df.loc[traindex,:].shape[0]
ntest   = df.loc[testdex,:].shape[0]

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from keras.regularizers import l2

def get_model(shape, l2_val = 0):
    model = Sequential()
    model.add(Dense(1024*4, input_dim=shape, kernel_regularizer=l2(l2_val)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    #model.add(Dropout(0.5))
    #model.add(Dense(1024, kernel_regularizer=l2(l2_val)))
    #model.add(BatchNormalization())
    #model.add(Activation('relu'))
    #model.add(Dropout(0.5))
    model.add(Dense(32, kernel_regularizer=l2(l2_val)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(1, activation='linear'))
    optim = keras.optimizers.Adam(lr=0.008)
    model.compile(loss=root_mean_squared_error, optimizer=optim)
    return model
gc.collect()

[6733.279489517212] Modified sklearn wrapper
[6733.27990937233] Start nnet


6599

In [27]:
NFOLDS = 5  
bsize = 1024
from sklearn import preprocessing
gc.collect()

from keras.regularizers import l2

oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((NFOLDS, ntest))
for i, f in enumerate(folds):
    train_index, test_index = np.where(f==False), np.where(f)
    x_tr = dnimgtrn[train_index]
    y_tr = y.values[train_index]
    x_te = dnimgtrn[test_index]
    y_te = y.values[test_index]
    ooftr = []
    oofte = []
    gc.collect()
    clf = get_model(dnimgtrn.shape[1])
    for i in range(5):
        clf.fit(x_tr, y_tr, 
          epochs=1,
          batch_size=bsize,
          validation_data = (x_te, y_te), 
          verbose=1)
        if i == 0:
            continue
        ooftr.append(clf.predict(x_te, batch_size=bsize*4).flatten())
        oofte.append(clf.predict(dnimgtst, batch_size=bsize*4).flatten())
        rms = sqrt(mean_squared_error(y_te, sum(ooftr)/len(ooftr)))
        print('Ridge OOF RMSE fold: {}'.format(rms))
    del x_tr, y_tr
    gc.collect()
    oof_train[test_index] = sum(ooftr)/len(ooftr)
    oof_test_skf[i, :] =    sum(oofte)/len(oofte)
    del x_te, train_index, test_index
    gc.collect()
oof_test[:] = oof_test_skf.mean(axis=0)
oof_train_out, oof_test_out = oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

Train on 1189868 samples, validate on 313556 samples
Epoch 1/1
Train on 1189868 samples, validate on 313556 samples
Epoch 1/1
Ridge OOF RMSE fold: 0.2389804074494495
Train on 1189868 samples, validate on 313556 samples
Epoch 1/1
Ridge OOF RMSE fold: 0.23729017797656682
Train on 1189868 samples, validate on 313556 samples
Epoch 1/1
Ridge OOF RMSE fold: 0.23688960413853133
Train on 1189868 samples, validate on 313556 samples
Epoch 1/1
Ridge OOF RMSE fold: 0.2366666015265686
Train on 1176264 samples, validate on 327160 samples
Epoch 1/1
Train on 1176264 samples, validate on 327160 samples
Epoch 1/1
Ridge OOF RMSE fold: 0.2382398845590967
Train on 1176264 samples, validate on 327160 samples
Epoch 1/1
Ridge OOF RMSE fold: 0.23716396954344718
Train on 1176264 samples, validate on 327160 samples
Epoch 1/1
Ridge OOF RMSE fold: 0.2372483307216842
Train on 1176264 samples, validate on 327160 samples
Epoch 1/1
Ridge OOF RMSE fold: 0.23662219366308487
Train on 1176532 samples, validate on 326892 s

In [36]:
np.concatenate([oof_train.flatten(), oof_test_out.flatten()])

array([0.0295782 , 0.27220583, 0.21238807, ..., 0.00108205, 0.071224  ,
       0.00609451])

In [48]:
rms = sqrt(mean_squared_error(y, oof_train_out))
print('Ridge OOF RMSE: {}'.format(rms))
   
nnet_preds = np.concatenate([oof_train.flatten(), oof_test_out.flatten()])         
df['deal_probability'] = nnet_preds
df[['deal_probability']].to_csv(path + '../sub/nnetImgV5CV.csv.gz', compression = 'gzip')    

Ridge OOF RMSE: 0.2356712803522193


In [49]:
df[['deal_probability']].head()

Unnamed: 0_level_0,deal_probability
item_id,Unnamed: 1_level_1
b912c3c6a6ad,0.029578
2dac0150717d,0.272206
ba83aefab5dc,0.212388
02996f1dd2ea,0.108923
7c90be56d2ab,0.289879
