In [1]:
import numpy as np
import pandas as pd
from sklearn import ensemble
from sklearn.linear_model import LinearRegression

from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation
import sklearn
from sklearn import pipeline, preprocessing, feature_extraction
from sklearn.metrics import f1_score
#import xgboost as xgb
from sklearn.metrics import mean_absolute_error
import time
import datetime
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, auc, roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from scipy import sparse
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
#import lightgbm as lgbm
import scipy
from sklearn.preprocessing import StandardScaler

%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib




In [2]:
def f1_score_single(y_true, y_pred):
    y_true = set(y_true)
    y_pred = set(y_pred)
    cross_size = len(y_true & y_pred)
    if cross_size == 0: return 0.
    p = 1. * cross_size / len(y_pred)
    r = 1. * cross_size / len(y_true)
    return 2 * p * r / (p + r)
    
def f1_score(y_true, y_pred):
    return np.mean([f1_score_single(x, y) for x, y in zip(y_true, y_pred)])

In [3]:
def load_data(path_data):
    '''
    --------------------------------order_product--------------------------------
    * Unique in order_id + product_id
    '''
    priors = pd.read_csv(path_data + 'order_products__prior.csv', 
                     dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    train = pd.read_csv(path_data + 'order_products__train.csv', 
                    dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    '''
    --------------------------------order--------------------------------
    * This file tells us which set (prior, train, test) an order belongs
    * Unique in order_id
    * order_id in train, prior, test has no intersection
    * this is the #order_number order of this user
    '''
    orders = pd.read_csv(path_data + 'orders.csv', 
                         dtype={
                                'order_id': np.int32,
                                'user_id': np.int64,
                                'eval_set': 'category',
                                'order_number': np.int16,
                                'order_dow': np.int8,
                                'order_hour_of_day': np.int8,
                                'days_since_prior_order': np.float32})

    #  order in prior, train, test has no duplicate
    #  order_ids_pri = priors.order_id.unique()
    #  order_ids_trn = train.order_id.unique()
    #  order_ids_tst = orders[orders.eval_set == 'test']['order_id'].unique()
    #  print(set(order_ids_pri).intersection(set(order_ids_trn)))
    #  print(set(order_ids_pri).intersection(set(order_ids_tst)))
    #  print(set(order_ids_trn).intersection(set(order_ids_tst)))

    '''
    --------------------------------product--------------------------------
    * Unique in product_id
    '''
    products = pd.read_csv(path_data + 'products.csv')
    aisles = pd.read_csv(path_data + "aisles.csv")
    departments = pd.read_csv(path_data + "departments.csv")
    sample_submission = pd.read_csv(path_data + "sample_submission.csv")
    
    return priors, train, orders, products, aisles, departments, sample_submission

class tick_tock:
    def __init__(self, process_name, verbose=1):
        self.process_name = process_name
        self.verbose = verbose
    def __enter__(self):
        if self.verbose:
            print(self.process_name + " begin ......")
            self.begin_time = time.time()
    def __exit__(self, type, value, traceback):
        if self.verbose:
            end_time = time.time()
            print(self.process_name + " end ......")
            print('time lapsing {0} s \n'.format(end_time - self.begin_time))
            
def ka_add_groupby_features_1_vs_n(df, group_columns_list, agg_dict, only_new_feature=True):
    '''Create statistical columns, group by [N columns] and compute stats on [N column]

       Parameters
       ----------
       df: pandas dataframe
          Features matrix
       group_columns_list: list_like
          List of columns you want to group with, could be multiple columns
       agg_dict: python dictionary

       Return
       ------
       new pandas dataframe with original columns and new added columns

       Example
       -------
       {real_column_name: {your_specified_new_column_name : method}}
       agg_dict = {'user_id':{'prod_tot_cnts':'count'},
                   'reordered':{'reorder_tot_cnts_of_this_prod':'sum'},
                   'user_buy_product_times': {'prod_order_once':lambda x: sum(x==1),
                                              'prod_order_more_than_once':lambda x: sum(x==2)}}
       ka_add_stats_features_1_vs_n(train, ['product_id'], agg_dict)
    '''
    with tick_tock("add stats features"):
        try:
            if type(group_columns_list) == list:
                pass
            else:
                raise TypeError(k + "should be a list")
        except TypeError as e:
            print(e)
            raise

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped.agg(agg_dict)
        the_stats.columns = the_stats.columns.droplevel(0)
        the_stats.reset_index(inplace=True)
        if only_new_feature:
            df_new = the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')

    return df_new

def ka_add_groupby_features_n_vs_1(df, group_columns_list, target_columns_list, methods_list, keep_only_stats=True, verbose=1):
    '''Create statistical columns, group by [N columns] and compute stats on [1 column]

       Parameters
       ----------
       df: pandas dataframe
          Features matrix
       group_columns_list: list_like
          List of columns you want to group with, could be multiple columns
       target_columns_list: list_like
          column you want to compute stats, need to be a list with only one element
       methods_list: list_like
          methods that you want to use, all methods that supported by groupby in Pandas

       Return
       ------
       new pandas dataframe with original columns and new added columns

       Example
       -------
       ka_add_stats_features_n_vs_1(train, group_columns_list=['x0'], target_columns_list=['x10'])
    '''
    with tick_tock("add stats features", verbose):
        dicts = {"group_columns_list": group_columns_list , "target_columns_list": target_columns_list, "methods_list" :methods_list}

        for k, v in dicts.items():
            try:
                if type(v) == list:
                    pass
                else:
                    raise TypeError(k + "should be a list")
            except TypeError as e:
                print(e)
                raise

        grouped_name = ''.join(group_columns_list)
        target_name = ''.join(target_columns_list)
        combine_name = [[grouped_name] + [method_name] + [target_name] for method_name in methods_list]

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped[target_name].agg(methods_list).reset_index()
        the_stats.columns = [grouped_name] + \
                            ['_%s_%s_by_%s' % (grouped_name, method_name, target_name) \
                             for (grouped_name, method_name, target_name) in combine_name]
        if keep_only_stats:
            return the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')
        return df_new

In [4]:
path_data = 'data/'
priors, train, orders, products, aisles, departments, sample_submission = load_data(path_data)

In [10]:
prior_base = pd.merge(priors, orders, on=['order_id'], how='left')

In [11]:
tmp = pd.DataFrame(prior_base.groupby(['user_id', 'product_id'])['order_number'].apply(lambda x: x.tolist()))

In [7]:
tmp[:5]

Unnamed: 0_level_0,Unnamed: 1_level_0,order_number
user_id,product_id,Unnamed: 2_level_1
1,196,"[5, 3, 7, 4, 9, 2, 1, 10, 8, 6]"
1,10258,"[5, 3, 7, 4, 9, 2, 10, 8, 6]"
1,10326,[5]
1,12427,"[5, 3, 7, 4, 9, 2, 1, 10, 8, 6]"
1,13032,"[7, 2, 10]"


In [12]:
tmp.shape

(13307953, 1)

In [13]:
max_ord = pd.DataFrame(prior_base.groupby(['user_id'])['order_number'].max())

In [14]:
max_ord.reset_index(inplace=True)
tmp.reset_index(inplace=True)

In [15]:
tmp.columns = ['user_id', 'product_id', 'orders']

In [16]:
max_ord.columns = ['user_id', 'max_ord']

In [17]:
tmp = pd.merge(tmp, max_ord, on=['user_id'], how='left')


In [19]:
def foo(x):
    t = []
    for i in range(1, x[3]+1):
        if i in x[2]:
            t += [1]
        else:
            t += [0]
    return t

In [20]:
tmp['seq'] = tmp.apply(foo,axis=1)

In [35]:
tmp.to_csv('sec.csv')

In [82]:
tmp = pd.read_csv('sec.csv')

In [83]:
tmp[:10]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,user_id,product_id,orders,max_ord,seq
0,0,0,1,196,"[5, 3, 7, 4, 9, 2, 1, 10, 8, 6]",10,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,1,1,1,10258,"[5, 3, 7, 4, 9, 2, 10, 8, 6]",10,"[0, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
2,2,2,1,10326,[5],10,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
3,3,3,1,12427,"[5, 3, 7, 4, 9, 2, 1, 10, 8, 6]",10,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,4,4,1,13032,"[7, 2, 10]",10,"[0, 1, 0, 0, 0, 0, 1, 0, 0, 1]"
5,5,5,1,13176,"[5, 2]",10,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0]"
6,6,6,1,14084,[1],10,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,7,7,1,17122,[5],10,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
8,8,8,1,25133,"[5, 3, 7, 4, 9, 10, 8, 6]",10,"[0, 0, 1, 1, 1, 1, 1, 1, 1, 1]"
9,9,9,1,26088,"[2, 1]",10,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0]"


In [85]:
tmp[:10]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,user_id,product_id,orders,max_ord,seq
0,0,0,1,196,"[5, 3, 7, 4, 9, 2, 1, 10, 8, 6]",10,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
1,1,1,1,10258,"[5, 3, 7, 4, 9, 2, 10, 8, 6]",10,"[0, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
2,2,2,1,10326,[5],10,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
3,3,3,1,12427,"[5, 3, 7, 4, 9, 2, 1, 10, 8, 6]",10,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,4,4,1,13032,"[7, 2, 10]",10,"[0, 1, 0, 0, 0, 0, 1, 0, 0, 1]"
5,5,5,1,13176,"[5, 2]",10,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0]"
6,6,6,1,14084,[1],10,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,7,7,1,17122,[5],10,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
8,8,8,1,25133,"[5, 3, 7, 4, 9, 10, 8, 6]",10,"[0, 0, 1, 1, 1, 1, 1, 1, 1, 1]"
9,9,9,1,26088,"[2, 1]",10,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0]"


In [84]:
tmp['seq'] = tmp['seq'].apply(lambda x: list(map(int, x[1:][:-1].split(', '))))

In [51]:
    batch_size = 250
    cnt = 0
    while(1):
        #shuffle(tr)
        curr_batch_size = 0
        x_train = []
        y_train = []
        for i in tmp.seq:
            x = np.zeros(55)
            idx = 55
            for j in reversed(i):
                if idx == 55:
                    y_train.append(j)
                else:
                    x[idx] = j
                idx -= 1
                if idx == -1:
                    break
            x_train.append(x)
            curr_batch_size += 1
            if curr_batch_size == batch_size:
                break
        #y_train = np.array(y_train, np.uint8)
        x_train = np.array(x_train, np.uint8)
        x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
        #y_train = np.reshape(y_train, (y_train.shape[0], y_train.shape[1],1))
        #print(x_train, y_train)
        cnt += 1
        if cnt == 10:
            break

In [10]:
x_train

array([[[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1]],

       [[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1]],

       [[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [0],
        [0],
        [0],
        [0]],

       [[0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1]]], dtype=uint8)

In [176]:
y_train

[1, 1, 0, 1]

In [86]:
def make_data(tr, batch_size):
        #np.random.shuffle(tr)
        num_start = 0
        x_train = []
        y_train = []
        curr_batch_size = 0
        for i in tr:
            num_start += 1
            x = np.zeros(55)
            idx = 55
            for j in reversed(i):
                if idx == 55:
                    y_train.append(j)
                else:
                    x[idx] = j
                idx -= 1
                if idx == -1:
                    break
            x_train.append(x)
            curr_batch_size += 1
            if curr_batch_size % 1000000 == 0:
                print(curr_batch_size)
            if curr_batch_size == batch_size:
                break
        x_train = np.array(x_train, np.uint8)
        x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
        curr_batch_size = 0
        return (x_train, y_train)

In [87]:
def make_tr_data(tr, batch_size):
        #np.random.shuffle(tr)
        num_start = 0
        x_train = []
        y_train = []
        curr_batch_size = 0
        for i in tr:
            num_start += 1
            x = np.zeros(55)
            idx = 54
            for j in reversed(i):
                if idx == 55:
                    y_train.append(j)
                else:
                    x[idx] = j
                idx -= 1
                if idx == -1:
                    break
            x_train.append(x)
            curr_batch_size += 1
            if curr_batch_size % 1000000 == 0:
                print(curr_batch_size)
            if curr_batch_size == batch_size:
                break
        x_train = np.array(x_train, np.uint8)
        x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
        curr_batch_size = 0
        return (x_train, y_train)

In [38]:
tmp.shape

(13307953, 7)

In [88]:
x, y = make_tr_data(tmp.seq[:], 13307953)

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000


In [58]:
x, y = make_data(tmp.seq[:], 13307953)

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000


In [11]:
def all_generator(tr, batch_size):
    while(1):
        np.random.shuffle(tr)
        num_start = 0
        x_train = []
        y_train = []
        curr_batch_size = 0
        for i in tr[num_start:]:
            num_start += 1
            x = np.zeros(15)
            idx = 15
            for j in reversed(i):
                if idx == 15:
                    y_train.append(j)
                else:
                    x[idx] = j
                idx -= 1
                if idx == -1:
                    break
            x_train.append(x)
            curr_batch_size += 1
            if curr_batch_size == batch_size:
                break
        x_train = np.array(x_train, np.uint8)
        x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
        curr_batch_size = 0
        yield (x_train, y_train)

In [12]:
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential


Using TensorFlow backend.


In [47]:
model = Sequential()

model.add(LSTM(input_shape=(55,1), output_dim=55, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(55,return_sequences=False))
model.add(Dropout(0.2))

model.add(Dense(output_dim=1))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [65]:
model.fit(x, y, batch_size=128,
          nb_epoch=10, verbose=1,
          callbacks=None,
          validation_split=0.2,
          #validation_data=None,
          shuffle=True)

Train on 10646362 samples, validate on 2661591 samples
Epoch 1/10
  636288/10646362 [>.............................] - ETA: 11543s - loss: 0.3557 - acc: 0.8643

KeyboardInterrupt: 

In [43]:
model.save_weights('lstm_15.h5')

In [14]:
batch_size = 256
model.fit_generator(all_generator(tmp.seq[:100000], batch_size),
                    samples_per_epoch=10000,
                    validation_data=all_generator(tmp.seq[:100000], batch_size),
                    nb_val_samples=5000,
                    nb_epoch=40,
                    verbose=1)

Epoch 1/40
 2048/10000 [=====>........................] - ETA: 114s - loss: 2.6837 - acc: 0.8335

KeyboardInterrupt: 

In [None]:
pred = model.predict(x)

In [None]:
pred

In [91]:
tmp['lstm_55'] = pred

ValueError: Length of values does not match length of index

In [92]:
len(pred)

250

In [10]:
import keras
keras.__version__

'1.2.2'

In [13]:
tmp = pd.DataFrame(prior_base.groupby(['user_id', 'order_number'])['product_id'].apply(lambda x: x.tolist()))

In [15]:
tmp[:3]

Unnamed: 0_level_0,Unnamed: 1_level_0,product_id
user_id,order_number,Unnamed: 2_level_1
1,1,"[196, 14084, 12427, 26088, 26405]"
1,2,"[196, 10258, 12427, 13176, 26088, 13032]"
1,3,"[196, 12427, 10258, 25133, 30450]"


In [24]:
tmp['product_id'] = tmp['product_id'].apply(lambda x: ''.join([str(i) + ' ' for i in x]))

In [28]:
hw = HashingVectorizer(n_features=20).fit(tmp['product_id'])

In [29]:
pd.DataFrame(hw.transform(tmp['product_id']).todense())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.447214,0.000000,0.447214,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.447214,0.000000,0.000000,0.447214,0.447214
1,0.000000,0.000000,0.000000,0.408248,0.408248,0.408248,0.000000,0.408248,-0.408248,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.408248,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.755929,0.377964,0.000000,0.000000,0.000000,0.000000,-0.377964,0.000000,0.000000,0.000000,0.000000,0.377964,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.755929,0.377964,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.377964,0.000000,0.000000,0.000000,0.377964
4,0.000000,-0.316228,0.316228,0.000000,0.632456,0.316228,0.000000,0.000000,-0.316228,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.316228,0.316228,0.000000,0.000000,0.000000
5,0.000000,0.000000,0.000000,0.000000,0.816497,0.408248,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.408248,0.000000,0.000000,0.000000,0.000000
6,0.000000,0.000000,0.000000,0.377964,0.755929,0.377964,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.377964,0.000000,0.000000,0.000000,0.000000
7,0.000000,0.000000,0.000000,0.000000,0.707107,0.353553,0.000000,0.000000,0.000000,0.000000,0.000000,0.353553,0.000000,0.000000,0.000000,0.353553,-0.353553,0.000000,0.000000,0.000000
8,0.000000,0.000000,0.000000,0.000000,0.707107,0.353553,0.000000,0.000000,0.000000,0.000000,0.000000,0.353553,0.000000,0.000000,0.000000,0.353553,-0.353553,0.000000,0.000000,0.000000
9,0.000000,0.000000,0.000000,0.301511,0.603023,0.301511,0.000000,0.000000,-0.301511,0.000000,0.000000,0.301511,0.000000,0.000000,0.000000,0.301511,-0.301511,0.301511,0.000000,0.000000


In [5]:
X_train = pd.merge(train, orders, on=['order_id'], how='left')

In [8]:
pd.DataFrame(X_train.groupby(['user_id', 'order_number'])['product_id'].max())

Unnamed: 0_level_0,Unnamed: 1_level_0,product_id
user_id,order_number,Unnamed: 2_level_1
1,11,49235
2,15,48821
5,5,48204
7,21,47272
8,4,48230
9,4,42828
10,6,48720
13,13,47078
14,14,42284
17,41,43352
