In [1]:
import numpy as np
import scipy as sp
import pandas as pd

In [2]:
path = 'data/'

df_aisles = pd.read_csv(path + 'aisles.csv')
df_departments = pd.read_csv(path + 'departments.csv')
df_data = pd.read_csv(path + 'order_products__prior.csv', dtype={
    'order_id': np.int32,
    'product_id': np.int32,
    'add_to_cart_order': np.int16,
    'reordered': np.int8,
})
df_train_target = pd.read_csv(path + 'order_products__train.csv')
df_orders = pd.read_csv(path + 'orders.csv', dtype={
    'order_id': np.int32,
    'user_id': np.int32,
    'order_number': np.int8,
    'order_dow': np.int8,
    'order_hour_of_day': np.int8,
})
df_products = pd.read_csv(path + 'products.csv')
df_sample_submission = pd.read_csv(path + 'sample_submission.csv')

In [3]:
# merge arguments we will be using often
merge_arguments = {
    'left_index': True,
    'right_index': True,
    'how': 'outer',
}

In [4]:
# setting index for easier mapping
df_orders = df_orders.set_index('order_id')
df_products = df_products.set_index('product_id')

In [5]:
%%time
# adding features to data to make it more complete
# many of the values are repeated as a single user places many orders with many products
df_data['user_id'] = df_data.order_id.map(df_orders.user_id)
df_data['order_number'] = df_data.order_id.map(df_orders.order_number)
df_data['order_dow'] = df_data.order_id.map(df_orders.order_dow)
df_data['order_hour_of_day'] = df_data.order_id.map(df_orders.order_hour_of_day)
df_data['days_since_prior_order'] = df_data.order_id.map(df_orders.days_since_prior_order)

Wall time: 22.3 s


In [6]:
# create a list of users in each group
users_train = df_orders.loc[df_orders.eval_set == 'train', 'user_id']
users_test = df_orders.loc[df_orders.eval_set == 'test', 'user_id']

# create maps for convenience
user_id_to_last_order_number = df_data.groupby('user_id').agg({'order_number': 'max'}).order_number
user_id_to_last_order_id = df_orders.loc[~df_orders.user_id.duplicated(keep='last')].reset_index().set_index('user_id').order_id
order_id_to_user_id = df_orders.user_id

In [66]:
# split data into two groups
df_train = df_data.loc[df_data.user_id.isin(users_train)]
df_test = df_data.loc[df_data.user_id.isin(users_test)]

In [67]:
df_train_target['user_id'] = df_train_target.order_id.map(order_id_to_user_id)

In [68]:
# a quick look at the sizes
print("# of data: {:,}".format(df_data.shape[0]))
print("# of train data: {:,}".format(df_train.shape[0]))
print("# of test data: {:,}".format(df_test.shape[0]))

# of data: 32,434,489
# of train data: 20,641,991
# of test data: 11,792,498


## Create bestsellers list

In [69]:
def get_best_sellers(df, quantile):
    df_temp = df.groupby('product_id').agg({'order_id': 'count'}).rename(columns={'order_id': 'amount_sold'})
    return df_temp.loc[df_temp.amount_sold >= df_temp.amount_sold.quantile(quantile)].index.values

In [70]:
# as we have seen in part 02,
# top 20% of most sold products account for more than 90% of all items sold
# there are approximately 50,000 different products sold
# this should cut down the number of features down to about 10,000
# as we shall be using one-hot encoded data for our model
quantile = 0.8
bestsellers = get_best_sellers(df_data, quantile)

## Take only bestsellers

In [71]:
df_train = df_train.loc[df_train.product_id.isin(bestsellers)]
df_test = df_test.loc[df_test.product_id.isin(bestsellers)]

In [72]:
print("# of bestsellers in train data: {:,}".format(df_train.shape[0]))
print("# of bestsellers in test data: {:,}".format(df_test.shape[0]))

# of bestsellers in train data: 18,785,336
# of bestsellers in test data: 10,723,251


## Take only last 3 orders

In [73]:
# the minimum number of orders made by a user is 3
# we cut off all orders prior to the last three
# this is mainly done to have constant number of features across all users
# after one-hot encoding all (products, order number) pairs
# hopefully last 3 orders are relavent enough in predicting reorders

In [74]:
# if not, we may do more feature engineering later on

In [75]:
def get_history(df):
    return df.loc[df.order_number <= df.user_id.map(user_id_to_last_order_number) - 3]

def get_last_orders(df):
    return df.loc[df.order_number > df.user_id.map(user_id_to_last_order_number) - 3]

In [76]:
# might be used later at some point
df_train_hist = get_history(df_train)
df_test_hist = get_history(df_test)

In [77]:
df_train = get_last_orders(df_train)
df_test = get_last_orders(df_test)

In [78]:
print("# of bestsellers not in the last 3 train data: {:,}".format(df_train_hist.shape[0]))
print("# of bestsellers not in the last 3 test data: {:,}".format(df_test_hist.shape[0]))
print('')
print("# of bestsellers in the last 3 train data: {:,}".format(df_train.shape[0]))
print("# of bestsellers in the last 3 test data: {:,}".format(df_test.shape[0]))

# of bestsellers not in the last 3 train data: 15,162,662
# of bestsellers not in the last 3 test data: 8,647,422

# of bestsellers in the last 3 train data: 3,622,674
# of bestsellers in the last 3 test data: 2,075,829


In [79]:
# here, we are throwing away about 80% of orders
# perhaps this is too much

## Set aside a validation set

In [82]:
def get_sample_users(df, n):
    if n > df.user_id.nunique():
        return pd.Series(df.user_id.unique())
    else:
        return pd.Series(np.random.choice(df.user_id.unique(), n, replace=False))

In [83]:
validation_size = 2048

In [84]:
validation_users = get_sample_users(df_train, validation_size)

In [85]:
df_validation = df_train.loc[df_train.user_id.isin(validation_users)]
df_validation_target = df_train_target.loc[df_train_target.user_id.isin(validation_users)]

df_train = df_train.loc[~df_train.user_id.isin(validation_users)]
df_train_target = df_train_target.loc[~df_train_target.user_id.isin(validation_users)]

## Data preprocessing functions and class

In [86]:
# this function reduces order number down to 0, 1, or 2
# these numbers are "relative" to the last order number made by a user
# this is done so that we may one-hot encode the feature
def standardize_order_number(df):
    df.order_number = df.order_number - (df.user_id.map(user_id_to_last_order_number) - 2)
    return df

In [87]:
def get_product_history_ohe_grouped_by_user(df):
    pass

In [88]:
def get_product_ohe_grouped_by_user(df):
    # setting up an "empty dataframe" to merge one-hot encoded features
    df_temp = pd.DataFrame(index=df.index)
    df_temp['user_id'] = df.user_id
    df_temp['order_number'] = df.order_number
    
    # merge one-hot encoded product feature and reordered feature
    df_temp = df_temp.merge(pd.get_dummies(df.product_id, prefix='prod'), **merge_arguments)
    df_temp = df_temp.merge(pd.get_dummies(df.product_id * df.reordered, prefix='re'), **merge_arguments)
    
    # group by order_number so that each row contains all the information
    # on which products are ordered in that particular order
    # we lose the information on when the products are added to the cart on that particular order
    # but this information may not be relavent
    df_temp = df_temp.groupby(['user_id', 'order_number']).sum()
    
    # unstack order_numbers so that each row now contains all the information
    # on which products are ordered by a user in the last 3 orders
    # fill_value is needed as there may be some users
    # who did not order one of the bestsellers in a particular order among the last 3 orders
    df_temp = df_temp.unstack(fill_value=0)
    df_temp.columns = ['_'.join([str(col[1]), str(col[0])]) for col in df_temp.columns]
    
    return df_temp

In [89]:
def get_time_ohe_grouped_by_user(df):
    # setting up an "empty dataframe" to merge one-hot encoded features
    df_temp = pd.DataFrame(index=df.index)
    df_temp['user_id'] = df.user_id
    df_temp['order_number'] = df.order_number
    
    # merge one-hot encoded time related features
    df_temp = df_temp.merge(pd.get_dummies(df.order_dow, prefix='dow'), **merge_arguments)
    df_temp = df_temp.merge(pd.get_dummies(df.order_hour_of_day, prefix='hour'), **merge_arguments)
    df_temp = df_temp.merge(pd.get_dummies(df.days_since_prior_order, prefix='days', dummy_na=True), **merge_arguments)
        
    # similar as above
    df_temp = df_temp.groupby(['user_id', 'order_number']).max()
    df_temp = df_temp.unstack(fill_value=0)
    df_temp.columns = ['_'.join([str(col[1]), str(col[0])]) for col in df_temp.columns]
    
    return df_temp

In [90]:
# a function that returns column names in a predefined format
# to make sure that we have a well defined format of a full list of features
# otherwise, when using a partial set of users for our input,
# we might be missing certain products
def get_product_ohe_columns():
    return np.concatenate([
        np.core.defchararray.add(str(i) + infix, bestsellers.astype(str))
        for i in range(3) for infix in ['_prod_', '_re_']
    ])

In [91]:
def get_time_ohe_columns():
    return np.concatenate([
        [str(i) + '_dow_' + str(dow) for dow in range(7) for i in range(3)],
        [str(i) + '_hour_' + str(hour) for hour in range(24) for i in range(3)],
        [str(i) + '_days_' + str(days) for days in np.sort(df_orders.days_since_prior_order.unique()) for i in range(3)],
        ['last_' + str(last) for last in np.ceil(np.log(user_id_to_last_order_number.sort_values())).unique()]
    ])

In [92]:
def get_last_ohe_grouped_by_user(df):
    df_temp = pd.DataFrame(index=df.index)
    df_temp['user_id'] = df.user_id
    
    df_temp = df_temp.merge(pd.get_dummies(np.ceil(np.log(df.user_id.map(user_id_to_last_order_number))), prefix='last'),
                            **merge_arguments)
    
    df_temp = df_temp.groupby('user_id').max()
    
    return df_temp

In [93]:
def get_ohe_columns():
    return np.concatenate([get_time_ohe_columns(), get_product_ohe_columns()])

In [108]:
# a simple function to combine one-hot encoded product features and time related features
def get_ohe_features(df, users):
    df_ohe = pd.DataFrame(data=0, index=users, columns=get_ohe_columns(), dtype=np.uint8)
    
    df_temp = get_product_ohe_grouped_by_user(df)
    df_ohe.loc[:, df_ohe.columns.isin(df_temp.columns)] = df_temp
    
    df_temp = get_time_ohe_grouped_by_user(df)
    df_ohe.loc[:, df_ohe.columns.isin(df_temp.columns)] = df_temp
    
    df_temp = get_last_ohe_grouped_by_user(df)
    df_ohe.loc[:, df_ohe.columns.isin(df_temp.columns)] = df_temp
    
    return df_ohe.sort_index()

In [109]:
def get_product_ohe_target(df, users):
    df_temp = df.copy()
    df_temp['user_id'] = df_temp.order_id.map(order_id_to_user_id)
    
    df_temp = df_temp.loc[df_temp.user_id.isin(users)]
    
    # take only those that are reorders of bestsellers
    df_temp = df_temp.loc[df_temp.reordered == 1]
    df_temp = df_temp.loc[df_temp.product_id.isin(bestsellers)]
    
    df_temp = df_temp.merge(pd.get_dummies(df_temp.product_id), **merge_arguments)
    
    # drop unnecessary columns
    df_temp = df_temp.drop(['order_id', 'product_id', 'add_to_cart_order', 'reordered'], axis=1)
    
    # group by users
    df_temp = df_temp.groupby('user_id').sum()
    
    return df_temp

In [110]:
def get_ohe_target(df, users):
    df_ohe = pd.DataFrame(data=0, index=users, columns=bestsellers, dtype=np.uint8)
    df_temp = get_product_ohe_target(df, users)
    
    # add possible missing users back in to the target
    missing_users = users.loc[~users.isin(df_temp.index)]
    df_temp = df_temp.append([pd.DataFrame(data=0, index=missing_users, columns=df_temp.columns, dtype=np.uint8)])
    
    df_ohe.loc[:, df_ohe.columns.isin(df_temp.columns)] = df_temp
    
    return df_ohe.sort_index()

In [111]:
    def set_predict(self, model):
        sparse_predict_temp = []
        for batch in self.user_batches:
            predict_temp = pd.DataFrame(data=model.predict(self.sparse_features).astype(np.uint8).todense(),
                                        index=batch, columns=bestsellers, dtype=np.uint8)
            sparse_predict_temp.append(sp.sparse.csr_matrix(predict_temp))
        self.sparse_predict = sp.sparse.vstack(sparse_predict_temp)

In [112]:
# complete overhaul might be needed
# somehow this code grown to accomodate
class DF_ohe:
    def __init__(self, df, users):
        batch_size = 1024
        
        self.df = df.loc[df.user_id.isin(users)].copy()
        self.df = standardize_order_number(self.df)
        
        # create small batches of users to save memory
        self.users = users.sort_values()
        self.users.index= range(self.users.size)
        users_temp = self.users.copy()
        self.user_batches = []
        while users_temp.size > 0:
            batch = users_temp[:batch_size]
            users_temp = users_temp[batch_size:]
            self.user_batches.append(batch)
        
        # possible missing users
        # sample is taken after dataframe reduction
        # we need to account for that
        
        # preprocess features in batches
        # otherwise this will take up too much memory
        sparse_features_temp = []
        for batch in self.user_batches:
            features_temp = get_ohe_features(self.df.loc[self.df.user_id.isin(batch)], batch)
            sparse_features_temp.append(sp.sparse.csr_matrix(features_temp))
        
        self.sparse_features = sp.sparse.vstack(sparse_features_temp)
        
        # remove dense dataframe to free up memory
        # del self.features
        
    def set_target(self, df):
        # process transforming targets in batches
        # otherwise this will take up too much memory
        sparse_target_temp = []
        for batch in self.user_batches:
            target_temp = get_ohe_target(df.loc[df.user_id.isin(batch)], batch)
            sparse_target_temp.append(sp.sparse.csr_matrix(target_temp))
        self.sparse_target = sp.sparse.vstack(sparse_target_temp)
    
    def set_predict(self, model):
        self.sparse_predict = model.predict(self.sparse_features)
        
    def set_submission(self):
        dict_temp = dict()
        
        for i in range(self.sparse_predict.shape[0]):
            reordered = bestsellers[self.sparse_predict[i].nonzero()[1]]
            str_temp = ''
            for j in range(len(reordered)):
                if j == 0:
                    str_temp += str(reordered[j])
                else:
                    str_temp += ' ' + str(reordered[j])
            if str_temp == '':
                str_temp = 'None'
            dict_temp[str(self.users[i])] = str_temp
        
        self.submission = pd.DataFrame.from_dict(dict_temp, orient='index')
        self.submission.index = self.submission.index.astype(np.uint32)
        self.submission['order_id'] = user_id_to_last_order_id
        self.submission.columns = ['products', 'order_id']
        self.submission = self.submission.loc[:, ['order_id', 'products']]
    
    def print_results(self, verbose=False):
        # number of predicted reorders
        PT = self.sparse_predict.sum()
        # number of relevant reorders
        RT = self.sparse_target.sum()
        # true positive of reorders
        TP = self.sparse_predict.multiply(self.sparse_target).sum()
        # number of false negative among bestsellers
        FN1 = RT - TP
        # estimation of false negative among non-bestsellers
        FN2 = 0.1 * RT
        # false positive of reorders
        FP = PT - TP
        
        # print detailed result of verbose is true
        if verbose:
            print('predicted true: {}'.format(PT))
            print('relevant true: {}'.format(RT))
            print('true positive: {}'.format(TP))
            print('false negative 1: {}'.format(FN1))
            print('false negative 2: {}'.format(int(FN2)))
            print('false positive: {}'.format(FP))
            print('')
        
        print('precision score: {:.3f}'.format(TP / (TP + FP)))
        print('pseudo f1 score: {:.3f}'.format((2 * TP) / ((2 * TP) + FN1 + FN2 + FP)))

## Build a model

In [113]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import Perceptron
from sklearn.externals import joblib

In [114]:
# take a sample of users
sample_size = 1024
sample_users = get_sample_users(df_train, sample_size)

In [115]:
model = OneVsRestClassifier(DecisionTreeClassifier(criterion='entropy'), n_jobs=4)
model_name = 'TS{:05d}_OVRDT'.format(sample_size)

In [116]:
%%time
df_ohe_sample = DF_ohe(df_train, sample_users)
df_ohe_sample.set_target(df_train_target)

Wall time: 20.4 s


In [117]:
%%time
%%capture --no-stdout
model.fit(df_ohe_sample.sparse_features, df_ohe_sample.sparse_target)

Wall time: 56.1 s


In [118]:
# fit/predict precision/score #1 #2 #3

# for a sample size of 256
# OvR + DecisionTreeClassifier:
#   13s/0.9s 0.191/0.118 0.183/0.105 0.198/0.112
# OvR + DecisionTreeClassifier with min leaf 5:
#   13s/0.9s 0.381/0.052 0.460/0.067 0.477/0.059
# OvR + DecisionTreeClassifier with min split 5:
#   14s/0.9s 0.162/0.100 0.168/0.095 0.149/0.089
# OvR + SGDClassifier:
#   7s/0.8s 0.119/0.052 0.127/0.048 0.170/0.065
# OvR + AdaBoostClassifier:
#   97s/3s  0.110/0.046 0.177/0.081 0.195/0.080

# for a sample size of 512
# OvR + DecisionTreeClassifier:
#   44s/2s 0.167/0.117 0.177/0.126 0.167/0.118
# OvR + DecisionTreeClassifier with min leaf 5:
#   45s/2s 0.407/0.109 0.443/0.113 0.381/0.099
# OvR + DecisionTreeClassifier with min split 5:
#   46s/2s 0.171/0.126 0.175/0.126 0.171/0.126
# OvR + SGDClassifier:
#   9s/1s 0.225/0.076 0.184/0.070 0.185/0.058
# OvR + AdaBoostClassifier:
#   464s/10s 0.196/0.110 0.194/0.108 0.227/0.127

# for a sample size of 1024
# OvR + DecisionTreeClassifier:
#   74s/4s 0.190/0.140 0.179/0.133 0.190/0.139
# OvR + DecisionTreeClassifier with min leaf 5:
#   74s/4s 0.412/0.140 0.399/0.126 0.437/0.138
# OvR + DecisionTreeClassifier with min split 5:
#   82s/4s 0.194/0.152 0.185/0.144 0.185/0.145
# OvR + SGDClassifier:
#   14s/3s 0.256/0.093 0.257/0.091 0.294/0.101
# OvR + AdaBoostClassifier:
#   1159s/31s 0.162/0.070 0.205/0.086 0.175/0.076

# comment: note that although "OvR + DecisionTreeClassifier with min leaf 5" has
# comparable score to other models, it is distinguished by having high precision rate
# this means that the model is more cautious in marking a product as reordered

# comment: there doesn't seem to be any difference in regards to calculation time
# between different parameters

# OvR + DecisionTreeClassifier with max depth n: might need further investigation
# OvR + DecisionTreeClassifier with max features n: very low score
# OvR + SGDClassifier: fast but less accurate than decision trees
# OvR + AdaBoostClassifier: quite slow compared to decision trees
# OvR + Perceptron: fast but less accurate than decision trees
#   at sample size of 1024, there is a huge dip in its score for some reason

# OvR + SGDClassifier: the model itself takes up too much memory
# OvR + SVC: takes too long
# OvR + LinearSVC: does not predict
# OvR + XGBClassifier: takes too long
# OvR + KNeighborsClassifier: does not predict
# OvR + RandomForestClassifier: does not predict
# OvR + MLPClassifier: takes too long
# OvR + BernoulliNB: does not predict
# OvR + GradientBoostingClassifier: does not work with sparse form
# OvR + ExtraTreesClassifier: low score
# KNeighborsClassifier: does not work in sparse form
# DecisionTreeClassifier: low score, does not work with sparse form
# RandomForestClassifier: does not work with sparse form
# MLPClassifier: does not predict
# MO + anything: does not work with sparse form

In [119]:
%%time
df_ohe_sample.set_predict(model)
df_ohe_sample.print_results(verbose=True)

predicted true: 5719
relevant true: 5719
true positive: 5719
false negative 1: 0
false negative 2: 571
false positive: 0

precision score: 1.000
pseudo f1 score: 0.952
Wall time: 2.53 s


## Validation

In [138]:
%%time
# set validation data
df_ohe_validation = DF_ohe(df_validation, validation_users)
df_ohe_validation.set_target(df_validation_target)

Wall time: 38 s


In [139]:
%%time
df_ohe_validation.set_predict(model)
df_ohe_validation.print_results(verbose=True)

predicted true: 3036
relevant true: 12359
true positive: 277
false negative 1: 12082
false negative 2: 1235
false positive: 2759

precision score: 0.091
pseudo f1 score: 0.033
Wall time: 467 ms


## Make predictions on test data

In [1444]:
model = joblib.load('TS32768_OVRDTSLSSMD5.pkl')
model_name = 'TS32768_OVRDTSLSSMD5'

In [122]:
# getting ready
test_users = pd.Series(df_test.user_id.unique())
all_test_users = df_orders.loc[df_orders.eval_set == 'test', 'user_id']
missing_test_users = all_test_users.loc[~all_test_users.isin(test_users)]

In [123]:
# create user batches
batch_size = 1024
user_batches = []

while test_users.size > 0:
    batch = test_users[:batch_size]
    test_users = test_users[batch_size:]
    user_batches.append(batch)

In [124]:
# there may be some missing users who did not purchase
# any product from the bestsellers list
# we need to keep track of them to add them back in
# in the final result at the end
missing_test_users_submission = pd.DataFrame(index=missing_test_users)
missing_test_users_submission['order_id'] = user_id_to_last_order_id
missing_test_users_submission['products'] = 'None'

# matching index data type to DF_ohe.submission.index
missing_test_users_submission.index = missing_test_users_submission.index.astype(np.uint64)

In [125]:
%%time
i = 0
submission = pd.DataFrame(columns=['order_id', 'products'])
submission.index = submission.index.astype(np.uint64)

for batch in user_batches:
    print(str(i), end=' ')
    i += 1
    df_ohe_test = DF_ohe(df_test, batch)
    df_ohe_test.set_predict(model)
    df_ohe_test.set_submission()
    submission = submission.append(df_ohe_test.submission)

print('')

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 
Wall time: 26min 53s


In [126]:
# adding in missing users and sort the result by order_id
submission = submission.append(missing_test_users_submission)
submission = submission.sort_values(by='order_id')

In [127]:
# export to a file
submission.to_csv(model_name + '.csv', index=False)

## Automated model testing

In [140]:
# for multiple trials, if we wanted to be more rigorous,
# on each trial, we should be taking new samples for fitting
# however, we do not implement this as this would take much more time to process
def model_test(model, model_name, sample_size=128, verbose=False, pickle=False):
    print('================================')
    print('TS{:05d}_'.format(sample_size) + model_name)
    print('================================')
    
    # get a sample and a fit
    sample_users = get_sample_users(df_train, sample_size)
    df_ohe_sample = DF_ohe(df_train, sample_users)
    df_ohe_sample.set_target(df_train_target)
    
    if verbose:
        print('Fitting')
    %time model.fit(df_ohe_sample.sparse_features, df_ohe_sample.sparse_target)
    if verbose:
        print('--------------------------------')
    
    # pickle the model for later use if pickle is true
    if pickle:
        joblib.dump(model, 'TS{:05d}_'.format(sample_size) + model_name + '.pkl')
    
    # sample_validation_users = get_sample_users(df_train, sample_size)
    if verbose:
        print('Validating')
    %time df_ohe_validation.set_predict(model)
    if verbose:
        print('--------------------------------')

    # print results
    df_ohe_validation.print_results(verbose)
    print('')

In [141]:
%%capture --no-stdout
# setting test arguments
verbose = True
pickle = False
sample_sizes = [32]
models = [
#    (OneVsRestClassifier(DecisionTreeClassifier()), 'OVR_DT'),
#    (OneVsRestClassifier(DecisionTreeClassifier(min_samples_leaf=5)), 'OVR_DT_SL5'),
#    (OneVsRestClassifier(DecisionTreeClassifier(min_samples_split=5)), 'OVR_DT_SS5'),
#    (OneVsRestClassifier(DecisionTreeClassifier(max_depth=5)), 'OVR_DT_MD5'),
#    (OneVsRestClassifier(DecisionTreeClassifier(min_samples_leaf=5, min_samples_split=5, max_depth=5)), 'OVR_DT_SLSSMD5'),
#    (OneVsRestClassifier(DecisionTreeClassifier(criterion='entropy', class_weight={0:1, 1:3})), 'OVRDT'),
#    (OneVsRestClassifier(DecisionTreeClassifier(criterion='entropy', class_weight={0:1, 1:3}, min_samples_leaf=5)), 'OVRDTSL5'),
#    (OneVsRestClassifier(DecisionTreeClassifier(criterion='entropy', class_weight={0:1, 1:3}, min_samples_split=5)), 'OVRDTSS5'),
#    (OneVsRestClassifier(DecisionTreeClassifier(criterion='entropy', class_weight={0:1, 1:3}, max_depth=5)), 'OVRDTMD5'),
    (OneVsRestClassifier(DecisionTreeClassifier()), 'OVRDT'),
#    (OneVsRestClassifier(DecisionTreeClassifier(class_weight={0:1, 1:2})), 'OVRDT1_2'),
#    (OneVsRestClassifier(DecisionTreeClassifier(class_weight={0:1, 1:3})), 'OVRDT1_3'),
#    (OneVsRestClassifier(DecisionTreeClassifier(class_weight={0:1, 1:4})), 'OVRDT1_4'),
#    (OneVsRestClassifier(DecisionTreeClassifier(class_weight={0:1, 1:5})), 'OVRDT1_5'),
#    (OneVsRestClassifier(DecisionTreeClassifier(class_weight={0:1, 1:6})), 'OVRDT1_6'),
#    (OneVsRestClassifier(DecisionTreeClassifier(class_weight={0:1, 1:7})), 'OVRDT1_7'),
#    (OneVsRestClassifier(DecisionTreeClassifier(class_weight={0:1, 1:8})), 'OVRDT1_8'),
    (OneVsRestClassifier(DecisionTreeClassifier(class_weight={0:1, 1:8})), 'OVRDT1_8'),
    (OneVsRestClassifier(DecisionTreeClassifier(class_weight={0:1, 1:16})), 'OVRDT1_16'),
    (OneVsRestClassifier(DecisionTreeClassifier(class_weight={0:1, 1:24})), 'OVRDT1_24'),
    (OneVsRestClassifier(DecisionTreeClassifier(class_weight={0:1, 1:32})), 'OVRDT1_32'),
]

for sample_size in sample_sizes:
    for model, model_name in models:
        model_test(model, model_name, sample_size, verbose, pickle)

TS00032_OVRDT
Fitting
Wall time: 7.12 s
--------------------------------
Validating
Wall time: 536 ms
--------------------------------
predicted true: 3636
relevant true: 12359
true positive: 331
false negative 1: 12028
false negative 2: 1235
false positive: 3305

precision score: 0.091
pseudo f1 score: 0.038

TS00032_OVRDT1_8
Fitting
Wall time: 7.23 s
--------------------------------
Validating
Wall time: 513 ms
--------------------------------
predicted true: 5491
relevant true: 12359
true positive: 423
false negative 1: 11936
false negative 2: 1235
false positive: 5068

precision score: 0.077
pseudo f1 score: 0.044

TS00032_OVRDT1_16
Fitting
Wall time: 5.67 s
--------------------------------
Validating
Wall time: 458 ms
--------------------------------
predicted true: 3156
relevant true: 12359
true positive: 342
false negative 1: 12017
false negative 2: 1235
false positive: 2814

precision score: 0.108
pseudo f1 score: 0.041

TS00032_OVRDT1_24
Fitting
Wall time: 6.45 s
-------------

In [1304]:
import glob
saved_model_files = glob.glob('*.pkl')

## Things to do
----
* add more features (order history)
* optimize parameters
* append missing users
* try with more products

## Test Incremental Models

## One-hot-encode the whole data

## Neural Network Models