In [1]:
%matplotlib inline
path = 'data/'

In [2]:
df_aisles = pd.read_csv(path + 'aisles.csv')
df_departments = pd.read_csv(path + 'departments.csv')
df_data = pd.read_csv(path + 'order_products__prior.csv', dtype={
    'order_id': np.int32,
    'product_id': np.int32,
    'add_to_cart_order': np.int16,
    'reordered': np.int8,
})
df_train_target = pd.read_csv(path + 'order_products__train.csv')
df_orders = pd.read_csv(path + 'orders.csv', dtype={
    'order_id': np.int32,
    'user_id': np.int32,
    'order_number': np.int8,
    'order_dow': np.int8,
    'order_hour_of_day': np.int8,
})
df_products = pd.read_csv(path + 'products.csv')
df_sample_submission = pd.read_csv(path + 'sample_submission.csv')

In [3]:
# merge arguments we will be using often
merge_arguments = {
    'left_index': True,
    'right_index': True,
    'how': 'outer',
}

In [4]:
# setting index for easier mapping
df_orders = df_orders.set_index('order_id')
df_products = df_products.set_index('product_id')

In [5]:
%%time
# adding features to data to make it more complete
# many of the values are repeated as a single user places many orders with many products
df_data['user_id'] = df_data.order_id.map(df_orders.user_id)
df_data['order_number'] = df_data.order_id.map(df_orders.order_number)
df_data['order_dow'] = df_data.order_id.map(df_orders.order_dow)
df_data['order_hour_of_day'] = df_data.order_id.map(df_orders.order_hour_of_day)
df_data['days_since_prior_order'] = df_data.order_id.map(df_orders.days_since_prior_order)

Wall time: 23.6 s


In [6]:
# create a list of users in each group
users_train = df_orders.loc[df_orders.eval_set == 'train', 'user_id']
users_test = df_orders.loc[df_orders.eval_set == 'test', 'user_id']

# create maps for convenience
user_to_last_order = df_data.groupby('user_id').agg({'order_number': 'max'}).order_number
user_to_order = df_orders.loc[~df_orders.user_id.duplicated(keep='last')].reset_index().set_index('user_id').order_id
order_to_user = df_orders.user_id

In [7]:
# split data into two groups
df_train = df_data.loc[df_data.user_id.isin(users_train)]
df_test = df_data.loc[df_data.user_id.isin(users_test)]

In [8]:
# a quick look at the sizes
print("# of data: {}".format(df_data.shape))
print("# of train data: {}".format(df_train.shape))
print("# of test data: {}".format(df_test.shape))

# of data: (32434489, 9)
# of train data: (20641991, 9)
# of test data: (11792498, 9)


## Create bestsellers list

In [9]:
def get_best_sellers(df, quantile):
    df_temp = df.groupby('product_id').agg({'order_id': 'count'}).rename(columns={'order_id': 'amount_sold'})
    return df_temp.loc[df_temp.amount_sold >= df_temp.amount_sold.quantile(quantile)].index.values

In [10]:
# as we have seen in part 02,
# top 20% of most sold products account for more than 90% of all items sold
# there are approximately 50,000 different products sold
# this should cut down the number of features down to about 10,000
# as we shall be using one-hot encoded data for our model
quantile = 0.8
bestsellers = get_best_sellers(df_data, quantile)

## Take only bestsellers

In [11]:
df_train = df_train.loc[df_train.product_id.isin(bestsellers)]
df_test = df_test.loc[df_test.product_id.isin(bestsellers)]

In [12]:
print("# of bestsellers in train data: {}".format(df_train.shape))
print("# of bestsellers in test data: {}".format(df_test.shape))

# of bestsellers in train data: (18785336, 9)
# of bestsellers in test data: (10723251, 9)


## Take only last 3 orders

In [13]:
# the minimum number of orders made by a user is 3
# we cut off all orders prior to the last three
# this is mainly done to have constant number of features across all users
# after one-hot encoding all (products, order number) pairs
# hopefully last 3 orders are relavent enough in predicting reorders

In [14]:
# if not, we may do more feature engineering later on

In [15]:
def get_last_orders(df):
    return df.loc[df.user_id.map(user_to_last_order) - df.order_number < 3]

In [16]:
df_train = get_last_orders(df_train)
df_test = get_last_orders(df_test)

In [17]:
print("# of bestsellers in the last 3 train data: {}".format(df_train.shape))
print("# of bestsellers in the last 3 test data: {}".format(df_test.shape))

# of bestsellers in the last 3 train data: (3622674, 9)
# of bestsellers in the last 3 test data: (2075829, 9)


## Data preprocessing functions and class

In [18]:
# this function reduces order number down to 0, 1, or 2
# these numbers are "relative" to the last order number made by a user
# this is done so that we may one-hot encode the feature
def standardize_order_number(df):
    df.order_number = df.order_number - (df.user_id.map(user_to_last_order) - 2)
    return df

In [19]:
def get_product_history_ohe_grouped_by_user(df):
    pass

In [20]:
def get_product_ohe_grouped_by_user(df):
    # setting up an "empty dataframe" to merge one-hot encoded features
    df_temp = pd.DataFrame(index=df.index)
    df_temp['user_id'] = df.user_id
    df_temp['order_number'] = df.order_number
    
    # merge one-hot encoded product feature and reordered feature
    df_temp = df_temp.merge(pd.get_dummies(df.product_id, prefix='prod'), **merge_arguments)
    df_temp = df_temp.merge(pd.get_dummies(df.product_id * df.reordered, prefix='re'), **merge_arguments)
    
    # group by order_number so that each row contains all the information
    # on which products are ordered in that particular order
    # we lose the information on when the products are added to the cart on that particular order
    # but this information may not be relavent
    df_temp = df_temp.groupby(['user_id', 'order_number']).sum()
    
    # unstack order_numbers so that each row now contains all the information
    # on which products are ordered by a user in the last 3 orders
    # fill_value is needed as there may be some users
    # who did not order one of the bestsellers in a particular order among the last 3 orders
    df_temp = df_temp.unstack(fill_value=0)
    df_temp.columns = ['_'.join([str(col[1]), str(col[0])]) for col in df_temp.columns]
    
    return df_temp

In [21]:
def get_time_ohe_grouped_by_user(df):
    # setting up an "empty dataframe" to merge one-hot encoded features
    df_temp = pd.DataFrame(index=df.index)
    df_temp['user_id'] = df.user_id
    df_temp['order_number'] = df.order_number
    
    # merge one-hot encoded time related features
    df_temp = df_temp.merge(pd.get_dummies(df.order_dow, prefix='dow'), **merge_arguments)
    df_temp = df_temp.merge(pd.get_dummies(df.order_hour_of_day, prefix='hour'), **merge_arguments)
    df_temp = df_temp.merge(pd.get_dummies(df.days_since_prior_order, prefix='days', dummy_na=True), **merge_arguments)
        
    # similar as above
    df_temp = df_temp.groupby(['user_id', 'order_number']).max()
    df_temp = df_temp.unstack(fill_value=0)
    df_temp.columns = ['_'.join([str(col[1]), str(col[0])]) for col in df_temp.columns]
    
    return df_temp

In [22]:
# a function that returns column names in a predefined format
# to make sure that we have a well defined format of a full list of features
# otherwise, when using a partial set of users for our input,
# we might be missing certain products
def get_product_ohe_columns():
    return np.concatenate([
        np.core.defchararray.add(str(i) + infix, bestsellers.astype(str))
        for i in range(3) for infix in ['_prod_', '_re_']
    ])

In [23]:
def get_time_ohe_columns():
    return np.concatenate([
        [str(i) + '_dow_' + str(dow) for dow in range(7) for i in range(3)],
        [str(i) + '_hour_' + str(hour) for hour in range(24) for i in range(3)],
        [str(i) + '_days_' + str(days) for days in np.sort(df_orders.days_since_prior_order.unique()) for i in range(3)],
        ['last_' + str(last) for last in np.ceil(np.log(user_to_last_order.sort_values())).unique()]
    ])

In [24]:
def get_last_ohe_grouped_by_user(df):
    df_temp = pd.DataFrame(index=df.index)
    df_temp['user_id'] = df.user_id
    
    df_temp = df_temp.merge(pd.get_dummies(np.ceil(np.log(df.user_id.map(user_to_last_order))), prefix='last'),
                            **merge_arguments)
    
    df_temp = df_temp.groupby('user_id').max()
    
    return df_temp

In [25]:
def get_ohe_columns():
    return np.concatenate([get_time_ohe_columns(), get_product_ohe_columns()])

In [26]:
# a simple function to combine one-hot encoded product features and time related features
def get_ohe_features(df, users):
    df_ohe = pd.DataFrame(data=0, index=users, columns=get_ohe_columns(), dtype=np.uint8)
    
    df_temp = get_product_ohe_grouped_by_user(df)
    df_ohe.loc[:, df_ohe.columns.isin(df_temp.columns)] = df_temp
    
    df_temp = get_time_ohe_grouped_by_user(df)
    df_ohe.loc[:, df_ohe.columns.isin(df_temp.columns)] = df_temp
    
    df_temp = get_last_ohe_grouped_by_user(df)
    df_ohe.loc[:, df_ohe.columns.isin(df_temp.columns)] = df_temp
    
    return df_ohe.sort_index()

In [27]:
def get_product_ohe_target(df, users):
    df_temp = df.copy()
    df_temp['user_id'] = df_temp.order_id.map(order_to_user)
    
    df_temp = df_temp.loc[df_temp.user_id.isin(users)]
    
    # take only those that are reorders of bestsellers
    df_temp = df_temp.loc[df_temp.reordered == 1]
    df_temp = df_temp.loc[df_temp.product_id.isin(bestsellers)]
    
    df_temp = df_temp.merge(pd.get_dummies(df_temp.product_id), **merge_arguments)
    
    # drop unnecessary columns
    df_temp = df_temp.drop(['order_id', 'product_id', 'add_to_cart_order', 'reordered'], axis=1)
    
    # group by users
    df_temp = df_temp.groupby('user_id').sum()
    
    return df_temp

In [28]:
def get_ohe_target(df, users):
    df_ohe = pd.DataFrame(data=0, index=users, columns=bestsellers, dtype=np.uint8)
    df_temp = get_product_ohe_target(df, users)
    
    # add possible missing users back in to the target
    missing_users = users.loc[~users.isin(df_temp.index)]
    df_temp = df_temp.append([pd.DataFrame(data=0, index=missing_users, columns=df_temp.columns, dtype=np.uint8)])
    
    df_ohe.loc[:, df_ohe.columns.isin(df_temp.columns)] = df_temp
    
    return df_ohe.sort_index()

In [66]:
class DF_ohe:
    def __init__(self, df, users):
        self.df = df.loc[df.user_id.isin(users)].copy()
        self.df = standardize_order_number(self.df)
        
        # possible missing users
        # sample is take after dataframe reduction
        # we need to account for that
        self.features = get_ohe_features(self.df, users)
        self.sparse_features = sp.sparse.csc_matrix(self.features)
        
    def set_target(self, df, users):
        self.target = get_ohe_target(df, users)
        self.sparse_target = sp.sparse.csr_matrix(self.target)
    
    def set_predict(self, model):
        self.predict = pd.DataFrame(data=model.predict(self.sparse_features).astype(np.uint8).todense(),
                                    index=self.features.index, columns=bestsellers, dtype=np.uint8)
    
    def set_submission(self):
        dict_temp = dict()
        
        for index, row in self.predict.iterrows():
            reordered = bestsellers[row.nonzero()]
            str_temp = ''
            for j in range(len(reordered)):
                if j == 0:
                    str_temp += str(reordered[j])
                else:
                    str_temp += ' ' + str(reordered[j])
            if str_temp == '':
                str_temp = 'None'
            dict_temp[str(index)] = str_temp
        
        self.submission = pd.DataFrame.from_dict(dict_temp, orient='index')
        self.submission.index = self.submission.index.astype(np.uint32)
        self.submission['order_id'] = user_to_order
        self.submission.columns = ['products', 'order_id']
        self.submission = self.submission.loc[:, ['order_id', 'products']]
    
    def print_results(self):
        # number of predicted reorders
        PT = self.predict.sum().sum()
        # number of relevant reorders
        RT = self.target.sum().sum()
        # true positive of reorders
        TP = self.predict.multiply(self.target).sum().sum()
        # number of false negative among bestsellers
        FN1 = RT - TP
        # estimation of false negative among non-bestsellers
        FN2 = 0.1 * RT
        # false positive of reorders
        FP = PT - TP
        
        print('predicted true: {}'.format(PT))
        print('relevant true: {}'.format(RT))
        print('true positive: {}'.format(TP))
        print('false negative 1: {}'.format(FN1))
        print('false negative 2: {}'.format(int(FN2)))
        print('false positive: {}'.format(FP))
        print('')
        print('precision score: {:.3f}'.format(TP / (TP + FP)))
        print('pseudo f1 score: {:.3f}'.format((2 * TP) / ((2 * TP) + FN1 + FN2 + FP)))

## Take a sample of users

In [67]:
def get_sample_users(df, n):
    return pd.Series(np.random.choice(df.user_id.unique(), n, replace=False))

In [128]:
# take a sample of users
sample_size = 256
sample_users = get_sample_users(df_train, sample_size)

In [129]:
%%time
df_ohe_sample = DF_ohe(df_train, sample_users)
df_ohe_sample.set_target(df_train_target, sample_users)

Wall time: 3.3 s


In [130]:
print('{:,}'.format(df_ohe_sample.features.memory_usage().sum()))

15,320,832


In [131]:
# if time and memory used are in linear relation to ths number of users,
# as there are about 130,000 users in the training data,
# this should take little more than an hour
# and should take up about 60GB of RAM

In [132]:
# sparsifying the dataframes takes more than 2 minutes
# for a sample of 100 users
# although this reduces the memory usage down to 1/50 of
# its dense form, this would take more than 2 days to process

In [133]:
# besides, due to pandas not being able to merge sparse data properly
# (i.e., even with sparse dataframe as its input, its output is always a dense dataframe)
# memory usage is still an issue

## Append missing users
--------
There may be missing users after the whole process. Namely, those who did not order any products from bestsellers list.

## Build a model

In [148]:
from sklearn.datasets import make_multilabel_classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import Perceptron
from sklearn.externals import joblib
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [182]:
model = OneVsRestClassifier(DecisionTreeClassifier())

In [183]:
%%time
%%capture --no-stdout
model.fit(df_ohe_sample.sparse_features, df_ohe_sample.sparse_target)

Wall time: 27.3 s


In [184]:
# fit/predict precision/score #1 #2 #3

# for a sample size of 256
# OvR + DecisionTreeClassifier:
#   13s/24s 0.166/0.106 0.144/0.083 0.169/0.107
# OvR + DecisionTreeClassifier with min leaf 5:
#   11s/22s 0.433/0.075 0.548/0.078 0.436/0.066
# OvR + DecisionTreeClassifier with min split 5:
#   14s/25s 0.119/0.075 0.144/0.084 0.169/0.107

# for a sample size of 512
# OvR + DecisionTreeClassifier:
#   27s/81s 0.156/0.121 0.191/0.137 0.168/0.122
# OvR + DecisionTreeClassifier with min leaf 5:
#   27s/82s 0.440/0.111 0.378/0.103 0.426/0.119
# OvR + DecisionTreeClassifier with min split 5:
#   27s/81s 0.187/0.132 0.155/0.113 0.172/0.124

# for a sample size of 1024
# OvR + DecisionTreeClassifier:
#   76s/405s 0.178/0.142 0.168/0.133 0.168/0.135
# OvR + DecisionTreeClassifier with min leaf 5:
#   68s/391s 0.382/0.131 0.437/0.150 0.440/0.150
# OvR + DecisionTreeClassifier with min split 5:
#   72s/394s 0.187/0.149 0.181/0.143 0.182/0.140

# comment: note that although "OvR + DecisionTreeClassifier with min leaf 5" has
# comparable score to other models, it is distinguished by having high precision rate
# this means that the model is more cautious in marking a product as reordered

# comment: there doesn't seem to be any difference in regards to calculation time
# between different parameters

# OvR + SGDClassifier: ???
# OvR + AdaBoostClassifier: quite slow compared to decision trees

# OvR + SVC: takes too long
# OvR + LinearSVC: does not predict
# OvR + XGBClassifier: takes too long
# OvR + KNeighborsClassifier: does not predict
# OvR + RandomForestClassifier: does not predict
# OvR + MLPClassifier: takes too long
# OvR + BernoulliNB: does not predict
# KNeighborsClassifier: does not work in sparse form
# DecisionTreeClassifier: low score, does not work with sparse form
# RandomForestClassifier: does not work with sparse form
# MLPClassifier: does not predict
# MO + anything: does not work with sparse form

In [185]:
# for a sample size of 100
# OvR + SGDClassifier:
#   fit: 64s predict: 33s score: 0.072 0.063 0.050
# OvR + AdaBoostClassifier:
#   fit: 126s predict: 38s score: 0.070, 0.112, 0.066
# OvR + DecisionTreeClassifier:
#   fit: 41s predict: 7s precision/score: 0.149/0.074 0.215/0.091 0.144/0.061
# DecisionTreeClassifier:
#   fit: 30s predict: 0s score: 0.055 0.036 0.080

In [186]:
%%time
df_ohe_sample.set_predict(model)
df_ohe_sample.print_results()

predicted true: 1505
relevant true: 1505
true positive: 1505
false negative 1: 0
false negative 2: 150
false positive: 0

precision score: 1.000
pseudo f1 score: 0.952
Wall time: 1.05 s


In [187]:
# joblib.dump(model, 'model.pkl')

In [188]:
# model = joblib.load('model.pkl')

## Validate against another sample data

In [189]:
# take a sample of users
validation_size = 256
sample_validation_users = get_sample_users(df_train, validation_size)

In [190]:
%%time
df_ohe_validation = DF_ohe(df_train, sample_validation_users)
df_ohe_validation.set_target(df_train_target, sample_validation_users)
df_ohe_validation.set_predict(model)
df_ohe_validation.print_results()

predicted true: 758
relevant true: 1456
true positive: 119
false negative 1: 1337
false negative 2: 145
false positive: 639

precision score: 0.157
pseudo f1 score: 0.101
Wall time: 5.55 s


In [191]:
df_ohe_validation.set_submission()

## Automated model testing

In [64]:
def model_test(model, model_name, sample_size=100, trials=3):
    print('================================================================')
    print(model_name + ' with sample size {}'.format(sample_size))
    print('================================================================')
    # get a sample and fit
    sample_users = get_sample_users(df_train, sample_size)
    df_ohe_sample = DF_ohe(df_train, sample_users)
    df_ohe_sample.set_target(df_train_target, sample_users)
    %time model.fit(df_ohe_sample.sparse_features, df_ohe_sample.sparse_target)
    
    for i in range(trials):
        print('Trial #{}'.format(i))
        print('--------------------------------')
        # validate model on a new sample
        sample_validation_users = get_sample_users(df_train, sample_size)
        df_ohe_validation = DF_ohe(df_train, sample_validation_users)
        df_ohe_validation.set_target(df_train_target, sample_validation_users)
        %time df_ohe_validation.set_predict(model)

        # print results
        df_ohe_validation.print_results()
        print('')

In [65]:
%%capture --no-stdout
sample_size = 32
model_test(OneVsRestClassifier(DecisionTreeClassifier()),
           'OvR + DTC', sample_size, trials=3)
model_test(OneVsRestClassifier(DecisionTreeClassifier(min_samples_leaf=5)),
           'OvR + DTC.SL(5)', sample_size, trials=3)
model_test(OneVsRestClassifier(DecisionTreeClassifier(min_samples_split=5)),
           'OvR + DTC.SS(5)', sample_size, trials=3)

sample_size = 64
model_test(OneVsRestClassifier(DecisionTreeClassifier()),
           'OvR + DTC', sample_size, trials=3)
model_test(OneVsRestClassifier(DecisionTreeClassifier(min_samples_leaf=5)),
           'OvR + DTC.SL(5)', sample_size, trials=3)
model_test(OneVsRestClassifier(DecisionTreeClassifier(min_samples_split=5)),
           'OvR + DTC.SS(5)', sample_size, trials=3)

OvR + DTC with sample size 32
Wall time: 5.88 s
Trial #0
--------------------------------
Wall time: 1.24 s
predicted true: 24
relevant true: 148
true positive: 3
false negative 1: 145
false negative 2: 14
false positive: 21

precision score: 0.125
pseudo f1 score: 0.032

Trial #1
--------------------------------
Wall time: 1.17 s
predicted true: 34
relevant true: 159
true positive: 2
false negative 1: 157
false negative 2: 15
false positive: 32

precision score: 0.059
pseudo f1 score: 0.019

Trial #2
--------------------------------
Wall time: 1.16 s
predicted true: 52
relevant true: 218
true positive: 5
false negative 1: 213
false negative 2: 21
false positive: 47

precision score: 0.096
pseudo f1 score: 0.034

OvR + DTC.SL(5) with sample size 32
Wall time: 6.94 s
Trial #0
--------------------------------
Wall time: 1.11 s
predicted true: 15
relevant true: 160
true positive: 8
false negative 1: 152
false negative 2: 16
false positive: 7

precision score: 0.533
pseudo f1 score: 0.084


## Split the test data into batches

In [355]:
test_users = df_test.user_id.unique()
all_test_users = df_orders.loc[df_orders.eval_set == 'test', 'user_id']
missing_test_users = all_test_users.loc[~all_test_users.isin(test_users)]

In [357]:
batch_size = 100

In [358]:
user_batches = []

In [359]:
while test_users.size > 0:
    batch = test_users[:100]
    test_users = test_users[100:]
    user_batches.append(batch)

In [428]:
missing_test_users_submission = pd.DataFrame(index=missing_test_users)
missing_test_users_submission['order_id'] = user_to_order
missing_test_users_submission['products'] = 'None'

# matching index data type to DF_ohe.submission.index
missing_test_users_submission.index = missing_test_users_submission.index.astype(np.uint64)

In [441]:
%%time
submission = pd.DataFrame(columns=['order_id', 'products'])
submission.index = submission.index.astype(np.uint64)
for batch in user_batches[:10]:
    df_ohe_test = DF_ohe(df_test, batch)
    df_ohe_test.set_predict(model)
    df_ohe_test.set_submission()
    submission = submission.append(df_ohe_test.submission)

Wall time: 1min 12s


In [443]:
submission = submission.append(missing_test_users_submission)

In [446]:
submission = submission.sort_values(by='order_id')
submission.to_csv('submission.csv', index=False)

## Things to do
----
* sparsify data
* add more features (order history)
* optimize parameters

## Test Incremental Models

## One-hot-encode the whole data

## Neural Network Models