In [17]:
import numpy as np
import pandas as pd

import sys
from os import path
import pickle
import re

from sklearn.model_selection import train_test_split

In [18]:
sys.path.append('..')
import src.my_helper as my_helper

### Load Raw Data

In [5]:
fd = ['..','data','raw']

data = {}
fn_list = ['orders.csv', 'products.csv', 'order_products__prior.csv', 'order_products__train.csv', 'departments.csv', 'aisles.csv']

for fn in fn_list:
    fp = path.join(*fd, fn)

    with open(file=fp, mode='r', encoding='utf8') as file:
        label = re.sub('\.csv$', '', fn)
        data[label] = pd.read_csv(file, encoding='utf8')

### Separating 'prior' & 'train'

As discussed during EDA, the orders dataset contains three 'eval_set' categories: prior, train, and test. Test contains no actual order product details as is intended for evaluation in the original Kaggle competition, so we will need to revert to using the prior/train sets as our train/test sets. 

In [37]:
train_mask = data['orders']['eval_set'] == 'prior'
test_mask = data['orders']['eval_set'] == 'train'

In [38]:
orders_train = data['orders'][train_mask].copy()
orders_test = data['orders'][test_mask].copy()

In [39]:
orders_train.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [40]:
orders_test.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
10,1187899,1,train,11,4,8,14.0
25,1492625,2,train,15,1,11,30.0
49,2196797,5,train,5,0,11,6.0
74,525192,7,train,21,2,11,6.0
78,880375,8,train,4,1,14,10.0


In [41]:
orders_train['eval_set'].unique(), orders_test['eval_set'].unique()

(array(['prior'], dtype=object), array(['train'], dtype=object))

With the eval_set column now being redundant, we will drop them from both dataframes:

In [42]:
orders_train.drop('eval_set', axis=1, inplace=True)
orders_test.drop('eval_set', axis=1, inplace=True)

### Joining Orders with Products

Most analysis and calculations will involve working with the products in each order, so combining these two into a single dataframe for reuse is preferable:

In [43]:
order_products_train = pd.merge(left=orders_train, right=data['order_products__prior'], on='order_id')
order_products_train.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,1,2,8,,196,1,0
1,2539329,1,1,2,8,,14084,2,0
2,2539329,1,1,2,8,,12427,3,0
3,2539329,1,1,2,8,,26088,4,0
4,2539329,1,1,2,8,,26405,5,0


In [44]:
order_products_test = pd.merge(left=orders_test, right=data['order_products__train'], on='order_id')
order_products_test.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,1187899,1,11,4,8,14.0,196,1,1
1,1187899,1,11,4,8,14.0,25133,2,1
2,1187899,1,11,4,8,14.0,38928,3,1
3,1187899,1,11,4,8,14.0,26405,4,1
4,1187899,1,11,4,8,14.0,39657,5,1


In addition to getting products per order, it may be practicaly to include the respective aisle and department IDs since we are likely going to work with these in clustering and recommendations:

In [49]:
order_products_train = order_products_train.merge(right=data['products'], on='product_id')
order_products_test = order_products_test.merge(right=data['products'], on='product_id') 

In [50]:
order_products_train.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,2539329,1,1,2,8,,196,1,0,Soda,77,7
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7


### Memory usage

Having merged most of our order and product data into a single dataframe, one of the major concerns is memory usage:

In [51]:
my_helper.mem_size(order_products_train)

'5778.09 MB'

Our training dataframe consumes over 5GB, which is less than ideal.

In [56]:
order_products_train.memory_usage(deep=True)

Index                      259475912
order_id                   259475912
user_id                    259475912
order_number               259475912
order_dow                  259475912
order_hour_of_day          259475912
days_since_prior_order     259475912
product_id                 259475912
add_to_cart_order          259475912
reordered                  259475912
product_name              2664383146
aisle_id                   259475912
department_id              259475912
dtype: int64

The inclusion of product name is an unnecessary burden seeing we can lookup product names when needed and work with the ID during analysis. Hopefully dropping this column frees up a considerable amount of memory: 

In [58]:
order_products_train.drop('product_name', axis=1, inplace=True)
order_products_test.drop('product_name', axis=1, inplace=True)

In [59]:
my_helper.mem_size(order_products_train)

'3113.71 MB'

A considerable improvement! It would be desirable to reduce this memory consumption further, but for the sake of convenience and with 32GB available we will leave it as is. The need for data such as day of week, hour of day, and such is not immediately clear, but given our observations during EDA it may be useful to split recommendations based off these factors. Thus keeping them readily available is preferable for the time being.

### Sampling the Dataset

With some initial insight in working with the full dataset in basic recommenders & model evaluation, regularly processing the entire dataset (from a volume perspective vs. memory consumption) is simply not going to be feasible within the constraints of working with one machine (8 cores). Taking a look at our data:

In [65]:
print('Training Set')
print('# of users: {}'.format(order_products_train['user_id'].nunique()))
print('# of orders: {}'.format(order_products_train['order_id'].nunique()))

Training Set
# of users: 206209
# of orders: 3214874


In [66]:
print('Test Set')
print('# of users: {}'.format(order_products_test['user_id'].nunique()))
print('# of orders: {}'.format(order_products_test['order_id'].nunique()))

Test Set
# of users: 131209
# of orders: 131209


Working with and regularly evaluating performance of recommendations for 200k users and 3.2M orders is simply not feasible given the time it takes to process. Even through the utilization of multi-core processing via Dask (which will still be implemented) evaluating a single, basic model takes upwards of 1 minute. Given we would like to explore model optimization and compare performance, we are forced to reduce the quantity of data being handled.

The first possible method of reduction is eliminating users via their number of orders:

In [73]:
order_products_train.groupby('user_id')['order_number'].max().describe()

count    206209.000000
mean         15.590367
std          16.654774
min           3.000000
25%           5.000000
50%           9.000000
75%          19.000000
max          99.000000
Name: order_number, dtype: float64

25% of users fall below a total of 5 orders placed, and on opposite end of the spectrum we have another 25% exceeding 19 orders (all the way up to a maximum of 99). We can trim both of these sides to see how that impact our volume of data:

In [84]:
order_number_threshold_mask = order_products_train.groupby('user_id')['order_number'].transform(lambda x: 5 <= max(x) <= 20)

In [118]:
temp = order_products_train[order_number_threshold_mask]

In [119]:
n_users_old = order_products_train['user_id'].nunique()
n_orders_old = order_products_train['order_id'].nunique()

In [120]:
print('Train Set')
n_users_new = temp['user_id'].nunique()
n_orders_new = temp['order_id'].nunique()
print('# of users: {} ({:.0f}%)'.format(n_users_new, 100*n_users_new/n_users_old))
print('# of orders: {} ({:.0f}%)'.format(n_orders_new, 100*n_orders_new/n_orders_old))

Train Set
# of users: 114823 (56%)
# of orders: 1164301 (36%)


We have cut down our order count significantly, but 100k users and 1M orders is still a lot to be working with given our resources.

Given our set of test data, we can limit our training data to just those users present in the test set:

In [122]:
temp = temp[temp['user_id'].isin(order_products_test['user_id'].unique())]

In [123]:
print('Train Set')
n_users_new = temp['user_id'].nunique()
n_orders_new = temp['order_id'].nunique()
print('# of users: {} ({:.0f}%)'.format(n_users_new, 100*n_users_new/n_users_old))
print('# of orders: {} ({:.0f}%)'.format(n_orders_new, 100*n_orders_new/n_orders_old))

Train Set
# of users: 72975 (35%)
# of orders: 740003 (23%)


Having done this, every user present in the train set now has a corresponding test record. Time for evaluating one order per user for performance measurement is still going to be considerable with 72k users. Consequenty we will proceed with random sampling of users to reduce our data to 10% (approx.) of our original user base:

In [124]:
np.random.seed(42)
user_sample = np.random.choice(temp['user_id'].unique(), 20000, replace=False)

In [125]:
temp = temp[temp['user_id'].isin(user_sample)]

In [126]:
print('Train Set')
n_users_new = temp['user_id'].nunique()
n_orders_new = temp['order_id'].nunique()
print('# of users: {} ({:.0f}%)'.format(n_users_new, 100*n_users_new/n_users_old))
print('# of orders: {} ({:.0f}%)'.format(n_orders_new, 100*n_orders_new/n_orders_old))

Train Set
# of users: 20000 (10%)
# of orders: 202234 (6%)


In [128]:
train = temp.copy()

Since our reduced training set no longer contains all users in the test set, we will want to make necessary adjustments there as well:

In [129]:
test = order_products_test[order_products_test['user_id'].isin(user_sample)].copy()

In [130]:
print('Test Set')
print('# of users: {}'.format(test['user_id'].nunique()))
print('# of orders: {}'.format(test['order_id'].nunique()))

Test Set
# of users: 20000
# of orders: 20000


We could have opted to leave the test set as-is to emphasize potential cold-start performance issues with our models, but for the sake of this project we will simply focus on providing accurate recommendations for established customers.

### Train & Validation Split

Since we have discarded the original test data (no actual order products available for those records), we will want to split our current training set into a new train and validation set. This validation set will be used in performance optimization of recommnder systems, whilst leaving our dedicated test set untouched until final performance evaluation of our various methods.

Rather than taking a random sample of orders from users, we will skim the most recent order off for each user, akin to our current test set which has 1 order per user for evaluation:

In [131]:
train_full = train

In [132]:
max_order_numbers = train_full.groupby('user_id')['order_number'].transform(max)
last_order_mask = train_full['order_number'] >= max_order_numbers
val = train_full[last_order_mask]
train = train_full[~last_order_mask]

In [137]:
# Check max orders per user in val
val.groupby('user_id')['order_id'].nunique().max()

1

In [138]:
# Check min orders per user in train
train.groupby('user_id')['order_id'].nunique().min()

4

In [140]:
# Check user_id consistency between sets
train['user_id'].nunique(), val['user_id'].nunique(), train_full['user_id'].nunique()

(20000, 20000, 20000)

In [141]:
# Percentage of validation split
val.shape[0]/(train.shape[0]+val.shape[0])

0.10229688299171301

Splitting via this method has yielded a validation set comprising 10% of our reduced training set. We could potentially skim a 2nd round of orders from users to approach 20%, but this would not only reduce the amount of data to train off of but increase the processing times for model evaluation, which has been the focus of our earlier data volume reduction efforts. Thus, for the sake of time we will settle with this 10%.

### Save Data

In [154]:
# Train
fn = 'train.p'
fd = ['..','data','processed']
fp = path.join(*fd,fn)

with open(fp, 'wb') as file:
    pickle.dump(train, file)

In [143]:
# Test 
fn = 'test.p'
fd = ['..','data','processed']
fp = path.join(*fd,fn)

with open(fp, 'wb') as file:
    pickle.dump(test, file)

In [144]:
# Validation
fn = 'val.p'
fd = ['..','data','processed']
fp = path.join(*fd,fn)

with open(fp, 'wb') as file:
    pickle.dump(val, file)

We will also move the original data sets to our processed folder for consistent referencing/retrieval, since items such as product names etc. will need be regularly accessed when ultimately converting IDs into meaningful values.

In [153]:
fd = ['..','data','processed']

for fn, df in data.items():
    fn += '.p'
    fp = path.join(*fd, fn)
    with open(file=fp, mode='wb') as file:
        pickle.dump(df, file)