In [1]:
%matplotlib inline
path = 'data/'

In [2]:
df_aisles = pd.read_csv(path + 'aisles.csv')
df_departments = pd.read_csv(path + 'departments.csv')
df_data = pd.read_csv(path + 'order_products__prior.csv', dtype={
    'order_id': np.int32,
    'product_id': np.int32,
    'add_to_cart_order': np.int16,
    'reordered': np.int8,
})
df_train_target = pd.read_csv(path + 'order_products__train.csv')
df_orders = pd.read_csv(path + 'orders.csv', dtype={
    'order_id': np.int32,
    'user_id': np.int32,
    'order_number': np.int8,
    'order_dow': np.int8,
    'order_hour_of_day': np.int8,
})
df_products = pd.read_csv(path + 'products.csv')
df_sample_submission = pd.read_csv(path + 'sample_submission.csv')

In [3]:
# merge arguments we will be using often
merge_arguments = {
    'left_index': True,
    'right_index': True,
    'how': 'outer',
}

In [4]:
# setting index for easier mapping
df_orders = df_orders.set_index('order_id')
df_products = df_products.set_index('product_id')

In [5]:
%%time
# adding features to data to make it more complete
# many of the values are repeated as a single user places many orders with many products
df_data['user_id'] = df_data.order_id.map(df_orders.user_id)
df_data['order_number'] = df_data.order_id.map(df_orders.order_number)
df_data['order_dow'] = df_data.order_id.map(df_orders.order_dow)
df_data['order_hour_of_day'] = df_data.order_id.map(df_orders.order_hour_of_day)
df_data['days_since_prior_order'] = df_data.order_id.map(df_orders.days_since_prior_order)

Wall time: 24.5 s


In [6]:
# create a list of users in each group
users_train = df_orders.loc[df_orders.eval_set == 'train', 'user_id']
users_test = df_orders.loc[df_orders.eval_set == 'test', 'user_id']

# create maps for convenience
user_to_last_order = df_data.groupby('user_id').agg({'order_number': 'max'}).order_number
order_to_user = df_orders.user_id

In [7]:
# split data into two groups
df_train = df_data.loc[df_data.user_id.isin(users_train)]
df_test = df_data.loc[df_data.user_id.isin(users_test)]

In [8]:
# a quick look at the sizes
print("# of data: {}".format(df_data.shape))
print("# of train data: {}".format(df_train.shape))
print("# of test data: {}".format(df_test.shape))

# of data: (32434489, 9)
# of train data: (20641991, 9)
# of test data: (11792498, 9)


## Create bestsellers list

In [9]:
def get_best_sellers(df, quantile):
    df_temp = df.groupby('product_id').agg({'order_id': 'count'}).rename(columns={'order_id': 'amount_sold'})
    return df_temp.loc[df_temp.amount_sold >= df_temp.amount_sold.quantile(quantile)].index.values

In [10]:
# as we have seen in part 02,
# top 20% of most sold products account for more than 90% of all items sold
# there are approximately 50,000 different products sold
# this should cut down the number of features down to about 10,000
# as we shall be using one-hot encoded data for our model
quantile = 0.8

bestsellers = get_best_sellers(df_data, quantile)

## Take only bestsellers

In [11]:
df_train = df_train.loc[df_train.product_id.isin(bestsellers)]
df_test = df_test.loc[df_test.product_id.isin(bestsellers)]

In [12]:
print("# of bestsellers in train data: {}".format(df_train.shape))
print("# of bestsellers in test data: {}".format(df_test.shape))

# of bestsellers in train data: (18785336, 9)
# of bestsellers in test data: (10723251, 9)


## Take only last 3 orders

In [13]:
# the minimum number of orders made by a user is 3
# we cut off all orders prior to the last three
# this is mainly done to have constant number of features across all users
# after one-hot encoding all (products, order number) pairs
# hopefully last 3 orders are relavent enough in predicting reorders

In [14]:
# if not, we may do more feature engineering later on

In [15]:
def get_last_orders(df):
    return df.loc[df.user_id.map(user_to_last_order) - df.order_number < 3]

In [16]:
df_train = get_last_orders(df_train)
df_test = get_last_orders(df_test)

In [17]:
print("# of bestsellers in the last 3 train data: {}".format(df_train.shape))
print("# of bestsellers in the last 3 test data: {}".format(df_test.shape))

# of bestsellers in the last 3 train data: (3622674, 9)
# of bestsellers in the last 3 test data: (2075829, 9)


## Data preprocessing functions

In [19]:
# this function reduces order number down to 0, 1, or 2
# these numbers are "relative" to the last order number made by a user
# this is done so that we may one-hot encode the feature
def standardize_order_number(df):
    df.order_number = df.order_number - (df.user_id.map(user_to_last_order) - 2)
    return df

In [20]:
def get_product_ohe_grouped_by_user(df):
    # setting up an "empty dataframe" to merge one-hot encoded features
    df_temp = pd.DataFrame(index=df.index)
    df_temp['user_id'] = df.user_id
    df_temp['order_number'] = df.order_number
    
    # merge one-hot encoded product feature and reordered feature
    df_temp = df_temp.merge(pd.get_dummies(df.product_id, prefix='prod'), **merge_arguments)
    df_temp = df_temp.merge(pd.get_dummies(df.product_id * df.reordered, prefix='re'), **merge_arguments)
    
    # group by order_number so that each row contains all the information
    # on which products are ordered in that particular order
    # we lose the information on when the products are added to the cart on that particular order
    # but this information may not be relavent
    df_temp = df_temp.groupby(['user_id', 'order_number']).sum()
    
    # unstack order_numbers so that each row now contains all the information
    # on which products are ordered by a user in the last 3 orders
    # fill_value is needed as there may be some users
    # who did not order one of the bestsellers in a particular order among the last 3 orders
    df_temp = df_temp.unstack(fill_value=0)
    df_temp.columns = ['_'.join([str(col[1]), str(col[0])]) for col in df_temp.columns]
    
    return df_temp

In [21]:
def get_time_ohe_grouped_by_user(df):
    # setting up an "empty dataframe" to merge one-hot encoded features
    df_temp = pd.DataFrame(index=df.index)
    df_temp['user_id'] = df.user_id
    df_temp['order_number'] = df.order_number
    
    # merge one-hot encoded time related features
    df_temp = df_temp.merge(pd.get_dummies(df.order_dow, prefix='dow'), **merge_arguments)
    df_temp = df_temp.merge(pd.get_dummies(df.order_hour_of_day, prefix='hour'), **merge_arguments)
    df_temp = df_temp.merge(pd.get_dummies(df.days_since_prior_order, prefix='days', dummy_na=True), **merge_arguments)
    
    # similar as above
    df_temp = df_temp.groupby(['user_id', 'order_number']).max()
    df_temp = df_temp.unstack(fill_value=0)
    df_temp.columns = ['_'.join([str(col[1]), str(col[0])]) for col in df_temp.columns]
    
    return df_temp

In [22]:
# a function that returns column names in a predefined format
# to make sure that we have a well defined format of a full list of features
# otherwise, when using a partial set of users for our input,
# we might be missing certain products
def get_product_ohe_columns():
    return np.concatenate([
        np.core.defchararray.add(str(i) + infix, bestsellers.astype(str))
        for i in range(3) for infix in ['_prod_', '_re_']
    ])

In [23]:
def get_time_ohe_columns():
    return np.concatenate([
        [str(i) + '_dow_' + str(dow) for dow in range(7) for i in range(3)],
        [str(i) + '_hour_' + str(hour) for hour in range(24) for i in range(3)],
        [str(i) + '_days_' + str(days) for days in np.sort(df_orders.days_since_prior_order.unique()) for i in range(3)],
    ])

In [24]:
def get_ohe_columns():
    return np.concatenate([get_time_ohe_columns(), get_product_ohe_columns()])

In [25]:
# a simple function to combine one-hot encoded product features and time related features
def get_ohe_features(df, users):
    df_ohe = pd.DataFrame(data=0, index=users, columns=get_ohe_columns(), dtype=np.uint8)
    
    df_temp = get_product_ohe_grouped_by_user(df)
    df_ohe.loc[:, df_ohe.columns.isin(df_temp)] = df_temp
    
    df_temp = get_time_ohe_grouped_by_user(df)
    df_ohe.loc[:, df_ohe.columns.isin(df_temp)] = df_temp
    
    # add one last feature
    # we use log for this part (see part 02)
    df_ohe['last_order'] = np.log(user_to_last_order)
    
    return df_ohe.sort_index()

In [26]:
def get_product_ohe_target(df, users):
    df_temp = df.copy()
    df_temp['user_id'] = df_temp.order_id.map(order_to_user)
    
    df_temp = df_temp.loc[df_temp.user_id.isin(users)]
    
    # take only those that are reorders of bestsellers
    df_temp = df_temp.loc[df_temp.reordered == 1]
    df_temp = df_temp.loc[df_temp.product_id.isin(bestsellers)]
    
    df_temp = df_temp.merge(pd.get_dummies(df_temp.product_id), **merge_arguments)
    
    # drop unnecessary columns
    df_temp = df_temp.drop(['order_id', 'product_id', 'add_to_cart_order', 'reordered'], axis=1)
    
    # group by users
    df_temp = df_temp.groupby('user_id').sum()
    
    return df_temp

In [27]:
def get_ohe_target(df, users):
    df_ohe = pd.DataFrame(data=0, index=users, columns=bestsellers, dtype=np.uint8)
    df_temp = get_product_ohe_target(df, users)
    
    # add possible missing users back in to the target
    missing_users = users.loc[~users.isin(df_temp.index)]
    df_temp = df_temp.append([pd.DataFrame(data=0, index=missing_users, columns=df_temp.columns, dtype=np.uint8)])
    
    df_ohe.loc[:, df_ohe.columns.isin(df_temp.columns)] = df_temp
    
    return df_ohe.sort_index()

## Take a sample of users

In [403]:
sample_size = 200

In [404]:
def get_sample_users(df, n):
    return pd.Series(np.random.choice(df.user_id.unique(), n, replace=False))

In [405]:
# take a sample of users
sample_users = get_sample_users(df_train, sample_size)

In [406]:
class DF_ohe:
    def __init__(self, df, users):
        self.df = df.loc[df.user_id.isin(users)].copy()
        self.df = standardize_order_number(self.df)
        
        # possible missing users
        # sample is take after dataframe reduction
        # we need to account for that
        self.features = get_ohe_features(self.df, users)
        
    def set_target(self, df, users):
        self.target = get_ohe_target(df, users)
    
    def set_predict(self, df):
        self.predict = df
    
    def to_submission_form(self):
        pass

In [407]:
%%time
df_ohe_sample = DF_ohe(df_train, sample_users)
df_ohe_sample.set_target(df_train_target, sample_users)

Wall time: 2.68 s


In [408]:
print('{:,}'.format(df_ohe_sample.features.memory_usage().sum()))

11,969,000


In [409]:
# if time and memory used are in linear relation to ths number of users,
# as there are about 130,000 users in the training data,
# this should take little more than an hour
# and should take up about 60GB of RAM

In [410]:
# sparsifying the dataframes takes more than 2 minutes
# for a sample of 100 users
# although this reduces the memory usage down to 1/50 of
# its dense form, this would take more than 2 days to process

In [411]:
# besides, due to pandas not being able to merge sparse data properly
# (i.e., even with sparse dataframe as its input, its output is always a dense dataframe)
# memory usage is still an issue

## Append missing users
--------
There may be missing users after the whole process. Namely, those who did not order any products from bestsellers list.

## Build a model

In [417]:
from sklearn.datasets import make_multilabel_classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.preprocessing import LabelBinarizer
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KDTree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import Perceptron

In [418]:
model = OneVsRestClassifier(SGDClassifier())

In [419]:
%%time
%%capture --no-stdout
model.fit(df_ohe_sample.features, df_ohe_sample.target)

Wall time: 4min 42s


In [420]:
# for a sample size of 100
# OvR + SGDClassifier:
#   fit: 64s predict: 33s score: 0.072 0.063 0.050
# OvR + DecisionTreeClassifier:
#   fit: 29s predict: 22s score: 0.099 0.064 0.096
# OvR + AdaBoostClassifier:
#   fit: 126s predict: 38s score: 0.070, 0.112, 0.066
# DecisionTreeClassifier:
#   fit: 30s predict: 0s score: 0.055 0.036 0.080

# for a sample size of 200
# OvR + SGDClassifier: fit:
#   fit: 221s predict: 130s score: 0.076 0.092 0.131
# OvR + DecisionTreeClassifier:
#   fit: 131s predict: 88s score: 0.150 0.119 0.151
# OvR + AdaBoostClassifier:
#   fit: 534s predict: 149s score: 0.115 0.095 0.090
# DecisionTreeClassifier:
#   fit: 109s predict: 0s score: 0.057 0.041 0.074

# for a sample size of 400
# OvR + DecisionTreeClassifier:
#   fit: 863s predict: 506s score: 0.125 0.084 0.084
#   comment: there seems to be an overfitting

# OvR + MLPClassifier: takes too long
# MLPClassifier: fail
# OvR + LinearSVC: fail
# OvR + RandomForestClassifier: fail
# RandomForestClassifier: fail

In [421]:
%%time
df = pd.DataFrame(data=model.predict(df_ohe_sample.features),
                  index=df_ohe_sample.features.index, columns=bestsellers, dtype=np.uint8)
df_ohe_sample.set_predict(df)

Wall time: 2min 56s


In [422]:
print('# of reorders in target: {}'.format(df_ohe_sample.target.sum().sum()))
print('# of reorders in predict: {}'.format(df_ohe_sample.predict.sum().sum()))

# of reorders in target: 1316
# of reorders in predict: 1317


In [423]:
(df_ohe_sample.predict == df_ohe_sample.target).all().all()

False

## Validate against test data

In [424]:
# take a sample of users
validation_size = 400
sample_validation_users = get_sample_users(df_train, validation_size)

In [425]:
%%time
df_ohe_validation = DF_ohe(df_train, sample_validation_users)
df_ohe_validation.set_target(df_train_target, sample_validation_users)
df = pd.DataFrame(data=model.predict(df_ohe_validation.features),
                  index=df_ohe_validation.features.index, columns=bestsellers, dtype=np.uint8)
df_ohe_validation.set_predict(df)

Wall time: 5min 23s


In [426]:
# number of predicted reorders
PT = df_ohe_validation.predict.sum().sum()
# number of relevant reorders
RT = df_ohe_validation.target.sum().sum()
# number of false negative among bestsellers
FN1 = (df_ohe_validation.predict != df_ohe_validation.target).sum().sum()
# estimation of false negative among non-bestsellers
FN2 = 0.1 * RT
# true positive of reorders
TP = PT + RT - FN1
# false positive of reorders
FP = PT - TP

In [427]:
print('predicted true: {}'.format(PT))
print('relevant true: {}'.format(RT))
print('true positive: {}'.format(TP))
print('false negative 1: {}'.format(FN1))
print('false negative 2: {}'.format(FN2))
print('false positive: {}'.format(FP))

predicted true: 226
relevant true: 2506
true positive: 88
false negative 1: 2644
false negative 2: 250.60000000000002
false positive: 138


In [428]:
print('pseudo f1 score: {:.3f}'.format((2 * TP) / ((2 * TP) + FN1 + FN2 + FP)))

pseudo f1 score: 0.055


## Test Incremental Models

## One-hot-encode the whole data

## Neural Network Models