In [1]:
import numpy as np
import scipy as sp
import pandas as pd

## Import data

In [2]:
%%time
path = './data/'

df_aisles = pd.read_csv(path + 'aisles.csv')
df_departments = pd.read_csv(path + 'departments.csv')
df_data = pd.read_csv(path + 'order_products__prior.csv', dtype={
    'order_id': np.int32,
    'product_id': np.int32,
    'add_to_cart_order': np.int16,
    'reordered': np.int8,
})
df_train_target = pd.read_csv(path + 'order_products__train.csv')
df_orders = pd.read_csv(path + 'orders.csv', dtype={
    'order_id': np.int32,
    'user_id': np.int32,
    'order_number': np.int8,
    'order_dow': np.int8,
    'order_hour_of_day': np.int8,
})
df_products = pd.read_csv(path + 'products.csv')
df_sample_submission = pd.read_csv(path + 'sample_submission.csv')

Wall time: 11.8 s


## Initialization

In [3]:
# just some merge arguments that'll be used later on
merge_arguments = {
    'left_index': True,
    'right_index': True,
    'how': 'outer',
}

In [4]:
# set an index for easier mapping
df_orders = df_orders.set_index('order_id')
df_products = df_products.set_index('product_id')

In [8]:
# add some features to the main data for convenience
# note that these values are repeated as a single user places many orders with many products
df_data['user_id'] = df_data.order_id.map(df_orders.user_id)
df_data['order_number'] = df_data.order_id.map(df_orders.order_number)
df_train_target['user_id'] = df_train_target.order_id.map(df_orders.user_id)

In [9]:
# creating list of users in each group: train and test
users_train = df_orders.loc[df_orders.eval_set == 'train', 'user_id']
users_test = df_orders.loc[df_orders.eval_set == 'test', 'user_id']

# creating more maps for convenience
user_id_to_last_order_number = df_data.groupby('user_id').agg({'order_number': 'max'}).order_number
user_id_to_last_order_id = df_orders.loc[~df_orders.user_id.duplicated(keep='last')].reset_index().set_index('user_id').order_id
order_id_to_user_id = df_orders.user_id

In [10]:
# split the main data into two groups: train and test
df_train = df_data.loc[df_data.user_id.isin(users_train)]
df_test = df_data.loc[df_data.user_id.isin(users_test)]

In [11]:
# a quick look at the sizes
print("# of samples: {:,}".format(df_data.shape[0]))
print("# of train samples: {:,}".format(df_train.shape[0]))
print("# of test samples: {:,}".format(df_test.shape[0]))

# of samples: 32,434,489
# of train samples: 20,641,991
# of test samples: 11,792,498


In [12]:
# perhaps last days_since_prior_order might be useful
# at first glance, this seems like data leakage, but this information can be obtained
# when a user visits the site.
# if the model is simple enough, the site can use this information for prediction in real time
user_id_to_days_since = df_orders.loc[(df_orders.eval_set == 'train') | (df_orders.eval_set == 'test')]\
                            .set_index('user_id').days_since_prior_order.astype(np.int16)

## Create bestsellers list

In [13]:
# products that are frequently sold

In [14]:
def get_best_sellers(df, quantile):
    df_temp = df.groupby('product_id').agg({'order_id': 'count'}).rename(columns={'order_id': 'amount_sold'})
    return df_temp.loc[df_temp.amount_sold >= df_temp.amount_sold.quantile(quantile)].index.values

In [15]:
quantile = 0.8
bestsellers = get_best_sellers(df_data, quantile)

## Add more features

In [16]:
# last_order refers to the number of times a user have shopped on the site
# rel_order_number is the relative "distance" of the order number to the last order

In [17]:
# warning ahead
# need to be dealt at some point
df_train.loc[:, 'last_order'] = df_train.user_id.map(user_id_to_last_order_number)
df_train.loc[:, 'rel_order_number'] = df_train.last_order - df_train.order_number

df_test.loc[:, 'last_order'] = df_test.user_id.map(user_id_to_last_order_number)
df_test.loc[:, 'rel_order_number'] = df_test.last_order - df_test.order_number

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


## Transform features

In [18]:
# just some helper dictionaries to use when preprocessing the data
feature_aggregation = {
    'order_number': 'min',
    'reordered': 'sum',
    'last_order': 'max',
}

last_order_aggregation = {
    'lo_0': 'sum',
    'lo_1': 'sum',
    'lo_2': 'sum',
    'lo_3': 'sum',
    'lo_4': 'sum',
    'lo_5': 'sum',
}

In [19]:
# this part needs some cleaning
class DF_data:
    def __init__(self, df, df_target=None, set_target=False):
        self.df_data = df.groupby(['user_id', 'product_id']).agg(feature_aggregation)
        self.df_data = self.df_data.rename(columns={'order_number': 'first_order', 'last_order': 'times_shopped'})
        
        df_temp = df.loc[df.rel_order_number < 6].copy()
        df_temp = df_temp.merge(pd.get_dummies(df_temp.rel_order_number, prefix='lo'), **merge_arguments)
        df_temp = df_temp.groupby(['user_id', 'product_id']).agg(last_order_aggregation)
        
        self.df_data = self.df_data.merge(df_temp, left_index=True, right_index=True, how='outer')
        self.df_data = self.df_data.fillna(0)
        
        self.df_data.lo_0 = self.df_data.lo_0.astype(np.int8)
        self.df_data.lo_1 = self.df_data.lo_1.astype(np.int8)
        self.df_data.lo_2 = self.df_data.lo_2.astype(np.int8)
        self.df_data.lo_3 = self.df_data.lo_0.astype(np.int8)
        self.df_data.lo_4 = self.df_data.lo_1.astype(np.int8)
        self.df_data.lo_5 = self.df_data.lo_2.astype(np.int8)
        
        # somewhat normalized reorder probability per order
        self.df_data['reorder_rate'] = ((self.df_data.reordered + 1)
                                        / ((self.df_data.times_shopped - self.df_data.first_order) + 2))
        
        if set_target == True:
            df_temp = df_target.groupby(['user_id', 'product_id']).agg({'reordered': 'sum'})
            df_temp = df_temp.rename(columns={'reordered': 'target'})

            self.df_data = self.df_data.merge(df_temp, left_index=True, right_index=True, how='left')
            self.df_data = self.df_data.fillna(0)
            self.df_data.target = self.df_data.target.astype(np.int8)
        
        self.df_data = self.df_data.reset_index()
        self.df_data['days_since'] = self.df_data.user_id.map(user_id_to_days_since)
        self.df_data['is_bestseller'] = self.df_data.product_id.isin(bestsellers).astype(np.int8)
        self.df_data['long_history'] = (self.df_data.first_order < self.df_data.times_shopped - 5).astype(np.int8)

In [20]:
%%time
train = DF_data(df_train, df_train_target, set_target=True)

Wall time: 49.9 s


In [21]:
# a quick sanity check
train.df_data.head(5)

Unnamed: 0,user_id,product_id,first_order,reordered,times_shopped,lo_0,lo_1,lo_2,lo_3,lo_4,lo_5,reorder_rate,target,days_since,is_bestseller,long_history
0,1,196,1,9,10,1,1,1,1,1,1,0.909091,1,14,1,1
1,1,10258,2,8,10,1,1,1,1,1,1,0.9,1,14,1,1
2,1,10326,5,0,10,0,0,0,0,0,0,0.142857,0,14,1,0
3,1,12427,1,9,10,1,1,1,1,1,1,0.909091,0,14,1,1
4,1,13032,2,2,10,1,0,0,1,0,0,0.3,1,14,1,1


## Model testing

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
# a handy list to flag which features to use when fitting a model
features_to_use = [
    'lo_0',
    'lo_1',
    'lo_2',
    'lo_3',
    'lo_4',
    'lo_5',
#    'times_shopped',
#    'days_since',
    'reorder_rate',
    'is_bestseller',
    'long_history',
]

In [24]:
# if n is larger than the number of users in the training set, just return all users
def get_sample_users(df, n):
    if n > df.user_id.nunique():
        return pd.Series(df.user_id.unique())
    else:
        return pd.Series(np.random.choice(df.user_id.unique(), n, replace=False))

In [25]:
validation_size = 8192
validation_users = get_sample_users(train.df_data, validation_size)

In [26]:
# splitting the training data into two groups for validation
df_X = train.df_data.loc[~train.df_data.user_id.isin(validation_users), features_to_use]
df_y = train.df_data.loc[~train.df_data.user_id.isin(validation_users), 'target']
df_val_X = train.df_data.loc[train.df_data.user_id.isin(validation_users), features_to_use]
df_val_y = train.df_data.loc[train.df_data.user_id.isin(validation_users), 'target']

In [27]:
# create a model and fit
model = DecisionTreeClassifier(class_weight={0: 1, 1: 4.5})

In [28]:
%%time
model.fit(df_X, df_y)

Wall time: 33.3 s


DecisionTreeClassifier(class_weight={0: 1, 1: 4.5}, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [29]:
predict = model.predict(df_val_X)

In [30]:
PT = predict.sum()
RT = df_val_y.sum()
TP = np.dot(predict.astype(np.int32), df_val_y.values.astype(np.int32))
FP = PT - TP
FN = RT - TP
print('predicted true: {:,}'.format(PT))
print('relevant true: {:,}'.format(RT))
print('true positive: {:,}'.format(TP))
print('false positive: {:,}'.format(FP))
print('false negative: {:,}'.format(FN))
print('')
print('precision: {:.3f}'.format(TP / PT))
print('f1 score: {:.3f}'.format((2 * TP) / ((2 * TP) + FN + FP)))

predicted true: 72,166
relevant true: 52,161
true positive: 26,253
false positive: 45,913
false negative: 25,908

precision: 0.364
f1 score: 0.422


In [31]:
# note that the score will be much lower on Kaggle
# as this is not mean f1 among users

## Predict on test set

In [32]:
test = DF_data(df_test, df_target=None, set_target=False)

In [33]:
df_test_X = test.df_data.loc[:, features_to_use]

In [34]:
test.df_data['products'] = model.predict(df_test_X) * test.df_data.product_id

In [35]:
# helper function to transform the result to proper submission format
def get_predict_list(series):
    str_temp = ''
    first_product = True
    for product in series:
        if product > 0:
            if first_product:
                str_temp += str(product)
                first_product = False
            else:
                str_temp += ' '
                str_temp += str(product)
    if str_temp == '':
        return 'None'
    else:
        return str_temp

In [36]:
submission = test.df_data.groupby('user_id').agg({'products': get_predict_list})

In [37]:
submission['order_id'] = user_id_to_last_order_id

In [38]:
submission = submission[['order_id', 'products']]

In [39]:
submission = submission.sort_values(by='order_id')

In [40]:
# a quick sanity check
# the number of rows should be exactly 75,000
submission.shape

(75000, 2)

In [41]:
submission.head(5)

Unnamed: 0_level_0,order_id,products
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
36855,17,13107 21463 38777
35220,34,2596 16083 21137 39475 43504 44663 47766 47792
187107,137,2326 5134 23794 24852 25890 38689 41787
115892,182,5479 9337 11520 13629 16973 32109 33000 39275 ...
35581,257,1025 4605 13870 24838 24852 25659 27104 27966 ...


In [42]:
submission.to_csv('submission.csv', index=False)