In [2]:
import pandas as pd
import statistics
import gc
from tqdm import tqdm
gc.enable()

orders = pd.read_csv('data/orders.csv')
order_products_prior = pd.read_csv('data/order_products__prior.csv')
order_products_train = pd.read_csv('data/order_products__train.csv')
products = pd.read_csv('data/products.csv')
orders['eval_set'] = orders['eval_set'].astype('category')

In [3]:
tqdm.pandas()

In [4]:
order_product = orders.merge(order_products_prior, on='order_id', how='inner')
user_clf = pd.read_csv('customer_classification.csv')
user_clf = user_clf.drop(user_clf.columns[0], axis=1)

order_product = order_product.merge(user_clf, on='user_id', how='inner')
order_product['label'] = order_product['label'].astype('category')
order_product.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,label
0,1737705,17,prior,1,2,13,,7350,1,0,3
1,1737705,17,prior,1,2,13,,47141,2,0,3
2,1737705,17,prior,1,2,13,,17762,3,0,3
3,1681401,17,prior,2,5,10,3.0,7350,1,1,3
4,1681401,17,prior,2,5,10,3.0,18534,2,0,3


In [5]:
# Find the total number of order a user placed

user_total_orders = order_product.groupby('user_id').progress_apply(lambda x: pd.Series({'total_orders': max(x['order_number'])})).reset_index()

user_total_orders.head()

100%|██████████| 40143/40143 [00:38<00:00, 1044.72it/s]


Unnamed: 0,user_id,total_orders
0,17,40
1,21,33
2,27,81
3,28,24
4,36,37


In [6]:
# Find out the frequency of user reordered

#       times of user reorder / total order of user

user_reorder = order_product.groupby('user_id').progress_apply(lambda x: pd.Series({'reorder_ratio' : statistics.mean(x['reordered'])})).reset_index()

100%|██████████| 40143/40143 [00:50<00:00, 797.10it/s]


In [7]:
# Average day since last order

order_product['days_since_prior_order'] = order_product['days_since_prior_order'].fillna(0)

user_last_order = order_product.groupby('user_id').progress_apply(lambda x: pd.Series({'avg_day_since_last_reorder' : statistics.mean(x['days_since_prior_order'])})).reset_index()

user_last_order

100%|██████████| 40143/40143 [00:44<00:00, 900.11it/s]


Unnamed: 0,user_id,avg_day_since_last_reorder
0,17,8.183673
1,21,9.809756
2,27,5.118490
3,28,12.027322
4,36,9.849673
...,...,...
40138,206187,9.528736
40139,206193,4.457726
40140,206201,11.413366
40141,206206,3.985965


In [8]:
# for each user, average reorder item percentage in whole order


def average_perc(x):
    order_reorder_prob = x.groupby('order_id')['reordered'].mean().to_frame('reorder_prob')
    return pd.Series({'reorder_item_ratio_per_order': statistics.mean(order_reorder_prob['reorder_prob'])}) 

average_reorder = order_product.groupby(['user_id']).progress_apply(average_perc).reset_index()
average_reorder.head()

100%|██████████| 40143/40143 [01:33<00:00, 428.48it/s]


Unnamed: 0,user_id,reorder_item_ratio_per_order
0,17,0.722594
1,21,0.554367
2,27,0.66927
3,28,0.409113
4,36,0.708076


In [9]:
user = user_total_orders.merge(user_reorder, on='user_id', how='left')
user = user.merge(user_last_order, on='user_id', how='left')
user = user.merge(average_reorder, on='user_id', how='left')

del user_reorder
del user_last_order
del average_reorder

gc.collect()

user

Unnamed: 0,user_id,total_orders,reorder_ratio,avg_day_since_last_reorder,reorder_item_ratio_per_order
0,17,40,0.717687,8.183673,0.722594
1,21,33,0.502439,9.809756,0.554367
2,27,81,0.713542,5.118490,0.669270
3,28,24,0.420765,12.027322,0.409113
4,36,37,0.640523,9.849673,0.708076
...,...,...,...,...,...
40138,206187,34,0.747126,9.528736,0.772777
40139,206193,41,0.527697,4.457726,0.610809
40140,206201,32,0.569307,11.413366,0.603936
40141,206206,67,0.473684,3.985965,0.570676


In [10]:
# get product predictor
# total time of purchase of one product
product_total_purchase = order_product.groupby('product_id').progress_apply(lambda x: pd.Series({'total_purchase': len(x['order_id'])})).reset_index()
product_total_purchase.head()

100%|██████████| 48128/48128 [00:40<00:00, 1192.26it/s]


Unnamed: 0,product_id,total_purchase
0,1,1023
1,2,51
2,3,179
3,4,105
4,5,9


In [11]:
# the probability that a product get reorder
# p_reorder = order_product.groupby('product_id').filter(lambda x: x.shape[0] > 40)

product_reorder_ratio = order_product.groupby('product_id').progress_apply(lambda x: pd.Series({'reorder_prob': statistics.mean(x['reordered'])})).reset_index()
product_reorder_ratio.head()

100%|██████████| 48128/48128 [00:56<00:00, 847.31it/s] 


Unnamed: 0,product_id,reorder_prob
0,1,0.740958
1,2,0.176471
2,3,0.837989
3,4,0.561905
4,5,0.777778


In [12]:
product = product_total_purchase.merge(product_reorder_ratio, on='product_id', how='left')

del product_reorder_ratio
del product_total_purchase

gc.collect()

product.head()

Unnamed: 0,product_id,total_purchase,reorder_prob
0,1,1023,0.740958
1,2,51,0.176471
2,3,179,0.837989
3,4,105,0.561905
4,5,9,0.777778


In [13]:
total_bought = order_product.groupby(['user_id', 'product_id'])['order_id'].count().to_frame('total_bought').reset_index()

total_bought.head()

Unnamed: 0,user_id,product_id,total_bought
0,17,812,1
1,17,1073,2
2,17,1774,5
3,17,2927,3
4,17,4591,1


### Preparing training dataset

In [14]:
data = total_bought.merge(user, on='user_id', how='left')
data = data.merge(product, on='product_id', how='left')

del user
del product
del total_bought

gc.collect()

data['reorder_prob'] = data['reorder_prob'].fillna(0)
data

Unnamed: 0,user_id,product_id,total_bought,total_orders,reorder_ratio,avg_day_since_last_reorder,reorder_item_ratio_per_order,total_purchase,reorder_prob
0,17,812,1,40,0.717687,8.183673,0.722594,2110,0.616588
1,17,1073,2,40,0.717687,8.183673,0.722594,309,0.689320
2,17,1774,5,40,0.717687,8.183673,0.722594,639,0.627543
3,17,2927,3,40,0.717687,8.183673,0.722594,84,0.404762
4,17,4591,1,40,0.717687,8.183673,0.722594,989,0.534884
...,...,...,...,...,...,...,...,...,...
5265943,206208,48364,1,49,0.707533,7.310192,0.700368,10824,0.660200
5265944,206208,48865,1,49,0.707533,7.310192,0.700368,418,0.471292
5265945,206208,49247,1,49,0.707533,7.310192,0.700368,3386,0.750443
5265946,206208,49385,1,49,0.707533,7.310192,0.700368,178,0.241573


In [15]:
orders_future = orders[((orders.eval_set=='train') | (orders.eval_set=='test'))]
orders_future = orders_future[ ['user_id', 'eval_set', 'order_id'] ]
orders_future.head(10)

Unnamed: 0,user_id,eval_set,order_id
10,1,train,1187899
25,2,train,1492625
38,3,test,2774568
44,4,test,329954
49,5,train,2196797
53,6,test,1528013
74,7,train,525192
78,8,train,880375
82,9,train,1094988
88,10,train,1822501


In [16]:
data = data.merge(orders_future, on='user_id', how='left')
data.head(10)

Unnamed: 0,user_id,product_id,total_bought,total_orders,reorder_ratio,avg_day_since_last_reorder,reorder_item_ratio_per_order,total_purchase,reorder_prob,eval_set,order_id
0,17,812,1,40,0.717687,8.183673,0.722594,2110,0.616588,train,2180313
1,17,1073,2,40,0.717687,8.183673,0.722594,309,0.68932,train,2180313
2,17,1774,5,40,0.717687,8.183673,0.722594,639,0.627543,train,2180313
3,17,2927,3,40,0.717687,8.183673,0.722594,84,0.404762,train,2180313
4,17,4591,1,40,0.717687,8.183673,0.722594,989,0.534884,train,2180313
5,17,5077,1,40,0.717687,8.183673,0.722594,33629,0.850902,train,2180313
6,17,5128,5,40,0.717687,8.183673,0.722594,540,0.707407,train,2180313
7,17,5748,1,40,0.717687,8.183673,0.722594,1093,0.639524,train,2180313
8,17,6212,3,40,0.717687,8.183673,0.722594,128,0.695312,train,2180313
9,17,6334,2,40,0.717687,8.183673,0.722594,115,0.704348,train,2180313


In [17]:
data_train = data[data.eval_set=='train']
data_train.head()

Unnamed: 0,user_id,product_id,total_bought,total_orders,reorder_ratio,avg_day_since_last_reorder,reorder_item_ratio_per_order,total_purchase,reorder_prob,eval_set,order_id
0,17,812,1,40,0.717687,8.183673,0.722594,2110,0.616588,train,2180313
1,17,1073,2,40,0.717687,8.183673,0.722594,309,0.68932,train,2180313
2,17,1774,5,40,0.717687,8.183673,0.722594,639,0.627543,train,2180313
3,17,2927,3,40,0.717687,8.183673,0.722594,84,0.404762,train,2180313
4,17,4591,1,40,0.717687,8.183673,0.722594,989,0.534884,train,2180313


In [18]:
data_train = data_train.merge(order_products_train[['product_id','order_id', 'reordered']], on=['product_id','order_id'],how='left' )

data_train.head()

Unnamed: 0,user_id,product_id,total_bought,total_orders,reorder_ratio,avg_day_since_last_reorder,reorder_item_ratio_per_order,total_purchase,reorder_prob,eval_set,order_id,reordered
0,17,812,1,40,0.717687,8.183673,0.722594,2110,0.616588,train,2180313,
1,17,1073,2,40,0.717687,8.183673,0.722594,309,0.68932,train,2180313,
2,17,1774,5,40,0.717687,8.183673,0.722594,639,0.627543,train,2180313,
3,17,2927,3,40,0.717687,8.183673,0.722594,84,0.404762,train,2180313,
4,17,4591,1,40,0.717687,8.183673,0.722594,989,0.534884,train,2180313,


In [19]:
data_train['reordered'] = data_train['reordered'].fillna(0)
data_train = data_train.set_index(['user_id', 'product_id'])

In [20]:
data_train = data_train.drop(['eval_set', 'order_id'], axis=1)
data_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bought,total_orders,reorder_ratio,avg_day_since_last_reorder,reorder_item_ratio_per_order,total_purchase,reorder_prob,reordered
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
17,812,1,40,0.717687,8.183673,0.722594,2110,0.616588,0.0
17,1073,2,40,0.717687,8.183673,0.722594,309,0.68932,0.0
17,1774,5,40,0.717687,8.183673,0.722594,639,0.627543,0.0
17,2927,3,40,0.717687,8.183673,0.722594,84,0.404762,0.0
17,4591,1,40,0.717687,8.183673,0.722594,989,0.534884,0.0


### Create prediction model

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(data_train.drop('reordered', axis=1), data_train.reordered, test_size=0.8, random_state=42)

In [21]:
clf = RandomForestClassifier(n_estimators=10 ,random_state=42)
model = clf.fit(X_train, y_train)

In [22]:
y_pred = (model.predict_proba(X_test)[:,1] >= 0.30).astype(int)

### Get scores on validation set
print("RESULTS ON VALIDATION SET\n====================")
print("F1 Score: ",f1_score(y_test, y_pred, average='binary'), "\n====================")
print("Classification Report\n ", classification_report(y_test, y_pred), "\n====================")
print("Confusion Matrix\n ", confusion_matrix(y_test, y_pred))

RESULTS ON VALIDATION SET
F1 Score:  0.30990535675972064 
Classification Report
                precision    recall  f1-score   support

         0.0       0.95      0.93      0.94   2507866
         1.0       0.27      0.36      0.31    181289

    accuracy                           0.89   2689155
   macro avg       0.61      0.64      0.63   2689155
weighted avg       0.91      0.89      0.90   2689155
 
Confusion Matrix
  [[2334683  173183]
 [ 116291   64998]]


In [23]:
feature_importances_df = pd.DataFrame(model.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance',ascending=False)
print(feature_importances_df)

                            importance
reorder_ratio                 0.194378
avg_day_since_last_reorder    0.193318
reorder_prob                  0.177038
total_purchase                0.168589
total_bought                  0.164626
total_orders                  0.102050


In [22]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {'bootstrap': [True, False], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8], 'n_estimators': [10, 50, 100, 200, 500], 'max_features': [1, 2, 3, 4, 5, 6, 7, 8], 'criterion': ['gini', 'entropy']}

clf = RandomForestClassifier(n_jobs=-1)

gsc = RandomizedSearchCV(clf, parameters, scoring='f1', n_jobs=1, verbose=10)
gsc.fit(X_train, y_train)
print(gsc.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=10, max_features=5, max_depth=8, criterion=gini, bootstrap=True 
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV]  n_estimators=10, max_features=5, max_depth=8, criterion=gini, bootstrap=True, score=0.181, total=   7.0s
[CV] n_estimators=10, max_features=5, max_depth=8, criterion=gini, bootstrap=True 
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.9s remaining:    0.0s
[CV]  n_estimators=10, max_features=5, max_depth=8, criterion=gini, bootstrap=True, score=0.184, total=   5.1s
[CV] n_estimators=10, max_features=5, max_depth=8, criterion=gini, bootstrap=True 
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   12.0s remaining:    0.0s
[CV]  n_estimators=10, max_features=5, max_depth=8, criterion=gini, bootstrap=True, score=0.185, total=   4.3s
[CV] n_estimators=10, max_features=5, max_depth=8, criterion=gini, bootstrap=True 
[Parallel(n_jobs=1)]: Done   3 ou

In [23]:
y_pred = (gsc.predict_proba(X_test)[:,1] >= 0.30).astype(int)

### Get scores on validation set
print("RESULTS ON VALIDATION SET\n====================")
print("F1 Score: ",f1_score(y_test, y_pred, average='binary'), "\n====================")
print("Classification Report\n ", classification_report(y_test, y_pred), "\n====================")
print("Confusion Matrix\n ", confusion_matrix(y_test, y_pred))

RESULTS ON VALIDATION SET
F1 Score:  0.32473639814424293 
Classification Report
                precision    recall  f1-score   support

         0.0       0.95      0.98      0.96   2507866
         1.0       0.45      0.25      0.32    181289

    accuracy                           0.93   2689155
   macro avg       0.70      0.62      0.64   2689155
weighted avg       0.91      0.93      0.92   2689155
 
Confusion Matrix
  [[2450832   57034]
 [ 135092   46197]]


In [27]:
clf = RandomForestClassifier(n_estimators=10 ,max_features=5, max_depth=8, criterion='gini', bootstrap=True)
model = clf.fit(X_train, y_train)
y_pred = (model.predict_proba(X_test)[:,1] >= 0.30).astype(int)

In [28]:
### Get scores on validation set
print("RESULTS ON VALIDATION SET\n====================")
print("F1 Score: ",f1_score(y_test, y_pred, average='binary'), "\n====================")
print("Classification Report\n ", classification_report(y_test, y_pred), "\n====================")
print("Confusion Matrix\n ", confusion_matrix(y_test, y_pred), "\n====================")
feature_importances_df = pd.DataFrame(model.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance',ascending=False)
print(feature_importances_df)

RESULTS ON VALIDATION SET
F1 Score:  0.32642274570556346 
Classification Report
                precision    recall  f1-score   support

         0.0       0.95      0.98      0.96   2507866
         1.0       0.45      0.26      0.33    181289

    accuracy                           0.93   2689155
   macro avg       0.70      0.62      0.64   2689155
weighted avg       0.91      0.93      0.92   2689155
 
Confusion Matrix
  [[2450410   57456]
 [ 134723   46566]] 
                              importance
total_bought                    0.788226
total_orders                    0.088440
reorder_prob                    0.057831
avg_day_since_last_reorder      0.028254
total_purchase                  0.013159
reorder_ratio                   0.012538
reorder_item_ratio_per_order    0.011553


In [31]:
feature_importances_df

Unnamed: 0,importance
total_bought,0.788226
total_orders,0.08844
reorder_prob,0.057831
avg_day_since_last_reorder,0.028254
total_purchase,0.013159
reorder_ratio,0.012538
reorder_item_ratio_per_order,0.011553


In [33]:
import pickle

filename = 'rfc_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [37]:
y_pred

array([0, 1, 0, ..., 0, 0, 0])