In [1]:
import pandas as pd
import statistics
import gc
from tqdm import tqdm
gc.enable()

orders = pd.read_csv('data/orders.csv')
order_products_prior = pd.read_csv('data/order_products__prior.csv')
order_products_train = pd.read_csv('data/order_products__train.csv')
products = pd.read_csv('data/products.csv')
orders['eval_set'] = orders['eval_set'].astype('category')

In [2]:
tqdm.pandas()

In [3]:
order_product = orders.merge(order_products_prior, on='order_id', how='inner')
# user_clf = pd.read_csv('customer_classification.csv')
# user_clf = user_clf.drop(user_clf.columns[0], axis=1)

# order_product = order_product.merge(user_clf, on='user_id', how='inner')
# order_product['label'] = order_product['label'].astype('category')
# order_product.head()

In [4]:
# Find the total number of order a user placed

user_total_orders = order_product.groupby('user_id').progress_apply(lambda x: pd.Series({'total_orders': max(x['order_number'])})).reset_index()

user_total_orders.head()

100%|██████████| 206209/206209 [02:24<00:00, 1428.98it/s]


Unnamed: 0,user_id,total_orders
0,1,10
1,2,14
2,3,12
3,4,5
4,5,4


In [5]:
# Find out the frequency of user reordered

#       times of user reorder / total order of user

user_reorder = order_product.groupby('user_id').progress_apply(lambda x: pd.Series({'reorder_ratio' : statistics.mean(x['reordered'])})).reset_index()

100%|██████████| 206209/206209 [03:01<00:00, 1133.83it/s]


In [6]:
# Average day since last order

order_product['days_since_prior_order'] = order_product['days_since_prior_order'].fillna(0)

user_last_order = order_product.groupby('user_id').progress_apply(lambda x: pd.Series({'avg_day_since_last_reorder' : statistics.mean(x['days_since_prior_order'])})).reset_index()

user_last_order

100%|██████████| 206209/206209 [02:10<00:00, 1585.47it/s]


Unnamed: 0,user_id,avg_day_since_last_reorder
0,1,18.542373
1,2,14.902564
2,3,10.181818
3,4,11.944444
4,5,10.189189
...,...,...
206204,206205,9.687500
206205,206206,3.985965
206206,206207,13.278027
206207,206208,7.310192


In [7]:
# for each user, average reorder item percentage in whole order


def average_perc(x):
    order_reorder_prob = x.groupby('order_id')['reordered'].mean().to_frame('reorder_prob')
    return pd.Series({'reorder_item_ratio_per_order': statistics.mean(order_reorder_prob['reorder_prob'])}) 

average_reorder = order_product.groupby(['user_id']).progress_apply(average_perc).reset_index()
average_reorder.head()

100%|██████████| 206209/206209 [05:46<00:00, 595.82it/s]


Unnamed: 0,user_id,reorder_item_ratio_per_order
0,1,0.705833
1,2,0.447961
2,3,0.658817
3,4,0.028571
4,5,0.377778


In [8]:
user = user_total_orders.merge(user_reorder, on='user_id', how='left')
user = user.merge(user_last_order, on='user_id', how='left')
user = user.merge(average_reorder, on='user_id', how='left')

del user_reorder
del user_last_order
del average_reorder

gc.collect()

user

Unnamed: 0,user_id,total_orders,reorder_ratio,avg_day_since_last_reorder,reorder_item_ratio_per_order
0,1,10,0.694915,18.542373,0.705833
1,2,14,0.476923,14.902564,0.447961
2,3,12,0.625000,10.181818,0.658817
3,4,5,0.055556,11.944444,0.028571
4,5,4,0.378378,10.189189,0.377778
...,...,...,...,...,...
206204,206205,3,0.250000,9.687500,0.369048
206205,206206,67,0.473684,3.985965,0.570676
206206,206207,16,0.587444,13.278027,0.637144
206207,206208,49,0.707533,7.310192,0.700368


In [9]:
# get product predictor
# total time of purchase of one product
product_total_purchase = order_product.groupby('product_id').progress_apply(lambda x: pd.Series({'total_purchase': len(x['order_id'])})).reset_index()
product_total_purchase.head()

100%|██████████| 49677/49677 [00:35<00:00, 1410.94it/s]


Unnamed: 0,product_id,total_purchase
0,1,1852
1,2,90
2,3,277
3,4,329
4,5,15


In [10]:
# the probability that a product get reorder
# p_reorder = order_product.groupby('product_id').filter(lambda x: x.shape[0] > 40)

product_reorder_ratio = order_product.groupby('product_id').progress_apply(lambda x: pd.Series({'reorder_prob': statistics.mean(x['reordered'])})).reset_index()
product_reorder_ratio.head()

100%|██████████| 49677/49677 [00:54<00:00, 904.04it/s] 


Unnamed: 0,product_id,reorder_prob
0,1,0.613391
1,2,0.133333
2,3,0.732852
3,4,0.446809
4,5,0.6


In [11]:
product = product_total_purchase.merge(product_reorder_ratio, on='product_id', how='left')

del product_reorder_ratio
del product_total_purchase

gc.collect()

product.head()

Unnamed: 0,product_id,total_purchase,reorder_prob
0,1,1852,0.613391
1,2,90,0.133333
2,3,277,0.732852
3,4,329,0.446809
4,5,15,0.6


In [12]:
total_bought = order_product.groupby(['user_id', 'product_id'])['order_id'].count().to_frame('total_bought').reset_index()

total_bought.head()

Unnamed: 0,user_id,product_id,total_bought
0,1,196,10
1,1,10258,9
2,1,10326,1
3,1,12427,10
4,1,13032,3


### Preparing training dataset

In [13]:
data = total_bought.merge(user, on='user_id', how='left')
data = data.merge(product, on='product_id', how='left')

del user
del product
del total_bought

gc.collect()

data['reorder_prob'] = data['reorder_prob'].fillna(0)
data

Unnamed: 0,user_id,product_id,total_bought,total_orders,reorder_ratio,avg_day_since_last_reorder,reorder_item_ratio_per_order,total_purchase,reorder_prob
0,1,196,10,10,0.694915,18.542373,0.705833,35791,0.776480
1,1,10258,9,10,0.694915,18.542373,0.705833,1946,0.713772
2,1,10326,1,10,0.694915,18.542373,0.705833,5526,0.652009
3,1,12427,10,10,0.694915,18.542373,0.705833,6476,0.740735
4,1,13032,3,10,0.694915,18.542373,0.705833,3751,0.657158
...,...,...,...,...,...,...,...,...,...
13307948,206209,43961,3,13,0.472868,18.232558,0.536752,55371,0.630583
13307949,206209,44325,1,13,0.472868,18.232558,0.536752,3485,0.401148
13307950,206209,48370,1,13,0.472868,18.232558,0.536752,3934,0.699288
13307951,206209,48697,1,13,0.472868,18.232558,0.536752,9783,0.357661


In [14]:
orders_future = orders[((orders.eval_set=='train') | (orders.eval_set=='test'))]
orders_future = orders_future[ ['user_id', 'eval_set', 'order_id'] ]
orders_future.head(10)

Unnamed: 0,user_id,eval_set,order_id
10,1,train,1187899
25,2,train,1492625
38,3,test,2774568
44,4,test,329954
49,5,train,2196797
53,6,test,1528013
74,7,train,525192
78,8,train,880375
82,9,train,1094988
88,10,train,1822501


In [15]:
data = data.merge(orders_future, on='user_id', how='left')
data.head(10)

Unnamed: 0,user_id,product_id,total_bought,total_orders,reorder_ratio,avg_day_since_last_reorder,reorder_item_ratio_per_order,total_purchase,reorder_prob,eval_set,order_id
0,1,196,10,10,0.694915,18.542373,0.705833,35791,0.77648,train,1187899
1,1,10258,9,10,0.694915,18.542373,0.705833,1946,0.713772,train,1187899
2,1,10326,1,10,0.694915,18.542373,0.705833,5526,0.652009,train,1187899
3,1,12427,10,10,0.694915,18.542373,0.705833,6476,0.740735,train,1187899
4,1,13032,3,10,0.694915,18.542373,0.705833,3751,0.657158,train,1187899
5,1,13176,2,10,0.694915,18.542373,0.705833,379450,0.832555,train,1187899
6,1,14084,1,10,0.694915,18.542373,0.705833,15935,0.810982,train,1187899
7,1,17122,1,10,0.694915,18.542373,0.705833,13880,0.675576,train,1187899
8,1,25133,8,10,0.694915,18.542373,0.705833,6196,0.740155,train,1187899
9,1,26088,2,10,0.694915,18.542373,0.705833,2523,0.539041,train,1187899


In [16]:
data_train = data[data.eval_set=='train']
data_train.head()

Unnamed: 0,user_id,product_id,total_bought,total_orders,reorder_ratio,avg_day_since_last_reorder,reorder_item_ratio_per_order,total_purchase,reorder_prob,eval_set,order_id
0,1,196,10,10,0.694915,18.542373,0.705833,35791,0.77648,train,1187899
1,1,10258,9,10,0.694915,18.542373,0.705833,1946,0.713772,train,1187899
2,1,10326,1,10,0.694915,18.542373,0.705833,5526,0.652009,train,1187899
3,1,12427,10,10,0.694915,18.542373,0.705833,6476,0.740735,train,1187899
4,1,13032,3,10,0.694915,18.542373,0.705833,3751,0.657158,train,1187899


In [17]:
data_train = data_train.merge(order_products_train[['product_id','order_id', 'reordered']], on=['product_id','order_id'],how='left' )

data_train.head()

Unnamed: 0,user_id,product_id,total_bought,total_orders,reorder_ratio,avg_day_since_last_reorder,reorder_item_ratio_per_order,total_purchase,reorder_prob,eval_set,order_id,reordered
0,1,196,10,10,0.694915,18.542373,0.705833,35791,0.77648,train,1187899,1.0
1,1,10258,9,10,0.694915,18.542373,0.705833,1946,0.713772,train,1187899,1.0
2,1,10326,1,10,0.694915,18.542373,0.705833,5526,0.652009,train,1187899,
3,1,12427,10,10,0.694915,18.542373,0.705833,6476,0.740735,train,1187899,
4,1,13032,3,10,0.694915,18.542373,0.705833,3751,0.657158,train,1187899,1.0


In [18]:
data_train['reordered'] = data_train['reordered'].fillna(0)
data_train = data_train.set_index(['user_id', 'product_id'])

In [19]:
data_train = data_train.drop(['eval_set', 'order_id'], axis=1)
data_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bought,total_orders,reorder_ratio,avg_day_since_last_reorder,reorder_item_ratio_per_order,total_purchase,reorder_prob,reordered
user_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,196,10,10,0.694915,18.542373,0.705833,35791,0.77648,1.0
1,10258,9,10,0.694915,18.542373,0.705833,1946,0.713772,1.0
1,10326,1,10,0.694915,18.542373,0.705833,5526,0.652009,0.0
1,12427,10,10,0.694915,18.542373,0.705833,6476,0.740735,0.0
1,13032,3,10,0.694915,18.542373,0.705833,3751,0.657158,1.0


### Create prediction model

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(data_train.drop('reordered', axis=1), data_train.reordered, test_size=0.8, random_state=42)

In [28]:
clf = RandomForestClassifier(n_estimators=10 ,random_state=42)
model = clf.fit(X_train, y_train)

In [29]:
y_pred = (model.predict_proba(X_test)[:,1] >= 0.30).astype(int)

### Get scores on validation set
print("RESULTS ON VALIDATION SET\n====================")
print("F1 Score: ",f1_score(y_test, y_pred, average='binary'), "\n====================")
print("Classification Report\n ", classification_report(y_test, y_pred), "\n====================")
print("Confusion Matrix\n ", confusion_matrix(y_test, y_pred), "\n====================")
feature_importances_df = pd.DataFrame(model.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance',ascending=False)
print(feature_importances_df)

RESULTS ON VALIDATION SET
F1 Score:  0.3356859486862497 
Classification Report
                precision    recall  f1-score   support

         0.0       0.93      0.89      0.91   6116209
         1.0       0.29      0.40      0.34    663520

    accuracy                           0.84   6779729
   macro avg       0.61      0.65      0.62   6779729
weighted avg       0.87      0.84      0.85   6779729
 
Confusion Matrix
  [[5449640  666569]
 [ 395246  268274]] 
                              importance
avg_day_since_last_reorder      0.166329
reorder_prob                    0.165624
reorder_item_ratio_per_order    0.161445
reorder_ratio                   0.158503
total_purchase                  0.152722
total_bought                    0.104400
total_orders                    0.090976


In [25]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8], 'n_estimators': [10, 50, 100], 'max_features': [1, 2, 3, 4, 5, 6, 7, 8]}

clf = RandomForestClassifier(n_jobs=-1)

gsc = RandomizedSearchCV(clf, parameters, scoring='f1', n_jobs=1, verbose=10)
gsc.fit(X_train, y_train)
print(gsc.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] n_estimators=10, max_features=5, max_depth=2 ....................
[CV]  n_estimators=10, max_features=5, max_depth=2, score=0.000, total=   4.9s
[CV] n_estimators=10, max_features=5, max_depth=2 ....................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.9s remaining:    0.0s
[CV]  n_estimators=10, max_features=5, max_depth=2, score=0.000, total=   4.0s
[CV] n_estimators=10, max_features=5, max_depth=2 ....................
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    8.8s remaining:    0.0s
[CV]  n_estimators=10, max_features=5, max_depth=2, score=0.000, total=   3.7s
[CV] n_estimators=10, max_features=5, max_depth=2 ....................
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   12.6s remaining:    0.0s
[CV]  n_estimators=10, max_features=5, max_depth=2, score=0.000, total=   3.4s
[CV] n_estimators=

In [30]:
clf = RandomForestClassifier(n_estimators=50 ,max_features=7, max_depth=8)
model = clf.fit(X_train, y_train)
y_pred = (model.predict_proba(X_test)[:,1] >= 0.30).astype(int)

In [27]:
### Get scores on validation set
print("RESULTS ON VALIDATION SET\n====================")
print("F1 Score: ",f1_score(y_test, y_pred, average='binary'), "\n====================")
print("Classification Report\n ", classification_report(y_test, y_pred), "\n====================")
print("Confusion Matrix\n ", confusion_matrix(y_test, y_pred), "\n====================")
feature_importances_df = pd.DataFrame(model.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance',ascending=False)
print(feature_importances_df)

RESULTS ON VALIDATION SET
F1 Score:  0.3546679614140841 
Classification Report
                precision    recall  f1-score   support

         0.0       0.93      0.96      0.94   6116209
         1.0       0.45      0.29      0.35    663520

    accuracy                           0.90   6779729
   macro avg       0.69      0.63      0.65   6779729
weighted avg       0.88      0.90      0.89   6779729
 
Confusion Matrix
  [[5881715  234494]
 [ 469944  193576]] 
                              importance
total_bought                    0.631492
total_orders                    0.306855
reorder_prob                    0.041544
reorder_ratio                   0.010574
total_purchase                  0.004059
reorder_item_ratio_per_order    0.002940
avg_day_since_last_reorder      0.002536


In [31]:
feature_importances_df

Unnamed: 0,importance
total_bought,0.788226
total_orders,0.08844
reorder_prob,0.057831
avg_day_since_last_reorder,0.028254
total_purchase,0.013159
reorder_ratio,0.012538
reorder_item_ratio_per_order,0.011553


In [33]:
import pickle

filename = 'rfc_model.sav'
pickle.dump(model, open(filename, 'wb'))