In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", 101)

### Picking up where we left off...

In [2]:
# read in pickled DFs generated by query_dfs.py

df_orders = pd.read_pickle("./pickle/df_orders.pickle")
df_train = pd.read_pickle("./pickle/df_train.pickle")
df_prior = pd.read_pickle("./pickle/df_prior.pickle")
df_prod_detail = pd.read_pickle("./pickle/df_prod_detail.pickle")

In [86]:
# read in pickled feature DF generated by feature_engineering_1.ipynb
X = pd.read_pickle("./pickle/X_7.pickle")

In [87]:
X.shape


(8474661, 7)

In [88]:
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,last_cart,in_last_cart
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0
1,907,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0
2,1000,138,1,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0


In [89]:
df_orders.sample()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
984636,2550549,59188,prior,2,5,16,30.0


In [90]:
len(df_train)

1384617

In [91]:
mask = df_orders.eval_set == 'train'
len(df_orders[mask].order_id.unique())

131209

### Features to Add:

**Product features**
* `prod_total_sales`: the number of times the product has been purcased in the past (by all users)
* `prod_pct_reordered`: product percent of prior transactions that were reorders
* `prod_avg_atco`: product average add-to-cart order


In [92]:
prod_sales = (df_prior.groupby('product_id')
              .agg({"product_id": "count"})['product_id']
              .sort_values(ascending=False))

prod_sales

product_id
24852    472565
13176    379450
21137    264683
21903    241921
47209    213584
          ...  
45893         1
13397         1
30451         1
42464         1
25248         1
Name: product_id, Length: 49677, dtype: int64

In [93]:
# get total sales for each product & add to X

X = (X.merge(prod_sales, left_on='product_id', right_index=True)
    .drop(columns="product_id_x"))
    
X.rename(columns={"product_id_y": "prod_prior_sales"}, inplace=True)
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,last_cart,in_last_cart,prod_prior_sales
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1852
55,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,"[24161, 6046, 49535, 23760, 43611, 21760, 3604...",0,1852
190,1,777,1,"[16797, 43352, 6184]",0,"[33493, 16797, 43352, 17122, 29223]",0,1852


In [94]:
X.shape

(8474661, 8)

In [95]:
# add prod_pct_reordered col
reorders = df_prior.groupby("product_id").agg({"reordered": "sum"})
reorders.head(3)

Unnamed: 0_level_0,reordered
product_id,Unnamed: 1_level_1
1,1136
2,12
3,203


In [96]:
X = (X.merge(reorders, left_on='product_id', right_index=True)
     .rename(columns={"reordered": "prod_prior_reorders"}))
del reorders
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,last_cart,in_last_cart,prod_prior_sales,prod_prior_reorders
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1852,1136
55,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,"[24161, 6046, 49535, 23760, 43611, 21760, 3604...",0,1852,1136
190,1,777,1,"[16797, 43352, 6184]",0,"[33493, 16797, 43352, 17122, 29223]",0,1852,1136


In [97]:
X["prod_pct_reorders"] = X.prod_prior_reorders / X.prod_prior_sales
X.drop(columns="prod_prior_reorders", inplace=True)
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,last_cart,in_last_cart,prod_prior_sales,prod_pct_reorders
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1852,0.613391
55,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,"[24161, 6046, 49535, 23760, 43611, 21760, 3604...",0,1852,0.613391
190,1,777,1,"[16797, 43352, 6184]",0,"[33493, 16797, 43352, 17122, 29223]",0,1852,0.613391


In [98]:
X.shape

(8474661, 9)

In [99]:
# add product avg. add to cart order (prod_avg_atco)
atcos = (df_prior.groupby(['product_id'], as_index=False)
                                           .agg({'add_to_cart_order': 'mean'}))

X = X.merge(atcos, on="product_id").rename(columns={'add_to_cart_order': 'prod_avg_atco'})
del atcos
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,last_cart,in_last_cart,prod_prior_sales,prod_pct_reorders,prod_avg_atco
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1852,0.613391,5.801836
1,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,"[24161, 6046, 49535, 23760, 43611, 21760, 3604...",0,1852,0.613391,5.801836
2,1,777,1,"[16797, 43352, 6184]",0,"[33493, 16797, 43352, 17122, 29223]",0,1852,0.613391,5.801836


### Features to Add

**User features**
* `user_total_orders`: total number of orders for the user
* `user_total_reorders`
* `user_pct_reorders`
* `user_avg_ippo`: user average items per order

In [100]:
# get num. total purchases for each user   
user_order_counts = (df_orders.groupby("user_id")
                     .agg({"order_number": "max"})
                     .rename(columns={"order_number": "user_total_orders"}))
                     
                     
user_order_counts.head(3)

Unnamed: 0_level_0,user_total_orders
user_id,Unnamed: 1_level_1
1,11
2,15
3,13


In [101]:
# add user_total_orders col to X

X = X.merge(user_order_counts, how='left', left_on='user_id', right_index=True)

del user_order_counts

X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,last_cart,in_last_cart,prod_prior_sales,prod_pct_reorders,prod_avg_atco,user_total_orders
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1852,0.613391,5.801836,33
1,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,"[24161, 6046, 49535, 23760, 43611, 21760, 3604...",0,1852,0.613391,5.801836,6
2,1,777,1,"[16797, 43352, 6184]",0,"[33493, 16797, 43352, 17122, 29223]",0,1852,0.613391,5.801836,27


In [102]:
# add total items per order
user_total_items = (df_prior.groupby('user_id', as_index=False)
 .agg({"product_id": "count"})
 .rename(columns={"product_id": "user_total_items_purchased"}))

user_total_items.head(3)

Unnamed: 0,user_id,user_total_items_purchased
0,1,59
1,2,195
2,3,88


In [103]:
user_num_prior_orders = (df_prior.groupby('user_id', as_index=False)
 .agg({"order_number": "max"})
 .rename(columns={"order_number": "user_num_prior_orders"}))

user_avg_ipos = user_num_prior_orders.merge(user_total_items, on='user_id')

user_avg_ipos['user_avg_ippo'] = user_avg_ipos.user_total_items_purchased / user_avg_ipos.user_num_prior_orders
user_avg_ipos

Unnamed: 0,user_id,user_num_prior_orders,user_total_items_purchased,user_avg_ippo
0,1,10,59,5.900000
1,2,14,195,13.928571
2,3,12,88,7.333333
3,4,5,18,3.600000
4,5,4,37,9.250000
...,...,...,...,...
206204,206205,3,32,10.666667
206205,206206,67,285,4.253731
206206,206207,16,223,13.937500
206207,206208,49,677,13.816327


In [104]:
X = X.merge(user_avg_ipos[['user_id', 'user_avg_ippo']], on='user_id')
X.drop(columns='user_total_orders', inplace=True)
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,last_cart,in_last_cart,prod_prior_sales,prod_pct_reorders,prod_avg_atco,user_avg_ippo
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1852,0.613391,5.801836,4.625
1,907,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,2025,0.554568,3.653333,4.625
2,1000,138,1,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,2610,0.408046,9.503448,4.625


### Top Ten Products (order percentage)

In [105]:
(df_prod_detail.merge((X.groupby('product_id', as_index=False)
 .agg({"prod_pct_reorders": "min"})), how='left', on='product_id')
 .sort_values(by="prod_pct_reorders", ascending=False)).reset_index().head(10)

Unnamed: 0,index,product_id,aisle_id,department_id,product_name,aisle,department,prod_pct_reorders
0,4445,6433,13,20,Raw Veggie Wrappers,prepared meals,deli,0.941176
1,47290,2075,126,11,Serenity Ultimate Extrema Overnight Pads,feminine care,personal care,0.931034
2,22700,43553,64,7,Orange Energy Shots,energy sports drinks,beverages,0.923077
3,16443,27740,45,19,Chocolate Love Bar,candy chocolate,snacks,0.920792
4,2442,13875,6,2,Simply Sleep Nighttime Sleep Aid,other,other,0.911111
5,22697,39992,64,7,"Energy Shot, Grape Flavor",energy sports drinks,beverages,0.909091
6,10700,5868,28,5,Russian River Valley Reserve Pinot Noir,red wines,alcohol,0.9
7,17003,35604,45,19,Maca Buttercups,candy chocolate,snacks,0.9
8,42966,31418,115,7,Sparking Water,water seltzer sparkling water,beverages,0.9
9,30872,36543,88,13,Bars Peanut Butter,spreads,pantry,0.895522


In [106]:
X.sample(2)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,last_cart,in_last_cart,prod_prior_sales,prod_pct_reorders,prod_avg_atco,user_avg_ippo
3921244,25510,169212,1,"[48679, 28785, 42768, 17445, 20127, 17902, 300...",0,"[4428, 3935, 21783, 31635, 21709, 40749, 27531...",0,44,0.318182,9.090909,11.363636
7344856,1529,165060,1,"[38739, 35042, 21278, 39485, 5913, 29487, 2295...",0,"[38739, 9387, 22950, 8230, 6244, 28993, 28156,...",0,8736,0.439332,8.816735,9.555556


In [107]:
from collections import OrderedDict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def get_user_split_data(df, test_size=.2, seed=36):

    rs = np.random.RandomState(seed)
    
    # Here, we select a sample (`choice`) from all possible unique users
    total_users = df['user_id'].unique()
    test_users = rs.choice(total_users, 
                           size=int(total_users.shape[0] * test_size), 
                           replace=False)

    mask = df['user_id'].isin(test_users)
    df_tr = df[~mask] # the '~' means NOT (i.e. includes Bool=False)
    df_te = df[mask] 

    y_tr, y_te = df_tr['in_cart'], df_te['in_cart']
    X_tr = df_tr.drop(['product_id','user_id','cart','in_cart','last_cart'],axis=1) 
    X_te = df_te.drop(['product_id','user_id','cart','in_cart','last_cart'],axis=1)
    
    print(f"Actual Test Size: {y_te.shape[0] / df.shape[0]:0.4}")
    
    return X_tr, X_te, y_tr, y_te

In [108]:
# check results
X_tr, X_te, y_tr, y_te = get_user_split_data(X)

lr = LogisticRegression(solver='lbfgs')
lr.fit(X_tr, y_tr)
f1_score(lr.predict(X_te), y_te)

Actual Test Size: 0.2001


0.13400922991152087

In [109]:
X.columns

Index(['product_id', 'user_id', 'num_orders', 'cart', 'in_cart', 'last_cart',
       'in_last_cart', 'prod_prior_sales', 'prod_pct_reorders',
       'prod_avg_atco', 'user_avg_ippo'],
      dtype='object')

In [110]:
len(df_prior.user_id.unique())

206209

In [113]:
len(df_train.user_id.unique())

131209

In [114]:
len(df_orders.user_id.unique())

206209

In [115]:
mask = df_orders.eval_set == 'test'
df_orders[mask].head(3)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,13,5,15,11.0
44,329954,4,test,6,3,12,30.0
53,1528013,6,test,4,3,16,22.0


In [117]:
mask = df_prior.order_id == 2774568
df_prior[mask].shape

(0, 11)