In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

pd.set_option("display.max_columns", 101)
pd.set_option("display.max_rows", 101)

### Picking up where we left off...

In [2]:
# read in pickled DFs generated by query_dfs.py

df_orders = pd.read_pickle("./pickle/df_orders.pickle")
df_train = pd.read_pickle("./pickle/df_train.pickle")
df_prior = pd.read_pickle("./pickle/df_prior.pickle")
df_prod_detail = pd.read_pickle("./pickle/df_prod_detail.pickle")

In [3]:
# read in pickled feature DF generated by feature_engineering_1.ipynb
X = pd.read_pickle("./pickle/X_10.pickle")

In [4]:
X.shape

(8474661, 10)

In [5]:
X.head(3)

Unnamed: 0,product_id,user_id,user_total_prod_orders,cart,in_cart,last_cart,in_last_cart,qty_reordered,qty_sold,prod_reorder_pct
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1136,1852,0.613391
1,907,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1123,2025,0.554568
2,1000,138,1,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1065,2610,0.408046


In [6]:
df_orders.sample()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
2351190,121821,141538,train,45,5,14,6.0


In [7]:
len(df_train)

1384617

In [8]:
mask = df_orders.eval_set == 'train'
len(df_orders[mask].order_id.unique())

131209

### Features to Add:

**Product features**
* `prod_total_sales`: the number of times the product has been purcased in the past (by all users)
* `prod_pct_reordered`: product percent of prior transactions that were reorders
* `prod_avg_atco`: product average add-to-cart order


In [9]:
prod_sales = (df_prior.groupby('product_id')
              .agg({"product_id": "count"})['product_id']
              .sort_values(ascending=False))

prod_sales

product_id
24852    472565
13176    379450
21137    264683
21903    241921
47209    213584
          ...  
45893         1
13397         1
30451         1
42464         1
25248         1
Name: product_id, Length: 49677, dtype: int64

In [10]:
# get total sales for each product & add to X

X = (X.merge(prod_sales, left_on='product_id', right_index=True)
    .drop(columns="product_id_x"))
    
X.rename(columns={"product_id_y": "prod_prior_sales"}, inplace=True)
X.head(3)

Unnamed: 0,product_id,user_id,user_total_prod_orders,cart,in_cart,last_cart,in_last_cart,qty_reordered,qty_sold,prod_reorder_pct,prod_prior_sales
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1136,1852,0.613391,1852
55,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,"[24161, 6046, 49535, 23760, 43611, 21760, 3604...",0,1136,1852,0.613391,1852
190,1,777,1,"[16797, 43352, 6184]",0,"[33493, 16797, 43352, 17122, 29223]",0,1136,1852,0.613391,1852


In [11]:
X.shape

(8474661, 11)

In [12]:
# add prod_pct_reordered col
reorders = df_prior.groupby("product_id").agg({"reordered": "sum"})
reorders.head(3)

Unnamed: 0_level_0,reordered
product_id,Unnamed: 1_level_1
1,1136
2,12
3,203


In [13]:
X = (X.merge(reorders, left_on='product_id', right_index=True)
     .rename(columns={"reordered": "prod_prior_reorders"}))
del reorders
X.head(3)

Unnamed: 0,product_id,user_id,user_total_prod_orders,cart,in_cart,last_cart,in_last_cart,qty_reordered,qty_sold,prod_reorder_pct,prod_prior_sales,prod_prior_reorders
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1136,1852,0.613391,1852,1136
55,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,"[24161, 6046, 49535, 23760, 43611, 21760, 3604...",0,1136,1852,0.613391,1852,1136
190,1,777,1,"[16797, 43352, 6184]",0,"[33493, 16797, 43352, 17122, 29223]",0,1136,1852,0.613391,1852,1136


In [14]:
X["prod_pct_reorders"] = X.prod_prior_reorders / X.prod_prior_sales
X.drop(columns="prod_prior_reorders", inplace=True)
X.head(3)

Unnamed: 0,product_id,user_id,user_total_prod_orders,cart,in_cart,last_cart,in_last_cart,qty_reordered,qty_sold,prod_reorder_pct,prod_prior_sales,prod_pct_reorders
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1136,1852,0.613391,1852,0.613391
55,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,"[24161, 6046, 49535, 23760, 43611, 21760, 3604...",0,1136,1852,0.613391,1852,0.613391
190,1,777,1,"[16797, 43352, 6184]",0,"[33493, 16797, 43352, 17122, 29223]",0,1136,1852,0.613391,1852,0.613391


In [15]:
X.shape

(8474661, 12)

In [16]:
# add product avg. add to cart order (prod_avg_atco)
atcos = (df_prior.groupby(['product_id'], as_index=False)
                                           .agg({'add_to_cart_order': 'mean'}))

X = X.merge(atcos, on="product_id").rename(columns={'add_to_cart_order': 'prod_avg_atco'})
del atcos
X.head(3)

Unnamed: 0,product_id,user_id,user_total_prod_orders,cart,in_cart,last_cart,in_last_cart,qty_reordered,qty_sold,prod_reorder_pct,prod_prior_sales,prod_pct_reorders,prod_avg_atco
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1136,1852,0.613391,1852,0.613391,5.801836
1,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,"[24161, 6046, 49535, 23760, 43611, 21760, 3604...",0,1136,1852,0.613391,1852,0.613391,5.801836
2,1,777,1,"[16797, 43352, 6184]",0,"[33493, 16797, 43352, 17122, 29223]",0,1136,1852,0.613391,1852,0.613391,5.801836


### Features to Add

**User features**
* `user_total_orders`: total number of orders for the user
* `user_total_reorders`
* `user_pct_reorders`
* `user_avg_ippo`: user average items per order

In [17]:
# get num. total purchases for each user   
user_order_counts = (df_orders.groupby("user_id")
                     .agg({"order_number": "max"})
                     .rename(columns={"order_number": "user_total_orders"}))
                     
                     
user_order_counts.head(3)

Unnamed: 0_level_0,user_total_orders
user_id,Unnamed: 1_level_1
1,11
2,15
3,13


In [18]:
# add user_total_orders col to X

X = X.merge(user_order_counts, how='left', left_on='user_id', right_index=True)

del user_order_counts

X.head(3)

Unnamed: 0,product_id,user_id,user_total_prod_orders,cart,in_cart,last_cart,in_last_cart,qty_reordered,qty_sold,prod_reorder_pct,prod_prior_sales,prod_pct_reorders,prod_avg_atco,user_total_orders
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1136,1852,0.613391,1852,0.613391,5.801836,33
1,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,"[24161, 6046, 49535, 23760, 43611, 21760, 3604...",0,1136,1852,0.613391,1852,0.613391,5.801836,6
2,1,777,1,"[16797, 43352, 6184]",0,"[33493, 16797, 43352, 17122, 29223]",0,1136,1852,0.613391,1852,0.613391,5.801836,27


In [19]:
# add total items per order
user_total_items = (df_prior.groupby('user_id', as_index=False)
 .agg({"product_id": "count"})
 .rename(columns={"product_id": "user_total_items_purchased"}))

user_total_items.head(3)

Unnamed: 0,user_id,user_total_items_purchased
0,1,59
1,2,195
2,3,88


In [20]:
user_num_prior_orders = (df_prior.groupby('user_id', as_index=False)
 .agg({"order_number": "max"})
 .rename(columns={"order_number": "user_num_prior_orders"}))

user_avg_ipos = user_num_prior_orders.merge(user_total_items, on='user_id')

user_avg_ipos['user_avg_ippo'] = user_avg_ipos.user_total_items_purchased / user_avg_ipos.user_num_prior_orders
user_avg_ipos

Unnamed: 0,user_id,user_num_prior_orders,user_total_items_purchased,user_avg_ippo
0,1,10,59,5.900000
1,2,14,195,13.928571
2,3,12,88,7.333333
3,4,5,18,3.600000
4,5,4,37,9.250000
...,...,...,...,...
206204,206205,3,32,10.666667
206205,206206,67,285,4.253731
206206,206207,16,223,13.937500
206207,206208,49,677,13.816327


In [21]:
X = X.merge(user_avg_ipos[['user_id', 'user_avg_ippo']], on='user_id')
X.drop(columns='user_total_orders', inplace=True)
X.head(3)

Unnamed: 0,product_id,user_id,user_total_prod_orders,cart,in_cart,last_cart,in_last_cart,qty_reordered,qty_sold,prod_reorder_pct,prod_prior_sales,prod_pct_reorders,prod_avg_atco,user_avg_ippo
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1136,1852,0.613391,1852,0.613391,5.801836,4.625
1,907,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1123,2025,0.554568,2025,0.554568,3.653333,4.625
2,1000,138,1,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1065,2610,0.408046,2610,0.408046,9.503448,4.625


In [24]:
X.columns

Index(['product_id', 'user_id', 'user_total_prod_orders', 'cart', 'in_cart',
       'last_cart', 'in_last_cart', 'qty_reordered', 'qty_sold',
       'prod_reorder_pct', 'prod_prior_sales', 'prod_pct_reorders',
       'prod_avg_atco', 'user_avg_ippo'],
      dtype='object')

In [25]:
len(df_prior.user_id.unique())

206209

In [26]:
len(df_train.user_id.unique())

131209

In [27]:
len(df_orders.user_id.unique())

206209

In [28]:
mask = df_orders.eval_set == 'test'
df_orders[mask].head(3)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
38,2774568,3,test,13,5,15,11.0
44,329954,4,test,6,3,12,30.0
53,1528013,6,test,4,3,16,22.0


In [29]:
mask = df_prior.user_id == 6
df_prior[mask].shape

(14, 12)

In [30]:
df_train.columns

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered', 'user_id',
       'eval_set', 'order_number', 'order_dow', 'order_hour_of_day',
       'days_since_prior_order'],
      dtype='object')

### More features...
* `days_since_prior_order`: already provided in df_train
* `user_avg_spacing`: average time between orders.

In [31]:
train_feats = (df_train.groupby('user_id', as_index=False)
               .agg({'days_since_prior_order': 'min',
                     'order_hour_of_day':'min'}))

X = X.merge(train_feats, on='user_id')
X.head(3)

Unnamed: 0,product_id,user_id,user_total_prod_orders,cart,in_cart,last_cart,in_last_cart,qty_reordered,qty_sold,prod_reorder_pct,prod_prior_sales,prod_pct_reorders,prod_avg_atco,user_avg_ippo,days_since_prior_order,order_hour_of_day
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1136,1852,0.613391,1852,0.613391,5.801836,4.625,1.0,12
1,907,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1123,2025,0.554568,2025,0.554568,3.653333,4.625,1.0,12
2,1000,138,1,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1065,2610,0.408046,2610,0.408046,9.503448,4.625,1.0,12


In [32]:
df_train.head(2)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,6129,24852,1,1,38907,train,7,1,14,30.0
1,6129,48364,2,1,38907,train,7,1,14,30.0


In [33]:
# add user_avg_spacing

train_merge_cols = ['user_id', 'days_since_prior_order']
prior_user_stats = (df_prior[train_merge_cols].groupby('user_id', as_index=False)
               .agg('mean'))

X = (X.merge(prior_user_stats, on='user_id', suffixes=[None, '_avg']))

(X.rename(columns={"days_since_prior_order_avg": "user_avg_spacing"},
          inplace=True))

X.head(3)

Unnamed: 0,product_id,user_id,user_total_prod_orders,cart,in_cart,last_cart,in_last_cart,qty_reordered,qty_sold,prod_reorder_pct,prod_prior_sales,prod_pct_reorders,prod_avg_atco,user_avg_ippo,days_since_prior_order,order_hour_of_day,user_avg_spacing
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1136,1852,0.613391,1852,0.613391,5.801836,4.625,1.0,12,10.4
1,907,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1123,2025,0.554568,2025,0.554568,3.653333,4.625,1.0,12,10.4
2,1000,138,1,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1065,2610,0.408046,2610,0.408046,9.503448,4.625,1.0,12,10.4


In [34]:
# # check results
# X_tr, X_te, y_tr, y_te = split_users(X)

# lr = LogisticRegression(solver='lbfgs', max_iter=200)
# lr.fit(X_tr, y_tr)
# f1_score(lr.predict(X_te), y_te)

In [35]:
X.shape

(8474661, 17)

### And more features...

But first, 1switch to feature_engineering_3.ipynb1. We'll pickle our work so we can pick up where we left off.

In [37]:
X.to_pickle("pickle/X_17.pickle")