In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", 101)

### Picking up where we left off...

In [2]:
# read in pickled DFs generated by query_dfs.py

df_orders = pd.read_pickle("./pickle/df_orders.pickle")
df_train = pd.read_pickle("./pickle/df_train.pickle")
df_prior = pd.read_pickle("./pickle/df_prior.pickle")
df_prod_detail = pd.read_pickle("./pickle/df_prod_detail.pickle")

In [77]:
# read in pickled feature DF generated by feature_engineering_1.ipynb
X = pd.read_pickle("./pickle/X_7.pickle")

In [78]:
X.shape


(8474661, 5)

In [79]:
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart
0,1,138,2,[42475],0
1,907,138,2,[42475],0
2,1000,138,1,[42475],0


In [80]:
df_orders.sample()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
1020898,1912914,61375,prior,16,3,20,5.0


In [81]:
len(df_train)

1384617

In [82]:
mask = df_orders.eval_set == 'train'
len(df_orders[mask].order_id.unique())

131209

### Features to Add:

Product features
* `prod_pct_reordered`: product percent of prior transactions that were reorders
* `prod_rank` rank in terms of products sold
* some combination of the above, perhaps?

User features
* user num orders
* average items per order

#### Product Features

Let's start with prod_prior_sales

In [83]:
prod_sales = (df_prior.groupby('product_id')
              .agg({"product_id": "count"})['product_id']
              .sort_values(ascending=False))

prod_sales

product_id
24852    472565
13176    379450
21137    264683
21903    241921
47209    213584
          ...  
45893         1
13397         1
30451         1
42464         1
25248         1
Name: product_id, Length: 49677, dtype: int64

In [84]:
# get total sales for each product & add to X

X = (X.merge(prod_sales, left_on='product_id', right_index=True)
    .drop(columns="product_id_x"))
    
X.rename(columns={"product_id_y": "prod_prior_sales"}, inplace=True)
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,prod_prior_sales
0,1,138,2,[42475],0,1852
55,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,1852
190,1,777,1,"[16797, 43352, 6184]",0,1852


In [85]:
X.shape

(8474661, 6)

In [86]:
# add prod_pct_reordered col
reorders = df_prior.groupby("product_id").agg({"reordered": "sum"})
reorders.head(3)

Unnamed: 0_level_0,reordered
product_id,Unnamed: 1_level_1
1,1136
2,12
3,203


In [87]:
X = (X.merge(reorders, left_on='product_id', right_index=True)
     .rename(columns={"reordered": "prod_prior_reorders"}))
del reorders
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,prod_prior_sales,prod_prior_reorders
0,1,138,2,[42475],0,1852,1136
55,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,1852,1136
190,1,777,1,"[16797, 43352, 6184]",0,1852,1136


In [88]:
X["prod_pct_reorders"] = X.prod_prior_reorders / X.prod_prior_sales
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,prod_prior_sales,prod_prior_reorders,prod_pct_reorders
0,1,138,2,[42475],0,1852,1136,0.613391
55,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,1852,1136,0.613391
190,1,777,1,"[16797, 43352, 6184]",0,1852,1136,0.613391


In [89]:
X.shape

(8474661, 8)

#### User Features

In [90]:
user_order_counts = (df_orders.groupby("user_id")
                     .agg({"order_number": "max"})
                     .rename(columns={"order_number": "user_total_orders"}))
                     
                     
user_order_counts.head(3)

Unnamed: 0_level_0,user_total_orders
user_id,Unnamed: 1_level_1
1,11
2,15
3,13


In [91]:
X.shape

(8474661, 8)

In [92]:
# add user_total_orders col

X = X.merge(user_order_counts, how='left', left_on='user_id', right_index=True)

del user_order_counts

X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,prod_prior_sales,prod_prior_reorders,prod_pct_reorders,user_total_orders
0,1,138,2,[42475],0,1852,1136,0.613391,33
55,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,1852,1136,0.613391,6
190,1,777,1,"[16797, 43352, 6184]",0,1852,1136,0.613391,27
