In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from splits import split_users # contains split_users func

pd.set_option("display.max_columns", 101)
pd.set_option("display.max_rows", 100)

In [33]:
# read in pickled DFs generated by query_dfs.py

df_orders = pd.read_pickle("./pickle/df_orders.pickle")
df_train = pd.read_pickle("./pickle/df_train.pickle")
df_prior = pd.read_pickle("./pickle/df_prior.pickle")
df_prod_detail = pd.read_pickle("./pickle/df_prod_detail.pickle")

In [34]:
# read in pickled feature DF generated by feature_engineering_1.ipynb
X = pd.read_pickle("./pickle/X_15.pickle")

In [35]:
X.sample(2)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,last_cart,in_last_cart,prod_prior_sales,prod_pct_reorders,prod_avg_atco,user_avg_ippo,user_avg_spacing,user_avg_time,order_hour_of_day_avg,days_since_prior_order_avg
5561487,25035,29671,1,"[24407, 2512, 3689, 24114, 16878, 3285, 43262,...",0,"[22610, 7736, 38102, 24492, 23048, 20378, 2907...",0,195,0.461538,10.625641,20.454545,2.0,7,7.435556,2.970732
6871506,20995,145460,2,"[27845, 26165, 13176, 17949, 31343, 39877, 445...",0,"[27845, 13176, 13848, 1559, 22124, 17949, 2616...",0,32887,0.651595,8.869949,20.142857,30.0,22,15.624113,18.008696


### Picking up where we left off...

**More. Features.**

* `streak`: how many times in a row has this user ordered this item?
    * this feature is very computationally expensive to add
    * implemented multiprocessing to complete in ~5 minutes. 
    * see `add_streaks.py` for details
    


In [36]:
df_streaks = pd.read_csv("../data/order_streaks.csv")
df_streaks.rename(columns={"order_streak": "streak"})
df_streaks.head(3)

Unnamed: 0,user_id,product_id,order_streak
0,1,196,5.0
1,1,12427,5.0
2,1,10258,5.0
3,1,25133,5.0
4,1,13032,1.0
5,1,46149,3.0
6,1,49235,-1.0
7,1,39657,1.0
8,1,38928,1.0
9,1,35951,1.0


Negative streaks indicate the number of weeks since the user last ordered the given product.

In [60]:
X = X.merge(df_streaks, how='left', on=['user_id', 'product_id'])

X.rename(columns={"order_streak": "streak"})
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,last_cart,in_last_cart,prod_prior_sales,prod_pct_reorders,prod_avg_atco,user_avg_ippo,user_avg_spacing,user_avg_time,order_hour_of_day_avg,days_since_prior_order_avg,order_streak
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1852,0.613391,5.801836,4.625,1.0,12,12.689189,10.4,-2.0
1,907,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,2025,0.554568,3.653333,4.625,1.0,12,12.689189,10.4,
2,1000,138,1,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,2610,0.408046,9.503448,4.625,1.0,12,12.689189,10.4,


Awesome! Now we have an `order_streak` column. But if the user has never purchased the given item in the past, the resulting value is `NaN`. Since we have negative streaks that may help our model, we can't replace them with zeros. Instead, let's **bin** this feature into separate categories.