In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from splits import split_users # contains split_users func

pd.set_option("display.max_columns", 101)
pd.set_option("display.max_rows", 100)

In [2]:
# read in pickled DFs generated by query_dfs.py

df_orders = pd.read_pickle("./pickle/df_orders.pickle")
df_train = pd.read_pickle("./pickle/df_train.pickle")
df_prior = pd.read_pickle("./pickle/df_prior.pickle")
df_prod_detail = pd.read_pickle("./pickle/df_prod_detail.pickle")

In [3]:
# read in pickled feature DF generated by feature_engineering_1.ipynb
X = pd.read_pickle("./pickle/X_25.pickle")

In [4]:
X.columns

Index(['product_id', 'user_id', 'user_total_prod_orders', 'cart', 'in_cart',
       'last_cart', 'in_last_cart', 'prod_prior_sales', 'prod_pct_reorders',
       'prod_avg_atco', 'user_avg_ippo', 'days_since_prior_order',
       'order_hour_of_day', 'user_avg_spacing', 'streak', 'streak_-2.0',
       'streak_-3.0', 'streak_-4.0', 'streak_1.0', 'streak_2.0', 'streak_3.0',
       'streak_4.0', 'streak_5.0', 'streak_nan', 'up_buy_streak',
       'up_n5_n_buys', 'up_n5_buy_ratio', 'up_atco_sum', 'up_atco_avg'],
      dtype='object')

Add features:
* `user_avg_cart_size`: we already have this, just rename col (from 'user_avg_ippo')
* `prod_total_market_share`**\***: what percentage of all prior orders the product makes up for
* `prod_aisle_market_share`**\***: same as above, but by aisle
* `prod_dpt_market_share`**\***: same as above, but by department
* `purchased_earlier_today`: (boolean) whether or not the item already purchased the item in that day.

**\*** *also adding log feature for these*

In [5]:
X.shape

(8474661, 29)

In [6]:
X.head(3)

Unnamed: 0,product_id,user_id,user_total_prod_orders,cart,in_cart,last_cart,in_last_cart,prod_prior_sales,prod_pct_reorders,prod_avg_atco,user_avg_ippo,days_since_prior_order,order_hour_of_day,user_avg_spacing,streak,streak_-2.0,streak_-3.0,streak_-4.0,streak_1.0,streak_2.0,streak_3.0,streak_4.0,streak_5.0,streak_nan,up_buy_streak,up_n5_n_buys,up_n5_buy_ratio,up_atco_sum,up_atco_avg
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1852,0.613391,5.801836,4.625,1.0,12,10.4,-2.0,1,0,0,0,0,0,0,0,0,0,2.0,0.4,6,3.0
1,907,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,2025,0.554568,3.653333,4.625,1.0,12,10.4,,0,0,0,0,0,0,0,0,1,0,0.0,0.0,5,2.5
2,1000,138,1,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,2610,0.408046,9.503448,4.625,1.0,12,10.4,,0,0,0,0,0,0,0,0,1,0,0.0,0.0,5,5.0


In [7]:
df_train.head(3)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,6129,24852,1,1,38907,train,7,1,14,30.0
1,6129,48364,2,1,38907,train,7,1,14,30.0
2,6129,21903,3,1,38907,train,7,1,14,30.0


In [8]:
df_prior.head(2)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,cart,in_cart
0,114,24954,1,0,91891,prior,1,0,11,,"[24954, 1688, 37371, 5782, 1263, 23763, 24385,...",1
1,114,1688,2,0,91891,prior,1,0,11,,"[24954, 1688, 37371, 5782, 1263, 23763, 24385,...",1


In [9]:
mask = (X.user_id == 31628)
X[mask].head(10)


Unnamed: 0,product_id,user_id,user_total_prod_orders,cart,in_cart,last_cart,in_last_cart,prod_prior_sales,prod_pct_reorders,prod_avg_atco,user_avg_ippo,days_since_prior_order,order_hour_of_day,user_avg_spacing,streak,streak_-2.0,streak_-3.0,streak_-4.0,streak_1.0,streak_2.0,streak_3.0,streak_4.0,streak_5.0,streak_nan,up_buy_streak,up_n5_n_buys,up_n5_buy_ratio,up_atco_sum,up_atco_avg
8474659,49070,31628,5,"[33000, 45608, 37496, 39441, 49070]",1,[49070],1,190,0.484211,9.189474,1.0,30.0,13,4.0,5.0,0,0,0,0,0,0,0,1,0,1,5.0,1.0,5,1.0


In [10]:
X.rename(columns={"user_avg_ippo": "user_avg_cart_size"}, inplace=True)
X.sample()

Unnamed: 0,product_id,user_id,user_total_prod_orders,cart,in_cart,last_cart,in_last_cart,prod_prior_sales,prod_pct_reorders,prod_avg_atco,user_avg_cart_size,days_since_prior_order,order_hour_of_day,user_avg_spacing,streak,streak_-2.0,streak_-3.0,streak_-4.0,streak_1.0,streak_2.0,streak_3.0,streak_4.0,streak_5.0,streak_nan,up_buy_streak,up_n5_n_buys,up_n5_buy_ratio,up_atco_sum,up_atco_avg
5964178,27521,192513,2,"[2108, 24852, 21137, 79, 30776, 11408, 7021, 2...",0,"[11097, 24964, 31717, 26209, 35939, 13310, 847...",0,43067,0.695103,8.155177,11.714286,6.0,18,10.36802,,0,0,0,0,0,0,0,0,1,0,0.0,0.0,15,7.5


In [11]:
df_prior = df_prior.merge(df_prod_detail, how='left', on='product_id')
df_prior.head(3)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,cart,in_cart,aisle_id,department_id,product_name,aisle,department
0,114,24954,1,0,91891,prior,1,0,11,,"[24954, 1688, 37371, 5782, 1263, 23763, 24385,...",1,120,16,Total 0% with Honey Nonfat Greek Strained Yogurt,yogurt,dairy eggs
1,114,1688,2,0,91891,prior,1,0,11,,"[24954, 1688, 37371, 5782, 1263, 23763, 24385,...",1,107,19,Sea Salted Reduced Fat Potato Chips,chips pretzels,snacks
2,114,37371,3,0,91891,prior,1,0,11,,"[24954, 1688, 37371, 5782, 1263, 23763, 24385,...",1,89,13,Homestyle Ranch,salad dressing toppings,pantry


In [12]:
# add prod_market_share cols

total_sales = len(df_prior)
# total_dpt_sales = 
market_shares = df_prior[['product_id', 'order_id']].groupby('product_id', as_index=False).agg('count')
market_shares.rename(columns={'order_id': 'prod_sales'}, inplace=True)

market_shares['prod_total_mkt_share'] = market_shares.prod_sales.to_numpy() / total_sales
market_shares['prod_total_mkt_share_log'] = np.log(market_shares.prod_total_mkt_share.to_numpy())

market_shares = market_shares.merge(df_prod_detail, how='left', on='product_id')

market_shares.head(3)

Unnamed: 0,product_id,prod_sales,prod_total_mkt_share,prod_total_mkt_share_log,aisle_id,department_id,product_name,aisle,department
0,1,1852,5.7e-05,-9.770711,61,19,Chocolate Sandwich Cookies,cookies cakes,snacks
1,2,90,3e-06,-12.794923,104,13,All-Seasons Salt,spices seasonings,pantry
2,3,277,9e-06,-11.670715,94,7,Robust Golden Unsweetened Oolong Tea,tea,beverages


In [13]:
aisle_shares = df_prior[['aisle_id', 'order_id']].groupby('aisle_id').agg('count')
aisle_shares.rename(columns={"order_id": "aisle_total_sales"}, inplace=True)
aisle_shares.head(3)

Unnamed: 0_level_0,aisle_total_sales
aisle_id,Unnamed: 1_level_1
1,71928
2,82491
3,456386


In [14]:
market_shares = market_shares.merge(aisle_shares, how='left', on='aisle_id')

In [15]:
dpt_shares = df_prior[['department_id', 'order_id']].groupby('department_id').agg('count')
dpt_shares.rename(columns={"order_id": "dpt_total_sales"}, inplace=True)
dpt_shares.head(5)

Unnamed: 0_level_0,dpt_total_sales
department_id,Unnamed: 1_level_1
1,2236432
2,36291
3,1176787
4,9479291
5,153696


In [16]:
market_shares = market_shares.merge(dpt_shares, how='left', on='department_id')

market_shares.head(3)

Unnamed: 0,product_id,prod_sales,prod_total_mkt_share,prod_total_mkt_share_log,aisle_id,department_id,product_name,aisle,department,aisle_total_sales,dpt_total_sales
0,1,1852,5.7e-05,-9.770711,61,19,Chocolate Sandwich Cookies,cookies cakes,snacks,234065,2887550
1,2,90,3e-06,-12.794923,104,13,All-Seasons Salt,spices seasonings,pantry,212092,1875577
2,3,277,9e-06,-11.670715,94,7,Robust Golden Unsweetened Oolong Tea,tea,beverages,249341,2690129


In [17]:
market_shares['prod_aisle_mkt_share'] = market_shares['prod_sales'] / market_shares['aisle_total_sales'].to_numpy()
market_shares['prod_dpt_mkt_share'] = market_shares['prod_sales'] / market_shares['dpt_total_sales'].to_numpy()

market_shares['prod_aisle_mkt_share_log'] = np.log(market_shares['prod_aisle_mkt_share'].to_numpy())
market_shares['prod_dpt_mkt_share_log'] = np.log(market_shares['prod_dpt_mkt_share'].to_numpy())

market_shares.head(3)

Unnamed: 0,product_id,prod_sales,prod_total_mkt_share,prod_total_mkt_share_log,aisle_id,department_id,product_name,aisle,department,aisle_total_sales,dpt_total_sales,prod_aisle_mkt_share,prod_dpt_mkt_share,prod_aisle_mkt_share_log,prod_dpt_mkt_share_log
0,1,1852,5.7e-05,-9.770711,61,19,Chocolate Sandwich Cookies,cookies cakes,snacks,234065,2887550,0.007912,0.000641,-4.839333,-7.351898
1,2,90,3e-06,-12.794923,104,13,All-Seasons Salt,spices seasonings,pantry,212092,1875577,0.000424,4.8e-05,-7.764966,-9.944617
2,3,277,9e-06,-11.670715,94,7,Robust Golden Unsweetened Oolong Tea,tea,beverages,249341,2690129,0.001111,0.000103,-6.802559,-9.181082


In [18]:
del df_prior
del df_train
del df_orders

In [19]:
market_shares.columns

Index(['product_id', 'prod_sales', 'prod_total_mkt_share',
       'prod_total_mkt_share_log', 'aisle_id', 'department_id', 'product_name',
       'aisle', 'department', 'aisle_total_sales', 'dpt_total_sales',
       'prod_aisle_mkt_share', 'prod_dpt_mkt_share',
       'prod_aisle_mkt_share_log', 'prod_dpt_mkt_share_log'],
      dtype='object')

In [20]:
market_shares.shape

(49677, 15)

In [21]:
X.shape

(8474661, 29)

In [22]:
cols = ['product_id', 'prod_total_mkt_share', 'prod_total_mkt_share_log', 'aisle_total_sales',
        'prod_aisle_mkt_share', 'prod_aisle_mkt_share_log', 'dpt_total_sales',
        'prod_dpt_mkt_share', 'prod_dpt_mkt_share_log']

X = X.merge(market_shares[cols], how='left', on='product_id')
X.head(3)

Unnamed: 0,product_id,user_id,user_total_prod_orders,cart,in_cart,last_cart,in_last_cart,prod_prior_sales,prod_pct_reorders,prod_avg_atco,user_avg_cart_size,days_since_prior_order,order_hour_of_day,user_avg_spacing,streak,streak_-2.0,streak_-3.0,streak_-4.0,streak_1.0,streak_2.0,streak_3.0,streak_4.0,streak_5.0,streak_nan,up_buy_streak,up_n5_n_buys,up_n5_buy_ratio,up_atco_sum,up_atco_avg,prod_total_mkt_share,prod_total_mkt_share_log,aisle_total_sales,prod_aisle_mkt_share,prod_aisle_mkt_share_log,dpt_total_sales,prod_dpt_mkt_share,prod_dpt_mkt_share_log
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1852,0.613391,5.801836,4.625,1.0,12,10.4,-2.0,1,0,0,0,0,0,0,0,0,0,2.0,0.4,6,3.0,5.7e-05,-9.770711,234065,0.007912,-4.839333,2887550,0.000641,-7.351898
1,907,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,2025,0.554568,3.653333,4.625,1.0,12,10.4,,0,0,0,0,0,0,0,0,1,0,0.0,0.0,5,2.5,6.2e-05,-9.681408,305655,0.006625,-5.016887,708931,0.002856,-5.858189
2,1000,138,1,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,2610,0.408046,9.503448,4.625,1.0,12,10.4,,0,0,0,0,0,0,0,0,1,0,0.0,0.0,5,5.0,8e-05,-9.427627,17368,0.150276,-1.895279,34573,0.075492,-2.583723


In [23]:
del market_shares

In [24]:
X.columns

Index(['product_id', 'user_id', 'user_total_prod_orders', 'cart', 'in_cart',
       'last_cart', 'in_last_cart', 'prod_prior_sales', 'prod_pct_reorders',
       'prod_avg_atco', 'user_avg_cart_size', 'days_since_prior_order',
       'order_hour_of_day', 'user_avg_spacing', 'streak', 'streak_-2.0',
       'streak_-3.0', 'streak_-4.0', 'streak_1.0', 'streak_2.0', 'streak_3.0',
       'streak_4.0', 'streak_5.0', 'streak_nan', 'up_buy_streak',
       'up_n5_n_buys', 'up_n5_buy_ratio', 'up_atco_sum', 'up_atco_avg',
       'prod_total_mkt_share', 'prod_total_mkt_share_log', 'aisle_total_sales',
       'prod_aisle_mkt_share', 'prod_aisle_mkt_share_log', 'dpt_total_sales',
       'prod_dpt_mkt_share', 'prod_dpt_mkt_share_log'],
      dtype='object')

In [25]:
X.shape

(8474661, 37)

In [26]:
import re
drop_cols = [col for col in X.columns if re.search(r"streak.*\.0$", col)]

In [27]:
drop_cols

['streak_-2.0',
 'streak_-3.0',
 'streak_-4.0',
 'streak_1.0',
 'streak_2.0',
 'streak_3.0',
 'streak_4.0',
 'streak_5.0']

In [28]:
X.drop(columns=drop_cols, inplace=True)

In [29]:
X.shape

(8474661, 29)

In [30]:
X['streak'] = X.streak.astype(float)
X['streak_abs'] = np.abs(X.streak.to_numpy())

In [31]:
X.columns

Index(['product_id', 'user_id', 'user_total_prod_orders', 'cart', 'in_cart',
       'last_cart', 'in_last_cart', 'prod_prior_sales', 'prod_pct_reorders',
       'prod_avg_atco', 'user_avg_cart_size', 'days_since_prior_order',
       'order_hour_of_day', 'user_avg_spacing', 'streak', 'streak_nan',
       'up_buy_streak', 'up_n5_n_buys', 'up_n5_buy_ratio', 'up_atco_sum',
       'up_atco_avg', 'prod_total_mkt_share', 'prod_total_mkt_share_log',
       'aisle_total_sales', 'prod_aisle_mkt_share', 'prod_aisle_mkt_share_log',
       'dpt_total_sales', 'prod_dpt_mkt_share', 'prod_dpt_mkt_share_log',
       'streak_abs'],
      dtype='object')

In [32]:
X.head(3)

Unnamed: 0,product_id,user_id,user_total_prod_orders,cart,in_cart,last_cart,in_last_cart,prod_prior_sales,prod_pct_reorders,prod_avg_atco,user_avg_cart_size,days_since_prior_order,order_hour_of_day,user_avg_spacing,streak,streak_nan,up_buy_streak,up_n5_n_buys,up_n5_buy_ratio,up_atco_sum,up_atco_avg,prod_total_mkt_share,prod_total_mkt_share_log,aisle_total_sales,prod_aisle_mkt_share,prod_aisle_mkt_share_log,dpt_total_sales,prod_dpt_mkt_share,prod_dpt_mkt_share_log,streak_abs
0,1,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,1852,0.613391,5.801836,4.625,1.0,12,10.4,-2.0,0,0,2.0,0.4,6,3.0,5.7e-05,-9.770711,234065,0.007912,-4.839333,2887550,0.000641,-7.351898,2.0
1,907,138,2,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,2025,0.554568,3.653333,4.625,1.0,12,10.4,,1,0,0.0,0.0,5,2.5,6.2e-05,-9.681408,305655,0.006625,-5.016887,708931,0.002856,-5.858189,
2,1000,138,1,[42475],0,"[46802, 22128, 40199, 21573, 26152, 12341]",0,2610,0.408046,9.503448,4.625,1.0,12,10.4,,1,0,0.0,0.0,5,5.0,8e-05,-9.427627,17368,0.150276,-1.895279,34573,0.075492,-2.583723,


In [33]:
X.to_pickle("pickle/X_F.pickle")

In [34]:
X.streak

0         -2.0
1          NaN
2          NaN
3          NaN
4          NaN
          ... 
8474656    3.0
8474657   -1.0
8474658    5.0
8474659    5.0
8474660    4.0
Name: streak, Length: 8474661, dtype: float64