In [169]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import warnings
from sklearn.decomposition import PCA
warnings.filterwarnings("ignore")

In [2]:
df_prior = pickle.load(open("data_pickle/prior_data.pkl",'rb'))

In [3]:
df_prior.head(3)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
8,2,17794,6,1,Carrots,83,4,fresh vegetables,produce,202279,prior,3,5,9,8.0
9,2,30035,5,0,Natural Sweetener,17,13,baking ingredients,pantry,202279,prior,3,5,9,8.0
10,2,1819,8,1,All Natural No Stir Creamy Almond Butter,88,13,spreads,pantry,202279,prior,3,5,9,8.0


<h2> dow reorder rate </h2>

In [18]:
# https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/38126
temp = df_prior.groupby('order_dow')['reordered'].agg('sum').reset_index()

In [19]:
temp["dow_rate"] = (temp["reordered"]/df_prior["reordered"].sum())

In [21]:
temp = temp.drop("reordered",axis=1)

In [24]:
temp

Unnamed: 0,order_dow,dow_rate
0,0,0.190017
1,1,0.178876
2,2,0.130057
3,3,0.117831
4,4,0.117019
5,5,0.131056
6,6,0.135144


In [22]:
%%time
pickle.dump(temp,open("new_features/dow_rate.pkl","wb"))

Wall time: 998 µs


<h2> order_hour_of_day rate </h2>

In [25]:
%%time
# similarly we can do for hour of day
temp = df_prior.groupby('order_hour_of_day')['reordered'].agg('sum').reset_index()
temp["order_hour_of_day_rate"] = (temp["reordered"]/df_prior["reordered"].sum())
temp = temp.drop("reordered",axis=1)
pickle.dump(temp,open("new_features/order_hour_of_day_rate.pkl","wb"))

Wall time: 866 ms


In [26]:
temp

Unnamed: 0,order_hour_of_day,order_hour_of_day_rate
0,0,0.006473
1,1,0.003373
2,2,0.002013
3,3,0.001501
4,4,0.001591
5,5,0.002799
6,6,0.009677
7,7,0.030054
8,8,0.056823
9,9,0.079565


<h2> days_since_prior_order rate </h2>

In [27]:
%%time
# similarly we can do days since prior order rate
temp = df_prior.groupby('days_since_prior_order')['reordered'].agg('sum').reset_index()
temp["days_since_prior_order_rate"] = (temp["reordered"]/df_prior["reordered"].sum())
temp = temp.drop("reordered",axis=1)
pickle.dump(temp,open("new_features/days_since_prior_order_rate.pkl","wb"))

Wall time: 1.12 s


In [28]:
temp

Unnamed: 0,days_since_prior_order,days_since_prior_order_rate
0,0.0,0.017331
1,1.0,0.03039
2,2.0,0.049433
3,3.0,0.065646
4,4.0,0.073704
5,5.0,0.074935
6,6.0,0.089204
7,7.0,0.126195
8,8.0,0.067771
9,9.0,0.041491


<h2> getting product features </h2>

In [4]:
%%time
product_features = pd.DataFrame(columns=['product_id'])
product_features['product_id'] = df_prior['product_id'].sort_values().unique()

Wall time: 10.1 s


In [94]:
%%time
groupped_data = df_prior.groupby(['product_id', 'reordered']).agg({'reordered': 'count'})
groupped_data["%"] = groupped_data.groupby(level=0).apply(lambda x:  100*x / x.sum())
groupped_data.head(3)

Wall time: 51.1 s


Unnamed: 0_level_0,Unnamed: 1_level_0,reordered,%
product_id,reordered,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,716,38.660907
1,1,1136,61.339093
2,0,78,86.666667


<h4> product reorder rate </h4>

In [5]:
%%time
# https://towardsdatascience.com/4-useful-tips-of-pandas-groupby-3744eefb1852
df = pd.DataFrame({'reorder_rate': df_prior.groupby(['product_id','reordered'])['reordered'].count().groupby(level=0).apply(lambda x: x / float(x.sum()))}).reset_index()

Wall time: 52.8 s


In [6]:
df.head(3)

Unnamed: 0,product_id,reordered,reorder_rate
0,1,0,0.386609
1,1,1,0.613391
2,2,0,0.866667


In [7]:
#get data of reordered products only
new_df = df[df['reordered']==1]
new_df['reorder_rate'] = new_df['reorder_rate'] * new_df['reordered']
#handling for products which were never reordered, hence reorder_rate = 0.0
new_df_1 = df[(df['reordered']==0) & (df['reorder_rate']==float(1.0))]
new_df_1['reorder_rate'] = new_df_1['reorder_rate'] * new_df_1['reordered']
new_df = new_df.append(new_df_1)
new_df.drop('reordered', axis = 1, inplace = True)
new_df.sort_values(by='product_id', inplace =  True)   
new_df = new_df.reset_index(drop = True)
#add to feat_1 of product_features dataframe
product_features['product_reorder_rate'] = new_df['reorder_rate']

In [9]:
#get mean position of product in the cart, sort by product_name and add to feat_2 of product_features
mean_position = df_prior.groupby('product_id')['add_to_cart_order'].mean().reset_index(name = 'avg_position')
mean_position.sort_values(by = 'product_id', inplace = True)
product_features['avg_pos_incart'] = mean_position['avg_position']

In [10]:
%%time
#Get aisle reorder rate
df = df_prior.groupby(['aisle']).size().reset_index(name='order_count')
aisle_reorder_rate = df_prior[df_prior['reordered']==1].groupby(['aisle']).size().reset_index(name='reorder_rate')
df['aisle_reorder_rate'] = aisle_reorder_rate['reorder_rate']/df['order_count']
df.drop(['order_count'], axis = 1, inplace = True)
new_df = pd.merge(df_prior, df, on = 'aisle')

Wall time: 1min 20s


In [13]:
%%time
#Get dept reorder rate
df = df_prior.groupby(['department']).size().reset_index(name='order_count')
aisle_reorder_rate = df_prior[df_prior['reordered']==1].groupby(['department']).size().reset_index(name='reorder_rate')
df['dept_reorder_rate'] = aisle_reorder_rate['reorder_rate']/df['order_count']
df.drop(['order_count'], axis = 1, inplace = True)
new_df = pd.merge(new_df, df, on = 'department')

Wall time: 52 s


In [14]:
new_df = new_df[['product_id','aisle_id','department_id','aisle_reorder_rate','dept_reorder_rate']]
new_df.drop_duplicates(keep='first', inplace = True)

In [15]:
%%time
#merge dept_reorder_rate and aisle_reorder_rate to existing product features
product_features = pd.merge(product_features, new_df , on='product_id', how = 'inner')

Wall time: 261 ms


In [17]:
product_features.head(3)

Unnamed: 0,product_id,product_reorder_rate,avg_pos_incart,aisle_id,department_id,aisle_reorder_rate,dept_reorder_rate
0,1,0.613391,5.801836,61,19,0.548698,0.57418
1,2,0.133333,9.888889,104,13,0.152391,0.346721
2,3,0.732852,6.415162,94,7,0.527615,0.65346


In [18]:
%%time
pickle.dump(product_features,open("new_features/product_features.pkl","wb"))

Wall time: 22.6 ms


<h2> USER featurers </h2>

In [48]:
user_features = pd.DataFrame(columns=['user_id'])
user_features['user_id'] = df_prior['user_id'].sort_values().unique()

In [37]:
user_features.shape

(206209, 2)

In [43]:
%%time
# user reorder rate
user_reorder_rate = df_prior.groupby(["user_id","reordered"])['reordered'].count().groupby(level = 0).apply(lambda x: x / float(x.sum())).reset_index(name='reorder_rate')

Wall time: 3min 45s


In [44]:
user_reorder_rate.shape

(409373, 3)

In [45]:
user_reorder_rate = user_reorder_rate[user_reorder_rate["reordered"]==1]

In [49]:
user_features = user_features.merge(user_reorder_rate,on="user_id",how = "left").fillna(0)

In [53]:
user_features.head(3)

Unnamed: 0,user_id,reordered,reorder_rate
0,1,1.0,0.694915
1,2,1.0,0.476923
2,3,1.0,0.625


In [58]:
user_features.shape

(206209, 3)

In [56]:
%%time
# this feature is inspired by https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/38100
# no of unique products by user
temp = df_prior.groupby(["user_id"])['product_name'].nunique().reset_index(name = 'unique_products')

Wall time: 1min 40s


In [59]:
user_features = user_features.merge(temp,on="user_id",how="left")

In [65]:
%%time
temp = df_prior.groupby(["user_id"])['product_name'].size().reset_index(name = 'user_total_products')

Wall time: 1.74 s


In [67]:
user_features = user_features.merge(temp,on="user_id",how="left")

In [69]:
user_features.head(5)

Unnamed: 0,user_id,reordered,reorder_rate,unique_products,user_total_products
0,1,1.0,0.694915,18,59
1,2,1.0,0.476923,102,195
2,3,1.0,0.625,33,88
3,4,1.0,0.055556,17,18
4,5,1.0,0.378378,23,37


In [72]:
%%time
# avg cart size
# 
temp = df_prior.groupby(["user_id","order_id"])['add_to_cart_order'].count().reset_index(name='avg_cart_size')
temp_1 = temp.groupby('user_id')['avg_cart_size'].mean().reset_index()

Wall time: 8.19 s


In [74]:
user_features = user_features.merge(temp_1,on="user_id",how="left")

In [76]:
%%time
# https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/38100
#Get average days between 2 orders for every user
temp = df_prior.groupby(["user_id","order_id"])['days_since_prior_order'].max().reset_index(name='user_avg_days_between_orders')
temp_1 = temp.groupby('user_id')['user_avg_days_between_orders'].mean().reset_index()

Wall time: 9.83 s


In [77]:
user_features = user_features.merge(temp_1,on="user_id",how="left")

In [78]:
user_features.head(5)

Unnamed: 0,user_id,reordered,reorder_rate,unique_products,user_total_products,avg_cart_size,user_avg_days_between_orders
0,1,1.0,0.694915,18,59,5.9,17.6
1,2,1.0,0.476923,102,195,13.928571,14.142857
2,3,1.0,0.625,33,88,7.333333,11.083333
3,4,1.0,0.055556,17,18,3.6,11.0
4,5,1.0,0.378378,23,37,9.25,10.0


In [80]:
# https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/38100
#get user product reorder ratio 
# 
    # number of unique products reordered / number of unique products ordered
df = pd.DataFrame()
df['user_id'] = df_prior['user_id'].sort_values().unique()
df['user_unique_products'] = df_prior.groupby(["user_id"])['product_name'].nunique().reset_index(name = 'unique')['unique']
df['user_reordered_products'] = df_prior[df_prior['reordered']==1].groupby(["user_id"])['product_name'].nunique().reset_index(name = 'reordered_unique')['reordered_unique']
df.fillna(0, inplace = True)

In [82]:
df["user_reordered_products_ratio"] = df['user_reordered_products'] / df['user_unique_products']

In [84]:
%%time
user_features = user_features.merge(df,on="user_id",how="left")
pickle.dump(user_features,open("new_features/user_features.pkl","wb"))

Wall time: 168 ms


<h2> user product features </h2>

In [95]:
%%time
user_product_features = pd.DataFrame(columns=['user_id','product_id'])

Wall time: 0 ns


In [96]:
%%time
temp = df_prior.groupby(["user_id","product_id"]).size().reset_index()
user_product_features["user_id"] = temp["user_id"]
user_product_features["product_id"] = temp["product_id"]

Wall time: 22.4 s


In [97]:
user_product_features.shape

(13307953, 2)

In [98]:
%%time
# frequently user ordered the product
#times user ordered the product/ #times user placed an order
temp = df_prior.groupby(["user_id","product_id"])["reordered"].size()
temp = (temp/df_prior.groupby(["user_id"]).size()).reset_index(name = 'user_product_order_rate')


Wall time: 21.5 s


In [99]:
%%time
user_product_features = user_product_features.merge(temp,on=["user_id","product_id"],how="left").fillna(0)

Wall time: 13.3 s


In [100]:
user_product_features

Unnamed: 0,user_id,product_id,user_product_order_rate
0,1,196,0.169492
1,1,10258,0.152542
2,1,10326,0.016949
3,1,12427,0.169492
4,1,13032,0.050847
...,...,...,...
13307948,206209,43961,0.023256
13307949,206209,44325,0.007752
13307950,206209,48370,0.007752
13307951,206209,48697,0.007752


In [102]:
%%time
# How frequently user reordered the product
#times user reordered the product/ #times user ordered the product
temp = df_prior[df_prior["reordered"]==1].groupby(["user_id","product_id"])["reordered"].size()
temp = (temp/df_prior.groupby(["user_id","product_id"]).size()).reset_index(name = 'user_product_reorder_rate')

Wall time: 1min 9s


In [104]:
%%time
user_product_features = user_product_features.merge(temp,on=["user_id","product_id"],how="left").fillna(0)

Wall time: 13.6 s


In [105]:
%%time
# Average position of product in the cart on orders placed by user
temp = df_prior.groupby(["user_id","product_id"])['add_to_cart_order'].mean().reset_index(name = 'user_product_avg_position')

Wall time: 20.9 s


In [107]:
%%time
user_product_features = user_product_features.merge(temp,on=["user_id","product_id"],how="left").fillna(0)

Wall time: 13.7 s


In [108]:
%%time
#Number of orders placed since the product was last ordered ?
# Get last order_number placed by user , subtract with last order_number with the product in cart 

temp_1 = df_prior.groupby(["user_id","product_id"])['order_number'].max().reset_index()
temp_2 = df_prior.groupby(["user_id"])['order_number'].max().reset_index()
new_df = pd.merge(temp_1, temp_2,  how='outer', left_on=['user_id'], right_on = ['user_id'])        
new_df['user_product_orders_since_last'] = new_df['order_number_y'] - new_df['order_number_x']

Wall time: 28.3 s


In [110]:
new_df = new_df.drop(["order_number_x","order_number_y"],axis=1)
user_product_features = user_product_features.merge(new_df,on=["user_id","product_id"],how="left").fillna(0)

In [113]:
pickle.dump(user_product_features,open("new_features/user_product_features.pkl","wb"))

In [5]:
%%time
# day_reorder_rate
temp = df_prior.groupby(['product_id','order_hour_of_day'])["reordered"].size()
# temp = (df/df_prior.groupby(["product_id"]).size()).reset_index(name = 'hour_reorder_rate')

Wall time: 7.65 s


In [6]:
temp = temp/df_prior.groupby(["product_id"]).size()

In [7]:
temp = temp.reset_index(name = 'hour_reorder_rate')

In [8]:
pickle.dump(temp,open("new_features/hour_reorder_rate.pkl","wb"))

In [4]:
%%time
# product_days_since_prior_order_reorder_rate
temp = df_prior.groupby(['product_id','days_since_prior_order'])["reordered"].size()
temp = (temp/df_prior.groupby(["product_id"]).size())
temp = temp.reset_index(name = 'product_days_since_prior_order_reorder_rate')
pickle.dump(temp,open("new_features/product_days_since_prior_order_reorder_rate.pkl","wb"))

Wall time: 9.93 s


In [9]:
%%time
# product_days_since_prior_order_reorder_rate
temp = df_prior.groupby(['product_id','order_dow'])["reordered"].size()
temp = (temp/df_prior.groupby(["product_id"]).size())
temp = temp.reset_index(name = 'day_reorder_rate')
pickle.dump(temp,open("new_features/day_reorder_rate.pkl","wb"))

Wall time: 9.03 s


In [10]:
%%time
# product_days_since_prior_order_reorder_rate
temp = df_prior.groupby(['user_id','days_since_prior_order'])["reordered"].size()
temp = (temp/df_prior.groupby(["user_id"]).size())
temp = temp.reset_index(name = 'user_days_since_prior_order_reorder_rate')
pickle.dump(temp,open("new_features/user_days_since_prior_order_reorder_rate.pkl","wb"))

Wall time: 8.21 s


In [11]:
%%time
# product_days_since_prior_order_reorder_rate
temp = df_prior.groupby(['user_id',"product_id",'days_since_prior_order'])["reordered"].size()
temp = (temp/df_prior.groupby(["user_id","product_id"]).size())
temp = temp.reset_index(name = 'days_since_prior_reorder_rate')
pickle.dump(temp,open("new_features/days_since_prior_reorder_rate.pkl","wb"))

Wall time: 25min 26s


<h2> user  features for first 15 days of the month </h2>

In [None]:
# https://www.kaggle.com/c/instacart-market-basket-analysis/discussion/38098

In [67]:
df_f15 = df_prior[df_prior["days_since_prior_order"] <=15]

In [68]:
%%time
user_features_f15 = pd.DataFrame(columns=['user_id'])
user_features_f15['user_id'] = df_f15['user_id'].sort_values().unique()

Wall time: 5.93 s


In [69]:
%%time
# user reorder rate
user_reorder_rate = df_f15.groupby(["user_id","reordered"])['reordered'].count().groupby(level = 0).apply(lambda x: x / float(x.sum())).reset_index(name='reorder_rate_f15')

Wall time: 3min 40s


In [70]:
user_reorder_rate = user_reorder_rate[user_reorder_rate["reordered"]==1]

In [71]:
user_reorder_rate = user_reorder_rate.drop("reordered",axis=1)

In [72]:
user_features_f15 = user_features_f15.merge(user_reorder_rate,on="user_id",how = "left").fillna(0)

In [73]:
%%time
# no of unique products and total products  by user
temp = df_f15.groupby(["user_id"])['product_name'].nunique().reset_index(name = 'unique_products_f15')
user_features_f15 = user_features_f15.merge(temp,on="user_id",how="left")

temp = df_f15.groupby(["user_id"])['product_name'].size().reset_index(name = 'user_total_products_f15')
user_features_f15 = user_features_f15.merge(temp,on="user_id",how="left")


Wall time: 1min 17s


In [74]:
%%time
# avg cart size
temp = df_f15.groupby(["user_id","order_id"])['add_to_cart_order'].count().reset_index(name='avg_cart_size_f15')
temp_1 = temp.groupby('user_id')['avg_cart_size_f15'].mean().reset_index()

Wall time: 6.57 s


In [75]:
user_features_f15 = user_features_f15.merge(temp_1,on="user_id",how="left")

In [76]:
%%time
#Get average days between 2 orders for every user
temp = df_f15.groupby(["user_id","order_id"])['days_since_prior_order'].max().reset_index(name='user_avg_days_between_orders_f15')
temp_1 = temp.groupby('user_id')['user_avg_days_between_orders_f15'].mean().reset_index()

Wall time: 7.21 s


In [77]:
%%time
user_features_f15 = user_features_f15.merge(temp_1,on="user_id",how="left")

Wall time: 148 ms


In [78]:
%%time
#get user product reorder ratio 
    # number of unique products reordered / number of unique products ordered
df = pd.DataFrame()
df['user_id'] = df_f15['user_id'].sort_values().unique()
df['user_unique_products'] = df_f15.groupby(["user_id"])['product_name'].nunique().reset_index(name = 'unique')['unique']
df['user_reordered_products'] = df_f15[df_prior['reordered']==1].groupby(["user_id"])['product_name'].nunique().reset_index(name = 'reordered_unique')['reordered_unique']
df.fillna(0, inplace = True)

Wall time: 2min 17s


In [79]:
df["user_reordered_products_ratio_f15"] = df['user_reordered_products'] / df['user_unique_products']

In [80]:
%%time
user_features_f15 = user_features_f15.merge(df,on="user_id",how="left")
pickle.dump(user_features_f15,open("new_features/user_features_f15.pkl","wb"))

Wall time: 284 ms


In [87]:
user_features_f15.columns

Index(['user_id', 'reorder_rate_f15', 'unique_products_f15',
       'user_total_products_f15', 'avg_cart_size_f15',
       'user_avg_days_between_orders_f15', 'user_unique_products_ordered_f15',
       'user_reordered_products_f15', 'user_reordered_products_ratio_f15'],
      dtype='object')

In [86]:
user_features_f15 = user_features_f15.rename(columns={"user_unique_products":"user_unique_products_ordered_f15","user_reordered_products":"user_reordered_products_f15"})

In [90]:
user_features_f15 = user_features_f15.drop("user_unique_products_ordered_f15",axis=1)

In [91]:
pickle.dump(user_features_f15,open("new_features/user_features_f15.pkl","wb"))

In [92]:
user_features_f15.head(5)

Unnamed: 0,user_id,reorder_rate_f15,unique_products_f15,user_total_products_f15,avg_cart_size_f15,user_avg_days_between_orders_f15,user_reordered_products_f15,user_reordered_products_ratio_f15
0,1,0.565217,11,23,5.75,7.25,7.0,0.636364
1,2,0.448529,77,136,13.6,8.3,31.0,0.402597
2,3,0.647887,29,71,7.888889,8.333333,19.0,0.655172
3,4,0.0,9,9,3.0,5.0,4.0,0.444444
4,5,0.24,19,25,8.333333,7.0,2.0,0.105263


<h2> user features for last 15 days </h2>

In [116]:
df_l15 = df_prior[df_prior["days_since_prior_order"] >15]

In [117]:
%%time
user_features_l15 = pd.DataFrame(columns=['user_id'])
user_features_l15['user_id'] = df_l15['user_id'].sort_values().unique()

Wall time: 1.11 s


In [118]:
%%time
# user reorder rate
user_reorder_rate = df_l15.groupby(["user_id","reordered"])['reordered'].count().groupby(level = 0).apply(lambda x: x / float(x.sum())).reset_index(name='reorder_rate_l15')

Wall time: 3min 17s


In [122]:
user_reorder_rate = user_reorder_rate[user_reorder_rate["reordered"]==1]
user_reorder_rate = user_reorder_rate.drop("reordered",axis=1)
user_features_l15 = user_features_l15.merge(user_reorder_rate,on="user_id",how = "left").fillna(0)

In [128]:
%%time
# no of unique products and total products  by user
temp = df_l15.groupby(["user_id"])['product_name'].nunique().reset_index(name = 'unique_products_l15')
user_features_l15 = user_features_l15.merge(temp,on="user_id",how="left")

temp = df_l15.groupby(["user_id"])['product_name'].size().reset_index(name = 'user_total_products_l15')
user_features_l15 = user_features_l15.merge(temp,on="user_id",how="left")


Wall time: 17.5 s


In [133]:
user_features_l15

Unnamed: 0,user_id,reorder_rate_l15,unique_products_l15,user_total_products_l15,avg_cart_size_l15,user_avg_days_between_orders_l15
0,1,0.777778,15,36,6.000000,24.500000
1,2,0.542373,47,59,14.750000,28.750000
2,3,0.529412,15,17,5.666667,19.333333
3,4,0.111111,9,9,4.500000,20.000000
4,5,0.666667,12,12,12.000000,19.000000
...,...,...,...,...,...,...
181850,206204,0.294118,17,17,17.000000,25.000000
181851,206205,0.250000,8,8,8.000000,30.000000
181852,206207,0.580000,67,100,14.285714,25.285714
181853,206208,0.659091,37,44,14.666667,18.333333


In [130]:
%%time
# avg cart size
temp = df_l15.groupby(["user_id","order_id"])['add_to_cart_order'].count().reset_index(name='avg_cart_size_l15')
temp_1 = temp.groupby('user_id')['avg_cart_size_l15'].mean().reset_index()
user_features_l15 = user_features_l15.merge(temp_1,on="user_id",how="left")

Wall time: 1.91 s


In [132]:
%%time
#Get average days between 2 orders for every user
temp = df_l15.groupby(["user_id","order_id"])['days_since_prior_order'].max().reset_index(name='user_avg_days_between_orders_l15')
temp_1 = temp.groupby('user_id')['user_avg_days_between_orders_l15'].mean().reset_index()


user_features_l15 = user_features_l15.merge(temp_1,on="user_id",how="left")

Wall time: 1.98 s


In [134]:
%%time
#get user product reorder ratio 
    # number of unique products reordered / number of unique products ordered
df = pd.DataFrame()
df['user_id'] = df_l15['user_id'].sort_values().unique()
df['user_unique_products'] = df_l15.groupby(["user_id"])['product_name'].nunique().reset_index(name = 'unique')['unique']
df['user_reordered_products'] = df_l15[df_prior['reordered']==1].groupby(["user_id"])['product_name'].nunique().reset_index(name = 'reordered_unique')['reordered_unique']
df.fillna(0, inplace = True)

Wall time: 27.2 s


In [135]:
df["user_reordered_products_ratio_l15"] = df['user_reordered_products'] / df['user_unique_products']

In [137]:
df = df.drop("user_reordered_products",axis=1)
df = df.rename(columns = {"user_reordered_products":"user_reordered_products_l15"})

In [138]:
%%time
user_features_l15 = user_features_l15.merge(df,on="user_id",how="left")
pickle.dump(user_features_l15,open("new_features/user_features_l15.pkl","wb"))

Wall time: 117 ms


<h2> user product features for first 15 days of a month </h2>

In [142]:
%%time
user_product_features_f15 = pd.DataFrame(columns=['user_id','product_id'])

Wall time: 0 ns


In [144]:
%%time
temp = df_f15.groupby(["user_id","product_id"]).size().reset_index()
user_product_features_f15["user_id"] = temp["user_id"]
user_product_features_f15["product_id"] = temp["product_id"]

Wall time: 16.1 s


In [145]:
%%time
# frequently user ordered the product
#times user ordered the product/ #times user placed an order
temp = df_f15.groupby(["user_id","product_id"])["reordered"].size()
temp = (temp/df_f15.groupby(["user_id"]).size()).reset_index(name = 'user_product_order_rate_f15')


Wall time: 16.2 s


In [146]:
%%time
user_product_features_f15 = user_product_features_f15.merge(temp,on=["user_id","product_id"],how="left").fillna(0)

Wall time: 10.1 s


In [148]:
%%time
# How frequently user reordered the product
#times user reordered the product/ #times user ordered the product
temp = df_f15[df_f15["reordered"]==1].groupby(["user_id","product_id"])["reordered"].size()
temp = (temp/df_f15.groupby(["user_id","product_id"]).size()).reset_index(name = 'user_product_reorder_rate_f15')
user_product_features_f15 = user_product_features_f15.merge(temp,on=["user_id","product_id"],how="left").fillna(0)

Wall time: 1min 11s


In [151]:
pickle.dump(user_product_features_f15,open("new_features/user_product_features_f15.pkl","wb"))

In [156]:
%%time
user_product_features_l15 = pd.DataFrame(columns=['user_id','product_id'])

Wall time: 0 ns


In [157]:
%%time
temp = df_l15.groupby(["user_id","product_id"]).size().reset_index()
user_product_features_l15["user_id"] = temp["user_id"]
user_product_features_l15["product_id"] = temp["product_id"]

Wall time: 5.92 s


In [158]:
%%time
# frequently user ordered the product
#times user ordered the product/ #times user placed an order
temp = df_l15.groupby(["user_id","product_id"])["reordered"].size()
temp = (temp/df_l15.groupby(["user_id"]).size()).reset_index(name = 'user_product_order_rate_l15')
user_product_features_l15 = user_product_features_l15.merge(temp,on=["user_id","product_id"],how="left").fillna(0)

Wall time: 10.1 s


In [159]:
%%time
# How frequently user reordered the product
#times user reordered the product/ #times user ordered the product
temp = df_l15[df_l15["reordered"]==1].groupby(["user_id","product_id"])["reordered"].size()
temp = (temp/df_l15.groupby(["user_id","product_id"]).size()).reset_index(name = 'user_product_reorder_rate_f15')
user_product_features_l15 = user_product_features_l15.merge(temp,on=["user_id","product_id"],how="left").fillna(0)

Wall time: 26.6 s


In [161]:
user_product_features_l15 = user_product_features_l15.rename(columns={"user_product_reorder_rate_f15":"user_product_reorder_rate_l15"})

In [162]:
pickle.dump(user_product_features_l15,open("new_features/user_product_features_l15.pkl","wb"))

<h2> implementing word2vec </h2>

In [163]:
# https://www.kaggle.com/omarito/word2vec-for-products-analysis-0-01-lb
# https://github.com/jacquespeeters/instacart-market-basket-analysis/blob/master/word2vector.py
aisles = pd.read_csv("data\\aisles.csv")
departments = pd.read_csv("data\\departments.csv")
product     = pd.read_csv("data\\products.csv")

In [164]:
t = product.merge(departments,on='department_id').merge(aisles,on='aisle_id')
# t = product.merge(departments,on='department_id').merge(aisles,on='aisle_id')
t['concat'] = t.product_name+' '+t.aisle+' '+t.department

In [165]:
import spacy
from tqdm import tqdm
nlp = spacy.load("en_core_web_sm")

In [166]:
embedings = []
for i in tqdm(t.concat):
    embedings.append(nlp(i).vector.tolist())

100%|████████████████████████████████████████████████████████████████████████████| 49688/49688 [12:28<00:00, 66.39it/s]


In [167]:
embedings = np.array(embedings)

In [168]:
embedings

array([[ 1.59712982,  3.68936539,  0.42697558, ..., -0.27670634,
         0.43996802, -0.19875298],
       [ 2.14573312,  2.89356637,  0.88513601, ..., -0.25646234,
        -0.01654181, -0.05958762],
       [ 1.63370895,  3.77954602,  0.94395179, ..., -0.48379385,
        -0.37982002,  0.68874878],
       ...,
       [ 2.11418772,  1.16476929,  0.81052685, ...,  0.15621345,
         0.11892312,  0.84416145],
       [ 2.17515397,  1.54050148,  0.93751061, ..., -0.42087144,
        -0.2340377 ,  0.20871983],
       [ 2.08332491,  1.74270678,  0.09583858, ...,  0.97301716,
         0.29398429, -0.25241169]])

In [171]:
pca = PCA(random_state=42)
pca.n_components = 30
pca_data = pca.fit_transform(pd.DataFrame(embedings))
pca_data.shape

(49688, 30)

In [172]:
embedd = pd.DataFrame(pca_data)
embedd.columns = ["pca"+str(i) for i in embedd.columns]
embedd['product_id'] = t.product_id
embedd.head(3)

Unnamed: 0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,...,pca21,pca22,pca23,pca24,pca25,pca26,pca27,pca28,pca29,product_id
0,5.685825,-3.038605,-2.744053,1.108631,-0.089354,-1.691685,-0.707914,-1.055036,-0.589785,0.678092,...,-0.721664,0.342295,-0.537946,-0.004651,-0.516881,-1.288119,0.045901,0.725184,-1.23947,1
1,1.764361,-1.80515,-1.872392,0.87902,0.346604,-1.049471,-1.054171,0.272769,-0.37564,-0.753306,...,-0.034965,-0.739605,-0.457462,-0.490952,-0.241052,-1.005393,0.264379,1.137681,-0.230692,78
2,5.245373,-3.529498,-1.309787,-0.370892,0.542056,-1.2273,-0.511927,-1.581759,-0.059841,0.076461,...,-0.724696,-0.387842,-0.70741,0.064645,-0.189325,-1.461408,-0.198345,-0.509135,-0.511021,102


In [173]:
pickle.dump(embedd,open('new_features/products_pca.pkl','wb'))