In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('data/dfmerged.csv')
df = df.drop(['Unnamed: 0', 'eval_set', 'aisle', 'department', 'product_name'], axis = 1)
df.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,aisle_id,department_id
0,2539329,1,1,2,8,,196,1,0,77,7
1,2398795,1,2,3,7,15.0,196,1,1,77,7
2,473747,1,3,3,12,21.0,196,1,1,77,7
3,2254736,1,4,4,7,29.0,196,1,1,77,7
4,431534,1,5,4,15,28.0,196,1,1,77,7


## 1.1 Create table of predictor variables 

In [4]:
user = pd.read_csv('data/user.csv')
user.head()

Unnamed: 0,user_id,reordered,total_orders,total_products,average_order_size,average_reorder
0,1,51,11,70,6.36,0.73
1,2,105,15,226,15.07,0.46
2,3,55,12,88,7.33,0.62
3,4,1,5,18,3.6,0.06
4,5,18,5,46,9.2,0.39


In [5]:
product = pd.read_csv('data/prodpred.csv')
product = product.drop(['Unnamed: 0', 'product_name', 'aisle_id','department_id', 'department'], axis=1).sort_values(by='product_id').reset_index()
product = product.drop(['index'], axis=1)
product.head()

Unnamed: 0,product_id,p_total_reorders,p_total_purchases,percent_reorder
0,1,1185,1928,0.614627
1,2,13,94,0.138298
2,3,209,283,0.738516
3,4,161,351,0.458689
4,5,10,16,0.625


In [6]:
user_prod = df.groupby(['user_id', 'product_id'])['order_id'].count().to_frame('total_bought').reset_index()
user_prod.head()

Unnamed: 0,user_id,product_id,total_bought
0,1,196,11
1,1,10258,10
2,1,10326,1
3,1,12427,10
4,1,13032,4


In [7]:
first_order_num = df.groupby(['user_id', 'product_id'])['order_number'].min().to_frame('first_order_num').reset_index()
first_order_num.head()

Unnamed: 0,user_id,product_id,first_order_num
0,1,196,1
1,1,10258,2
2,1,10326,5
3,1,12427,1
4,1,13032,2


In [8]:
total_orders = user[['user_id','total_orders']]
first_order_num = first_order_num.merge(total_orders, on = 'user_id', how = 'left')
first_order_num.head()

Unnamed: 0,user_id,product_id,first_order_num,total_orders
0,1,196,1,11
1,1,10258,2,11
2,1,10326,5,11
3,1,12427,1,11
4,1,13032,2,11


In [9]:
first_order_num = first_order_num.merge(user_prod, on = ['user_id', 'product_id'], how = 'left')
first_order_num.head()

Unnamed: 0,user_id,product_id,first_order_num,total_orders,total_bought
0,1,196,1,11,11
1,1,10258,2,11,10
2,1,10326,5,11,1
3,1,12427,1,11,10
4,1,13032,2,11,4


In [10]:
first_order_num['up_reorders'] = first_order_num.total_orders - first_order_num.first_order_num + 1
first_order_num['up_reorder_ratio'] = first_order_num.total_bought / first_order_num.up_reorders
first_order_num.head()

Unnamed: 0,user_id,product_id,first_order_num,total_orders,total_bought,up_reorders,up_reorder_ratio
0,1,196,1,11,11,11,1.0
1,1,10258,2,11,10,10,1.0
2,1,10326,5,11,1,7,0.142857
3,1,12427,1,11,10,11,0.909091
4,1,13032,2,11,4,10,0.4


In [11]:
avg_days = df.groupby(['user_id', 'product_id'])['days_since_prior_order'].mean().to_frame('avg_days_btw_order').reset_index()
avg_days.head()

Unnamed: 0,user_id,product_id,avg_days_btw_order
0,1,196,19.0
1,1,10258,19.0
2,1,10326,28.0
3,1,12427,19.555556
4,1,13032,19.75


In [12]:

user_prod = first_order_num.merge(avg_days, on =['user_id', 'product_id'], how ='left')
user_prod.head()

Unnamed: 0,user_id,product_id,first_order_num,total_orders,total_bought,up_reorders,up_reorder_ratio,avg_days_btw_order
0,1,196,1,11,11,11,1.0,19.0
1,1,10258,2,11,10,10,1.0,19.0
2,1,10326,5,11,1,7,0.142857,28.0
3,1,12427,1,11,10,11,0.909091,19.555556
4,1,13032,2,11,4,10,0.4,19.75
5,1,13176,2,11,2,10,0.2,21.5
6,1,14084,1,11,1,11,0.090909,
7,1,17122,5,11,1,7,0.142857,28.0
8,1,25133,3,11,9,9,1.0,19.444444
9,1,26088,1,11,3,11,0.272727,14.5


In [13]:
df.columns

Index(['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day',
       'days_since_prior_order', 'product_id', 'add_to_cart_order',
       'reordered', 'aisle_id', 'department_id'],
      dtype='object')

In [14]:
df = df[[ 'user_id','product_id', 'order_number','order_id','reordered', 'days_since_prior_order']]
df = df.merge(total_orders, on = 'user_id', how ='left')
df.head()

Unnamed: 0,user_id,product_id,order_number,order_id,reordered,total_orders
0,1,196,1,2539329,0,11
1,1,196,2,2398795,1,11
2,1,196,3,473747,1,11
3,1,196,4,2254736,1,11
4,1,196,5,431534,1,11


In [15]:
df = df.loc[df['order_number']== df['total_orders']].sort_values(by=['user_id', 'product_id']).reset_index()
df.drop(['index'], axis=1, inplace=True)
df.head()

Unnamed: 0,user_id,product_id,order_number,order_id,reordered,total_orders
0,1,196,11,1187899,1,11
1,1,10258,11,1187899,1,11
2,1,13032,11,1187899,1,11
3,1,25133,11,1187899,1,11
4,1,26088,11,1187899,1,11


In [16]:
user_prod.columns

Index(['user_id', 'product_id', 'first_order_num', 'total_orders',
       'total_bought', 'up_reorders', 'up_reorder_ratio',
       'avg_days_btw_order'],
      dtype='object')

In [17]:
up_df = user_prod[['user_id', 'product_id','total_bought','up_reorder_ratio', 'avg_days_btw_order']]
p_df = product[['product_id', 'percent_reorder']]
up_df = up_df.merge(p_df, on ='product_id', how ='left')
up_df.head()

Unnamed: 0,user_id,product_id,total_bought,up_reorder_ratio,avg_days_btw_order,percent_reorder
0,1,196,11,1.0,19.0,0.777843
1,1,10258,10,1.0,19.0,0.71561
2,1,10326,1,0.142857,28.0,0.653439
3,1,12427,10,0.909091,19.555556,0.740182
4,1,13032,4,0.4,19.75,0.661117


In [18]:
up_df.shape

(13863746, 6)

In [19]:
df.shape

(2165417, 6)

In [20]:
df = df.merge(up_df, on = ['user_id','product_id'], how = 'left')
df.head()

Unnamed: 0,user_id,product_id,order_number,order_id,reordered,total_orders,total_bought,up_reorder_ratio,avg_days_btw_order,percent_reorder
0,1,196,11,1187899,1,11,11,1.0,19.0,0.777843
1,1,10258,11,1187899,1,11,10,1.0,19.0,0.71561
2,1,13032,11,1187899,1,11,4,0.4,19.75,0.661117
3,1,25133,11,1187899,1,11,9,1.0,19.444444,0.740364
4,1,26088,11,1187899,1,11,3,0.272727,14.5,0.540429


In [21]:
df.columns

Index(['user_id', 'product_id', 'order_number', 'order_id', 'reordered',
       'total_orders', 'total_bought', 'up_reorder_ratio',
       'avg_days_btw_order', 'percent_reorder'],
      dtype='object')

In [22]:
features = ['user_id', 'product_id', 'order_number', 'order_id', 
       'total_orders', 'total_bought', 'up_reorder_ratio',
       'avg_days_btw_order', 'percent_reorder']
X = df[features]
y = df['reordered']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state= 123)

In [23]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)