# Import all modules

In [1]:
import pandas as pd # to start reading csv
import numpy as np
import matplotlib.pyplot as plt #plotting basic
import seaborn as sns
%matplotlib inline

# Reading all the files

In [2]:
aisles = pd.read_csv("aisles.csv")
departments = pd.read_csv("departments.csv")
order_products_prior = pd.read_csv("order_products__prior.csv")
order_products_train = pd.read_csv("order_products__train.csv")
orders = pd.read_csv("orders.csv")
products = pd.read_csv("products.csv")

In [3]:
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


# Opening the Training data

In [4]:
train_orders=orders[orders['eval_set']=='train']
trains=pd.merge(order_products_train, train_orders,how='left', on='order_id')
trains.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,1,49302,1,1,112108,train,4,4,10,9.0
1,1,11109,2,1,112108,train,4,4,10,9.0
2,1,10246,3,0,112108,train,4,4,10,9.0
3,1,49683,4,0,112108,train,4,4,10,9.0
4,1,43633,5,1,112108,train,4,4,10,9.0


# Feature Engineering Part I: Product Features

- purchase_count: How many people purchased this product
- reordered_count: How many people reordered this product

In [5]:
prior_orders=orders[orders['eval_set']=='prior']
priors = pd.merge(order_products_prior, prior_orders, on='order_id')
priors.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0


In [6]:
priors_product = pd.merge(priors, products, on='product_id')
priors_product.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16
1,26,33120,5,0,153404,prior,2,0,16,7.0,Organic Egg Whites,86,16
2,120,33120,13,0,23750,prior,11,6,8,10.0,Organic Egg Whites,86,16
3,327,33120,5,1,58707,prior,21,6,9,8.0,Organic Egg Whites,86,16
4,390,33120,28,1,166654,prior,48,0,12,9.0,Organic Egg Whites,86,16


In [7]:
priors_product_purchase= priors_product.groupby(["product_id"]).size().reset_index(name ='purchase_count')
priors_product_purchase.head()

Unnamed: 0,product_id,purchase_count
0,1,1852
1,2,90
2,3,277
3,4,329
4,5,15


In [8]:
priors_reordered = priors_product[priors_product['reordered'] == 1]
priors_reordered.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16
3,327,33120,5,1,58707,prior,21,6,9,8.0,Organic Egg Whites,86,16
4,390,33120,28,1,166654,prior,48,0,12,9.0,Organic Egg Whites,86,16
5,537,33120,2,1,180135,prior,15,2,8,3.0,Organic Egg Whites,86,16
6,582,33120,7,1,193223,prior,6,2,19,10.0,Organic Egg Whites,86,16


In [9]:
priors_product_reordered= priors_reordered.groupby(["product_id"]).size().reset_index(name ='reordered_count')
priors_product_reordered.head()

Unnamed: 0,product_id,reordered_count
0,1,1136
1,2,12
2,3,203
3,4,147
4,5,9


In [10]:
priors_product_merged1 = pd.merge(priors_product, priors_product_purchase, on='product_id')
priors_product_merged1.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,purchase_count
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16,19400
1,26,33120,5,0,153404,prior,2,0,16,7.0,Organic Egg Whites,86,16,19400
2,120,33120,13,0,23750,prior,11,6,8,10.0,Organic Egg Whites,86,16,19400
3,327,33120,5,1,58707,prior,21,6,9,8.0,Organic Egg Whites,86,16,19400
4,390,33120,28,1,166654,prior,48,0,12,9.0,Organic Egg Whites,86,16,19400


In [11]:
priors_product_merged2 = pd.merge(priors_product_merged1, priors_product_reordered, on='product_id')
priors_product_merged2.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,purchase_count,reordered_count
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16,19400,13744
1,26,33120,5,0,153404,prior,2,0,16,7.0,Organic Egg Whites,86,16,19400,13744
2,120,33120,13,0,23750,prior,11,6,8,10.0,Organic Egg Whites,86,16,19400,13744
3,327,33120,5,1,58707,prior,21,6,9,8.0,Organic Egg Whites,86,16,19400,13744
4,390,33120,28,1,166654,prior,48,0,12,9.0,Organic Egg Whites,86,16,19400,13744


In [12]:
FE1=priors_product_merged2[["product_id","purchase_count","reordered_count"]]
FE1.head()

Unnamed: 0,product_id,purchase_count,reordered_count
0,33120,19400,13744
1,33120,19400,13744
2,33120,19400,13744
3,33120,19400,13744
4,33120,19400,13744


# Feature Engineering Part 2: User Features

- average number of days the user comes back from shopping
- average size of user cart

In [13]:
avg_reorder_days = prior_orders.groupby(["user_id"])['days_since_prior_order'].aggregate('count').reset_index(name='avg_days_prior_order')
avg_reorder_days.head()

Unnamed: 0,user_id,avg_days_prior_order
0,1,9
1,2,13
2,3,11
3,4,4
4,5,3


In [14]:
avg_usercart_size = priors_reordered.groupby(["user_id"])['add_to_cart_order'].aggregate('count').reset_index(name='avg_user_cart_size')
avg_usercart_size.head()

Unnamed: 0,user_id,avg_user_cart_size
0,1,41
1,2,93
2,3,55
3,4,1
4,5,14


In [15]:
priors_product_merged3 = pd.merge(priors_product_merged2, avg_reorder_days, on='user_id')
priors_product_merged3.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,purchase_count,reordered_count,avg_days_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16,19400,13744,7
1,104690,33120,2,1,202279,prior,8,6,10,30.0,Organic Egg Whites,86,16,19400,13744,7
2,132412,33120,1,1,202279,prior,6,5,9,30.0,Organic Egg Whites,86,16,19400,13744,7
3,2808715,33120,1,1,202279,prior,5,5,13,9.0,Organic Egg Whites,86,16,19400,13744,7
4,2894949,33120,5,0,202279,prior,1,5,9,,Organic Egg Whites,86,16,19400,13744,7


In [16]:
priors_product_merged4 = pd.merge(priors_product_merged3, avg_usercart_size, on='user_id')
priors_product_merged4.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,purchase_count,reordered_count,avg_days_prior_order,avg_user_cart_size
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16,19400,13744,7,43
1,104690,33120,2,1,202279,prior,8,6,10,30.0,Organic Egg Whites,86,16,19400,13744,7,43
2,132412,33120,1,1,202279,prior,6,5,9,30.0,Organic Egg Whites,86,16,19400,13744,7,43
3,2808715,33120,1,1,202279,prior,5,5,13,9.0,Organic Egg Whites,86,16,19400,13744,7,43
4,2894949,33120,5,0,202279,prior,1,5,9,,Organic Egg Whites,86,16,19400,13744,7,43


# Feature Engineering Part 3: Product User features

- how many times this specific user buy this product
- reorder rate of this product by the user: reorder rate = product reordered by this customer/number of order by this customer

In [17]:
priors_product_purchase_spec= priors_product.groupby(["user_id","product_id"]).size().reset_index(name ='purchase_count_spec')
priors_product_purchase_spec.head()

Unnamed: 0,user_id,product_id,purchase_count_spec
0,1,196,10
1,1,10258,9
2,1,10326,1
3,1,12427,10
4,1,13032,3


In [18]:
priors_product_reordered_spec= priors_reordered.groupby(["user_id","product_id"]).size().reset_index(name ='reordered_count_spec')
priors_product_reordered_spec.head()

Unnamed: 0,user_id,product_id,reordered_count_spec
0,1,196,9
1,1,10258,8
2,1,12427,9
3,1,13032,2
4,1,13176,1


In [19]:
priors_product_spec1 = pd.merge(priors_product_merged4, priors_product_purchase_spec)
priors_product_spec1.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,purchase_count,reordered_count,avg_days_prior_order,avg_user_cart_size,purchase_count_spec
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16,19400,13744,7,43,5
1,104690,33120,2,1,202279,prior,8,6,10,30.0,Organic Egg Whites,86,16,19400,13744,7,43,5
2,132412,33120,1,1,202279,prior,6,5,9,30.0,Organic Egg Whites,86,16,19400,13744,7,43,5
3,2808715,33120,1,1,202279,prior,5,5,13,9.0,Organic Egg Whites,86,16,19400,13744,7,43,5
4,2894949,33120,5,0,202279,prior,1,5,9,,Organic Egg Whites,86,16,19400,13744,7,43,5


In [20]:
priors_product_spec2 = pd.merge(priors_product_spec1, priors_product_reordered_spec)
priors_product_spec2.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,purchase_count,reordered_count,avg_days_prior_order,avg_user_cart_size,purchase_count_spec,reordered_count_spec
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16,19400,13744,7,43,5,4
1,104690,33120,2,1,202279,prior,8,6,10,30.0,Organic Egg Whites,86,16,19400,13744,7,43,5,4
2,132412,33120,1,1,202279,prior,6,5,9,30.0,Organic Egg Whites,86,16,19400,13744,7,43,5,4
3,2808715,33120,1,1,202279,prior,5,5,13,9.0,Organic Egg Whites,86,16,19400,13744,7,43,5,4
4,2894949,33120,5,0,202279,prior,1,5,9,,Organic Egg Whites,86,16,19400,13744,7,43,5,4


In [21]:
priors_product_spec2['reorder_ratio']=priors_product_spec2['reordered_count_spec']/priors_product_spec2['purchase_count_spec']
priors_product_spec2.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,purchase_count,reordered_count,avg_days_prior_order,avg_user_cart_size,purchase_count_spec,reordered_count_spec,reorder_ratio
0,2,33120,1,1,202279,prior,3,5,9,8.0,Organic Egg Whites,86,16,19400,13744,7,43,5,4,0.8
1,104690,33120,2,1,202279,prior,8,6,10,30.0,Organic Egg Whites,86,16,19400,13744,7,43,5,4,0.8
2,132412,33120,1,1,202279,prior,6,5,9,30.0,Organic Egg Whites,86,16,19400,13744,7,43,5,4,0.8
3,2808715,33120,1,1,202279,prior,5,5,13,9.0,Organic Egg Whites,86,16,19400,13744,7,43,5,4,0.8
4,2894949,33120,5,0,202279,prior,1,5,9,,Organic Egg Whites,86,16,19400,13744,7,43,5,4,0.8


# Merging All prior data to train data
- merging train data with F.E 1, product id
- merging the product of the merged data with F.E 2, user id
- merging the product of the merged data with F.E 3, product id, user id

- Note that we should have 8-12 features up until here

In [None]:
trains_fe1 = pd.merge(trains, FE1)
trains_fe1.head()

In [None]:
trains.size()

In [None]:
sdsdsds

In [None]:
orders_time= orders[['order_id','order_hour_of_day']]
prior_reorder=pd.merge(prior_reorder,orders_time, how='left', on='order_id')
prior_reorder.head()

In [None]:
prior_orders=orders[orders['eval_set']=='prior']
prior_orders= prior_orders[['order_id','user_id']]
prior_orders.head()

In [None]:
prior_reorder_merged=pd.merge(prior_reorder,prior_orders, how='left', on='order_id')
prior_reorder_merged.head()

In [None]:
prior_reorder_grouped=prior_reorder_merged.groupby(["user_id", "product_id"]).size().reset_index(name="product_count")
prior_reorder_grouped.head()

In [None]:
prior_reorder_all=pd.merge(prior_reorder_merged,prior_reorder_grouped, how='left', on=['user_id','product_id'])
prior_reorder_all.head()

In [None]:
prior_reorder_all_count=prior_reorder_all.groupby(["product_id","user_id"]).size().reset_index(name="customer_count")
prior_reorder_all_count.head()

In [None]:
prior_reorder_complete=pd.merge(prior_reorder_all,prior_reorder_all_count, how='left', on=['user_id','product_id'])

In [None]:
print(prior_reorder_complete.head(20))
print(prior_reorder_complete.dtypes)

In [None]:
prior_reorder_complete_smpl=prior_reorder_complete[['product_id','product_count','customer_count']]
prior_reorder_complete_smpl.head()

# 4a) Machine Learning Section

In [None]:
#train_reorder.head()

With the train dataset, it is required for it to be merged with the features from prior dataset
- Product id, purchase count, reordered_count product
- user id, no of days,avg size
- product_used, user id

In [None]:
train_set= order_products_train[['order_id','product_id','user_id']]
train_set.head()

In [None]:
train_set.info()

In [None]:
prior_reorder_complete_smpl.info()

In [None]:
asaas

In [None]:
product_features= pd.merge(train_set, prior_reorder_complete_smpl, on='product_id', how='inner')
product_features.head()

In [None]:
sdsdsds

In [None]:
orders.groupby(["order_dow", "order_hour_of_day"])["order_number"].aggregate("count").reset_index()
avg_size = order_products_prior.groupby(["order_id"]).aggregate("count").reset_index()
avg_size.head()
avg_size_smpl=avg_size_smpl[['order_id','product_id','user_id']]

In [None]:
#Xtrain = train_reorder.drop(['reordered','product_name','aisle_id','department_id', 'add_to_cart_order','aisle','department', 'product_count','customer_count'], axis=1)
#ytrain = train_reorder[['reordered','product_count']]
x = orders_merged.drop(['order_id','eval_set_x','eval_set_y','reordered', 'add_to_cart_order'], axis=1)
y = orders_merged['reordered']

In [None]:
#Xtrain.head()
x.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
model = LogisticRegression()

In [None]:
#model.fit(Xtrain,ytrain)
model.fit(x,y)


In [None]:
test_set = orders[orders['eval_set']=='test']
test_set = test_set.drop(['eval_set'],axis=1)
test_set.head()

In [None]:
test_set['reordered'] = model.predict(test_set)

In [None]:
train_set[train_set['reordered']==0].groupby('order_id')['product_id'].apply(list)

In [None]:
train_set[train_set['reordered']==1].groupby('order_id')['product_id'].apply(list)

In [None]:
model.score(x,y)

In [None]:
training_accuracy = model.score(Xtrain, ytrain)
test_accuracy = model.score(Xtest, ytest)
print("Accuracy on training data: {:0.2f}".format(training_accuracy))
print("Accuracy on test data:     {:0.2f}".format(test_accuracy))
return model, Xtrain, ytrain, Xtest, ytest