In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns; sns.set()



# Import instacart data

In [2]:
aisles = pd.read_csv("aisles.csv")
departments = pd.read_csv("departments.csv")
orders = pd.read_csv("orders.csv")
prior = pd.read_csv("order_products__prior.csv")
train = pd.read_csv("order_products__train.csv")
products = pd.read_csv("products.csv")

In [3]:
# appending train with prior to obtain whole sample size 
full = prior.append(train)

In [4]:
# sorting values by order id to keep some structure
full.sort_values(by = 'order_id', inplace = True, kind = 'mergesort')

In [5]:
# merging orders and full to include time and product information
full= full.merge(orders, on = 'order_id', how = 'left')

In [6]:
# merging to substitute id for actual name
full = full.merge(products, on = 'product_id', how = 'left')

In [7]:
# merging to substitute id for actual name
full = full.merge(aisles, on = 'aisle_id', how = 'left')

In [8]:
# merging to substitute id for actual name
full = full.merge(departments, on = 'department_id', how = 'left')


In [9]:
# removing id in place of actual name 
full.pop('product_id')
full.pop('aisle_id')
full.pop('department_id')
full.head()

Unnamed: 0,order_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle,department
0,1,1,1,112108,train,4,4,10,9.0,Bulgarian Yogurt,yogurt,dairy eggs
1,1,2,1,112108,train,4,4,10,9.0,Organic 4% Milk Fat Whole Milk Cottage Cheese,other creams cheeses,dairy eggs
2,1,3,0,112108,train,4,4,10,9.0,Organic Celery Hearts,fresh vegetables,produce
3,1,4,0,112108,train,4,4,10,9.0,Cucumber Kirby,fresh vegetables,produce
4,1,5,1,112108,train,4,4,10,9.0,Lightly Smoked Sardines in Olive Oil,canned meat seafood,canned goods


In [10]:
# reordering sequence of columns for easy lookup
full = full[[ 'order_id',
             'order_number',
             'user_id',
             'department',
             'aisle',
             'product_name',
             'add_to_cart_order',
             'days_since_prior_order',
             'order_dow',
             'order_hour_of_day',
             'reordered',
             'eval_set']]
full.head()

Unnamed: 0,order_id,order_number,user_id,department,aisle,product_name,add_to_cart_order,days_since_prior_order,order_dow,order_hour_of_day,reordered,eval_set
0,1,4,112108,dairy eggs,yogurt,Bulgarian Yogurt,1,9.0,4,10,1,train
1,1,4,112108,dairy eggs,other creams cheeses,Organic 4% Milk Fat Whole Milk Cottage Cheese,2,9.0,4,10,1,train
2,1,4,112108,produce,fresh vegetables,Organic Celery Hearts,3,9.0,4,10,0,train
3,1,4,112108,produce,fresh vegetables,Cucumber Kirby,4,9.0,4,10,0,train
4,1,4,112108,canned goods,canned meat seafood,Lightly Smoked Sardines in Olive Oil,5,9.0,4,10,1,train


# Feature Engineering

In [11]:
dff = full.sort_values(by ='user_id').head(10000)

In [12]:
dff.head(50)

Unnamed: 0,order_id,order_number,user_id,department,aisle,product_name,add_to_cart_order,days_since_prior_order,order_dow,order_hour_of_day,reordered,eval_set
4265667,431534,5,1,beverages,soft drinks,Soda,1,28.0,4,15,1,prior
22289627,2254736,4,1,snacks,nuts seeds dried fruit,Pistachios,3,29.0,4,7,1,prior
22289628,2254736,4,1,dairy eggs,packaged cheese,Organic String Cheese,4,29.0,4,7,1,prior
22289629,2254736,4,1,household,paper goods,XL Pick-A-Size Paper Towel Rolls,5,29.0,4,7,1,prior
22690189,2295261,9,1,beverages,soft drinks,Zero Calorie Cola,2,0.0,1,16,1,prior
22690190,2295261,9,1,dairy eggs,packaged cheese,Organic String Cheese,3,0.0,1,16,1,prior
22690191,2295261,9,1,beverages,soft drinks,Soda,4,0.0,1,16,1,prior
22690192,2295261,9,1,snacks,nuts seeds dried fruit,Pistachios,5,0.0,1,16,1,prior
22690193,2295261,9,1,snacks,popcorn jerky,Original Beef Jerky,6,0.0,1,16,1,prior
5437567,550135,7,1,breakfast,cereal,Cinnamon Toast Crunch,5,20.0,1,9,1,prior


In [13]:
dff.fillna(0, axis = 0, inplace = True)

In [14]:
####restart point##########

In [15]:
# count of product per user 
grs = dff.groupby(['user_id']).product_name.value_counts() 

In [16]:
# convert to data frame for merge
grs = grs.to_frame()

In [17]:
# renaming columns for reindex
grs.rename(columns = {'product_name' : 'product_counts'}, inplace = True)

In [18]:
grs.reset_index(inplace = True)

In [19]:
# number of orders to divide 
amount_orders = dff.groupby('user_id',as_index=False).order_number.max()

In [20]:
# merging number of orders with amount of products 
grs = grs.merge(amount_orders, on = 'user_id')

In [21]:
grs.rename(columns = {'order_number':'amount_orders'}, inplace = True)

In [22]:
grs.head()


Unnamed: 0,user_id,product_name,product_counts,amount_orders
0,1,Soda,11,11
1,1,Original Beef Jerky,10,11
2,1,Pistachios,10,11
3,1,Organic String Cheese,9,11
4,1,Cinnamon Toast Crunch,4,11


In [23]:
grs = pd.merge(grs, dff[['user_id', 'order_id', 'order_number', 'product_name', 'department', 'aisle', 'add_to_cart_order', 'days_since_prior_order', 'order_dow', 'order_hour_of_day', 'reordered', 'eval_set' ]], on = ['user_id', 'product_name'], how = 'outer')

In [24]:
# ratio to infer how often the user purchases the same item 
grs['order_ratio'] = round(grs['product_counts'] / grs['amount_orders'], 3)

In [25]:
# count of the number of products purchased by user 
userct = grs.user_id.value_counts().to_frame().reset_index()
userct.columns = ['user_id', 'count']

In [26]:
# threshold to determine if user is an impulsive buyer 10% number of products repurchased
userrt = grs.where(grs['order_ratio'] > .10).user_id.value_counts().to_frame().reset_index()
userrt.columns = ['user_id', 'met_threshold']

In [27]:
# impulse ratio calculation  
impulse = userct.merge(userrt, on = 'user_id')
impulse['nonimpulse_ratio'] = impulse['met_threshold'] / impulse['count']

In [28]:
grs = grs.merge(impulse, on = 'user_id')

In [29]:
grs.drop(['count', 'met_threshold'], axis = 1, inplace = True)

In [30]:
grs.iloc[4000:4500]

Unnamed: 0,user_id,product_name,product_counts,amount_orders,order_id,order_number,department,aisle,add_to_cart_order,days_since_prior_order,order_dow,order_hour_of_day,reordered,eval_set,order_ratio,nonimpulse_ratio
4000,31,Soda,2,20,2231262,17,beverages,soft drinks,14,8.0,3,11,1,prior,0.100,0.317726
4001,31,Sparkling Cranberry Juice,2,20,2647083,16,beverages,soft drinks,10,8.0,2,8,0,prior,0.100,0.317726
4002,31,Sparkling Cranberry Juice,2,20,813240,19,beverages,soft drinks,22,4.0,0,17,1,prior,0.100,0.317726
4003,31,Sparkling Pink Lemonade,2,20,2647083,16,beverages,soft drinks,23,8.0,2,8,1,prior,0.100,0.317726
4004,31,Sparkling Pink Lemonade,2,20,2135842,4,beverages,soft drinks,2,6.0,3,13,0,prior,0.100,0.317726
4005,31,Spicy Mango With Jalapeño Smoked Chicken Sausage,2,20,813240,19,meat seafood,hot dogs bacon sausage,4,4.0,0,17,1,prior,0.100,0.317726
4006,31,Spicy Mango With Jalapeño Smoked Chicken Sausage,2,20,2951746,14,meat seafood,hot dogs bacon sausage,12,6.0,2,9,0,prior,0.100,0.317726
4007,31,Swedish Meatballs,2,20,695934,2,frozen,frozen meals,3,0.0,5,11,0,prior,0.100,0.317726
4008,31,Swedish Meatballs,2,20,813240,19,frozen,frozen meals,17,4.0,0,17,1,prior,0.100,0.317726
4009,31,Synergy Organic & Raw Cosmic Cranberry,2,20,3035956,13,beverages,refrigerated,6,6.0,3,11,0,prior,0.100,0.317726


In [31]:
# sns.pairplot(data_raw)

# Preproccessing

In [32]:
grs.fillna(0, axis = 0, inplace = True)

In [33]:
grs.set_index(['user_id', 'order_id'], inplace = True)

In [34]:
ready = pd.get_dummies(grs, prefix = '', prefix_sep = '', columns = ['department', 'aisle', 'product_name'], dtype = 'int64')

In [None]:
# sns.pairplot(ready)

In [35]:
# training set
train = ready.loc[grs['eval_set'] == 'prior']

In [36]:
train.drop('eval_set', inplace = True, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [37]:
train.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 9516 entries, (1, 431534) to (74, 1386615)
Columns: 3226 entries, product_counts to Zucchini Squash
dtypes: float64(3), int64(3223)
memory usage: 234.2 MB


In [38]:
# train.replace([np.inf, -np.inf], 0)

In [40]:
# Testing set
tests = ready.loc[grs['eval_set'] == 'train']

In [41]:
tests.drop('eval_set', inplace = True, axis = 1)

# Train and Test Set

In [42]:
X_test = tests.drop('reordered', axis = 1)
y_test = tests['reordered']

In [None]:
X = train.drop('reordered', axis = 1)
y = train['reordered']

# Logistic Regression 

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


In [45]:
lr = LogisticRegression(random_state = 13, class_weight = 'balanced')

In [46]:
lr.fit(X, y)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=13, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
from sklearn.metrics import confusion_matrix, classification_report

In [48]:
tests.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,product_counts,amount_orders,order_number,add_to_cart_order,days_since_prior_order,order_dow,order_hour_of_day,reordered,order_ratio,nonimpulse_ratio,...,Zen Tea,Zero Calorie Cola,Zero Calorie Cola Soda,Zero Calorie Tonic Water,Zero Calories Berry Nutrient Enhanced Water,Zero Go-Go Mixed Berry Vitamin Water,Zero Soda,Zero Vitamin Water,Zucchini Noodles,Zucchini Squash
user_id,order_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,1187899,11,11,11,1,14.0,4,8,1,1.0,0.9,...,0,0,0,0,0,0,0,0,0,0
1,1187899,10,11,11,6,14.0,4,8,1,0.909,0.9,...,0,0,0,0,0,0,0,0,0,0
1,1187899,9,11,11,2,14.0,4,8,1,0.818,0.9,...,0,0,0,0,0,0,0,0,0,0
1,1187899,4,11,11,7,14.0,4,8,1,0.364,0.9,...,0,0,0,0,0,0,0,0,0,0
1,1187899,4,11,11,11,14.0,4,8,1,0.364,0.9,...,0,1,0,0,0,0,0,0,0,0


In [73]:
lr_pred = lr.predict(X_test)

In [74]:
confusion_matrix(y_test, lr_pred)

array([[183,   8],
       [ 82, 211]], dtype=int64)

In [52]:
print(classification_report(y_test, lr_pred))

              precision    recall  f1-score   support

           0       0.69      0.96      0.80       191
           1       0.96      0.72      0.82       293

    accuracy                           0.81       484
   macro avg       0.83      0.84      0.81       484
weighted avg       0.86      0.81      0.82       484



# Random Forest

In [55]:
from sklearn.ensemble import RandomForestClassifier

In [56]:
rf = RandomForestClassifier(max_features="log2", max_depth=11, n_estimators=24,min_samples_split=1000, 
                               oob_score=True)

In [58]:
#fitting
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=11, max_features='log2', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=1000,
                       min_weight_fraction_leaf=0.0, n_estimators=24,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [59]:
#predictions
predrf = rf.predict(X_test)

In [61]:
confusion_matrix(y_test, predrf)

array([[103,  88],
       [  4, 289]], dtype=int64)

In [62]:
print(classification_report(y_test, predrf))

              precision    recall  f1-score   support

           0       0.96      0.54      0.69       191
           1       0.77      0.99      0.86       293

    accuracy                           0.81       484
   macro avg       0.86      0.76      0.78       484
weighted avg       0.84      0.81      0.80       484



# LGBM

In [64]:
import lightgbm as lgb

In [65]:
model= lgb.LGBMClassifier()


In [66]:
model.fit(X, y)


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [67]:
pred=model.predict(X_test)


In [68]:
confusion_matrix(y_test, pred)

array([[191,   0],
       [  6, 287]], dtype=int64)

In [75]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       191
           1       1.00      0.98      0.99       293

    accuracy                           0.99       484
   macro avg       0.98      0.99      0.99       484
weighted avg       0.99      0.99      0.99       484



# XGBoost

In [76]:
from xgboost import XGBClassifier

In [87]:
xgb = XGBClassifier()
xgb.fit(X.values, y.values)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [86]:
# test_df.columns.duplicated().sum()
# xgboost does not like multi-index

0

In [88]:
xgb_pred = model.predict(X_test)

In [89]:
confusion_matrix(y_test, xgb_pred)

array([[191,   0],
       [  6, 287]], dtype=int64)

In [90]:
print(classification_report(y_test, xgb_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       191
           1       1.00      0.98      0.99       293

    accuracy                           0.99       484
   macro avg       0.98      0.99      0.99       484
weighted avg       0.99      0.99      0.99       484



# Apriori

In [91]:
from apyori import apriori, load_transactions

In [119]:
items = prior[['order_id', 'product_id']]

In [None]:
transactions = []
for order in items['order_id']:
    transactions.append((order, item) for item in items['product_id'] )

In [None]:
rules = apriori(transactions)

In [None]:
results = list(rules)



In [None]:
results