# Main Questions to keep in mind

1. Predict the next order more specifically what item will the user purchase next
2. What products will be 'discovered', what should be recommended based on prior purchases?
3. What products could usually be purchased together, which items arent purchased together?

# Data Wrangling/ Cleaning

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns; sns.set()

In [10]:
aisles = pd.read_csv("aisles.csv")
departments = pd.read_csv("departments.csv")
orders = pd.read_csv("orders.csv")
prior = pd.read_csv("order_products__prior.csv")
train = pd.read_csv("order_products__train.csv")
products = pd.read_csv("products.csv")


FileNotFoundError: [Errno 2] File b'aisles.csv' does not exist: b'aisles.csv'

In [None]:
# appending train with prior to obtain whole sample size 
full = prior.append(train)

In [None]:
# sorting values by order id to keep some structure
full.sort_values(by = 'order_id', inplace = True, kind = 'mergesort')

In [None]:
# merging orders and full to include time and product information
full= full.merge(orders, on = 'order_id', how = 'left')

In [None]:
# merging to substitute id for actual name
full = full.merge(products, on = 'product_id', how = 'left')

In [None]:
# merging to substitute id for actual name
full = full.merge(aisles, on = 'aisle_id', how = 'left')

In [None]:
# merging to substitute id for actual name
full = full.merge(departments, on = 'department_id', how = 'left')

In [None]:
# removing id in place of actual name 
full.pop('product_id')
full.pop('aisle_id')
full.pop('department_id')
full.head()

In [None]:
# reordering sequence of columns for easy lookup
full = full[[ 'order_id',
             'order_number',
             'user_id',
             'department',
             'aisle',
             'product_name',
             'add_to_cart_order',
             'days_since_prior_order',
             'order_dow',
             'order_hour_of_day',
             'reordered',
             'eval_set']]
full.head()

# Storytelling, Exploratory Data Analysis

In [None]:
# How can we classify the items? produce offers the most
# Will this be evident in the recommended products?
plt.figure(figsize = (10,10))
sns.countplot(
            y = 'department',
            data = full,  
            orient = 'h', 
            saturation = 0.5,
            )
plt.title('What department is purchased from the most?')

In [None]:
# How many products are there?
total_products = products.shape[0]
total_products

In [None]:
# How many aisles?
total_aisles = aisles.shape[0]
total_aisles

In [None]:
# How many departments?
total_departments = departments.shape[0]
total_departments

In [None]:
# creation of data for department and aisle comparison 
stock = full[['department', 'aisle', 'product_name']]
# changed from series to df to input into graph
total_units = pd.DataFrame(stock.groupby(['department','aisle']).size().sort_values())
# renaming unnamed column
total_units.reset_index(inplace = True)
total_units.rename(columns = { 0:'totals'}, inplace = True)
total_units.head()

In [None]:
full.loc[(full['aisle'] == 'missing') | (full['department'] == 'missing')]
# 77396 observations contain "missing"  
# because of the 1258 products containing "missing" in aisle and/or department column

In [None]:
# visualization for better interpretation and quicker comparisons
fig, ax = plt.subplots(figsize=(20,65), dpi= 325)
# Initial setup for the background
ax.hlines(
          y=total_units.aisle, 
          color='gray', 
          xmin=350, 
          xmax=1050, 
          alpha=0.3,
          linewidth=2,
          linestyles='dashdot'
          )

plot_kws = {'s':500}
# controls marker size passed down to plt.scatter at draw time
sns.scatterplot(
            x= 'totals',
            y='aisle',
            hue='department', 
            data=total_units, 
            palette= 'colorblind', 
            **plot_kws
           )

# Title, Label, Ticks and Legend

ax.set_title(
            'Product Aisle Items',
             fontdict={'size':63}
            )
plt.legend(
           loc = 'lower right',
           prop={'size': 50},
           markerscale = 5
          )

# x axis
ax.set_xlim(0, 1300)
ax.set_xlabel('Items Available', fontdict={'size':50})
ax.set_xticklabels([0,200,400,600,800,1000,1200], fontdict = {'size': 36})
ax.tick_params(axis = 'x', labelsize = 50, which = 'major')

# y axis
ax.tick_params(axis = 'y', labelsize = 25, which ='major')
plt.ylabel('Aisle', fontdict = {'size':21})
ax.set_yticks(total_units.aisle)
ax.set_yticklabels(total_units.aisle.str.title(),
                   fontdict={'horizontalalignment': 'right'})



plt.show()

In [None]:
# top 20 products sold
top_20_items = full.product_name.value_counts().head(20)
top_20_items

In [None]:
fig, ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(10)
ax.barh(
    top_20_items.index,
    top_20_items, 
    align='center'
        )
# labels read top-to-bottom
ax.invert_yaxis() 
ax.set_xlabel('Amount Purchased')
ax.set_title('What items are the most purchased?')

In [None]:
# What items are reordered the most? only a few slight differences
reordered_amt = full.loc[full['reordered'] == 1].groupby('product_name').size()
reordered_amt.sort_values(inplace = True, ascending = False)
top_20_reordered = reordered_amt.head(20)

In [None]:
fig, ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(10)
ax.barh(top_20_reordered.index, top_20_reordered, align='center')
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('Amount Reordered')
ax.set_title('What items are reordered the most ?')

In [None]:
#What items are usually chosen first?
first_picks = full.loc[full['add_to_cart_order'] == 1].groupby('product_name').size()
first_picks.sort_values(inplace = True, ascending = False)
first_picks_top_20 = first_picks.head(20)

In [None]:
fig, ax = plt.subplots()
fig.set_figheight(10)
fig.set_figwidth(10)
ax.barh(first_picks_top_20.index,first_picks_top_20, align='center')
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('Number of times chosen first')
ax.set_title('What items are chosen first?')

In [None]:
# How often is a product reordered
reorder_ratio = full.reordered.value_counts()

In [None]:
# Plotting pie chart, kwargs draws from pyplot.pie
pie_kwargs = {'startangle': 90, 
              'labels' : ['Reordered','Not Reordered'],
              'autopct' : '%.1f%%',
               'fontsize' : 'x-large'
             }
reorder_ratio.plot(kind = 'pie', 
                   figsize = (8,8),
                   title = 'Percentges of Items Reordered',
                   shadow = True,
                   **pie_kwargs).set_ylabel('')

In [None]:
# order amount per user
order_amt = full.groupby('user_id').order_number.max()
order_amt

In [None]:
# order amount per user
plt.figure(figsize = (10,10))
sns.distplot(
            order_amt, 
            kde = True, 
            bins = 10, 
            color = 'chocolate', 
            axlabel = 'Amount of Purchases',
            )
plt.title('How many orders are there per user?')
plt.ylabel('percentage')
plt.axvline(order_amt.mean(), linestyle='dashed',)
plt.text(
    order_amt.mean() + 1,
    0.08,
    'mean = ' + str(round(order_amt.mean(), 2)),
    verticalalignment = 'top'
    )

In [None]:
# amount of items purchased in each order
items_per_purchase = full.groupby('order_id').add_to_cart_order.max()
items_per_purchase

In [None]:
# items per purchase
plt.figure(figsize = (10,10))
sns.distplot(
            items_per_purchase, 
            kde = True, 
            color = 'coral', 
            axlabel = 'Items per Purchase',
            )
plt.title('How many items are purchased each order?')
plt.ylabel('percentage')
plt.axvline(items_per_purchase.mean(), linestyle='dashed',)
plt.text(18,
         0.06,
         'mean = '+ str(round(items_per_purchase.mean(), 2)),
         verticalalignment = 'top'
        )

In [None]:
days_since_means = full.groupby('user_id').days_since_prior_order.mean()
days_since_means

In [None]:
# items per purchase
plt.figure(figsize = (10,10))
sns.distplot(
            days_since_means, 
            kde = True, 
            color = 'chocolate', 
            axlabel = 'Mean of Days Since Last Order',
            )
plt.title('What is the mean amount of days since prior order')
plt.ylabel('percentage')
plt.axvline(days_since_means.mean(), linestyle='dashed',)
plt.text(days_since_means.mean() + 2,
         0.06,
         'mean = '+ str(round(days_since_means.mean(), 2)),
         verticalalignment = 'top'
        )

In [None]:
# scatterplot do the amount of days since person relate to the amount of orders made
plt.figure(figsize = (10,10))

plt.scatter(order_amt, days_since_means,  marker = 'o', alpha = 0.02)
plt.plot(
    orders.order_number.value_counts().index,
    orders.groupby('order_number').days_since_prior_order.mean(),
    color = 'b'
        )
plt.title('Amount of orders made vs mean amount of days since prior order')
plt.xlabel('Number of orders made in the past')
plt.ylabel('mean number of days since prior order')

In [None]:
order_times = orders.groupby(["order_dow", "order_hour_of_day"])["order_number"].agg("count").reset_index()
order_times = order_times.pivot('order_hour_of_day', 'order_dow','order_number')
order_times

In [None]:
# time of day vs day of week heatmap
plt.figure(figsize = (10,10))
sns.heatmap(order_times, robust = True)
plt.title("Time of the Day vs. Day of the Week")


In [None]:
# How do the days compare to one another?
dow = full.order_dow.value_counts()
plt.figure(figsize = (10,10))
plt.stem(dow)
plt.xlabel('Day')
plt.ylabel('Amount of Orders')
plt.xticks([0,1,2,3,4,5,6], ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.title('What day has the most orders?')

In [None]:
# How do the hours of the day compare?
time_of_day = full.order_hour_of_day.value_counts(normalize = True).sort_index()
plt.figure(figsize = (10,10))
plt.stem(time_of_day)
plt.xlabel('Hour')
plt.xticks([0,2,4,6,8,10,12,14,16,18,20,22])
plt.ylabel('Percentage of Orders')
plt.title('What are the peak hours?')
plt.axhline(y = 0.01, linewidth = 0.5, linestyle = '--', color = 'magenta' )
plt.text(x = 0, y = 0.015, s = ' 10% of orders')

In [None]:
# Organic foods seem very popular
organic_yn = full[['order_id', 'product_name', 'reordered']]
organic_yn['Is it organic?'] = organic_yn['product_name'].str.contains('Organic', case = False)
organic_yn

In [None]:
# How many purchases are organic?
organic_yn['Is it organic?'].value_counts()

In [None]:
# creating a table used to consolidate and plot reorder vs organic 
organic_table = organic_yn.pivot_table(
    'order_id',
    'Is it organic?',
    'reordered',
    aggfunc = 'count'
    )
organic_table

In [None]:
organic_table.plot(kind = 'bar')
plt.ylabel('millions')
plt.title('Do organic foods get reordered more often?')

# Statistical Exploratory Data Analysis

To see if organic foods do get reordered more often a hypothesis test will be conducted. 
significance = 0.01.

H0 - Organic food does not have a greater likelyhood of being reordered.     
Ha - Organic food does have a greater likelyhood of being reordered.

In [None]:
organic_table.reset_index(inplace = True)

In [None]:
# parameters for beta distribution
success_a = organic_table.iloc[1][1]
failure_a = organic_table.iloc[1][0]
success_b = organic_table.iloc[0][1]
failure_b = organic_table.iloc[0][0]

In [None]:
# generate samples
a_samples_beta = np.random.beta(success_a, failure_a, 1000)
b_samples_beta = np.random.beta(success_b, failure_b, 1000)

In [None]:
# convert to pandas series for better handling later
a_samples_beta = pd.Series(a_samples_beta)
b_samples_beta = pd.Series(b_samples_beta)

In [None]:
# Plot the two distributions using kernel density estimation
pd.DataFrame(
    {
        'Organic': a_samples_beta,
        'Not Organic': b_samples_beta,

    }
).plot(
    kind='kde',
    title='Beta Distribution',
    
)



In [None]:
# combined graph wasn't effective for individual distribution and shows significant difference
a_samples_beta.hist()
plt.title('beta distribution')

In [None]:
# to check shape of distrbution
b_samples_beta.hist()
plt.title('beta distribution')

In [None]:
 str(100 * ((a_samples_beta - b_samples_beta) > 0).mean()) + '% Confident in organic having better rate of reorder.' 

The null hypothesis has been rejected therefore, it can be assumed that organic foods have a higher likelihood of probability.

significance = 0.01

H0 - Orders that are picked first have no relationship to reorder status

Ha - Orders that are picked first have a relationship with reorder status

In [None]:
# Frequentist type hypothesis test
first_picks = full.loc[full['add_to_cart_order'] == 1]
reordered_first_picks = first_picks.loc[first_picks['reordered'] == 1]
not_reordered_first_picks = first_picks.loc[first_picks['reordered'] == 0]

In [None]:
def hypothesis_test(df1, df2):
    """Tests the hypotheses with product name only"""
    mean1 = df1.product_name.value_counts().mean() 
    mean2 = df2.product_name.value_counts().mean()
    # calculate the mean 
    var1 = df1.product_name.value_counts().var()
    var2 = df2.product_name.value_counts().var()
    # calculate the variance
    length1 = df1.product_name.value_counts().shape[0]
    length2 = df2.product_name.value_counts().shape[0]
    # obtain length of value_counts
    var_pop_est = (((length1 - 1) * var1) + ((length2 - 1) * var2)) / length1 + length2 -2
    # variance population estimate (pooled)
    SE_diff = np.sqrt(var_pop_est) * (np.sqrt((1 / length1) + 1 / length2))
    # standard error for difference
    mean_diff = mean1 - mean2
    # calculate difference of means
    lower_interval = mean_diff - 2.576 * SE_diff
    upper_interval = mean_diff + 2.576 * SE_diff
  
    return lower_interval, upper_interval, (mean_diff)

In [None]:
# keep in variable to access values later
hypothesis_test(reordered_first_picks, not_reordered_first_picks)

The confidence interval does not contain zero within and therefore the null hypothesis is rejected.

significance = 0.01

H0 - Any order picked will have no relationship to reorder status

Ha - Any order picked will have a relationship to reorder status

In [None]:
reordered = full.loc[full['reordered'] == 1]
not_reordered = full.loc[full['reordered'] == 0]

In [None]:
hypothesis_test(reordered, not_reordered)

In [None]:
# parameters 
top_30_purchases = full.product_name.value_counts().head(30)
top_30_reorders = reordered_amt.head(30)

In [None]:
# generate samples
purchase_samples1 = np.random.dirichlet(top_30_purchases)
reorder_samples1 = np.random.dirichlet(top_30_reorders)

In [None]:
# covert to pandas series
purchase_samples1 = pd.Series(purchase_samples1)
reorder_samples1 = pd.Series(reorder_samples1)

In [None]:
pd.DataFrame(
    {
        'Purchase': purchase_samples1,
        'Reorder': reorder_samples1,

    }
).plot(
    kind='kde',
    title='Beta Distribution',

)

In [None]:
# generate samples
purchase_samples2 = np.random.multinomial(30, purchase_samples1)
reorder_samples2 = np.random.multinomial(30, reorder_samples1)

In [None]:
# covert to pandas series
purchase_samples2 = pd.Series(purchase_samples2)
reorder_samples2 = pd.Series(reorder_samples2)

In [None]:
purchase_freq = purchase_samples2.value_counts()
reorder_freq = reorder_samples2.value_counts()

In [None]:
pd.DataFrame(
    {
        'Purchase': purchase_samples2,
        'Reorder': reorder_samples2,

    }
).plot(
    kind='kde',
    title='Multinomial Distribution',

)

In [None]:
# conjugate distributions for posterior
purchase = purchase_samples1 * purchase_samples2
reorder = reorder_samples1 * reorder_samples2

In [None]:
pd.DataFrame(
    {
        'Posterior Purchased Most': purchase,
        'Posterior Reordered Most': reorder,
    }
).plot(
    kind='kde',
    title='Most Reordered Items vs Most Purchased Items ',
   
)
plt.axvline(0, linestyle = '--', color = 'black')

In [None]:
'Confidence level for the most reordered items differing from most purchased items ' + str (100 * ((purchase - reorder) > 0).mean().round(2)) + '%' 


The null hypothesis has failed to have been rejected suggesting that there is no difference in the most purchased items and most reordered items likelihood.