In [1]:
import pandas as pd
import numpy as np

from statsmodels.discrete.discrete_model import Logit
from sklearn.linear_model import LogisticRegression
from scipy.special import logit

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

COORDINATION / UPDATE NOTES

* I created a Logistical regression model (create_model function) and applied it to transformed training data (pivoted the original dataset on the basis of customers, dropped some of the less-purchased items to balance out the dataset under the rule of 10); also created top_5 function to output top 5 product recommendations based on that model. -- Zac (3/27)

In [2]:
#Load in the retail data
df = pd.read_excel("../data/Online Retail.xlsx")

#Cut the size of the df to make it less cumbersome
df = df.iloc[:9000]


In [None]:
"""
FUNCTIONS

Functions to perform various analytic/transformation tasks

"""


def create_model(df, item):
    """
    Inputs the transaction dataframe and specific stock item (by ID) and outputs five top recommended purchases based on transaction data

    NOTE: This is a work in progress; haven't been able to overcome perfect separation problems + R2s of 1, speak nothing of the slow speed for fitting.

    """

    #Get the predictors
    predictors = list(df.columns)
    predictors.remove(item)

    df['intercept'] = 1.0

    #Create the model
    m = Logit(df[item], df[predictors])
    m = m.fit(maxiter=1000, method='bfgs')
    #m = m.fit_regularized(maxiter=1000, method="l1")

    return m

def top_5(fitted_model, df):
    """
    Inputs fitted model from create_model, prints top 5 recommendations based on that item.
    """
    t5i = list(fitted_model.params.sort_values(key=abs, ascending=False).iloc[1:].head(5).index)

    for no, x in enumerate(t5i):
        print(F'RECOMMENDATION #{no + 1}: ', df[df['StockCode'] == x].iloc[0]['Description'])






In [None]:
"""
PREPARING THE DATA

In this section we adjust the datatypes and create new data points as needed.
"""

#Total money spent in each purchase
df['TotalSpend'] = df.apply(lambda row: row.UnitPrice * row.Quantity, axis=1)

# Remove abnormal stock codes (Post, D, DOT, etc)
mask1 = df['StockCode'].str.contains(r'[a-zA-Z]', regex=True, na=False)
mask2 = df['StockCode'].str.match(r'^[^0-9]', na=False)
df = df[~mask2]

#Pivot the dataframe to focus on customer behavior
df['Purchased'] = 1 #Adds a binary column for the pivot
client_df1 = df.pivot_table(index='CustomerID', columns='StockCode', values='Purchased', aggfunc='max', fill_value=0)

#Create a reference to determine the customers making the most purchases.
client_df1['PurchaseNo'] = client_df1.apply(lambda row: row.sum(), axis=1)

#RULE OF 10 - shave off some customers who don't make a lot of purchases in order to rectify the feature/column ratio
cutoff_point = 15 # No. of purchases to serve as cutoff threshold for training data
client_df1 = client_df1[client_df1['PurchaseNo'] > 12]





In [None]:
"""
EXAMINING THE DATA

In this section we see what we have with the data set.
"""
#DATAFRAME IN GENERAL
#How many df entries in total? - 541909
transaction_no = len(df)

#How many customers in total? - 4373
customer_no = len(df['CustomerID'].unique())

#How many transactions in total? - 25900
transaction_no = len(df['InvoiceNo'].unique())


#INDIVIDUAL CUSTOMERS
#Average individual customer spend (those that spent money/did not get refunds) - 1923.48
customer_spend = df.groupby('CustomerID')['TotalSpend'].aggregate('sum')
customer_spend = customer_spend[customer_spend > 0]
avg_spend = np.mean(customer_spend).round(2)

#PRODUCTS
# How many products in total? - 4070
product_no = len(df['StockCode'].unique())

#Value counts of various products (possible grouping less popular items as 'other' on the basis of how many times they appear)
prod_counts = df['StockCode'].value_counts()
#print(len(prod_counts[prod_counts < 15]))




In [None]:
"""
EXECUTING THE MODEL

In this section we apply the model to the data set.
"""

#This creates the model based on one product - StockCode 84380 - in the hope of eventually outputting the five most suitable product recommendations based on customers cross-purchasing.
m = create_model(client_df1, 84380)


Optimization terminated successfully.
         Current function value: 0.000000
         Iterations: 24
         Function evaluations: 30
         Gradient evaluations: 30




In [None]:
top_5(m, df)


RECOMMENDATION #1:  PACK OF 72 SKULL CAKE CASES
RECOMMENDATION #2:  PAPER CHAIN KIT 50'S CHRISTMAS 
RECOMMENDATION #3:  HEART OF WICKER LARGE
RECOMMENDATION #4:  JUMBO STORAGE BAG SUKI
RECOMMENDATION #5:  SET 6 FOOTBALL CELEBRATION CANDLES


In [None]:
"""
TESTING THE MODEL

In this section we discern the accuracy of the model.
"""

'\nTESTING THE MODEL\n\nIn this section we discern the accuracy of the model.\n'