### Final Project


In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import scipy.sparse as sparse
import numpy as np

from pandas.api.types import CategoricalDtype
import random
import implicit
from sklearn import metrics

## Helper functions

In [2]:
def train_test_split(ratings, pct_test = 0.2):
    '''
    This function takes the original user-item matrix and "mask" a percentage of the original ratings where a
    user-item interaction has taken place for use as a test set. The returned parameter test_set will contain all the original ratings, 
    while the training set replaces the specified percentage of them with a zero in the original ratings matrix. 
    
    parameters: 
    
        ratings - the original ratings matrix from which you want to generate a train/test set. Test is just a complete
        copy of the original set. This is in the form of a sparse csr_matrix. 
        
        pct_test - The percentage of user-item interactions where an interaction took place that you want to mask in the 
        training set for later comparison to the test set, which contains all of the original ratings. 
    
    returns:
    
        training_set - The altered version of the original data with a certain percentage of the user-item pairs 
        that originally had interaction set back to zero.
        
        test_set - A copy of the original ratings matrix, unaltered, so it can be used to see how the rank order 
        compares with the actual interactions.
        
        user_inds - From the randomly selected user-item indices, which user rows were altered in the training data.
        This will be necessary later when evaluating the performance via AUC.
    '''
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    random.seed(0) # Set the random seed to zero for reproducibility
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered  

# evaluate the model
def auc_score(predictions, test):
    '''
    This simple function will output the area under the curve using sklearn's metrics. 
    
    parameters:
    
        - predictions: your prediction output
        
        - test: the actual target result you are comparing to
    
    returns:
    
        - AUC (area under the Receiver Operating Characterisic curve)
    '''
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)  

def calc_mean_metrics(training_set, altered_users, predictions, test_set, func):
    '''
    This function will calculate the mean AUC by user for any user that had their user-item matrix altered. 
    
    parameters:
    
        training_set - The training set resulting from train_test_split, where a certain percentage of the original
        user/item interactions are reset to zero to hide them from the model 
        
        altered_users - The indices of the users where at least one user/item pair was altered from train_test_split function        
        
        predictions - The matrix of your predicted ratings for each user/item pair as output from the implicit MF.
        These should be stored in a list, with user vectors as item zero and item vectors as item one. 
        
        test_set - The test set constucted earlier from train_test_split function

        func - The metrics function
    
    returns:
    
        The mean AUC (area under the Receiver Operator Characteristic curve) of the test set only on user-item interactions
    there were originally zero to test ranking ability in addition to the most popular items as a benchmark.
    '''
    
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_items = np.array(test_set.sum(axis = 0)).reshape(-1) # Get sum of item iteractions to find most popular
    item_vecs = predictions[1]
    for user in altered_users: # Iterate through each user that had an item altered
        training_row = training_set[user,:].toarray().reshape(-1) # Get the training set row
        zero_inds = np.where(training_row == 0) # Find where the interaction had not yet occurred
        # Get the predicted values based on our user/item vectors
        user_vec = predictions[0][user,:]
        pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_items[zero_inds] # Get the item popularity for our chosen items
        store_auc.append(func(pred, actual)) # Calculate AUC for the given user and store
        popularity_auc.append(func(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration

    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc)) ,float('%.3f'%np.median(store_auc)), float('%.3f'%np.median(popularity_auc)) 
    # Return the mean AUC rounded to three decimal places for both test and popularity benchmark

def get_items_purchased(customer_id, user_item_train, customers_list, products_list, item_lookup):
    '''
    This function retrieves which items have been already purchased by a specific user in the training set. 
    
    parameters: 
    
        customer_id - Input the customer's id number that you want to see prior purchases of at least once
        
        user_item_train - The initial ratings training set used (without weights applied)
        
        customers_list - The array of customers used in the ratings matrix
        
        products_list - The array of products used in the ratings matrix
        
        item_lookup - A simple pandas dataframe of the unique product ID/product descriptions available
    
    returns:
    
        A list of item IDs and item descriptions for a particular customer that were already purchased in the training set
    '''
    cust_ind = np.where(customers_list == customer_id)[0][0] # Returns the index row of our customer id
    purchased_ind = user_item_train[cust_ind,:].nonzero()[1] # Get column indices of purchased items
    prod_codes = products_list[purchased_ind] # Get the stock codes for our purchased items
    return item_lookup.loc[item_lookup.StockCode.isin(prod_codes)]

def recommend_for_one_user(customer_id, customers_list, model, user_item_train, item_lookup, user_item_df, item_id:str = 'StockCode'):
    '''
    This function retrieves the recommendation items for a single user. 
    
    parameters: 
    
        customer_id - Input the customer's id number that you want to see recommendations
        
        customers_list - The array of customers used in the ratings matrix
        
        model - the implicit model object
        
        user_item_train - The initial ratings training set used (without weights applied)
        
        item_lookup - A simple pandas dataframe of the unique product ID/product descriptions available

        user_item_df - The df that contains the original index for user item records

        item_id - The column name for the item id
    
    returns:
    
        A list of item IDs, item descriptions, and scores in desc order for a particular customer
    '''
    cust_ind = np.where(customers_list == customer_id)[0][0] # Returns the index row of our customer id
    # Use the implicit recommender.
    recommended = model.recommend(cust_ind, user_item_train)

    item_codes = [user_item_df.loc[item[0],item_id] for item in recommended]
    scores = [item[1] for item in recommended]

    score_df = pd.DataFrame({item_id: item_codes, 'Score': scores})
    items = item_desc.loc[item_lookup[item_id].isin(item_codes)]
    recommendations = score_df.merge(items,on=item_id)
    return recommendations


## 1. Pre-processing

### 1.1 Check the cleaned data

In [3]:
# read input data
df= pd.read_excel("Online Retail After Cleaning.xlsx")
print(df.shape)
print(df.dtypes)
# invoice number could contain character
df.head(3)

(331442, 8)
InvoiceNo               int64
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID              int64
Country                object
dtype: object


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom


In [4]:
# check the cleaned data
df.describe()

Unnamed: 0,InvoiceNo,Quantity,UnitPrice,CustomerID
count,331442.0,331442.0,331442.0,331442.0
mean,560831.914223,7.538975,2.099396,15326.786943
std,13144.721504,6.794875,1.413986,1712.035802
min,536365.0,1.0,0.001,12347.0
25%,549302.0,2.0,1.25,13993.0
50%,562213.0,6.0,1.65,15249.0
75%,572306.0,12.0,2.95,16824.0
max,581587.0,27.0,6.0,18287.0


In [5]:
# find out data time range
print(df['InvoiceDate'].min(),df['InvoiceDate'].max())

2010-12-01 08:26:00 2011-12-09 12:50:00


In [6]:
# check item count
item_count = df.groupby('StockCode').count()
item_count.reset_index(inplace=True)
item_count['InvoiceNo'].describe()

count    3291.000000
mean      100.711638
std       151.249657
min         1.000000
25%        11.000000
50%        44.000000
75%       127.000000
max      1686.000000
Name: InvoiceNo, dtype: float64

In [7]:
user_order_count = df.groupby(['CustomerID','InvoiceNo']).count()
user_order_count.reset_index(inplace=True)
user_order_count['StockCode'].describe()

count    16720.000000
mean        19.823086
std         22.891105
min          1.000000
25%          6.000000
50%         14.000000
75%         25.000000
max        424.000000
Name: StockCode, dtype: float64

### 1.2 Prepare the data for Collaborative Filtering model

In [8]:
# keep item StockCode and its description for reference
item_desc = df[['StockCode', 'Description']].drop_duplicates()
item_desc.reset_index(inplace=True,drop=True)
item_desc.tail(3)

Unnamed: 0,StockCode,Description
3482,90089,PINK CRYSTAL SKULL PHONE CHARM
3483,84707B,PINK JEWELLED MIRROR TRINKET TRAY
3484,85123A,CREAM HANGING HEART T-LIGHT HOLDER


In [9]:
# build the necessary columns for collaborative filtering
df2 = df[['StockCode', 'Quantity', 'CustomerID']]
grouped_purchased = df2.groupby(['CustomerID', 'StockCode']).sum().reset_index() # Group together
#### !! For the second model, uncomment this line so that we don't use quantity as preference
# grouped_purchased['Quantity'] = 1 # don't record the quantity of the items purchased
grouped_purchased.describe()

Unnamed: 0,CustomerID,Quantity
count,227352.0,227352.0
mean,15301.476789,10.990592
std,1706.042352,13.736908
min,12347.0,1.0
25%,13899.0,3.0
50%,15272.0,6.0
75%,16791.0,12.0
max,18287.0,504.0


In [10]:
# build the user-item matrix for collaborative filtering

customers = list(np.sort(grouped_purchased['CustomerID'].unique())) # Get our unique customers
products = list(grouped_purchased['StockCode'].unique()) # Get our unique products that were purchased
quantity = list(grouped_purchased['Quantity']) # All of our purchases

cat_type1 = CategoricalDtype(categories=customers, ordered=False)
cat_type2 = CategoricalDtype(categories=products, ordered=False)

rows = grouped_purchased['CustomerID'].astype(cat_type1).cat.codes 
# Get the associated row indices
cols = grouped_purchased['StockCode'].astype(cat_type2).cat.codes 
# Get the associated column indices
user_item_all = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))

In [11]:
# check the sparcity, a maximum of 99.5% is recommended by Jesse Steinweg-Woods
matrix_size = user_item_all.shape[0]*user_item_all.shape[1] # Number of possible interactions in the matrix
num_purchases = len(user_item_all.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_purchases/matrix_size))
sparsity

98.34966666245649

In [12]:
# split train test dataset
user_item_train, user_item_test, selected_user_index = train_test_split(user_item_all, pct_test = 0.33)
item_user_train = user_item_train.T.tocsr()
item_user_test = user_item_test.T.tocsr()

## 2. Build the model

In [13]:
# fit the recommendation model

# The implicit library expects data as a item-user matrix 
# so we use item-user matrix for model training
# and user-item for recommendations later

alpha = 40 # ref: http://yifanhu.net/PUB/cf.pdf
model = implicit.als.AlternatingLeastSquares(factors=20, regularization = 0.1, iterations = 50, random_state= 42)
model.fit((item_user_train*alpha).astype('double'))

100%|██████████| 50/50 [00:11<00:00,  4.39it/s]


## 3. Evaluate the model

In [14]:
# Get user and item vectors from our trained model
user_vecs, item_vecs = model.user_factors, model.item_factors

In [15]:
# check the dimensions
print(user_item_train.shape)
print(user_vecs.shape)
print(item_vecs.shape)
print(user_item_test.shape)

(4186, 3291)
(4186, 20)
(3291, 20)
(4186, 3291)


In [16]:
# check the AUC score for this recommender system
calc_mean_metrics(user_item_train, selected_user_index, 
                [sparse.csr_matrix(user_vecs), sparse.csr_matrix(item_vecs.T)], user_item_test, func = auc_score)


(0.858, 0.819, 0.874, 0.832)

## 4. Examples of recommendations

In [17]:
customers_arr = np.array(customers) # Array of customer IDs from the ratings matrix
products_arr = np.array(products) # Array of product IDs from the ratings matrix
customers_arr[:20]

array([12347, 12348, 12349, 12350, 12352, 12353, 12354, 12355, 12356,
       12357, 12358, 12359, 12360, 12361, 12362, 12363, 12364, 12365,
       12367, 12370])

In [18]:
userid = 12357
# check the items purchased in the training data
get_items_purchased(userid, user_item_train, customers_arr, products_arr, item_desc)

Unnamed: 0,StockCode,Description
3,84029G,KNITTED UNION FLAG HOT WATER BOTTLE
88,15056BL,EDWARDIAN PARASOL BLACK
125,84030E,ENGLISH ROSE HOT WATER BOTTLE
733,35598B,BLACK CHRISTMAS TREE 60CM
1185,15056P,EDWARDIAN PARASOL PINK
1922,85159A,"BLACK TEA,COFFEE,SUGAR JARS"
3257,35599B,BLACK CHRISTMAS TREE 120CM


In [19]:
# Create recommendations
rec = recommend_for_one_user(userid,customers_arr, model, user_item_train, item_desc, grouped_purchased, item_id = 'StockCode')
rec

Unnamed: 0,StockCode,Score,Description
0,21106,1.95698,CREAM SLICE FLANNEL CHOCOLATE SPOT
1,23188,1.795498,VINTAGE 2 METER FOLDING RULER
2,23188,1.795498,VINTAGE 2 METRE FOLDING RULER
3,22821,1.634983,GIFT BAG PSYCHEDELIC APPLES
4,21216,1.626036,"SET 3 RETROSPOT TEA,COFFEE,SUGAR"
5,23426,1.57671,METAL SIGN DROP YOUR PANTS
6,23256,1.566309,CHILDRENS CUTLERY SPACEBOY
7,23256,1.566309,KIDS CUTLERY SPACEBOY
8,23256,1.516222,CHILDRENS CUTLERY SPACEBOY
9,23256,1.516222,KIDS CUTLERY SPACEBOY


In [20]:
# check what the user bought in original data
df.loc[(df['CustomerID']==userid),['StockCode','Description']].drop_duplicates().reset_index()

Unnamed: 0,index,StockCode,Description
0,270569,22064,PINK DOUGHNUT TRINKET POT
1,270570,21232,STRAWBERRY CERAMIC TRINKET POT
2,270571,22067,CHOC TRUFFLE GOLD TRINKET POT
3,270572,21555,CERAMIC STRAWBERRY TRINKET TRAY
4,270573,22316,200 BENDY SKULL STRAWS
...,...,...,...
89,270658,85159B,"WHITE TEA,COFFEE,SUGAR JARS"
90,270659,21216,"SET 3 RETROSPOT TEA,COFFEE,SUGAR"
91,270660,23168,CLASSIC CAFE SUGAR DISPENSER
92,270661,23302,KNEELING MAT HOUSEWORK DESIGN
