# Implicit Recommender System - LightFM

In [1]:
# conventional libraries
import os
import calendar
import pandas as pd
import numpy as np
import datetime
import random

# data visualizations
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
import seaborn as sns
import matplotlib.pyplot as plt

# algorithm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from implicit.evaluation import ranking_metrics_at_k
from tqdm import tqdm
import scipy.sparse as sparse
import implicit
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
from sklearn import preprocessing
from lightfm import LightFM
from scipy.sparse import csr_matrix
from lightfm.evaluation import auc_score
import time
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics.pairwise import cosine_similarity


LightFM was compiled without OpenMP support. Only a single thread will be used.



### Loading data

In [2]:
df = pd.read_csv('df_withC.csv', index_col=[0])
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,StockCode,Description,SC_Clean,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,StockCode_NR,StockCode_L,...,Week,Year_Month,Hour,weekday_nr,Day,is_cancelled,weekday,Quarter,Date,RefundFlg
0,85123A,white hanging heart t-light holder,3716,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,85123,A,...,48,2010-12,8,3,1,0,Wednesday,Q3,2010-12-01 00:00:00,False
1,71053,white metal lantern,3724,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,71053,reg,...,48,2010-12,8,3,1,0,Wednesday,Q3,2010-12-01 00:00:00,False
2,84406B,cream cupid hearts coat hanger,861,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,84406,B,...,48,2010-12,8,3,1,0,Wednesday,Q3,2010-12-01 00:00:00,False
3,84029G,knitted union flag hot water bottle,1813,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,84029,G,...,48,2010-12,8,3,1,0,Wednesday,Q3,2010-12-01 00:00:00,False
4,84029E,red woolly hottie white heart.,2776,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,84029,E,...,48,2010-12,8,3,1,0,Wednesday,Q3,2010-12-01 00:00:00,False


In [3]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

### Defining Cancels and Buys based on Quantity

In [4]:
df['Event'] = df.Quantity.apply(lambda x: 'Cancel' if x < 0 else 'Buy')
df.head()

Unnamed: 0,StockCode,Description,SC_Clean,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,StockCode_NR,StockCode_L,...,Year_Month,Hour,weekday_nr,Day,is_cancelled,weekday,Quarter,Date,RefundFlg,Event
0,85123A,white hanging heart t-light holder,3716,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,85123,A,...,2010-12,8,3,1,0,Wednesday,Q3,2010-12-01 00:00:00,False,Buy
1,71053,white metal lantern,3724,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,71053,reg,...,2010-12,8,3,1,0,Wednesday,Q3,2010-12-01 00:00:00,False,Buy
2,84406B,cream cupid hearts coat hanger,861,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,84406,B,...,2010-12,8,3,1,0,Wednesday,Q3,2010-12-01 00:00:00,False,Buy
3,84029G,knitted union flag hot water bottle,1813,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,84029,G,...,2010-12,8,3,1,0,Wednesday,Q3,2010-12-01 00:00:00,False,Buy
4,84029E,red woolly hottie white heart.,2776,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,84029,E,...,2010-12,8,3,1,0,Wednesday,Q3,2010-12-01 00:00:00,False,Buy


In [5]:
buy_count = df.loc[df.Event == "Buy"].shape[0]
cancel_count = df.loc[df.Event == "Cancel"].shape[0]

print(f"Amount of purchase datapoints: {buy_count} \n\
Amount of cancel datapoints: {cancel_count}")

Amount of purchase datapoints: 391163 
Amount of cancel datapoints: 8504


In [6]:
df.Event.unique()

array(['Buy', 'Cancel'], dtype=object)

## Transforming implicit data
Here the implicit data will be divided into negative and positive feedback
We assume that a purchase could be labelled as a positive event
And a cancellation should neutralize the positive effect yet not entirely as the intention to purchase the item (was) is there

In [7]:
event_type_strength = {
    "Buy": 3.0,
    "Cancel": -2.5
}

df['eventStrength'] = df['Event'].apply(lambda x: event_type_strength[x])

Converting `Objects` into `Category` to decrease processing time: [Reference](https://stackoverflow.com/questions/30601830/when-to-use-category-rather-than-object)

---

## LightFM

In [8]:
df_limit = df[['InvoiceDate','Description','SC_Clean','eventStrength','CustomerID']]
df_limit.rename(columns={'SC_Clean':'StockCode'}, inplace=True)
df_limit.head()

Unnamed: 0,InvoiceDate,Description,StockCode,eventStrength,CustomerID
0,2010-12-01 08:26:00,white hanging heart t-light holder,3716,3.0,17850
1,2010-12-01 08:26:00,white metal lantern,3724,3.0,17850
2,2010-12-01 08:26:00,cream cupid hearts coat hanger,861,3.0,17850
3,2010-12-01 08:26:00,knitted union flag hot water bottle,1813,3.0,17850
4,2010-12-01 08:26:00,red woolly hottie white heart.,2776,3.0,17850


In [9]:
df = df_limit.copy()

## Checking the dataframe

In [10]:
df.head()

Unnamed: 0,InvoiceDate,Description,StockCode,eventStrength,CustomerID
0,2010-12-01 08:26:00,white hanging heart t-light holder,3716,3.0,17850
1,2010-12-01 08:26:00,white metal lantern,3724,3.0,17850
2,2010-12-01 08:26:00,cream cupid hearts coat hanger,861,3.0,17850
3,2010-12-01 08:26:00,knitted union flag hot water bottle,1813,3.0,17850
4,2010-12-01 08:26:00,red woolly hottie white heart.,2776,3.0,17850


## Reducing noise by excluding irrelevant features

In [11]:
recData = df[['CustomerID','StockCode','Description','eventStrength']]

In [12]:
recData["CustomerID"]=recData["CustomerID"].astype(int)
recData["StockCode"]=recData["StockCode"].astype(str)
recData["Description"]=recData["Description"].astype(str)

In [13]:
def create_interaction_matrix(df,user_col, item_col, rating_col, norm= False, threshold = None):

    interactions = df.groupby([user_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions

In [14]:
interactions = create_interaction_matrix(df = recData,
                                         user_col = 'CustomerID',
                                         item_col = 'StockCode',
                                         rating_col = 'eventStrength')
interactions.head()

StockCode,0,1,10,100,1000,1001,1002,1003,1004,1005,...,990,991,992,993,994,995,996,997,998,999
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12352,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# transforming 'StockCode' into integers
recData['StockCode'] = recData['StockCode'].astype(str).astype(int)

In [16]:
def create_user_dict(interactions):
    '''
    Function to create a user dictionary based on their index and number in interaction dataset
    Required Input - 
        interactions - dataset create by create_interaction_matrix
    Expected Output -
        user_dict - Dictionary type output containing interaction_index as key and user_id as value
    '''
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0 
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict
    
def create_item_dict(df,id_col,name_col):
    '''
    Function to create an item dictionary based on their item_id and item name
    Required Input - 
        - df = Pandas dataframe with Item information
        - id_col = Column name containing unique identifier for an item
        - name_col = Column name containing name of the item
    Expected Output -
        item_dict = Dictionary type output containing item_id as key and item_name as value
    '''
    item_dict ={}
    for i in range(df.shape[0]):
        item_dict[(df.loc[i,id_col])] = df.loc[i,name_col]
    return item_dict

In [17]:
# create user dict
user_dict = create_user_dict(interactions = interactions)

In [18]:
# create item dict
item_dict = create_item_dict(recData,id_col = "StockCode",name_col="Description" )

In [19]:
def informed_train_test(rating_df, train_ratio,user_dict,item_dict):
    split_cut = np.int(np.round(rating_df.shape[0] * train_ratio))
    train_df = rating_df.iloc[0:split_cut]
    test_df = rating_df.iloc[split_cut::]
    test_df = test_df[(test_df['CustomerID'].isin(train_df['CustomerID'])) & (test_df['StockCode'].isin(train_df['StockCode']))]
    id_cols = ['CustomerID', 'StockCode']
    trans_cat_train = dict()
    trans_cat_test = dict()
    for k in id_cols:
        cate_enc = preprocessing.LabelEncoder()
        trans_cat_train[k] = cate_enc.fit_transform(train_df[k].values)
        trans_cat_test[k] = cate_enc.transform(test_df[k].values)
        
# --- Encode ratings:
    cate_enc = preprocessing.LabelEncoder()
    scores = dict()
    scores['train'] = cate_enc.fit_transform(train_df.eventStrength)
    scores['test'] = cate_enc.transform(test_df.eventStrength)
    n_users = len(user_dict)
    n_items = len(item_dict)
    train = coo_matrix((scores['train'], (trans_cat_train['CustomerID'], \
                                                          trans_cat_train['StockCode'])) \
                                      , shape=(n_users, n_items))
    test = coo_matrix((scores['test'], (trans_cat_test['CustomerID'], \
                                                        trans_cat_test['StockCode'])) \
                                     , shape=(n_users, n_items))
    return train, test, train_df

In [20]:
train, test, raw_train_df = informed_train_test(recData, 0.8, user_dict, item_dict)

## Evaluation with AUC Test and Train Dataset

In [21]:
# Run mode
start_time = time.time()
model_2=LightFM(no_components=115,learning_rate=0.027,loss='warp')
model_2.fit(train,epochs=12,num_threads=4)

# (Mean) Auc Score
auc_train = auc_score(model_2, train).mean()
auc_test = auc_score(model_2, test).mean()

print("--- Run time:  {} mins ---".format((time.time() - start_time)/60))
print("Train AUC Score: {}".format(auc_train))
print("Test AUC Score: {}".format(auc_test))

--- Run time:  0.4271711548169454 mins ---
Train AUC Score: 0.9365456104278564
Test AUC Score: 0.8644058108329773


In [22]:
user_item_matrix =raw_train_df.pivot_table(index='CustomerID', columns='StockCode', values='eventStrength')

### Creating an user/item matrix with CustomerID and StockCode

In [23]:
user_item_matrix =raw_train_df.pivot_table(index='CustomerID', columns='StockCode', values='eventStrength')
user_item_matrix.fillna(0, inplace = True)
user_item_matrix = user_item_matrix.astype(np.int32)
print(user_item_matrix.shape)
user_item_matrix.head()

(4010, 3822)


StockCode,0,1,2,3,4,5,6,7,8,9,...,3886,3887,3888,3889,3890,3891,3892,3893,3894,3895
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12347,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12348,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12353,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
item_df = pd.DataFrame(list(item_dict.items()),columns = ['StockCode','Description']) 

In [25]:
def user_item_dikts(interaction_matrix, items_df):
    user_ids = list(interaction_matrix.index)
    user_dikt = {}
    counter = 0 
    for i in user_ids:
        user_dikt[i] = counter
        counter += 1

    item_dikt ={}
    for i in range(items_df.shape[0]):
        item_dikt[(items_df.loc[i,'StockCode'])] = items_df.loc[i,'Description']
    
    return user_dikt, item_dikt

In [26]:
def similar_recommendation(model, interaction_matrix, user_id, user_dikt, 
                               item_dikt,threshold = 0,number_rec_items = 10):

    #Function to produce user recommendations

    n_users, n_items = interaction_matrix.shape
    user_x = user_dikt[user_id]
    scores = pd.Series(model.predict(user_x,np.arange(n_items)))
    scores.index = interaction_matrix.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(interaction_matrix.loc[user_id,:][interaction_matrix.loc[user_id,:] > threshold].index).sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    score_list = scores[0:number_rec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dikt[x]))
    scores = list(pd.Series(score_list).apply(lambda x: item_dikt[x]))

    print("Items that were liked by the User:")
    counter = 1
    for i in known_items[:25]:
        print(str(counter) + '- ' + i)
        counter+=1

    print("\n Recommended Items:")
    counter = 1
    for i in scores:
        print(str(counter) + '- ' + i)
        counter+=1
#     return score_list

In [27]:
user_dikt, item_dikt = user_item_dikts(user_item_matrix, item_df)

### Model output:
First the items that were 'liked' by the User (customerId = 12444) are depicted (purchased history). <br>
Next, the top 10 recommended items are illustrated below

In [28]:
similar_recommendation(model_2, user_item_matrix, 12444, user_dikt, 
                               item_dikt)

Items that were liked by the User:
1- zinc willie winkie  candle stick
2- zinc heart lattice charger large
3- wooden picture frame white finish
4- wooden box of dominoes
5- wood black board ant white finish
6- white spot blue ceramic drawer knob
7- white soap rack with 2 bottles
8- white brocante soap dish
9- vintage red enamel trim jug 
10- vintage coffee grinder box
11- victorian sewing box medium
12- victorian  metal postcard spring
13- two door curio cabinet
14- soap dish brocante
15- small apothecary measuring jar 
16- single heart zinc t-light holder
17- shelf with 4 hooks home sweet home
18- set/3 decoupage stacking tins
19- set of tea coffee sugar tins pantry
20- set of 3 regency cake tins
21- set of 3 cake tins pantry design 
22- set of 2 wooden market crates
23- set of 2 trays home sweet home
24- set of 2 round tins camembert 
25- set 3 wicker storage baskets 

 Recommended Items:
1- roses regency teacup and saucer 
2- red stripe ceramic drawer knob
3- regency teapot roses 
4

In [29]:
def create_item_emdedding_distance_matrix(model,interactions):
    '''
    Function to create item-item distance embedding matrix
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
    Expected Output -
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
    '''
    df_item_norm_sparse = sparse.csr_matrix(model.item_embeddings)
    similarities = cosine_similarity(df_item_norm_sparse)
    item_emdedding_distance_matrix = pd.DataFrame(similarities)
    item_emdedding_distance_matrix.columns = interactions.columns
    item_emdedding_distance_matrix.index = interactions.columns
    return item_emdedding_distance_matrix

In [30]:
item_item_dist = create_item_emdedding_distance_matrix(model = model_2,
                                                       interactions = interactions)
## Checking item embedding distance matrix
item_item_dist.head()

StockCode,0,1,10,100,1000,1001,1002,1003,1004,1005,...,990,991,992,993,994,995,996,997,998,999
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,-0.133745,-0.385863,-0.09608,-0.058373,0.314556,0.444623,-0.209469,-0.20102,-0.210142,...,0.102443,0.177235,0.141118,0.071869,0.113424,0.130346,0.140336,0.181057,0.14735,0.181048
1,-0.133745,1.0,0.071821,-0.044747,-0.134126,0.525607,-0.268339,0.843576,-0.122238,0.332269,...,-0.21658,-0.161315,-0.194228,-0.15139,-0.118696,-0.1721,-0.235618,-0.20072,-0.250822,-0.229561
10,-0.385863,0.071821,1.0,0.493876,0.07354,-0.257035,-0.052513,0.151173,0.116449,0.544018,...,-0.148952,-0.15751,-0.143291,-0.091826,-0.133068,-0.118289,-0.16544,-0.143283,-0.148479,-0.219698
100,-0.09608,-0.044747,0.493876,1.0,0.29686,-0.030634,0.109871,0.060443,0.341672,0.287711,...,0.264939,0.280665,0.298995,0.264545,0.186471,0.248172,0.317569,0.294102,0.284477,0.215431
1000,-0.058373,-0.134126,0.07354,0.29686,1.0,0.074017,0.093093,0.024123,-0.296208,0.203394,...,0.844737,0.852094,0.87151,0.899149,0.878136,0.87752,0.85176,0.85497,0.843443,0.848611


In [31]:
def item_item_recommendation(item_emdedding_distance_matrix, item_id, 
                             item_dict, n_items = 10, show = True):
    '''
    Function to create item-item recommendation
    Required Input - 
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
        - item_id  = item ID for which we need to generate recommended items
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - n_items = Number of items needed as an output
    Expected Output -
        - recommended_items = List of recommended items
    '''
    recommended_items = list(pd.Series(item_emdedding_distance_matrix.iloc[item_id,:]. \
                                  sort_values(ascending = False).head(n_items+1). \
                                  index[1:n_items+1]))
    print(len(recommended_items))
    if show == True:
        print("Item of interest :{0}".format(item_dict[item_id]))
        print("Item similar to the above item:")
        counter = 1
        for i in range(len(recommended_items)):
            print(str(counter),str(item_dict[i]))
            counter+=1
    return recommended_items

In [32]:
list(pd.Series(item_item_dist.iloc[1557,:]. \
                                  sort_values(ascending = False).head(10+1). \
                                  index[1:10+1]))

['1937',
 '2403',
 '1492',
 '2389',
 '1805',
 '3763',
 '1821',
 '1942',
 '2449',
 '2387']

## Item-Item Collaborative Filtering
Here, we use an item-item collaborative filtering method to suggest similar items based on a stockcode given as input. <br>

In [33]:
rec_list = item_item_recommendation(item_emdedding_distance_matrix = item_item_dist,
                                    item_id = 3716,
                                    item_dict = item_dict)

10
Item of interest :white hanging heart t-light holder
Item similar to the above item:
1  4 purple flock dinner candles
2  50's christmas gift bag large
3  dolly girl beaker
4  i love london mini backpack
5  i love london mini rucksack
6  nine drawer office tidy
7  oval wall mirror diamante 
8  red spot gift bag large
9  set 2 tea towels i love london 
10  spaceboy baby gift set
