In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.functions import col
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.mllib.evaluation import RankingMetrics
import plotly.express as px
import random 
import time
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import json

# for NDCG
from pyspark.sql import Window
from pyspark.sql.functions import col
from pyspark.sql.functions import expr
import pyspark.sql.functions as F

# for lightFM
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

import os
import zipfile
import requests

import numpy as np


LightFM was compiled without OpenMP support. Only a single thread will be used.



In [2]:
users = pd.read_pickle('users.pkl')
ratings = pd.read_pickle('ratings.pkl')
business = pd.read_pickle('business.pkl')

In [3]:
# select active ratings
user_counts = ratings["user_id"].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()

Get sub sample  
1% of the user as subsample, the most recent record as subsample test

In [4]:
# randomly select 1% users
import random 
random.seed(12345)
sample_1pc = random.sample(active_users,int(0.01*len(active_users)))
sample_ratings=ratings.loc[ratings['user_id'].isin(sample_1pc)]
sample_business=business.loc[business['business_id'].isin(sample_ratings['business_id'])]
sample_users=users.loc[users['user_id'].isin(sample_ratings['user_id'])]
print("number of users in the sample: ",len(sample_users))
print("number of business in the sample: ",len(sample_business))
print("number of ratings in the sample: ",len(sample_ratings))

number of users in the sample:  2861
number of business in the sample:  28191
number of ratings in the sample:  49133


In [5]:
sample_users['year']=2019-pd.DatetimeIndex(sample_users['yelping_since']).year



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [6]:
# reset the index 
sample_ratings = sample_ratings.reset_index(drop=True)
sample_business = sample_business.reset_index(drop=True)
sample_users = sample_users.reset_index(drop=True)

# Cross Validation

In [7]:
# parameter set
loss=["bpr", "warp"]
no_components = [10,20,30]
num_epochs = [10,20,30]
item_alpha = [0,0.0001]
user_alpha = [0,0.0001]

In [8]:
def cross_validation(dt1, dt2,loss,no_components,num_epochs,item_alpha,user_alpha):
    result = []
    for i in range(len(loss)):
        for j in range(len(no_components)):
            for k in range(len(num_epochs)):
                for l in range(len(item_alpha)):
                    for m in range(len(user_alpha)):
                        
                        model1 = LightFM(loss=loss[i],no_components=no_components[j],
                                        item_alpha=item_alpha[l], user_alpha=user_alpha[m])
                        model1.fit(dt1,epochs=num_epochs[k],item_features=item_features,user_features=user_features)
                        
                        precision1 = precision_at_k(model1, dt2,item_features=item_features, user_features=user_features,k=10).mean()
                        auc1 = auc_score(model1, dt2,item_features=item_features,user_features=user_features).mean()
                        
                        # swap train and test
                        model2 = LightFM(loss=loss[i],no_components=no_components[j],
                                        item_alpha=item_alpha[l], user_alpha=user_alpha[m])
                        model2.fit(dt2,epochs=num_epochs[k],item_features=item_features,user_features=user_features)
                        
                        precision2 = precision_at_k(model2, dt1,item_features=item_features, user_features=user_features,k=10).mean()
                        auc2 = auc_score(model2, dt1,item_features=item_features,user_features=user_features).mean()
                        
                        result.append([loss[i],no_components[j],num_epochs[k],item_alpha[l],user_alpha[m],
                                       (precision1+precision2)/2,(auc1+auc2)/2])
                        
    return pd.DataFrame(result,columns=['loss','no_components','num_epochs','item_alpha','user_alpha','precision_10','AUC'])

In [9]:
# prepare the dataset
dataset = Dataset()
dataset.fit((sample_ratings['user_id']),
            (sample_ratings['business_id']))
dataset.fit_partial(items=(sample_business['business_id']),
                    item_features = (sample_business['stars']))

dataset.fit_partial(items=(sample_business['business_id']),
                    item_features = (sample_business['state']))

dataset.fit_partial(items=(sample_business['business_id']),
                    item_features = (sample_business['review_count']))

dataset.fit_partial(users=(sample_users['user_id']),
                    user_features = (sample_users['year']))


(interactions, weights) = dataset.build_interactions((sample_ratings['user_id'][i],sample_ratings['business_id'][i]) 
                                                     for i in range(len(sample_ratings)))

item_features = dataset.build_item_features(((sample_business['business_id'][i], [sample_business['stars'][i],
                             sample_business['state'][i],sample_business['review_count'][i]])
                                              for i in range(len(sample_business))))

user_features = dataset.build_user_features(((sample_users['user_id'][i], [sample_users['year'][i]])
                                              for i in range(len(sample_users))))


# 2-fold cross validation
from lightfm.cross_validation import random_train_test_split
my_r=np.random.seed(12345)
dt1, dt2 = random_train_test_split(interactions, test_percentage=0.5, random_state=my_r)


cv_result=cross_validation(dt1,dt2,loss,no_components,num_epochs,item_alpha,user_alpha)


In [10]:
cv_result

Unnamed: 0,loss,no_components,num_epochs,item_alpha,user_alpha,precision_10,AUC
0,bpr,10,10,0.0000,0.0000,0.001726,0.815912
1,bpr,10,10,0.0000,0.0001,0.001726,0.819884
2,bpr,10,10,0.0001,0.0000,0.001708,0.810619
3,bpr,10,10,0.0001,0.0001,0.001462,0.810165
4,bpr,10,20,0.0000,0.0000,0.002026,0.853017
5,bpr,10,20,0.0000,0.0001,0.002325,0.854825
6,bpr,10,20,0.0001,0.0000,0.002166,0.845511
7,bpr,10,20,0.0001,0.0001,0.002677,0.848225
8,bpr,10,30,0.0000,0.0000,0.002854,0.865131
9,bpr,10,30,0.0000,0.0001,0.002166,0.864193


BPR: Bayesian Personalised Ranking pairwise loss. Maximises the prediction difference between a positive example and a randomly chosen negative example. Useful when only positive interactions are present and optimising ROC AUC is desired.

WARP: Weighted Approximate-Rank Pairwise [2] loss. Maximises the rank of positive examples by repeatedly sampling negative examples until rank violating one is found. Useful when only positive interactions are present and optimising the top of the recommendation list (precision@k) is desired.  

In [17]:
cv_result.loc[cv_reult['AUC'] == cv_result['AUC'].max(),]

Unnamed: 0,loss,no_components,num_epochs,item_alpha,user_alpha,precision_10,AUC
61,warp,30,10,0.0,0.0001,0.009866,0.883216


In [19]:
cv_result.loc[cv_result['precision_10'] == cv_result['precision_10'].max(),]

Unnamed: 0,loss,no_components,num_epochs,item_alpha,user_alpha,precision_10,AUC
67,warp,30,20,0.0001,0.0001,0.0102,0.879504


AUC: Measure the ROC AUC metric for a model: the probability that a randomly chosen positive example has a higher score than a randomly chosen negative example. A perfect score is 1.0.

Precision_at_k: Measure the precision at k metric for a model: the fraction of known positives in the first k positions of the ranked list of results. A perfect score is 1.0.