In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.functions import col
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.mllib.evaluation import RankingMetrics
import plotly.express as px
import random 
import time
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import json

# for NDCG
from pyspark.sql import Window
from pyspark.sql.functions import col
from pyspark.sql.functions import expr
import pyspark.sql.functions as F

# for lightFM
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

import os
import zipfile
import requests

import numpy as np


LightFM was compiled without OpenMP support. Only a single thread will be used.



In [2]:
# import data
users = pd.read_pickle('users.pkl')
ratings = pd.read_pickle('ratings.pkl')
business = pd.read_pickle('business.pkl')

In [3]:
# transform users feature
users['year']=2019-pd.DatetimeIndex(users['yelping_since']).year

In [4]:
# select active ratings
user_counts = ratings["user_id"].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()

act_ratings=ratings.loc[ratings['user_id'].isin(active_users)]
act_business=business.loc[business['business_id'].isin(act_ratings['business_id'])]
act_users=users.loc[users['user_id'].isin(act_ratings['user_id'])]
print("number of active users: ",len(act_users))
print("number of business appears in active users: ",len(act_business))
print("number of ratings made by active users: ",len(act_ratings))

number of active users:  286130
number of business appears in active users:  185723
number of ratings made by active users:  4538272


In [5]:
# select the most recent rating as test, the rest as training

ratings_test = act_ratings.groupby('user_id').tail(1)
ratings_training = act_ratings.drop(ratings_test.index)

# no need to build feature interactions seperatly, can skip this part
business_training=business.loc[business['business_id'].isin(ratings_training['business_id'])]
business_test=business.loc[business['business_id'].isin(ratings_test['business_id'])]

users_training=users.loc[users['user_id'].isin(ratings_training['user_id'])]
users_test=users.loc[users['user_id'].isin(ratings_test['user_id'])]

print("number of users in the training: ",len(users_training))
print("number of business in the training: ",len(business_training))
print("number of ratings in the training: ",len(ratings_training))

print("number of users in the test: ",len(users_test))
print("number of business in the test: ",len(business_test))
print("number of ratings in the test: ",len(ratings_test))

number of users in the training:  286130
number of business in the training:  183637
number of ratings in the training:  4252142
number of users in the test:  286130
number of business in the test:  45788
number of ratings in the test:  286130


In [6]:
# reset the index 
act_ratings = act_ratings.reset_index(drop=True)
act_business = act_business.reset_index(drop=True)
act_users = act_users.reset_index(drop=True)

ratings_training = ratings_training.reset_index(drop=True)

In [7]:
# build mapping using whole dataset
dataset = Dataset()
dataset.fit((act_ratings['user_id']),
            (act_ratings['business_id']))
dataset.fit_partial(items=(act_business['business_id']),
                    item_features = (act_business['stars']))

dataset.fit_partial(items=(act_business['business_id']),
                    item_features = (act_business['state']))

dataset.fit_partial(items=(act_business['business_id']),
                    item_features = (act_business['review_count']))

dataset.fit_partial(users=(act_users['user_id']),
                    user_features = (act_users['year']))



In [8]:
# build rating interactions for training
# build feature interactions for both training and test

(interactions_training, weights) = dataset.build_interactions((ratings_training['user_id'][i],ratings_training['business_id'][i]) 
                                                     for i in range(len(ratings_training)))

item_features = dataset.build_item_features(((act_business['business_id'][i], [act_business['stars'][i],
                             act_business['state'][i],act_business['review_count'][i]])
                                              for i in range(len(act_business))))

user_features = dataset.build_user_features(((act_users['user_id'][i], [act_users['year'][i]])
                                              for i in range(len(act_users))))

print(repr(interactions_training))
print(repr(item_features))
print(repr(user_features))

<286130x185723 sparse matrix of type '<class 'numpy.int32'>'
	with 4252142 stored elements in COOrdinate format>
<185723x186942 sparse matrix of type '<class 'numpy.float32'>'
	with 734110 stored elements in Compressed Sparse Row format>
<286130x286145 sparse matrix of type '<class 'numpy.float32'>'
	with 572260 stored elements in Compressed Sparse Row format>


### fit model1 - FM with features

In [9]:
model1 = LightFM(loss='warp',no_components=30, item_alpha=0.0001, user_alpha=0.0001)
model1.fit(interactions_training,epochs=20,item_features=item_features,user_features=user_features)

<lightfm.lightfm.LightFM at 0x1aac513588>

### fit model2 - MF (FM without features)

In [10]:
model2 = LightFM(no_components=30)
model2.fit(interactions_training,epochs=20)

<lightfm.lightfm.LightFM at 0x1abe326400>

### build interactions for test

In [12]:
# reset the index 
ratings_test = ratings_test.reset_index(drop=True)

In [13]:
# build rating interactions for test

(interactions_test, weights) = dataset.build_interactions((ratings_test['user_id'][i],ratings_test['business_id'][i]) 
                                                     for i in range(len(ratings_test)))


print(repr(interactions_test))


<286130x185723 sparse matrix of type '<class 'numpy.int32'>'
	with 286130 stored elements in COOrdinate format>


### evaluate results

In [14]:
# precision at 5
FM_precision_overall = precision_at_k(model1,interactions_test, train_interactions = interactions_training, check_intersections=False,
                                      item_features=item_features, user_features=user_features,k=5).mean()
print(FM_precision_overall)

BL_precision_overall = precision_at_k(model2,interactions_test,train_interactions = interactions_training, check_intersections=False, k=5).mean()
print(BL_precision_overall)

0.00067521754
7.7587116e-05


In [15]:
# auc
FM_auc_overall = auc_score(model1,interactions_test,item_features=item_features,user_features=user_features,
                          train_interactions = interactions_training, check_intersections=False).mean()
print(FM_auc_overall)

BL_auc_overall = auc_score(model2,interactions_test, train_interactions = interactions_training, check_intersections=False,).mean()
print(BL_auc_overall)

0.94939375
0.8178871
