In [1]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.functions import col
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.mllib.evaluation import RankingMetrics
import plotly.express as px
import random 
import time
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import json

# for NDCG
from pyspark.sql import Window
from pyspark.sql.functions import col
from pyspark.sql.functions import expr
import pyspark.sql.functions as F

# for lightFM
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

import os
import zipfile
import requests

import numpy as np


LightFM was compiled without OpenMP support. Only a single thread will be used.



In [2]:
# import data
users = pd.read_pickle('users.pkl')
ratings = pd.read_pickle('ratings.pkl')
business = pd.read_pickle('business.pkl')

In [4]:
# transform users feature
users['year']=2019-pd.DatetimeIndex(users['yelping_since']).year

In [5]:
# select active ratings
user_counts = ratings["user_id"].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()

act_ratings=ratings.loc[ratings['user_id'].isin(active_users)]
act_business=business.loc[business['business_id'].isin(act_ratings['business_id'])]
act_users=users.loc[users['user_id'].isin(act_ratings['user_id'])]
print("number of active users: ",len(act_users))
print("number of business appears in active users: ",len(act_business))
print("number of ratings made by active users: ",len(act_ratings))

number of active users:  286130
number of business appears in active users:  185723
number of ratings made by active users:  4538272


In [6]:
# select the most recent rating as test, the rest as training

ratings_test = act_ratings.groupby('user_id').tail(1)
ratings_training = act_ratings.drop(ratings_test.index)

# no need to build feature interactions seperatly, can skip this part
business_training=business.loc[business['business_id'].isin(ratings_training['business_id'])]
business_test=business.loc[business['business_id'].isin(ratings_test['business_id'])]

users_training=users.loc[users['user_id'].isin(ratings_training['user_id'])]
users_test=users.loc[users['user_id'].isin(ratings_test['user_id'])]

print("number of users in the training: ",len(users_training))
print("number of business in the training: ",len(business_training))
print("number of ratings in the training: ",len(ratings_training))

print("number of users in the test: ",len(users_test))
print("number of business in the test: ",len(business_test))
print("number of ratings in the test: ",len(ratings_test))

number of users in the training:  286130
number of business in the training:  183637
number of ratings in the training:  4252142
number of users in the test:  286130
number of business in the test:  45788
number of ratings in the test:  286130


In [7]:
# reset the index 
act_ratings = act_ratings.reset_index(drop=True)
act_business = act_business.reset_index(drop=True)
act_users = act_users.reset_index(drop=True)

ratings_training = ratings_training.reset_index(drop=True)

In [8]:
# build mapping using whole dataset
dataset = Dataset()
dataset.fit((act_ratings['user_id']),
            (act_ratings['business_id']))
dataset.fit_partial(items=(act_business['business_id']),
                    item_features = (act_business['stars']))

dataset.fit_partial(items=(act_business['business_id']),
                    item_features = (act_business['state']))

dataset.fit_partial(items=(act_business['business_id']),
                    item_features = (act_business['review_count']))

dataset.fit_partial(users=(act_users['user_id']),
                    user_features = (act_users['year']))



In [9]:
# build rating interactions for training
# build feature interactions for both training and test

(interactions_training, weights) = dataset.build_interactions((ratings_training['user_id'][i],ratings_training['business_id'][i]) 
                                                     for i in range(len(ratings_training)))

item_features = dataset.build_item_features(((act_business['business_id'][i], [act_business['stars'][i],
                             act_business['state'][i],act_business['review_count'][i]])
                                              for i in range(len(act_business))))

user_features = dataset.build_user_features(((act_users['user_id'][i], [act_users['year'][i]])
                                              for i in range(len(act_users))))

print(repr(interactions_training))
print(repr(item_features))
print(repr(user_features))

<286130x185723 sparse matrix of type '<class 'numpy.int32'>'
	with 4252142 stored elements in COOrdinate format>
<185723x186942 sparse matrix of type '<class 'numpy.float32'>'
	with 734110 stored elements in Compressed Sparse Row format>
<286130x286145 sparse matrix of type '<class 'numpy.float32'>'
	with 572260 stored elements in Compressed Sparse Row format>


### fit model1 - FM with features

In [10]:
model1 = LightFM(loss='warp',no_components=30, item_alpha=0.0001, user_alpha=0.0001)
model1.fit(interactions_training,epochs=20,item_features=item_features,user_features=user_features)

<lightfm.lightfm.LightFM at 0x20ea4d28cc0>

### fit model2 - MF (FM without features)

In [11]:
model2 = LightFM(no_components=30)
model2.fit(interactions_training,epochs=20)

<lightfm.lightfm.LightFM at 0x20ea4d28ac8>

### build interactions for test

In [12]:
# group the items into popular (>=300 ratings), moderate (10-300 ratings) and unpopular (<10 ratings)
item_counts = act_ratings["business_id"].value_counts()
popular_items = item_counts.loc[item_counts >= 300].index.tolist()
moderate_items = item_counts.loc[(item_counts < 300) & (item_counts >= 10)].index.tolist()
unpopular_items = item_counts.loc[item_counts < 10].index.tolist()

# group the users into active (>=20 ratings), moderate (10-20 ratings) and inactive (<10 ratings)
user_counts = act_ratings["user_id"].value_counts()
active_users = user_counts.loc[user_counts >= 20].index.tolist()
moderate_users = user_counts.loc[(user_counts < 20) & (user_counts >= 10)].index.tolist()
inactive_users = user_counts.loc[user_counts < 10].index.tolist()

In [13]:
# divide test ratings into groups 
popular_item_ratings=ratings_test.loc[ratings_test['business_id'].isin(popular_items)]
moderate_item_ratings=ratings_test.loc[ratings_test['business_id'].isin(moderate_items)]
unpopular_item_ratings=ratings_test.loc[ratings_test['business_id'].isin(unpopular_items)]

active_users_ratings=ratings_test.loc[ratings_test['user_id'].isin(active_users)]
moderate_users_ratings=ratings_test.loc[ratings_test['user_id'].isin(moderate_users)]
inactive_users_ratings=ratings_test.loc[ratings_test['user_id'].isin(inactive_users)]

In [14]:
# reset the index 
popular_item_ratings = popular_item_ratings.reset_index(drop=True)
moderate_item_ratings = moderate_item_ratings.reset_index(drop=True)
unpopular_item_ratings = unpopular_item_ratings.reset_index(drop=True)

active_users_ratings = active_users_ratings.reset_index(drop=True)
moderate_users_ratings = moderate_users_ratings.reset_index(drop=True)
inactive_users_ratings = inactive_users_ratings.reset_index(drop=True)

In [15]:
# build rating interactions for test

(interactions_popular_item, weights) = dataset.build_interactions((popular_item_ratings['user_id'][i],popular_item_ratings['business_id'][i]) 
                                                     for i in range(len(popular_item_ratings)))
(interactions_moderate_item, weights) = dataset.build_interactions((moderate_item_ratings['user_id'][i],moderate_item_ratings['business_id'][i]) 
                                                     for i in range(len(moderate_item_ratings)))
(interactions_unpopular_item, weights) = dataset.build_interactions((unpopular_item_ratings['user_id'][i],unpopular_item_ratings['business_id'][i]) 
                                                     for i in range(len(unpopular_item_ratings)))


(interactions_active_users, weights) = dataset.build_interactions((active_users_ratings['user_id'][i],active_users_ratings['business_id'][i]) 
                                                     for i in range(len(active_users_ratings)))
(interactions_moderate_users, weights) = dataset.build_interactions((moderate_users_ratings['user_id'][i],moderate_users_ratings['business_id'][i]) 
                                                     for i in range(len(moderate_users_ratings)))
(interactions_inactive_users, weights) = dataset.build_interactions((inactive_users_ratings['user_id'][i],inactive_users_ratings['business_id'][i]) 
                                                     for i in range(len(inactive_users_ratings)))


print(repr(interactions_popular_item))
print(repr(interactions_moderate_item))
print(repr(interactions_unpopular_item))
print(repr(interactions_active_users))
print(repr(interactions_moderate_users))
print(repr(interactions_inactive_users))


<286130x185723 sparse matrix of type '<class 'numpy.int32'>'
	with 68309 stored elements in COOrdinate format>
<286130x185723 sparse matrix of type '<class 'numpy.int32'>'
	with 187882 stored elements in COOrdinate format>
<286130x185723 sparse matrix of type '<class 'numpy.int32'>'
	with 29939 stored elements in COOrdinate format>
<286130x185723 sparse matrix of type '<class 'numpy.int32'>'
	with 47944 stored elements in COOrdinate format>
<286130x185723 sparse matrix of type '<class 'numpy.int32'>'
	with 74880 stored elements in COOrdinate format>
<286130x185723 sparse matrix of type '<class 'numpy.int32'>'
	with 163306 stored elements in COOrdinate format>


### AUC

In [17]:
# popular item
FM_auc_popular_item = auc_score(model1,interactions_popular_item, train_interactions = interactions_training, check_intersections=False,item_features=item_features,user_features=user_features).mean()

BL_auc_popular_item = auc_score(model2,interactions_popular_item, train_interactions = interactions_training, check_intersections=False).mean()

In [18]:
# moderate item
FM_auc_moderate_item = auc_score(model1,interactions_moderate_item, train_interactions = interactions_training, check_intersections=False,item_features=item_features,user_features=user_features).mean()

BL_auc_moderate_item = auc_score(model2,interactions_moderate_item, train_interactions = interactions_training, check_intersections=False).mean()

In [19]:
# unpopular item
FM_auc_unpopular_item = auc_score(model1,interactions_unpopular_item, train_interactions = interactions_training, check_intersections=False,item_features=item_features,user_features=user_features).mean()

BL_auc_unpopular_item = auc_score(model2,interactions_unpopular_item, train_interactions = interactions_training, check_intersections=False).mean()

In [23]:
# active users
FM_auc_active_users = auc_score(model1,interactions_active_users, train_interactions = interactions_training, check_intersections=False,item_features=item_features,user_features=user_features).mean()

BL_auc_active_users = auc_score(model2,interactions_active_users, train_interactions = interactions_training, check_intersections=False).mean()

In [20]:
# moderate users
FM_auc_moderate_users = auc_score(model1,interactions_moderate_users, train_interactions = interactions_training, check_intersections=False,item_features=item_features,user_features=user_features).mean()

BL_auc_moderate_users = auc_score(model2,interactions_moderate_users, train_interactions = interactions_training, check_intersections=False).mean()

In [21]:
# inactive users
FM_auc_inactive_users = auc_score(model1,interactions_inactive_users, train_interactions = interactions_training, check_intersections=False,item_features=item_features,user_features=user_features).mean()

BL_auc_inactive_users = auc_score(model2,interactions_inactive_users, train_interactions = interactions_training, check_intersections=False).mean()

In [24]:
result_auc=[['popular_item',FM_auc_popular_item,BL_auc_popular_item],
         ['moderate_item',FM_auc_moderate_item,BL_auc_moderate_item],
         ['unpopular_item',FM_auc_unpopular_item,BL_auc_unpopular_item],
         ['active_users',FM_auc_active_users,BL_auc_active_users],
         ['moderate_users',FM_auc_moderate_users,BL_auc_moderate_users],
         ['inactive_users',FM_auc_inactive_users,BL_auc_inactive_users]]

result_auc_df=pd.DataFrame(result_auc,columns=['groups','lightFM','Matrix Factorization from lightFM'])
print(result_auc_df)

           groups   lightFM  Matrix Factorization from lightFM
0    popular_item  0.996481                           0.992258
1   moderate_item  0.961698                           0.845464
2  unpopular_item  0.766378                           0.246989
3    active_users  0.961347                           0.806422
4  moderate_users  0.953436                           0.815486
5  inactive_users  0.944330                           0.822355
