In [70]:
# !pip install pyspark

In [71]:
# !pip install lightfm

In [72]:
#import libraries
import numpy as np
import pandas as pd
import re
import itertools

from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score, precision_at_k
from lightfm.cross_validation import random_train_test_split

#choose hyperparameters
TOP_N = 10

In [73]:
#import rating data and create train and test set
df_rating = pd.read_csv('rating.csv')
df_rating = df_rating[:30000]
df_rating.sort_values(by='timestamp', inplace=True, ignore_index=True)
df_rating.drop('timestamp', inplace=True, axis = 1)
df_rating_train = df_rating[:-10000]
df_rating_test = df_rating[-10000:].reset_index()
df_rating_train.head()

Unnamed: 0,userId,movieId,rating
0,172,296,3.0
1,172,380,3.0
2,172,150,3.0
3,172,592,3.0
4,172,590,4.0


In [74]:
# delete new users fropm test to avoid cold start problem
train_users = df_rating_train.userId.unique()
df_rating_test['old_users'] = df_rating_test.userId.isin(train_users)
df_rating_test.drop(df_rating_test[df_rating_test.old_users == False].index, inplace=True)
df_rating_test.reset_index()
df_rating_test.drop('old_users', axis=1, inplace=True)
df_rating_test.head()

Unnamed: 0,index,userId,movieId,rating
0,20000,124,784,3.0
1,20001,124,2324,4.0
2,20002,124,5445,4.5
3,20003,124,2268,4.5
4,20004,124,58,1.0


In [75]:
df_movies = pd.read_csv('movie.csv')
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# ALS

In [76]:
spark = SparkSession.builder.getOrCreate()

In [77]:
#create and define ALS best model using rmse evaluator and ParamGridBuilder

train = spark.createDataFrame(df_rating_train)
test = spark.createDataFrame(df_rating_test)

als = ALS(userCol='userId', itemCol='movieId', ratingCol='rating')
param_grid = ParamGridBuilder()\
            .addGrid(als.rank, [12, 13, 14])\
            .addGrid(als.maxIter, [10, 15, 20])\
            .addGrid(als.regParam, [0.1, 0.15, 0.2])\
            .build()
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
cv = CrossValidator(estimator=als, 
                    estimatorParamMaps=param_grid,
                    evaluator=evaluator,
                    numFolds=3) 
model = cv.fit(train)
best_model = model.bestModel

In [78]:
def compute_metrics(df_true, df_pred, top_N):
    """
    returns calculated metrics to evaluate predictions
    """
    result = {}
    test_recs = df_true.set_index(['userId', 'movieId']).join(df_pred.set_index(['userId', 'movieId']))
    test_recs = test_recs.sort_values(by=['userId', 'rank'])

    test_recs['users_item_count'] = test_recs.groupby(level='userId')['rank'].transform(np.size)
    test_recs['reciprocal_rank'] = (1 / test_recs['rank']).fillna(0)
    test_recs['cumulative_rank'] = test_recs.groupby(level='userId').cumcount() + 1
    test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs['rank']
    
    users_count = test_recs.index.get_level_values('userId').nunique()
    for k in range(1, top_N + 1):
        hit_k = f'hit@{k}'
        test_recs[hit_k] = test_recs['rank'] <= k
        result[f'Precision@{k}'] = (test_recs[hit_k] / k).sum() / users_count
        result[f'Recall@{k}'] = (test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count

    result[f'MAP@{top_N}'] = (test_recs["cumulative_rank"] / test_recs["users_item_count"]).sum() / users_count
    result[f'MRR'] = test_recs.groupby(level='userId')['reciprocal_rank'].max().mean()
    return pd.Series(result)

In [79]:
item_titles = pd.Series(df_movies['title'].values, index=df_movies['movieId']).to_dict()

In [80]:
# predict recommendations 
users = test.select(als.getUserCol()).distinct()
recs = best_model.recommendForUserSubset(users, TOP_N)

# prepare to calculating metrics
recs = recs.toPandas()
recs['movieId'] = [list(map(lambda x: x[0], string)) for _, string in enumerate(recs.recommendations)]
recs.drop('recommendations', inplace=True, axis=1)
recs['titles'] = [[item_titles[id] for id in m_ids] for m_ids in recs.movieId]
recs.head()



Unnamed: 0,userId,movieId,titles
0,91,"[6947, 5899, 7153, 527, 1449, 2019, 30749, 295...",[Master and Commander: The Far Side of the Wor...
1,220,"[858, 318, 5060, 2019, 4304, 8638, 6214, 1193,...","[Godfather, The (1972), Shawshank Redemption, ..."
2,90,"[31427, 1198, 3578, 858, 912, 318, 2028, 4866,...","[Hide and Seek (2005), Raiders of the Lost Ark..."
3,58,"[1361, 910, 50, 527, 903, 3007, 1211, 6331, 12...",[Paradise Lost: The Child Murders at Robin Hoo...
4,135,"[7153, 198, 3396, 3179, 106, 199, 750, 1721, 5...","[Lord of the Rings: The Return of the King, Th..."


In [81]:
# compute metrics
recs = recs.explode('movieId')
recs['rank'] = recs.groupby('userId').cumcount() + 1

recs.drop('titles', inplace=True, axis=1)
compute_metrics(df_rating_test, recs, TOP_N)

Precision@1     0.000000
Recall@1        0.000000
Precision@2     0.000000
Recall@2        0.000000
Precision@3     0.000000
Recall@3        0.000000
Precision@4     0.035714
Recall@4        0.000389
Precision@5     0.028571
Recall@5        0.000389
Precision@6     0.047619
Recall@6        0.002110
Precision@7     0.061224
Recall@7        0.003832
Precision@8     0.089286
Recall@8        0.006048
Precision@9     0.079365
Recall@9        0.006048
Precision@10    0.071429
Recall@10       0.006048
MAP@10          0.001153
MRR             0.095238
dtype: float64

# LightFM

### prepare film dataset

In [82]:
# detecting genres 
genres = set()
genres_data = df_movies.genres.unique()
for film_genres in genres_data:
    curr_genres = film_genres.split('|')
    genres.update(curr_genres)

# using dummy-encoding for genres and adding a 'year' column 
df_movies[list(genres)] = np.zeros((len(df_movies), len(genres)), dtype=bool)
df_movies['year'] = np.zeros(len(df_movies), dtype=str)

pattern = r'\d{4}'
for i in list(df_movies.index):
    curr_genres = df_movies.genres.iloc[i].split('|')
    for g in curr_genres:
        df_movies.loc[i,g] = True
    
    title = df_movies.loc[i, 'title']
    year = re.findall(pattern, title)
    try:
        df_movies.loc[i, 'year'] = year[-1] 
    except:
        df_movies.loc[i, 'year'] = '2000' # in case there is not any information about year

In [83]:
df_movies.head()

Unnamed: 0,movieId,title,genres,Romance,Action,Sci-Fi,Mystery,Horror,Musical,Crime,...,Children,Fantasy,Animation,(no genres listed),Comedy,Thriller,Documentary,Adventure,Western,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,False,False,False,False,False,False,False,...,True,True,True,False,True,False,False,True,False,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,False,False,False,False,False,False,False,...,True,True,False,False,False,False,False,True,False,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,True,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,True,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,1995
4,5,Father of the Bride Part II (1995),Comedy,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,1995


### prepare data for using lightFM Dataset

In [84]:
# define movie features
movie_features = []
col = ['year'] * len(df_movies.year.unique())
unique_values = list(df_movies.year.unique())
for genre in genres:
    col += [genre] * 2
    unique_values += [True, False]

for x,y in zip(col, unique_values):
    res = str(x)+ ":" +str(y)
    movie_features.append(res)
print(movie_features)

['year:1995', 'year:1994', 'year:1996', 'year:1976', 'year:1992', 'year:1988', 'year:1967', 'year:1993', 'year:1964', 'year:1977', 'year:1965', 'year:1982', 'year:1985', 'year:1990', 'year:1991', 'year:1989', 'year:1937', 'year:1940', 'year:1969', 'year:1981', 'year:1973', 'year:1970', 'year:1960', 'year:1955', 'year:1959', 'year:1968', 'year:1980', 'year:1975', 'year:1986', 'year:1948', 'year:1943', 'year:1950', 'year:1946', 'year:1987', 'year:1997', 'year:1974', 'year:1956', 'year:1958', 'year:1949', 'year:1972', 'year:1998', 'year:1933', 'year:1952', 'year:1951', 'year:1957', 'year:1961', 'year:1954', 'year:1934', 'year:1944', 'year:1963', 'year:1942', 'year:1941', 'year:1953', 'year:1939', 'year:1947', 'year:1945', 'year:1938', 'year:1935', 'year:1936', 'year:1926', 'year:1932', 'year:1979', 'year:1971', 'year:1978', 'year:1966', 'year:1962', 'year:1983', 'year:1984', 'year:1931', 'year:1922', 'year:1999', 'year:1927', 'year:1929', 'year:1930', 'year:1928', 'year:1925', 'year:1914'

### making dataset and sparse matrix

In [85]:
dataset = Dataset()
dataset.fit(
        df_rating_train['userId'].unique(), 
        df_movies['movieId'].unique(), 
        item_features = movie_features
)

In [86]:
(interactions, weights) = dataset.build_interactions([(x[0], x[1], x[2]) for x in df_rating_train.values])

### building movies features

In [87]:
features_names = list(genres) + ['year']

movies_features = df_movies[features_names]
values_list = [list(x) for x in movies_features.values]
feature_list = []
for values in values_list:
    res = []
    for x,y in zip(features_names,values):
        res.append(str(x) +':'+ str(y))
    feature_list.append(res)

movies_tuple = list(zip(df_movies.movieId, feature_list))
print(movies_tuple[0])

(1, ['Romance:False', 'Action:False', 'Sci-Fi:False', 'Mystery:False', 'Horror:False', 'Musical:False', 'Crime:False', 'Film-Noir:False', 'IMAX:False', 'War:False', 'Drama:False', 'Children:True', 'Fantasy:True', 'Animation:True', '(no genres listed):False', 'Comedy:True', 'Thriller:False', 'Documentary:False', 'Adventure:True', 'Western:False', 'year:1995'])


In [88]:
item_features = dataset.build_item_features(movies_tuple, normalize= False)
item_features.todense()

matrix([[1., 0., 0., ..., 0., 0., 1.],
        [0., 1., 0., ..., 0., 0., 1.],
        [0., 0., 1., ..., 1., 0., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 1., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [89]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
items_inv_mapping = {v: k for k, v in item_id_map.items()}

### searching for best hyperparameters

In [90]:
def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """
    while True:
        yield {
            "no_components": np.random.choice([16, 32, 64]),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "num_epochs": np.random.randint(5, 50),
        }


def random_search(train, test, num_samples=10):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.
    
    """

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        model.fit(train, epochs=num_epochs)

        score = precision_at_k(model, test, train_interactions=train, k = TOP_N).mean()

        hyperparams["num_epochs"] = num_epochs

        yield (score, hyperparams, model)

In [91]:
train_int, val_int = random_train_test_split(interactions, test_percentage=0.2, random_state=42)
(score, hyperparams, model) = max(random_search(train = train_int, test = val_int), key=lambda x: x[0])
print("Best score {} at {}".format(score, hyperparams))

Best score 0.17421384155750275 at {'no_components': 16, 'learning_schedule': 'adagrad', 'loss': 'warp', 'learning_rate': 0.03695667938051607, 'item_alpha': 3.8799929383614655e-09, 'num_epochs': 41}


### evaluating best model

In [92]:
nc, ls, loss, lr, item_alpha, num_epochs = hyperparams.values()
model = LightFM(no_components=nc, learning_schedule=ls, loss=loss, learning_rate=lr, item_alpha=item_alpha)
model.fit(interactions, 
      item_features= item_features, 
      sample_weight= weights, 
      epochs=num_epochs)

train_auc = auc_score(model,
                      interactions,
                      item_features=item_features
                     ).mean()
print('lightFM training AUC: %s' % train_auc)

lightFM training AUC: 0.9959921


In [93]:
users = df_rating_test['userId'].unique()
titles = []
movie_ids = []
recs = pd.DataFrame({
    'userId': df_rating_test['userId'].unique()
})

for user in users:
    row_id = user_id_map[user]
    pred = model.predict(row_id, list(item_id_map.values()), item_features=item_features)
    top = np.argpartition(pred, -np.arange(TOP_N))[-TOP_N:][::-1]
    ids = [items_inv_mapping[el] for el in top]
    movie_ids.append(ids)
    titles.append([item_titles[el] for el in ids]) 

recs = pd.DataFrame({
    'userId': users,
    'movieId': movie_ids,
    'titles': titles
})

recs.head()

Unnamed: 0,userId,movieId,titles
0,124,"[593, 480, 589, 356, 318, 1210, 296, 380, 377,...","[Silence of the Lambs, The (1991), Jurassic Pa..."
1,208,"[1172, 924, 1721, 1131, 1197, 1617, 858, 2858,...",[Cinema Paradiso (Nuovo cinema Paradiso) (1989...
2,91,"[356, 110, 593, 1196, 1210, 260, 595, 318, 107...","[Forrest Gump (1994), Braveheart (1995), Silen..."
3,58,"[593, 3535, 2710, 1219, 2762, 1617, 4148, 3706...","[Silence of the Lambs, The (1991), American Ps..."
4,135,"[364, 595, 480, 588, 594, 1, 589, 356, 318, 150]","[Lion King, The (1994), Beauty and the Beast (..."


In [94]:
recs.drop('titles', inplace=True, axis=1)
recs = recs.explode('movieId')
recs['rank'] = recs.groupby('userId').cumcount() + 1

In [95]:
metrics = compute_metrics(df_rating_test, recs, TOP_N)
metrics

Precision@1     0.285714
Recall@1        0.002803
Precision@2     0.357143
Recall@2        0.005996
Precision@3     0.333333
Recall@3        0.008800
Precision@4     0.285714
Recall@4        0.009882
Precision@5     0.257143
Recall@5        0.011603
Precision@6     0.238095
Recall@6        0.012685
Precision@7     0.204082
Recall@7        0.012685
Precision@8     0.214286
Recall@8        0.015489
Precision@9     0.222222
Recall@9        0.018292
Precision@10    0.228571
Recall@10       0.020402
MAP@10          0.017125
MRR             0.357143
dtype: float64