In [158]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import linear_kernel

from src.recommender import evaluate_recommendations

In [106]:
N_RECOMMENDATIONS = 100

## Dataset

In [149]:
ratings_df = pd.read_csv("../datasets/movielens/ml-1m/ratings.csv")
movies_infos_df = pd.read_json("../datasets/movielens/omdb.csv", lines=True)

full_df = pd.merge(
    ratings_df,
    movies_infos_df,
    left_index=True,
    right_index=True
)

movies = ratings_df["movieId"].unique().tolist()
users = ratings_df["userId"].unique().tolist()

movie_index_to_id = movies_infos_df["id"].to_dict()
movie_id_to_index = {v: k for (k, v) in movie_index_to_id.items()}

In [108]:
full_df_filtered = full_df.loc[full_df["rating"] >= 4]

train_df, test_df = train_test_split(full_df_filtered)

In [109]:
target_recommendations_per_user = \
    test_df.groupby("userId")["movieId"].apply(set).to_dict()

# e.g: `target_recommendations_per_user[2]` shows items we should predict for user 2

## Machine Learning

In [118]:
def evaluate_predict():
    scores = []

    for user_id in test_df["userId"].unique():
        target = target_recommendations_per_user[user_id]
        predictions = predict(user_id)

        score = evaluate_recommendations(predictions, target, k=N_RECOMMENDATIONS)
        scores.append(score)

    return np.mean(scores)

### Random model

In [190]:
def predict(user_id):
    recommended_movies = np.random.choice(movies, N_RECOMMENDATIONS)
    
    return recommended_movies

In [191]:
%time evaluate_predict()

CPU times: user 138 ms, sys: 11.6 ms, total: 149 ms
Wall time: 162 ms


0.03279629928650603

### Content Based

In [159]:
genre_vect = TfidfVectorizer()

X = sparse.hstack([
    genre_vect.fit_transform(movies_infos_df["Genre"].fillna("")),
])

In [160]:
cosine_sim = linear_kernel(X, X)

In [187]:
def predict(user_id):
    movie_ids_liked_by_user = \
        train_df.loc[train_df["userId"] == user_id]["id"].tolist()
    
    movie_indices_liked_by_user = [movie_id_to_index[movie_id]
                                   for movie_id in movie_ids_liked_by_user]

    recommended_movies = []
    
    for movie_index in movie_indices_liked_by_user:
        sim_scores = list(enumerate(cosine_sim[movie_index]))
        sim_scores_sorted = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        sim_scores_sorted = sim_scores_sorted[:(N_RECOMMENDATIONS // len(movie_indices_liked_by_user))]

        recommended_movies_tmp = [movie_index_to_id[recommended_index]
                                  for (recommended_index, score) in sim_scores_sorted]
        
        recommended_movies += recommended_movies_tmp
        
    return recommended_movies

In [188]:
evaluate_predict()

0.042441582802978564