In [16]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import linear_kernel, pairwise_distances

from src.recommender import evaluate_recommendations

In [2]:
N_RECOMMENDATIONS = 100

## Dataset

In [3]:
ratings_df = pd.read_csv("../datasets/movielens/ml-1m/ratings.csv")
movies_infos_df = pd.read_json("../datasets/movielens/omdb.csv", lines=True)

full_df = pd.merge(
    ratings_df,
    movies_infos_df,
    left_index=True,
    right_index=True
)

movies = ratings_df["movieId"].unique().tolist()
users = ratings_df["userId"].unique().tolist()

movie_index_to_id = movies_infos_df["id"].to_dict()
movie_id_to_index = {v: k for (k, v) in movie_index_to_id.items()}

In [4]:
full_df_filtered = full_df.loc[full_df["rating"] >= 4]

train_df, test_df = train_test_split(full_df_filtered)

In [113]:
likes_per_user = \
    train_df.groupby("userId")["movieId"].apply(set).to_dict()

target_recommendations_per_user = \
    test_df.groupby("userId")["movieId"].apply(set).to_dict()

# e.g: `target_recommendations_per_user[2]` shows items we should predict for user 2

## Machine Learning

In [6]:
def evaluate_predict():
    scores = []

    for user_id in test_df["userId"].unique():
        target = target_recommendations_per_user[user_id]
        predictions = predict(user_id)

        score = evaluate_recommendations(predictions, target, k=N_RECOMMENDATIONS)
        scores.append(score)

    return np.mean(scores)

### Random model

In [7]:
def predict(user_id):
    recommended_movies = np.random.choice(movies, N_RECOMMENDATIONS)
    
    return recommended_movies

In [8]:
%time evaluate_predict()

CPU times: user 138 ms, sys: 2.73 ms, total: 141 ms
Wall time: 139 ms


0.04288574288451783

### Content Based

In [158]:
genre_vect = TfidfVectorizer()

X = sparse.hstack([
    genre_vect.fit_transform(movies_infos_df["Genre"].fillna("")),
])

In [159]:
cosine_sim = linear_kernel(X, X)

In [160]:
def predict(user_id):
    movie_ids_liked_by_user = \
        train_df.loc[train_df["userId"] == user_id]["id"].tolist()
    
    movie_indices_liked_by_user = [movie_id_to_index[movie_id]
                                   for movie_id in movie_ids_liked_by_user]

    recommended_movies = []
    
    for movie_index in movie_indices_liked_by_user:
        sim_scores = list(enumerate(cosine_sim[movie_index]))
        sim_scores_sorted = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        sim_scores_sorted = sim_scores_sorted[:(N_RECOMMENDATIONS // len(movie_indices_liked_by_user) + 1)]

        recommended_movies_tmp = [movie_index_to_id[recommended_index]
                                  for (recommended_index, score) in sim_scores_sorted]
        
        recommended_movies += recommended_movies_tmp
        
    return recommended_movies

In [161]:
evaluate_predict()

0.04753234834849813

### Collaborative Filtering

In [162]:
v = CountVectorizer(token_pattern="[0-9]+")
v.fit(train_df["movieId"].astype(str));

In [163]:
def make_matrix(df, groupby_column, data_column):
    grouped_df = df.groupby(groupby_column)
    group_keys = list(grouped_df.groups.keys())
    
    data = grouped_df[data_column]\
            .apply(list)\
            .apply(lambda r: " ".join(list(map(str, r))))\
            .tolist()
    
    return v.transform(data), group_keys

In [164]:
X_train, train_users = make_matrix(train_df, groupby_column="userId", data_column="movieId")

In [165]:
cosine_sim = 1 - pairwise_distances(X_train, metric="cosine")

In [166]:
cosine_sim.shape

(185, 185)

In [173]:
def predict(user_id, similarity_threshold=0.2):
    user_index = train_users.index(user_id)
    
    sim_scores = list(enumerate(cosine_sim[user_index]))
    sim_scores_sorted = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    recommended_movies = []
    
    candidates = [(user_index, score)
                  for user_index, score in sim_scores_sorted
                  if score > similarity_threshold]
    
    for similar_user_index, similarity_score in candidates:
        similar_user_id = train_users[similar_user_index]
        
        if similar_user_id == user_id:
            continue
            
        similar_user_likes = list(likes_per_user[similar_user_id])
        
        recommended_movies += similar_user_likes[:(N_RECOMMENDATIONS // len(candidates)) + 1]
        
    return recommended_movies

In [174]:
evaluate_predict()

0.20323759379413636