In [42]:
import sys
sys.path.append("..")

import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import linear_kernel, pairwise_distances

from src.recommender import evaluate_recommendations

In [2]:
N_RECOMMENDATIONS = 100

## Dataset

In [3]:
ratings_df = pd.read_csv("../datasets/movielens/ml-1m/ratings.csv")
movies_infos_df = pd.read_json("../datasets/movielens/omdb.csv", lines=True)

full_df = pd.merge(
    ratings_df,
    movies_infos_df,
    left_index=True,
    right_index=True
)

movies = ratings_df["movieId"].unique().tolist()
users = ratings_df["userId"].unique().tolist()

In [4]:
full_df_filtered = full_df.loc[full_df["rating"] >= 4]
train_df, test_df = train_test_split(full_df_filtered)

In [5]:
likes_per_user = \
    train_df.groupby("userId")["movieId"].apply(set).to_dict()

target_recommendations_per_user = \
    test_df.groupby("userId")["movieId"].apply(set).to_dict()

# e.g: `target_recommendations_per_user[2]` shows items we should predict for user 2

## Machine Learning

In [6]:
def evaluate_predict():
    scores = []

    for user_id in test_df["userId"].unique():
        target = target_recommendations_per_user[user_id]
        predictions = predict(user_id)

        score = evaluate_recommendations(predictions, target, k=N_RECOMMENDATIONS)
        scores.append(score)

    return np.mean(scores)

### Random model

In [67]:
def predict(user_id):
    recommended_movies = np.random.choice(movies, N_RECOMMENDATIONS)
    
    return recommended_movies

In [68]:
%time evaluate_predict()

CPU times: user 151 ms, sys: 4.14 ms, total: 155 ms
Wall time: 152 ms


0.04449388786776339

### Content Based

In [69]:
genre_vect = TfidfVectorizer()

X = sparse.hstack([
    genre_vect.fit_transform(movies_infos_df["Genre"].fillna("")),
])

In [70]:
cosine_sim = linear_kernel(X, X)

In [71]:
movie_index_to_id = movies_infos_df["id"].to_dict()
movie_id_to_index = {v: k for (k, v) in movie_index_to_id.items()}

def predict(user_id):
    movie_ids_liked_by_user = \
        train_df.loc[train_df["userId"] == user_id]["id"].tolist()
    
    movie_indices_liked_by_user = [movie_id_to_index[movie_id]
                                   for movie_id in movie_ids_liked_by_user]

    recommended_movies = set()
    
    for movie_index in movie_indices_liked_by_user:
        sim_scores = list(enumerate(cosine_sim[movie_index]))
        sim_scores_sorted = sorted(sim_scores, key=lambda x: x[1], reverse=True)[:10]

        recommended_movies_tmp = [movie_index_to_id[recommended_index]
                                  for (recommended_index, score) in sim_scores_sorted]
        
        recommended_movies.update(recommended_movies_tmp)
        
    return list(recommended_movies)[:N_RECOMMENDATIONS]

In [72]:
evaluate_predict()

0.0733327740029165

### Collaborative Filtering

In [30]:
v = CountVectorizer(token_pattern="[0-9]+")
v.fit(train_df["movieId"].astype(str));

def make_matrix(df, groupby_column, data_column):
    grouped_df = df.groupby(groupby_column)
    group_keys = list(grouped_df.groups.keys())
    
    data = grouped_df[data_column]\
            .apply(list)\
            .apply(lambda r: " ".join(list(map(str, r))))\
            .tolist()
    
    return v.transform(data), group_keys

#### User-Based Collaborative Filtering

In [24]:
X_train, train_users = make_matrix(train_df, groupby_column="userId", data_column="movieId")

In [11]:
cosine_sim = 1 - pairwise_distances(X_train, metric="cosine")

In [12]:
def predict(user_id, similarity_threshold=0.2):
    user_index = train_users.index(user_id)
    
    sim_scores = list(enumerate(cosine_sim[user_index]))
    sim_scores_sorted = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    recommended_movies = set()
    
    candidates = [(user_index, score)
                  for user_index, score in sim_scores_sorted
                  if score > similarity_threshold]
    
    for similar_user_index, similarity_score in candidates:
        similar_user_id = train_users[similar_user_index]
        
        if similar_user_id == user_id:
            continue
            
        similar_user_likes = set(list(likes_per_user[similar_user_id])[:10])
        
        recommended_movies.update(similar_user_likes)
        
    return list(recommended_movies)[:N_RECOMMENDATIONS]

In [13]:
evaluate_predict()

0.18735409277748163

#### Item-Based Collaborative Filtering

In [40]:
X_train, train_movies = make_matrix(train_df, groupby_column="movieId", data_column="userId")

In [41]:
X_train.shape

(3244, 3244)

In [15]:
cosine_sim = 1 - pairwise_distances(X_train, metric="cosine")

In [16]:
def predict(user_id):
    recommended_movies = set()
    
    for movie_id in likes_per_user[user_id]:
        movie_index = train_movies.index(movie_id)
        
        sim_scores = list(enumerate(cosine_sim[movie_index]))
        sim_scores_sorted = sorted(sim_scores, key=lambda x: x[1], reverse=True)[:10]
        
        recommended_movies.update([train_movies[i] for i, _ in sim_scores_sorted])
    
    return list(recommended_movies)[:N_RECOMMENDATIONS]

In [17]:
evaluate_predict()

0.2026131587238082

In [18]:
cosine_sim.shape

(3244, 3244)