In [16]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds 

import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

from sklearn.neighbors import NearestNeighbors

## 0- Load Data

In [2]:
def load_data(path, chunksize):
    '''
    This function returns a dataframe generated from a csv file path as input

    Args:
    - path : the path where the csv file is located, as a string type
    - chunksize : number of lines to read from the file per chunk, as an int type

    Returns:
    - df : the dataframe loaded from the csv file

    '''

    data = pd.read_csv(str(path), iterator=True, chunksize=int(chunksize))
    df = df = pd.concat(data, ignore_index=True)

    return df

In [3]:
df = load_data('movie_db_clean.csv', 250000)

## 1- Data Pre-Processing

Inspired by the following repo : https://github.com/topspinj/tmls-2020-recommender-workshop

In [4]:
def create_matrix(df):
    
    """
    This function creates a sparse user-movie matrix from a dataframe

    Args:
    - a Dataframe that contains at least the columns movieId, userId, and rating

    Returns:
    - matrix : a sparse user-movie matrix of size NxM with N the number of unique users and M the number of unique movies
    - map_user : a dictionary that maps user_ids to their respective indices
    - map_user_inv : a dictionary that maps indices to the user_id
    - map_movie : a dictionary that maps movie_ids to their respective indices
    - map_movie_inv : a dictionary that maps indices to the movie_id

    """

    N = df['userId'].nunique()
    M = df['movieId'].nunique()

    map_user = dict(zip(np.unique(df['userId']), list(range(df['userId'].nunique()))))
    map_movie = dict(zip(np.unique(df['movieId']), list(range(df['movieId'].nunique()))))

    map_user_inv = {v: k for k, v in map_user.items()}
    map_movie_inv = {v: k for k, v in map_movie.items()}

    user_idx = [map_user[i] for i in df['userId']]
    movie_idx = [map_movie[i] for i in df['movieId']]

    matrix = csr_matrix((df["rating"], (user_idx, movie_idx)), shape=(N,M))

    df_matrix = df.pivot(index='userId', columns='movieId', values='rating').fillna(0)

    return matrix, df_matrix, map_user, map_user_inv, map_movie, map_movie_inv, user_idx, movie_idx

In [5]:
matrix, df_matrix, map_user, map_user_inv, map_movie, map_movie_inv, user_idx, movie_idx = create_matrix(df)

## 2- Create SVD algorithm

Inspired by the following repo : https://github.com/vivdalal/movie-recommender-system/blob/master/movie_recommendation_system.ipynb

In [6]:
def svd(matrix, n_factors=50):
    """
    This function returns a dataframe with the predicted ratings for all users within the dataframe

    Args:
    - matrix : the sparse user-movie matrix created during step 1
    - n_factors : the number of factors / rank of the latent matrix for factorization

    Returns:
    - predictions : a DataFrame containing the predicted ratings for all users in the original dataset
    """
    # The following code creates :
    # U : user matrix of dimension (n_users, n_factors)
    # sigma : the diagonal matrix of singular values
    # V_t : the transposed movie matrix of dimension (n_factors, n_movies)
    
    U, sigma, V_t = svds(matrix, k = n_factors)

    sigma = np.diag(sigma)

    pred_ratings = np.dot((U @ sigma), V_t)

    predictions = pd.DataFrame(pred_ratings)

    predictions.rename(columns=dict(zip(predictions.columns, list(map_movie.keys()))))
    predictions.index = list(map_user.keys())
    
    return predictions

In [7]:
df_pred = svd(matrix, n_factors=50)

## 3- Recommand movies to user

Inspired by the following repo : https://github.com/vivdalal/movie-recommender-system/blob/master/movie_recommendation_system.ipynb

In [8]:
def recommend_movies(df_pred, user_id, df, df_matrix, n_recommendations):

    """
    This function returns a DataFrame with movies recommandations based on user's previously rated movies

    Args:
    - df_pred : the dataframe with the rating predictions for all users in the dataset
    - user_id : the user id of the user we want to make recommandations to
    - df : the original dataframe generated with the csv import 
    - df_matrix : the user-movie matrix containing the ratings 
    - n_recommandations : the number of movies we want to recommand to the user

    Returns:
    - recommandations : a dataframe containing the movie_id and the titles of the movies we recommand to the user
    """

    # Sort user's predictions
    sort_pred = df_pred.iloc[user_id].sort_values(ascending=False)
    
    # User data
    user_data = df_matrix.iloc[user_id]

    # Get the index of movies already seen by user
    # We filled by 0 the missing values and there was no 0 rating in the original database
    seen_movies = list(user_data[user_data != 0.0].index)

    print('User {0} has already rated {1} movies.'.format(user_id, len(seen_movies)))
    print('Recommending highest {0} predicted ratings movies not already rated.'.format(n_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet
    reco_movies = sort_pred[~sort_pred.index.isin(seen_movies)][:n_recommendations].index

    # Return the recommanded movies with their respective titles
    recommandations = df[['movieId', 'title']].drop_duplicates(subset=['movieId']).set_index('movieId').iloc[reco_movies]

    return recommandations

In [9]:
recommandations = recommend_movies(df_pred, 972, df, df_matrix, 10)

recommandations

User 972 has already rated 43 movies.
Recommending highest 10 predicted ratings movies not already rated.


Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
260,Star Wars: Episode IV - A New Hope
1210,Star Wars: Episode VI - Return of the Jedi
1196,Star Wars: Episode V - The Empire Strikes Back
356,Forrest Gump
1,Toy Story
858,"Godfather, The"
780,Independence Day (a.k.a. ID4)
480,Jurassic Park
588,Aladdin
364,"Lion King, The"


**What are the movies the user has already watched and the associated ratings?**

In [10]:
seen_movies = list(df_matrix.iloc[972][df_matrix.iloc[972] != 0.0].index)

In [11]:
df[['movieId', 'title', 'rating']].drop_duplicates(subset=['movieId']).set_index('movieId').iloc[seen_movies]

Unnamed: 0_level_0,title,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2,Jumanji,5.0
20,Money Train,1.0
40,"Cry, the Beloved Country",4.0
107,Muppet Treasure Island,4.0
157,Canadian Bacon,3.0
187,Party Girl,3.0
231,Dumb & Dumber (Dumb and Dumber),4.0
254,Jefferson in Paris,3.0
268,Little Odessa,4.0
286,Nemesis 2: Nebula,2.0


## 4- NLP/LLM based Recommandations

Inspired by the following repo : https://github.com/pritishmishra703/Recommendation-System-with-Universal-Sentence-Encoder

In [12]:
def nlp_reco(prompt, model_url, df, n_recommendations):

    '''
    This function returns a dataframe with a selection of movies recommandations based on user's specific text input

    Args:
    - prompt : the user's text input describing the kind of movies he wants to see
    - model_url : the type of model (encoder) we want to use to make NLP recommandations
    - df : the original dataframe generated with the csv import 
    - n_recommandations : the number of movies we want to recommand to the user

    Returns:
    - recommandations : a dataframe containing the movie_id, the titles and the genres of the movies we recommand to the user
    
    '''

    model = hub.load(str(model_url))

    synopsis = list(df.drop_duplicates(subset=['movieId'])['Synopsis'])

    synop_embed = model([synopsis])
    prompt_embed = model([prompt])

    nn = NearestNeighbors(n_neighbors=n_recommendations)
    nn.fit(synop_embed)

    reco_idx = nn.kneighbors(prompt_embed, return_distance=False)[0]

    df_filter = df[['movieId', 'title', 'genres']].drop_duplicates(subset=['movieId']).set_index('movieId')

    reco = df_filter.iloc[reco_idx].sort_index()

    return reco

In [13]:
prompt_1 = 'I want to this a funny movie about a cowboy doll'

prompt_2 = "Je veux voir un film d'animation qui parle d'une poupée cowboy"

model_url = 'https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/multilingual/versions/2'

In [14]:
nlp_reco(prompt_1, model_url, df, 10)

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story,['Adventure|Animation|Children|Comedy|Fantasy']
12,Dracula: Dead and Loving It,['Comedy|Horror']
2253,Toys,['Comedy|Fantasy']
2659,It Came from Hollywood,['Comedy|Documentary']
5703,Wholly Moses,['Comedy']
7203,Final Cut,['Drama']
7408,Jack and the Beanstalk,['Children|Comedy|Fantasy']
48673,Kummelin jackpot,['Comedy']
97194,"Thing: Terror Takes Shape, The",['Documentary']
172909,Cheburashka,['Animation']


In [15]:
nlp_reco(prompt_2, model_url, df, 10)

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story,['Adventure|Animation|Children|Comedy|Fantasy']
2253,Toys,['Comedy|Fantasy']
3964,"Adventures of Ichabod and Mr. Toad, The",['Animation|Children']
7304,Allegro non troppo,['Animation|Comedy|Fantasy|Musical']
83219,"Pixar Story, The",['Documentary']
97194,"Thing: Terror Takes Shape, The",['Documentary']
133127,Barbie of Swan Lake,['Animation|Children']
136542,Mickey's Christmas Carol,['Animation|Children']
160874,"Life, Animated",['Animation|Documentary']
172909,Cheburashka,['Animation']
