In [1]:
import pandas as pd 
import numpy as np 
from scipy.sparse import csr_matrix

## Test fonctions Databases

In [2]:
def load_data(path: str, chunk_size: int):
    '''
    This function returns a dataframe generated from a csv file path as input

    Args:
    - path : the path where the csv file is located, as a string type
    - chunk_size : number of lines to read from the file per chunk, as an int type

    Returns:
    - df : the dataframe loaded from the csv file

    '''

    data = pd.read_csv(path, iterator=True, chunksize=chunk_size)
    df = df = pd.concat(data, ignore_index=True)

    return df

In [3]:
ratings = load_data('ratings_final.csv', 100000)
movies = load_data('movies_affiche_final.csv', 500)
sql = load_data('db_sql_final.csv', 500)

In [4]:
def create_matrix(df):
    """
    This function creates a sparse user-movie matrix from a dataframe

    Args:
    - a Dataframe that contains at least the columns movieId, userId, and rating (db_preprocessing)

    Returns:
    - matrix : a sparse user-movie matrix of size NxM with N the number of unique users and M the number of unique movies
    - map_user : a dictionary that maps user_ids to their respective indices
    - map_user_inv : a dictionary that maps indices to the user_id
    - map_movie : a dictionary that maps movie_ids to their respective indices
    - map_movie_inv : a dictionary that maps indices to the movie_id

    Inspired by the following repo : https://github.com/topspinj/tmls-2020-recommender-workshop
    """

    N = df['UserId'].nunique()
    M = df['MovieId'].nunique()

    map_user = dict(zip(np.unique(df['UserId']), list(range(df['UserId'].nunique()))))
    map_movie = dict(zip(np.unique(df['MovieId']), list(range(df['MovieId'].nunique()))))

    user_idx = [map_user[i] for i in df['UserId']]
    movie_idx = [map_movie[i] for i in df['MovieId']]

    matrix = csr_matrix((df["Rating"], (user_idx, movie_idx)), shape=(N,M))

    df_matrix = df.pivot(index='UserId', columns='MovieId', values='Rating').fillna(0)

    return matrix, df_matrix, map_user, map_movie

In [5]:
matrix, df_matrix, map_user, map_movie = create_matrix(ratings)

## Tests fonctions Recommandations

In [20]:
from scipy.sparse.linalg import svds

In [8]:
def svd(matrix, map_user: dict, map_movie: dict, n_factors: int):
    """
    This function returns a dataframe with the predicted ratings for all users within the dataframe

    Args:
    - matrix : the sparse user-movie matrix created using the create_matrix function
    - n_factors : the number of factors / rank of the latent matrix for factorization
    - map_user : a dictionary that maps user_ids to their respective indices
    - map_movie : a dictionary that maps movie_ids to their respective indices

    Returns:
    - predictions : a DataFrame containing the predicted ratings for all users in the original dataset

    Inspired by the following Git repo : https://github.com/vivdalal/movie-recommender-system
        
    """
    # The following code creates :
    # U : user matrix of dimension (n_users, n_factors)
    # sigma : the diagonal matrix of singular values
    # V_t : the transposed movie matrix of dimension (n_factors, n_movies)
    
    try:

        U, sigma, V_t = svds(matrix, k = n_factors)

        sigma = np.diag(sigma)

        pred_ratings = np.dot((U @ sigma), V_t)

        predictions = pd.DataFrame(pred_ratings)

        predictions.rename(columns=dict(zip(predictions.columns, list(map_movie.keys()))), inplace=True)
        predictions.index = list(map_user.keys())
    
    except ValueError:
        print('The number of factor ({0}) is either smaller than 1 or larger than one dimension of \
              the matrix shape ({1})'.format(n_factors, matrix.shape))
    
    return predictions

In [9]:
df_pred = svd(matrix, map_user, map_movie, 50)

In [11]:
def generate_reco(df, user_id: int, n_recommandations: int, n_factors:50):
    """
    This function returns a DataFrame with movies recommandations based on user's previously rated movies

    Args:
    - df : a Dataframe that contains at least the columns movieId, userId, and rating
    - user_id : the user id of the user we want to make recommandations to
    - n_recommandations : the number of movies we want to recommand to the user
    - n_factors : the number of factors / rank of the latent matrix for factorization (default is 50)

    Returns:
    - recommandations : a list containing the movie_id of the movies we recommand to the user

    Inspired by the following Git repo : https://github.com/vivdalal/movie-recommender-system
    """

    matrix, df_matrix, map_user, map_movie = create_matrix(df)

    df_pred = svd(matrix, map_user, map_movie, n_factors).loc[user_id].sort_values(ascending=False)

    user_data = df_matrix.loc[user_id]

    seen_movies = list(user_data[user_data != 0.0].index)
    
    reco_movies = df_pred[~df_pred.index.isin(seen_movies)][:n_recommandations].index

    recommandations = list(df[['MovieId']].drop_duplicates(subset=['MovieId']).set_index('MovieId').loc[reco_movies].index)
    
    return recommandations

In [12]:
reco = generate_reco(ratings, user_id=997, n_recommandations=10, n_factors=50)

**Test fonction NLP avec Spacy**

Here do not forget to write before :

- !pip install nltk
- !pip install spacy
- !spacy download xx_ent_wiki_sm
- !pip install deep-translator

In [226]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import en_core_web_sm

from collections import Counter

from deep_translator import GoogleTranslator

In [177]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jerem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jerem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [240]:
def preprocess_sentence(sentence, num_keywords=3, pos=["NOUN", "ADJ"]):

    nlp = en_core_web_sm.load()
    stop_words = set(stopwords.words('english'))

    corpus = nlp(sentence)
    non_movies_corpus = nlp(" ".join([token.text for token in corpus if token.text.lower() not in ["movie", "movies"]]))
    corpus_clean = nlp(" ".join([token.lemma_ for token in non_movies_corpus if token.text.lower() not in stop_words and token.is_punct == False]))
    frequent_words = Counter([token.text for token in corpus_clean if token.pos_ in pos])
    keywords = [words for words, _ in frequent_words.most_common(num_keywords)]

    return ' '.join(keywords)

In [208]:
prompt_1 = "Je veux voir un film sur des requins, de l'eau et du sang"
prompt_2 = "I want to watch a movie with sharks, water and blood"

In [238]:
translated = GoogleTranslator(source='auto', target='en').translate(prompt_1)
preprocess_sentence(translated, 3, pos=["NOUN", "ADJ"])

'shark water blood'

In [236]:
preprocess_sentence(prompt_2, 3, pos=["NOUN", "ADJ"])

'shark water blood'

In [241]:
def nlp_reco(prompt: str, df, n_recommendations: int):
    '''
    This function returns a dataframe with a selection of movies recommandations based on user's specific text input

    Args: 
    - prompt : the user's text input describing the kind of movies he wants to see
    - df : a Dataframe containing at least the columns movieId, title and synopsis
    - n_recommandations : the number of movies we want to recommand to the user

    Returns:
    - reco : a dataframe containing the movie_id, the titles and the genres of the movies we recommand to the user
    
    '''
 
    vectorizer = TfidfVectorizer(stop_words="english")

    synopsis_tfidf = vectorizer.fit_transform(df['Synopsis'])

    nn = NearestNeighbors(n_neighbors=n_recommendations)
    nn.fit(synopsis_tfidf)

    prompt_translate = GoogleTranslator(source='auto', target='en').translate(prompt)

    prompt_extract = preprocess_sentence(prompt_translate, 3, pos=["NOUN", "ADJ"])

    prompt_tfidf = vectorizer.transform([prompt_extract])

    reco_idx = nn.kneighbors(prompt_tfidf, return_distance=False)[0]

    reco = df[['MovieId', 'Title', 'Genres', 'Synopsis']].set_index('MovieId').iloc[reco_idx]

    return reco

In [242]:
prompt = "Je veux voir un film sur des requins, de l'eau et du sang"
nlp_recommandation = nlp_reco(prompt, movies, 10)
nlp_recommandation

Unnamed: 0_level_0,Title,Genres,Synopsis
MovieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
248,Houseguest,['Comedy'],"In hot water with the mob over an unpaid debt,..."
1387,Jaws,['Action|Horror'],When a killer shark unleashes chaos on a beach...
1388,Jaws 2,['Horror|Thriller'],Police chief Brody must protect the citizens o...
1389,Jaws 3-D,['Action|Horror'],A giant thirty-five-foot shark becomes trapped...
2081,The Little Mermaid,['Animation|Children|Comedy|Musical|Romance'],A young mermaid makes a deal with a sea witch ...
3927,Fantastic Voyage,['Adventure|Sci-Fi'],When a blood clot renders a scientist comatose...
8907,Shark Tale,['Animation|Children|Comedy'],When a son of a gangster shark boss is acciden...
30810,The Life Aquatic with Steve Zissou,['Adventure|Comedy|Fantasy'],With a plan to exact revenge on a mythical sha...
69844,Harry Potter and the Half-Blood Prince,['Adventure|Fantasy|Mystery|Romance|IMAX'],As Harry Potter begins his sixth year at Hogwa...
119141,The Interview,['Action|Comedy'],"Based on Anne Rice's iconic novel, follow Loui..."


**Test fonction NLP/LLM avec des Transformers (Neural Nets)**

Here do not forget to write before :

-!pip install sentence-transformers (it takes around 1 min 30)


In [41]:
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [46]:
def llm_reco(prompt, df, n_recommendations):

    '''
    This function returns a dataframe with a selection of movies recommandations based on user's specific text input

    Args:
    - prompt : the user's text input describing the kind of movies he wants to see
    - df : the original dataframe generated with the csv import 
    - n_recommandations : the number of movies we want to recommand to the user

    Returns:
    - recommandations : a dataframe containing the movie_id, the titles and the genres of the movies we recommand to the user
    
    '''

    model = SentenceTransformer('distiluse-base-multilingual-cased')


    prompt = preprocess_text(prompt)
    df['preprocess_synopsis'] = df['Synopsis'].apply(preprocess_text)
    synopsis = list(df['preprocess_synopsis'])
    


    prompt_embed = model.encode(prompt, convert_to_tensor=True)
    synopis_embed = model.encode(synopsis, convert_to_tensor=True)


    similarities = util.pytorch_cos_sim(prompt_embed, synopis_embed)[0]

 
    similar_idx = similarities.argsort(descending=True)[:n_recommendations]


    reco = df[['MovieId', 'Title', 'Genres', 'Synopsis']].set_index('MovieId').iloc[similar_idx]

    return reco

In [47]:
prompt = 'Je veux voir un film sur des requins'
llm_recommandation = llm_reco(prompt, movies, 10)
llm_recommandation

Unnamed: 0_level_0,Title,Genres,Synopsis
MovieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1387,Jaws,['Action|Horror'],When a killer shark unleashes chaos on a beach...
108729,Enemy,['Mystery|Thriller'],A man seeks out his exact look-alike after spo...
1388,Jaws 2,['Horror|Thriller'],Police chief Brody must protect the citizens o...
344,Ace Ventura: Pet Detective,['Comedy'],A goofy detective specializing in animals goes...
1060,Swingers,['Comedy|Drama'],A wannabe actor has a hard time moving on from...
1389,Jaws 3-D,['Action|Horror'],A giant thirty-five-foot shark becomes trapped...
2713,Lake Placid,['Horror|Thriller'],Four people attempt to stop a gigantic crocodi...
1111,Microcosmos,['Documentary'],A documentary on insect life in meadows and po...
3146,Deuce Bigalow: Male Gigolo,['Comedy'],An average aquarium cleaner house-sits for a g...
3007,American Movie,['Documentary'],Documentary about an aspiring filmmaker's atte...
