In [1]:
import pandas as pd 
import numpy as np 
from scipy.sparse import csr_matrix

## Test fonctions Databases

In [2]:
def load_data(path: str, chunk_size: int):
    '''
    This function returns a dataframe generated from a csv file path as input

    Args:
    - path : the path where the csv file is located, as a string type
    - chunk_size : number of lines to read from the file per chunk, as an int type

    Returns:
    - df : the dataframe loaded from the csv file

    '''

    data = pd.read_csv(path, iterator=True, chunksize=chunk_size)
    df = df = pd.concat(data, ignore_index=True)

    return df

In [3]:
ratings = load_data('ratings_final.csv', 100000)
movies = load_data('movies_affiche_final.csv', 500)
sql = load_data('db_sql_final.csv', 500)

In [4]:
def create_matrix(df):
    """
    This function creates a sparse user-movie matrix from a dataframe

    Args:
    - a Dataframe that contains at least the columns movieId, userId, and rating (db_preprocessing)

    Returns:
    - matrix : a sparse user-movie matrix of size NxM with N the number of unique users and M the number of unique movies
    - map_user : a dictionary that maps user_ids to their respective indices
    - map_user_inv : a dictionary that maps indices to the user_id
    - map_movie : a dictionary that maps movie_ids to their respective indices
    - map_movie_inv : a dictionary that maps indices to the movie_id

    Inspired by the following repo : https://github.com/topspinj/tmls-2020-recommender-workshop
    """

    N = df['UserId'].nunique()
    M = df['MovieId'].nunique()

    map_user = dict(zip(np.unique(df['UserId']), list(range(df['UserId'].nunique()))))
    map_movie = dict(zip(np.unique(df['MovieId']), list(range(df['MovieId'].nunique()))))

    user_idx = [map_user[i] for i in df['UserId']]
    movie_idx = [map_movie[i] for i in df['MovieId']]

    matrix = csr_matrix((df["Rating"], (user_idx, movie_idx)), shape=(N,M))

    df_matrix = df.pivot(index='UserId', columns='MovieId', values='Rating').fillna(0)

    return matrix, df_matrix, map_user, map_movie

In [5]:
matrix, df_matrix, map_user, map_movie = create_matrix(ratings)

## Tests fonctions Recommandations

In [20]:
from scipy.sparse.linalg import svds

In [8]:
def svd(matrix, map_user: dict, map_movie: dict, n_factors: int):
    """
    This function returns a dataframe with the predicted ratings for all users within the dataframe

    Args:
    - matrix : the sparse user-movie matrix created using the create_matrix function
    - n_factors : the number of factors / rank of the latent matrix for factorization
    - map_user : a dictionary that maps user_ids to their respective indices
    - map_movie : a dictionary that maps movie_ids to their respective indices

    Returns:
    - predictions : a DataFrame containing the predicted ratings for all users in the original dataset

    Inspired by the following Git repo : https://github.com/vivdalal/movie-recommender-system
        
    """
    # The following code creates :
    # U : user matrix of dimension (n_users, n_factors)
    # sigma : the diagonal matrix of singular values
    # V_t : the transposed movie matrix of dimension (n_factors, n_movies)
    
    try:

        U, sigma, V_t = svds(matrix, k = n_factors)

        sigma = np.diag(sigma)

        pred_ratings = np.dot((U @ sigma), V_t)

        predictions = pd.DataFrame(pred_ratings)

        predictions.rename(columns=dict(zip(predictions.columns, list(map_movie.keys()))), inplace=True)
        predictions.index = list(map_user.keys())
    
    except ValueError:
        print('The number of factor ({0}) is either smaller than 1 or larger than one dimension of \
              the matrix shape ({1})'.format(n_factors, matrix.shape))
    
    return predictions

In [9]:
df_pred = svd(matrix, map_user, map_movie, 50)

In [11]:
def generate_reco(df, user_id: int, n_recommandations: int, n_factors:50):
    """
    This function returns a DataFrame with movies recommandations based on user's previously rated movies

    Args:
    - df : a Dataframe that contains at least the columns movieId, userId, and rating
    - user_id : the user id of the user we want to make recommandations to
    - n_recommandations : the number of movies we want to recommand to the user
    - n_factors : the number of factors / rank of the latent matrix for factorization (default is 50)

    Returns:
    - recommandations : a list containing the movie_id of the movies we recommand to the user

    Inspired by the following Git repo : https://github.com/vivdalal/movie-recommender-system
    """

    matrix, df_matrix, map_user, map_movie = create_matrix(df)

    df_pred = svd(matrix, map_user, map_movie, n_factors).loc[user_id].sort_values(ascending=False)

    user_data = df_matrix.loc[user_id]

    seen_movies = list(user_data[user_data != 0.0].index)
    
    reco_movies = df_pred[~df_pred.index.isin(seen_movies)][:n_recommandations].index

    recommandations = list(df[['MovieId']].drop_duplicates(subset=['MovieId']).set_index('MovieId').loc[reco_movies].index)
    
    return recommandations

In [12]:
reco = generate_reco(ratings, user_id=997, n_recommandations=10, n_factors=50)

**Test fonction NLP avec Spacy**

Here do not forget to write before :

- !pip install nltk
- !pip install spacy
- !spacy download xx_ent_wiki_sm


In [30]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [31]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jerem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jerem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [33]:
def preprocess_text(text):

    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
    
    return ' '.join(filtered_words)

In [38]:
def nlp_reco(prompt: str, df, n_recommendations: int):
    '''
    This function returns a dataframe with a selection of movies recommandations based on user's specific text input

    Args: 
    - prompt : the user's text input describing the kind of movies he wants to see
    - df : a Dataframe containing at least the columns movieId, title and synopsis
    - n_recommandations : the number of movies we want to recommand to the user

    Returns:
    - reco : a dataframe containing the movie_id, the titles and the genres of the movies we recommand to the user
    
    '''
    model = spacy.load("xx_ent_wiki_sm")

    df['prepro_synopsis'] = df['Synopsis'].apply(preprocess_text)
 
    vectorizer = TfidfVectorizer(stop_words="english")

    synopsis_tfidf = vectorizer.fit_transform(df['prepro_synopsis'])

    nn = NearestNeighbors(n_neighbors=n_recommendations)
    nn.fit(synopsis_tfidf)

    prompt_doc = model(prompt)
    prompt_text = preprocess_text(prompt_doc.text)
    prompt_tfidf = vectorizer.transform([prompt_text])

    reco_idx = nn.kneighbors(prompt_tfidf, return_distance=False)[0]

    reco = df[['MovieId', 'Title', 'Genres', 'Synopsis']].set_index('MovieId').iloc[reco_idx].sort_index()

    return reco

In [39]:
prompt = 'Je veux voir un film sur des requins'
nlp_recommandation = nlp_reco(prompt, movies, 10)
nlp_recommandation

Unnamed: 0_level_0,Title,Genres,Synopsis
MovieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,Waiting to Exhale,['Comedy|Drama|Romance'],"Based on Terry McMillan's novel, this film fol..."
145,Bad Boys,['Action|Comedy|Crime|Drama|Thriller'],Plot unknown. Fourth installment of the 'Bad B...
520,Robin Hood: Men in Tights,['Comedy'],"A spoof of Robin Hood in general, and Robin de..."
1080,Monty Python's Life of Brian,['Comedy'],Monty Python's The Life of Brian (1979) - Writ...
2505,8MM,['Drama|Mystery|Thriller'],A private investigator is hired to discover if...
2671,Notting Hill,['Comedy|Romance'],The life of a simple bookshop owner changes wh...
2710,The Blair Witch Project,['Drama|Horror|Thriller'],Three film students vanish after traveling int...
2770,Bowfinger,['Comedy'],When a desperate movie producer fails to get a...
3159,Fantasia 2000,['Animation|Children|Musical|IMAX'],An update of the original film with new interp...
4017,Pollock,['Drama'],A film about the life and career of the Americ...


**Test fonction NLP/LLM avec des Transformers (Neural Nets)**

Here do not forget to write before :

-!pip install sentence-transformers (it takes around 1 min 30)


In [41]:
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [42]:
def llm_reco(prompt, df, n_recommendations):

    '''
    This function returns a dataframe with a selection of movies recommandations based on user's specific text input

    Args:
    - prompt : the user's text input describing the kind of movies he wants to see
    - df : the original dataframe generated with the csv import 
    - n_recommandations : the number of movies we want to recommand to the user

    Returns:
    - recommandations : a dataframe containing the movie_id, the titles and the genres of the movies we recommand to the user
    
    '''

    model = SentenceTransformer('distiluse-base-multilingual-cased')


    prompt = preprocess_text(prompt)
    df['preprocess_synopsis'] = df['Synopsis'].apply(preprocess_text)
    synopsis = list(df['preprocess_synopsis'])
    


    prompt_embed = model.encode(prompt, convert_to_tensor=True)
    synopis_embed = model.encode(synopsis, convert_to_tensor=True)


    similarities = util.pytorch_cos_sim(prompt_embed, synopis_embed)[0]

 
    similar_idx = similarities.argsort(descending=True)[:n_recommendations]


    reco = df[['MovieId', 'Title', 'Genres', 'Synopsis']].set_index('MovieId').iloc[similar_idx].sort_index()

    return reco

In [43]:
prompt = 'Je veux voir un film sur des requins'
llm_recommandation = llm_reco(prompt, movies, 10)
llm_recommandation

.gitattributes: 100%|██████████| 690/690 [00:00<?, ?B/s] 
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 187kB/s]
2_Dense/config.json: 100%|██████████| 114/114 [00:00<?, ?B/s] 
pytorch_model.bin: 100%|██████████| 1.58M/1.58M [00:00<00:00, 23.1MB/s]
rust_model.ot: 100%|██████████| 1.58M/1.58M [00:00<00:00, 33.9MB/s]
README.md: 100%|██████████| 2.37k/2.37k [00:00<?, ?B/s]
config.json: 100%|██████████| 607/607 [00:00<?, ?B/s] 
config_sentence_transformers.json: 100%|██████████| 122/122 [00:00<?, ?B/s] 
pytorch_model.bin: 100%|██████████| 539M/539M [00:14<00:00, 37.6MB/s] 
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<?, ?B/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 145kB/s]
tokenizer.json: 100%|██████████| 1.96M/1.96M [00:00<00:00, 7.80MB/s]
tokenizer_config.json: 100%|██████████| 528/528 [00:00<?, ?B/s] 
vocab.txt: 100%|██████████| 996k/996k [00:00<00:00, 6.29MB/s]
modules.json: 100%|██████████| 341/341 [00:00<?, ?B/s] 


Unnamed: 0_level_0,Title,Genres,Synopsis
MovieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
344,Ace Ventura: Pet Detective,['Comedy'],A goofy detective specializing in animals goes...
1060,Swingers,['Comedy|Drama'],A wannabe actor has a hard time moving on from...
1111,Microcosmos,['Documentary'],A documentary on insect life in meadows and po...
1387,Jaws,['Action|Horror'],When a killer shark unleashes chaos on a beach...
1388,Jaws 2,['Horror|Thriller'],Police chief Brody must protect the citizens o...
1389,Jaws 3-D,['Action|Horror'],A giant thirty-five-foot shark becomes trapped...
2713,Lake Placid,['Horror|Thriller'],Four people attempt to stop a gigantic crocodi...
3007,American Movie,['Documentary'],Documentary about an aspiring filmmaker's atte...
3146,Deuce Bigalow: Male Gigolo,['Comedy'],An average aquarium cleaner house-sits for a g...
108729,Enemy,['Mystery|Thriller'],A man seeks out his exact look-alike after spo...
