In [100]:
import pandas as pd
import pickle
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from surprise import Reader, Dataset
from surprise import SVD
from surprise.model_selection import cross_validate

In [145]:
# Content-Based filtering
df_content_tags = pd.read_csv("../data/interim/movies_tags.csv")
df_content_tags.dropna(subset=['all_tags'], inplace=True)
tfidf = TfidfVectorizer()
matrice_tfidf = tfidf.fit_transform(df_content_tags['all_tags'])
sim_cosinus = cosine_similarity(matrice_tfidf, matrice_tfidf)
sim_euclidienne = 1 / (1 + euclidean_distances(matrice_tfidf))
indices = pd.Series(range(0,len(df_content_tags)), index=df_content_tags.title)

def recommandations_content(titre, cos_sim, num_recommendations=10):
    idx = indices[titre]
    scores_similarite = list(enumerate(cos_sim[idx]))
    scores_similarite = sorted(scores_similarite, key=lambda x: x[1], reverse=True)
    top_similair = scores_similarite[1:num_recommendations+1]
    res = [(indices.index[idx], score) for idx, score in top_similair]
    res = pd.DataFrame(res)
    res = res.rename(columns={0: 'title', 1: 'score'})
    return res

In [102]:
print(matrice_tfidf.shape)
print(recommandations_content("Toy Story (1995)", sim_cosinus))

(27053, 7877)
                                               title     score
0                                 Toy Story 2 (1999)  0.574920
1                              Monsters, Inc. (2001)  0.401519
2                                        Cars (2006)  0.357911
3     Adventures of Ichabod and Mr. Toad, The (1949)  0.322854
4                                   Pinocchio (1940)  0.313041
5                                Finding Nemo (2003)  0.303094
6  101 Dalmatians (One Hundred and One Dalmatians...  0.295764
7                         Toy Story of Terror (2013)  0.295070
8                        Beauty and the Beast (1991)  0.288640
9                                       Shrek (2001)  0.286178


In [12]:
# Collaborative filtering
df_ratings = pd.read_csv("../data/raw/ratings.csv")
df_ratings = df_ratings.drop("timestamp", axis=1)
df_movies = pd.read_csv("../data/raw/movies.csv")
reader = Reader(rating_scale=(0, 5))
df_surprise = Dataset.load_from_df(df_ratings, reader=reader)

svd = SVD()
cross_validate(svd, df_surprise,measures=['RMSE', 'MAE'], cv=5, verbose=True )


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7860  0.7862  0.7864  0.7862  0.7864  0.7862  0.0001  
MAE (testset)     0.5980  0.5980  0.5983  0.5983  0.5984  0.5982  0.0002  
Fit time          94.06   119.61  102.93  118.68  100.83  107.22  10.17   
Test time         31.55   28.25   27.57   26.71   30.63   28.94   1.85    


{'test_rmse': array([0.78599116, 0.78619962, 0.78637094, 0.78623411, 0.78638188]),
 'test_mae': array([0.59802472, 0.5980028 , 0.59832207, 0.59826599, 0.59838414]),
 'fit_time': (94.06464791297913,
  119.60788702964783,
  102.93337392807007,
  118.68454623222351,
  100.83284306526184),
 'test_time': (31.551662921905518,
  28.252826929092407,
  27.565049171447754,
  26.70999503135681,
  30.634342908859253)}

In [94]:
train_set = df_surprise.build_full_trainset()
svd.fit(train_set)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x15f451c40>

In [103]:
train_set = df_surprise.build_full_trainset()

In [146]:
def recommandations_collab(user_id, num_recommendations=10):
    # Générer l'anti-test set pour l'utilisateur cible
    anti_testset = []
    targetUser = train_set.to_inner_uid(user_id)
    moyenne = train_set.global_mean
    user_note = train_set.ur[targetUser]
    user_livre = [item for (item, _) in user_note]

    for livre in train_set.all_items():
        if livre not in user_livre:
            anti_testset.append((user_id, train_set.to_raw_iid(livre), moyenne))

    # Effectuer les prédictions
    predictionsSVD = svd.test(anti_testset)
    predictionsSVD = pd.DataFrame(predictionsSVD)

    # Mapper les IDs de films aux titres
    df_movies = pd.read_csv("../data/raw/movies.csv")
    movieId_title_map = df_movies.set_index('movieId')['title'].to_dict()
    predictionsSVD['title'] = predictionsSVD['iid'].map(movieId_title_map)

    # Renommer les colonnes et trier par note estimée
    predictionsSVD = predictionsSVD.rename(columns={'uid': 'userId', 'est': 'note'})
    predictionsSVD = predictionsSVD[['userId', 'title', 'note']]
    predictionsSVD.sort_values('note', ascending=False, inplace=True)

    # Retourner les top N recommandations
    return predictionsSVD.head(num_recommendations)

In [134]:
with open("../models/svd_model.pkl", "rb") as filehandler:
        svd_model = pickle.load(filehandler)


In [135]:
def collab_reco(user_id, num_recommendations=10):

    anti_testset =[]
    target_user = train_set.to_inner_uid(user_id)
    moyenne = train_set.global_mean
    user_note = train_set.ur[target_user]
    user_film = [item for (item, _) in user_note]

    for film in train_set.all_items():
        if film not in user_film:
            anti_testset.append((user_id, train_set.to_raw_iid(film), moyenne))

    predictions_svd = svd_model.test(anti_testset)
    predictions_svd = pd.DataFrame(predictions_svd)

    df_movies = pd.read_csv("../data/raw/movies.csv")
    movieId_title_map = df_movies.set_index('movieId')['title'].to_dict()
    predictions_svd['title'] = predictions_svd['iid'].map(movieId_title_map)

    predictions_svd = predictions_svd.rename(columns={'uid': 'userId', 'est': 'note'})
    predictions_svd = predictions_svd[['userId', 'title', 'note']]
    predictions_svd.sort_values('note', ascending=False, inplace=True)

    return predictions_svd.head(num_recommendations)

In [142]:
user_id = 1
num_recommendations = 20
top_recommendations_collab = collab_reco(user_id, num_recommendations)
print(top_recommendations_collab)

       userId                                              title      note
7662        1                  Dylan Moran: Like, Totally (2006)  4.518033
15365       1      Zero Motivation (Efes beyahasei enosh) (2014)  4.493053
6427        1                        Zeitgeist: The Movie (2007)  4.425366
15704       1  If a Tree Falls: A Story of the Earth Liberati...  4.390903
10397       1                               Prime Suspect (1991)  4.345804
12753       1                      Very Potter Musical, A (2009)  4.344184
10757       1                   Pillars of the Earth, The (2010)  4.340872
10342       1          Death on the Staircase (SoupÃ§ons) (2004)  4.329217
16755       1                                    Girl Shy (1924)  4.326247
2497        1                          The Imitation Game (2014)  4.315304
14488       1  Memories of Underdevelopment (Memorias del sub...  4.307039
1676        1  Harry Potter and the Deathly Hallows: Part 2 (...  4.303772
13607       1            

In [147]:
user_id = 123
num_recommendations = 20
top_recommendations_collab = recommandations_collab(user_id, num_recommendations)
print(top_recommendations_collab)

       userId                                              title      note
2193      123   Three Colors: Red (Trois couleurs: Rouge) (1994)  4.462192
8158      123                     Alone in the Wilderness (2004)  4.440168
3981      123  Dear Zachary: A Letter to a Son About His Fath...  4.429980
9299      123                          Fawlty Towers (1975-1979)  4.421757
2063      123   Three Colors: Blue (Trois couleurs: Bleu) (1993)  4.355871
2071      123                            Band of Brothers (2001)  4.345737
5313      123                                      Baraka (1992)  4.344656
3792      123   Song of the Little Road (Pather Panchali) (1955)  4.340330
3152      123      Chungking Express (Chung Hing sam lam) (1994)  4.337888
2588      123                                        Once (2006)  4.318958
5245      123                       Louis C.K.: Hilarious (2010)  4.307030
5026      123                  Nine Queens (Nueve reinas) (2000)  4.297056
1277      123  Lives of O

In [85]:
with open("../models/svd_model.pkl", "wb") as f:
    pickle.dump(svd, f)

In [148]:
def generer_nouvelles_recommandations(recommandations, nb_recommandations=10):
    # Convertir les recommandations en liste
    recommandations_liste = recommandations.values.tolist()
    # Mélanger aléatoirement les recommandations
    random.shuffle(recommandations_liste)
    # Sélectionner un sous-ensemble de recommandations
    nouvelles_recommandations = recommandations_liste[:nb_recommandations]
    # Convertir la liste en DataFrame
    nouvelles_recommandations_df = pd.DataFrame(nouvelles_recommandations, columns=recommandations.columns)
    return nouvelles_recommandations_df

# Afficher les recommandations initiales
top_recommendations_collab = recommandations_collab(user_id, num_recommendations)
afficher_recommandations(top_recommendations_collab)

# Recueillir le retour utilisateur
satisfaction = input("Êtes-vous satisfait de ces recommandations ? (Oui/Non): ")

if satisfaction.lower() == "non":
    # Générer de nouvelles recommandations sans réentraîner le modèle
    nouvelles_recommandations = generer_nouvelles_recommandations(top_recommendations_collab)
    # Afficher les nouvelles recommandations
    afficher_recommandations(nouvelles_recommandations)


NameError: name 'afficher_recommandations' is not defined

In [143]:
from sklearn.preprocessing import MinMaxScaler
def recommandations_hybride(user_id, titre, num_recommendations=10, alpha=0.8, n=1000):

    scaler = MinMaxScaler()
    # Obtenir les recommandations basées sur le contenu
    rec_content = recommandations_content(titre, sim_euclidienne, num_recommendations*n)
    rec_content = rec_content.set_index('title')
    rec_content = rec_content.rename(columns={'score': 'score_content'})
    rec_content['score_content'] = scaler.fit_transform(rec_content[['score_content']])
    print("Recommandations basées sur le contenu pour '{}':\n{}".format(titre, rec_content.head(10)))

    # Obtenir les recommandations basées sur le filtrage collaboratif
    rec_collab = collab_reco(user_id, num_recommendations*n)
    rec_collab = rec_collab.set_index('title')
    rec_collab = rec_collab.rename(columns={'note': 'score_collab'})
    rec_collab['score_collab'] = scaler.fit_transform(rec_collab[['score_collab']])
    print("Recommandations collaboratives pour l'utilisateur {}:\n{}".format(user_id, rec_collab.head(10)))
    
    # Fusionner les scores
    rec_combined = rec_content.join(rec_collab, how='outer').fillna(0)
    rec_combined['score'] = (alpha * rec_combined['score_content']) +((1 - alpha) * rec_combined['score_collab'])

    # Trier et retourner les recommandations
    rec_combined = rec_combined.sort_values('score', ascending=False)
    rec_combined = rec_combined[['score_content', 'score_collab', 'score']].reset_index()
    return rec_combined.head(num_recommendations)

In [144]:
user_id = 1000
titre = "Braveheart (1995)"
num_recommendations = 10
recommandations_final = recommandations_hybride(user_id, titre, num_recommendations, alpha=0.8)
print("Recommandations hybrides pour '{}':\n{}".format(titre, recommandations_final))


Recommandations basées sur le contenu pour 'Braveheart (1995)':
                            score_content
title                                    
Spartacus (1960)                 1.000000
Lawrence of Arabia (1962)        0.926684
Patton (1970)                    0.883285
Black Hawk Down (2001)           0.834704
Saving Private Ryan (1998)       0.816444
Rob Roy (1995)                   0.796507
Gladiator (2000)                 0.764112
Doctor Zhivago (1965)            0.758606
Dances with Wolves (1990)        0.743466
Last Emperor, The (1987)         0.713222
Recommandations collaboratives pour l'utilisateur 1000:
                                                    userId  score_collab
title                                                                   
Interstellar (2014)                                   1000      1.000000
Dark Knight Rises, The (2012)                         1000      0.986494
Shawshank Redemption, The (1994)                      1000      0.980251
Boondock Sa

In [99]:
user_id = 1000
titre = "Toy Story (1995)"
num_recommendations = 10
recommandations_final = recommandations_hybride(user_id, titre, num_recommendations, alpha=0.8)
print("Recommandations hybrides pour '{}':\n{}".format(titre, recommandations_final))


Recommandations basées sur le contenu pour 'Toy Story (1995)':
                                                    score_content
title                                                            
Toy Story 2 (1999)                                       1.000000
Monsters, Inc. (2001)                                    0.585150
Cars (2006)                                              0.500061
Adventures of Ichabod and Mr. Toad, The (1949)           0.435868
Pinocchio (1940)                                         0.418515
Finding Nemo (2003)                                      0.401188
101 Dalmatians (One Hundred and One Dalmatians)...       0.388581
Toy Story of Terror (2013)                               0.387395
Beauty and the Beast (1991)                              0.376461
Shrek (2001)                                             0.372302
Recommandations collaboratives pour l'utilisateur 1000:
                                             userId  score_collab
title                  