In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Charger les données
movies = pd.read_csv("../data/cleaned_movies.csv")
ratings = pd.read_csv("../data/cleaned_ratings.csv")
users = pd.read_csv("../data/cleaned_users.csv")

# Vérifier les colonnes disponibles
print(movies.columns)
print(movies.head())

Index(['movie_id', 'movie_title', 'release_date', 'imdb_url', 'unknown',
       'Action', 'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
       'release_year'],
      dtype='object')
   movie_id        movie_title release_date  \
0         1   Toy Story (1995)  01-Jan-1995   
1         2   GoldenEye (1995)  01-Jan-1995   
2         3  Four Rooms (1995)  01-Jan-1995   
3         4  Get Shorty (1995)  01-Jan-1995   
4         5     Copycat (1995)  01-Jan-1995   

                                            imdb_url  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0       1   
4  http://us

## Préparer un corpus textuel

1. Combine les titres et genres pour former une description textuelle de chaque film

In [3]:
# Préparer le corpus textuel en utilisant les titres et les genres
# Convertir les genres binaires en mots-clés pour enrichir le signal sémantique
genre_columns = [col for col in movies.columns if col not in ["movie_id", "movie_title", "release_date", "imdb_url", "release_year"]]
def genres_to_keywords(row):
    keywords = []
    for genre in genre_columns:
        if row[genre] == 1:
            keywords.append(genre)
    return " ".join(keywords)

movies["genres"] = movies.apply(genres_to_keywords, axis=1)
movies["corpus"] = movies["movie_title"] + " " + movies["genres"]
print(movies[["movie_id", "corpus"]].head())

   movie_id                                        corpus
0         1  Toy Story (1995) Animation Children's Comedy
1         2    GoldenEye (1995) Action Adventure Thriller
2         3                    Four Rooms (1995) Thriller
3         4         Get Shorty (1995) Action Comedy Drama
4         5           Copycat (1995) Crime Drama Thriller


2. Applique TF-IDF pour transformer les descriptions textuelles en vecteurs

In [4]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), max_df=0.8, min_df=0.02)
movie_features = vectorizer.fit_transform(movies["corpus"])
print("TF-IDF Matrix Shape:", movie_features.shape)

TF-IDF Matrix Shape: (1679, 46)


**TF-IDF (Term Frequency-Inverse Document Frequency)** : Une mesure de l'importance d'un mot dans un document et dans l'ensemble des documents.

- Ce code prépare les données pour un système de recommandations basé sur le contenu :
  - Le corpus combine le titre et les genres pour représenter chaque film comme un document textuel.
  - La transformation TF-IDF encode ces documents dans une matrice sparse, prête pour des calculs de similarité.

In [5]:
movie_similarity = cosine_similarity(movie_features)
movie_similarity_df = pd.DataFrame(movie_similarity, index=movies["movie_id"], columns=movies["movie_id"])
print("Movie Similarity Matrix:")
print(movie_similarity_df.head())

movie_similarity_df.to_csv("../data/movie_similarity.csv")

Movie Similarity Matrix:
movie_id      1         2         3         4         5         6     \
movie_id                                                               
1         1.000000  0.104347  0.225830  0.190048  0.104797  0.177986   
2         0.104347  1.000000  0.462061  0.547814  0.214420  0.171466   
3         0.225830  0.462061  1.000000  0.240423  0.464052  0.371090   
4         0.190048  0.547814  0.240423  1.000000  0.162167  0.275423   
5         0.104797  0.214420  0.464052  0.162167  1.000000  0.250302   

movie_id      7         8         9         10    ...      1673      1674  \
movie_id                                          ...                       
1         0.108361  0.697607  0.177986  0.138093  ...  0.133747  0.000000   
2         0.104392  0.106372  0.171466  0.133034  ...  0.780185  0.000000   
3         0.225926  0.230213  0.371090  0.287915  ...  0.592245  0.000000   
4         0.167683  0.495403  0.275423  0.213691  ...  0.702159  0.235958   
5       

## Identifier les films similaires

In [6]:
def get_similar_movies(movie_id, movie_similarity_df, k=5):
    if movie_id not in movie_similarity_df.index:
        return []
    movie_similarities = movie_similarity_df.loc[movie_id]
    similar_movies = movie_similarities.sort_values(ascending=False)
    similar_movies = similar_movies[similar_movies.index != movie_id]  # Exclure le film lui-même
    return similar_movies.head(k).index.tolist()

## Recommander des films similaires

In [7]:
def recommend_similar_movies(movie_id, movie_similarity_df, movies, k=5):
    similar_movie_ids = get_similar_movies(movie_id, movie_similarity_df, k)
    recommended_movies = movies[movies["movie_id"].isin(similar_movie_ids)]["movie_title"]
    return recommended_movies.tolist()

## Tester la fonction

In [8]:
movie_id = 1
recommendations = recommend_similar_movies(movie_id, movie_similarity_df, movies, k=5)
print(f"Films similaires à {movies[movies['movie_id'] == movie_id]['movie_title'].values[0]}:")
print(recommendations)

Films similaires à Toy Story (1995):
['Aladdin and the King of Thieves (1996)', 'Space Jam (1996)', 'Big Green, The (1995)', 'Goofy Movie, A (1995)', 'Gumby: The Movie (1995)']


In [9]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(ratings, test_size=0.2, random_state=42)

# Créer une matrice utilisateur-film pour l'ensemble d'entraînement
train_user_item_matrix = train.pivot_table(index="user_id", columns="item_id", values="rating").fillna(0)
test_user_item_matrix = test.pivot_table(index="user_id", columns="item_id", values="rating").fillna(0)

In [10]:
def calculate_metrics(y_true, y_pred):
    from sklearn.metrics import precision_score, recall_score, f1_score
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=1)
    recall = recall_score(y_true, y_pred, average="weighted", zero_division=1)
    f1 = f1_score(y_true, y_pred, average="weighted", zero_division=1)
    return precision, recall, f1

In [11]:
def evaluate_content(user_id, k=5, threshold=3.5):
    if user_id not in test_user_item_matrix.index:
        return 0, 0, 0

    actual_positive_movies = test_user_item_matrix.loc[user_id][test_user_item_matrix.loc[user_id] >= threshold].index
    recommendations = set()

    for movie_id in train_user_item_matrix.loc[user_id][train_user_item_matrix.loc[user_id] > 0].index:
        recommendations.update(get_similar_movies(movie_id, movie_similarity_df, k))

    y_true = [1 if movie in actual_positive_movies else 0 for movie in train_user_item_matrix.columns]
    y_pred = [1 if movie in recommendations else 0 for movie in train_user_item_matrix.columns]

    return calculate_metrics(y_true, y_pred)

In [12]:
content_metrics = []
for user_id in test_user_item_matrix.index[:10]:
    precision, recall, f1 = evaluate_content(user_id)
    content_metrics.append({"User": user_id, "Precision": precision, "Recall": recall, "F1": f1})

# Afficher les résultats
content_results = pd.DataFrame(content_metrics)
print("Métriques moyennes :")
print(content_results.mean())

Métriques moyennes :
User         5.500000
Precision    0.980237
Recall       0.819648
F1           0.886276
dtype: float64


In [13]:
content_results.to_csv("../data/content_results.csv", index=False)