In [12]:
import pandas as pd
import numpy as np

file_path = 'train_ratings.csv'
ratings_df = pd.read_csv(file_path)
num_users = ratings_df['user_id'].nunique()
num_movies = ratings_df['movie_id'].nunique()

print("Nombre d'utilisateurs uniques :", num_users)
print("Nombre de films uniques :", num_movies)
print(ratings_df.shape)

missing_values = ratings_df.isnull().sum()
rows_with_nan = ratings_df[ratings_df.isna().any(axis=1)].index.tolist()
#print("Nombre de valeurs manquantes par colonne :")
#print(missing_values)

user_movie_matrix = ratings_df.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
#print(user_movie_matrix)

# Fonction pour calculer la similarité cosinus entre deux vecteurs
def cosine_similarity(vector_a, vector_b):
    dot_product = np.dot(vector_a, vector_b)
    norm_a = np.linalg.norm(vector_a) 
    norm_b = np.linalg.norm(vector_b) 
    if norm_a == 0 or norm_b == 0:
        return 0
    return dot_product / (norm_a * norm_b)

# Calcul de la similarité cosinus entre tous les films (colonnes)
movie_ids = user_movie_matrix.columns
movie_similarity = pd.DataFrame(index=movie_ids, columns=movie_ids)

for i in movie_ids:
    for j in movie_ids:
        if i != j:
            movie_similarity.loc[i, j] = cosine_similarity(user_movie_matrix[i], user_movie_matrix[j])
        else:
            movie_similarity.loc[i, j] = 1

print(movie_similarity.head())

Nombre d'utilisateurs uniques : 6040
Nombre de films uniques : 3675
(800167, 4)
movie_id      1         2         3         4         5         6     \
movie_id                                                               
1                1  0.306919  0.200327  0.136721  0.197376  0.278412   
2         0.306919         1  0.204602  0.126866  0.203671  0.200241   
3         0.200327  0.204602         1  0.165122  0.252018  0.157374   
4         0.136721  0.126866  0.165122         1  0.231629  0.107153   
5         0.197376  0.203671  0.252018  0.231629         1  0.131258   

movie_id      7         8         9         10    ...      3943      3944  \
movie_id                                          ...                       
1         0.235776  0.090328  0.082599  0.328742  ...  0.083554  0.003782   
2         0.192632  0.132255  0.135231  0.337208  ...  0.063388       0.0   
3         0.227628  0.068536  0.091582   0.19845  ...  0.038718  0.044798   
4          0.16079  0.055909  

In [13]:

def predict_rating(user_id, movie_id, user_movie_matrix, movie_similarity, top_n=5):
    user_ratings = user_movie_matrix.loc[user_id]
    rated_movies = user_ratings[user_ratings > 0].index
    similarities = movie_similarity.loc[movie_id, rated_movies]
    similarities = similarities.sort_values(ascending=False)[:top_n]
    
    weighted_ratings = 0
    similarity_sum = 0
    for rated_movie in similarities.index:
        similarity = similarities[rated_movie]
        rating = user_ratings[rated_movie]
        weighted_ratings += similarity * rating
        similarity_sum += similarity
    
    if similarity_sum == 0:
        return 0
    
    return weighted_ratings / similarity_sum


def predict_missing_ratings(user_movie_matrix, movie_similarity, top_n=5):
    predictions = {}
    
    for user_id in user_movie_matrix.index:
        predictions[user_id] = {}
        for movie_id in user_movie_matrix.columns:
            if user_movie_matrix.loc[user_id, movie_id] == 0:
                predicted_rating = predict_rating(user_id, movie_id, user_movie_matrix, movie_similarity, top_n)
                predictions[user_id][movie_id] = predicted_rating
                
    return predictions

# Calculer les notes prédites pour tous les films non évalués
predicted_ratings = predict_missing_ratings(user_movie_matrix, movie_similarity)

# Affichage d'un exemple de prédictions pour un utilisateur
example_user_id = 1  # Remplacez par un identifiant d'utilisateur
print(f"Prédictions de notes pour l'utilisateur {example_user_id} :")
print(predicted_ratings[example_user_id])


Prédictions de notes pour l'utilisateur 1 :
{2: 4.198345841196195, 3: 3.9896950563025864, 4: 3.969537431266605, 5: 4.3443170006756135, 6: 4.409259681462814, 7: 3.7773735078305077, 8: 3.7921979868858653, 9: 4.19946782279879, 10: 4.59846848579751, 11: 4.202310551420905, 12: 4.179647253469886, 13: 3.815561351305948, 14: 4.446124478202977, 15: 3.9753614346956763, 16: 4.589270501755336, 17: 4.223171165287427, 18: 3.7982390182487737, 19: 3.9957535288681365, 20: 4.591919799915725, 21: 4.603437023880622, 22: 4.592465508511196, 23: 4.398939414368032, 24: 4.392573151664828, 25: 4.605624817142771, 26: 4.446458037583358, 27: 4.046074449650383, 28: 3.5687599129326553, 29: 3.802910554686713, 30: 4.597485123411545, 31: 4.2309594036824665, 32: 4.198844191716142, 33: 4.603810199395197, 34: 4.610292884236721, 35: 4.218593613575823, 36: 4.7778123207788, 37: 4.362466176050009, 38: 3.9891108226899243, 39: 4.185672560323908, 40: 4.562907051471094, 41: 4.58523043355707, 42: 4.414451016787066, 43: 3.790174010

In [62]:

similarity_threshold = 0.05  # Par exemple, seules les similarités >= 0.05 sont prises en compte
def generate_recommendations(predicted_ratings, top_n=5):
    recommendations = {}

    for user_id, movie_ratings in predicted_ratings.items():
        sorted_ratings = sorted(movie_ratings.items(), key=lambda x: x[1], reverse=True)
        top_recommendations = sorted_ratings[:top_n]
        recommendations[user_id] = top_recommendations
    return recommendations

top_n = 10
recommendations = generate_recommendations(predicted_ratings, top_n=top_n)

"""
user_id = 2  # Remplacez par un identifiant d'utilisateur
for movie_id, predicted_rating in recommendations[example_user_id]:
    print(f"user {user_id} movie {movie_id} rate {predicted_rating:.2f}")
"""

"""
for user_id, user_recommendations in recommendations.items():
    for movie_id, predicted_rating in user_recommendations:
        print(f"user {user_id} movie {movie_id} note {predicted_rating:.2f}")
    print("\n")  # Ajouter une ligne vide entre les utilisateurs pour une meilleure lisibilité
"""


user 2 movie 127 rate 5.00
user 2 movie 133 rate 5.00
user 2 movie 642 rate 5.00
user 2 movie 1321 rate 5.00
user 2 movie 3347 rate 5.00
user 2 movie 3382 rate 5.00
user 2 movie 3412 rate 5.00
user 2 movie 3233 rate 4.87
user 2 movie 1132 rate 4.82
user 2 movie 2020 rate 4.81
user 2 movie 1012 rate 4.81
user 2 movie 2240 rate 4.81
user 2 movie 1982 rate 4.81
user 2 movie 616 rate 4.81
user 2 movie 1131 rate 4.80
user 2 movie 1186 rate 4.80
user 2 movie 1219 rate 4.80
user 2 movie 590 rate 4.80
user 2 movie 3021 rate 4.80
user 2 movie 953 rate 4.80
user 2 movie 1185 rate 4.80
user 2 movie 1574 rate 4.80
user 2 movie 919 rate 4.79
user 2 movie 73 rate 4.67
user 2 movie 2942 rate 4.64
user 2 movie 3607 rate 4.64
user 2 movie 826 rate 4.63
user 2 movie 2172 rate 4.62
user 2 movie 2991 rate 4.62
user 2 movie 3461 rate 4.62
user 2 movie 1411 rate 4.62
user 2 movie 1097 rate 4.61
user 2 movie 3123 rate 4.61
user 2 movie 1224 rate 4.61
user 2 movie 3609 rate 4.61
user 2 movie 2459 rate 4.61
us

In [58]:
def predict_rating(user_id, movie_id, user_movie_matrix, movie_similarity, top_n=5):
    user_ratings = user_movie_matrix.loc[user_id]
    rated_movies = user_ratings[user_ratings > 0].index
    similarities = movie_similarity.loc[movie_id, rated_movies]
    print(len(similarities))
    similarities= similarities[similarities > 0.2]
    print(len(similarities))
    #similarities = similarities.sort_values(ascending=False)[:top_n]
    weighted_ratings = 0
    similarity_sum = 0
    for rated_movie in similarities.index:
        similarity = similarities[rated_movie]
        rating = user_ratings[rated_movie]
        weighted_ratings += similarity * rating
        similarity_sum += similarity
    if similarity_sum == 0:
        return 0
    return weighted_ratings / similarity_sum

val= predict_rating(3456,3, user_movie_matrix,movie_similarity)
print(val)

48
9
4.556411222797798
