In [11]:
# load datset concerning ratings

import pandas as pd
data = pd.read_csv (r"C:\Users\msi\PycharmProjects\Data science projects\Recommendation system\ml-latest-small\ratings.csv")
print (data.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [12]:
# create a user matrix item to represent the interactions between user and movies

user_item_matrix = data.pivot (index= "userId", columns = "movieId", values= "rating")

In [13]:
#collaborative filtering based on similarity
#calculating similarities between users and items (cosine similarity)
#use of surprise library for a simple model

!pip install scikit-surprise

from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)

algo = SVD()
cross_validate(algo, data, cv=5, verbose=True)



Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8730  0.8718  0.8696  0.8770  0.8767  0.8736  0.0029  
MAE (testset)     0.6701  0.6721  0.6662  0.6762  0.6720  0.6713  0.0032  
Fit time          1.41    1.37    1.31    1.31    1.33    1.35    0.04    
Test time         0.17    0.22    0.15    0.21    0.16    0.18    0.03    


{'test_rmse': array([0.87297811, 0.87179828, 0.86962176, 0.87704459, 0.87672646]),
 'test_mae': array([0.67014739, 0.67209561, 0.66618817, 0.67621544, 0.67198355]),
 'fit_time': (1.4146902561187744,
  1.3716683387756348,
  1.3086533546447754,
  1.3092830181121826,
  1.3294732570648193),
 'test_time': (0.16770696640014648,
  0.22151803970336914,
  0.14880752563476562,
  0.21367359161376953,
  0.15500664710998535)}

In [16]:
#make some predictions
#predict a note for a user and a given item

algo = SVD()
algo.fit(data.build_full_trainset())

user_id = ratings['userId'].iloc[0]  
item_id = ratings['movieId'].iloc[0]

prediction = algo.predict(user_id, item_id)
print(prediction)

NameError: name 'ratings' is not defined

In [None]:
# basic data visualisation
# ratings distribution

import matplotlib.pyplot as plt
import seaborn as sns

# Distribution des notes
sns.histplot(ratings['rating'], bins=10, kde=False, color='blue')
plt.title("Ratings distrbution")
plt.xlabel("Score")
plt.ylabel("Number of ratings")
plt.show()




In [None]:
# top 10 movies

# Counting number of ratings per movie
top_movies = ratings.groupby('movieId')['rating'].count().nlargest(10)

# Join with movie titles
top_movies = pd.merge(top_movies, movies, on='movieId', how='left')

# Visualise
plt.figure(figsize=(10, 6))
sns.barplot(x=top_movies['rating'], y=top_movies['title'], palette="viridis")
plt.title("Top 10 most rated")
plt.xlabel("Number of ratings")
plt.ylabel("Movie title")
plt.show()


In [None]:
# assess the model's performances
#RMSE (Root Mean Square Error) to measure the mean deviation between predictions and real evaluations

from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse

# load data with Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# divide data in train/test
trainset, testset = train_test_split(data, test_size=0.25)

# train an SVD model
algo = SVD()
algo.fit(trainset)

# make predictions on test data
predictions = algo.test(testset)

# Calculate RMSE
print("RMSE :", rmse(predictions))


In [None]:
# calculate MAE (Mean Absolute Error) measures the mean absolute difference between predictions and real scores

from surprise.accuracy import mae

# Calculer le MAE
print("MAE :", mae(predictions))


In [None]:
# visualise the predictions
# histogramme of deviations to explore how close the predictions are to the real scores

# Calculate the deviations
errors = [pred.est - pred.r_ui for pred in predictions]

# Visualise
sns.histplot(errors, bins=20, kde=True, color='red')
plt.title("Distribution of Error prediction")
plt.xlabel("Error (Prediction - Real rating)")
plt.ylabel("Frequency")
plt.show()


In [None]:
# explore the  generated recommendations
# the  movies suggested to a given user

# get a list of all movies the user didn't rate
user_id = 1
all_movie_ids = set(movies['movieId'])
rated_movie_ids = set(ratings[ratings['userId'] == user_id]['movieId'])
unrated_movie_ids = list(all_movie_ids - rated_movie_ids)

# forecast the ratings for these movies
recommendations = []
for movie_id in unrated_movie_ids:
    pred = algo.predict(user_id, movie_id)
    recommendations.append((movie_id, pred.est))

# class by predicted rating
recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:10]

# Add movie titles
recommended_movies = pd.DataFrame(recommendations, columns=['movieId', 'PredictedRating'])
recommended_movies = pd.merge(recommended_movies, movies, on='movieId', how='left')

# Display the recommendations
print(recommended_movies[['title', 'PredictedRating']])


In [None]:
# assess the model's performance on different users group

# RMSE to see if some users are more predictable than others

from collections import defaultdict

#group predictions by user

user_errors = defaultdict(list)
for pred in predictions:
    user_errors[pred.uid].append(abs(pred.est - pred.r_ui))

# Calculate the mean errors per user
user_rmse = {uid: sum(errors) / len(errors) for uid, errors in user_errors.items()}

# Visualise
sns.histplot(list(user_rmse.values()), bins=20, kde=True, color='green')
plt.title("RMSE distribution per user")
plt.xlabel("RMSE")
plt.ylabel("Number of users")
plt.show()

