Machine Learning Project
- Grégory Bournassenko
- Reda Aoua
- Chadw Lasheen

# Section 1 : Présentation des données

In [None]:
import pandas as pd
import numpy as np
import pickle
import requests
from io import BytesIO
import pprint

def load_data_from_url(url):
    response = requests.get(url)
    response.raise_for_status()
    return pickle.loads(response.content)

ratings_url = "https://s3.eu-west-3.amazonaws.com/dauphine.projet/movie_ratings_500_id.pkl"
metadata_url = "https://s3.eu-west-3.amazonaws.com/dauphine.projet/movie_metadata.pkl"

In [None]:
response = requests.get(metadata_url)
if response.status_code == 200:
    data = pickle.loads(response.content)
    pprint.pprint(data)
else:
    print("Failed to download the file. Status code:", response.status_code)

{'tt0118661': {'actors': ['Ralph Fiennes',
                          'Uma Thurman',
                          'Sean Connery',
                          'Jim Broadbent'],
               'director': 'Jeremiah S. Chechik',
               'genre': ['Action', 'Adventure', 'Sci-Fi'],
               'title': 'The Avengers'},
 'tt0118715': {'actors': ['Jeff Bridges',
                          'John Goodman',
                          'Julianne Moore',
                          'Steve Buscemi',
                          'Philip Seymour Hoffman',
                          'Peter Stormare',
                          'John Turturro'],
               'director': 'Joel Coen Ethan Coen',
               'genre': ['Comedy', 'Crime'],
               'title': 'The Big Lebowski'},
 'tt0118744': {'actors': ['Jessica Lange',
                          'Gwyneth Paltrow',
                          'Johnathon Schaech'],
               'director': 'Jonathan Darby',
               'genre': ['Drama', 'Thriller'],


In [None]:
response = requests.get(ratings_url)
if response.status_code == 200:
    data = pickle.loads(response.content)
    pprint.pprint(data)
else:
    print("Failed to download the file. Status code:", response.status_code)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
                'user_rating': '4',
                'user_rating_date': '2005-10-01'},
               {'user_id': '1065438',
                'user_rating': '1',
                'user_rating_date': '2005-10-10'},
               {'user_id': '1283798',
                'user_rating': '3',
                'user_rating_date': '2005-11-01'},
               {'user_id': '2181176',
                'user_rating': '1',
                'user_rating_date': '2005-07-22'},
               {'user_id': '2415952',
                'user_rating': '2',
                'user_rating_date': '2005-07-25'},
               {'user_id': '2412710',
                'user_rating': '2',
                'user_rating_date': '2005-07-27'},
               {'user_id': '2248080',
                'user_rating': '1',
                'user_rating_date': '2005-08-09'},
               {'user_id': '278092',
                'user_rating': '2',
                'user_rat

# Section 2 : Exploitation des données

In [None]:
movie_ratings = load_data_from_url(ratings_url)
movie_metadata = load_data_from_url(metadata_url)

ratings_data = []
for movie_id, ratings in movie_ratings.items():
    for rating in ratings:
        ratings_data.append([rating['user_id'], movie_id, rating['user_rating']])
ratings_df = pd.DataFrame(ratings_data, columns=['user_id', 'movie_id', 'rating'])

ratings_df['rating'] = pd.to_numeric(ratings_df['rating'])

ratings_df.head()

Unnamed: 0,user_id,movie_id,rating
0,1380819,tt0305224,4
1,185150,tt0305224,3
2,1351377,tt0305224,4
3,386143,tt0305224,2
4,2173336,tt0305224,3


In [None]:
metadata_data = []
for movie_id, metadata in movie_metadata.items():
    metadata_row = [movie_id]
    metadata_row.extend([metadata[key] for key in sorted(metadata)])
    metadata_data.append(metadata_row)

columns = ['movie_id'] + sorted(metadata.keys())
metadata_df = pd.DataFrame(metadata_data, columns=columns)

metadata_df.head()

Unnamed: 0,movie_id,actors,director,genre,title
0,tt0305224,"[Jack Nicholson, Adam Sandler, Marisa Tomei, W...",Peter Segal,[Comedy],Anger Management
1,tt0245046,"[Cate Blanchett, James Fleet, Abigail Cruttenden]",Gillian Armstrong,"[Drama, Romance, Thriller]",Charlotte Gray
2,tt0185125,"[Cecilia Roth, Marisa Paredes, Candela Peña, P...",Pedro Almodóvar,[Drama],All About My Mother
3,tt0196229,"[Ben Stiller, Owen Wilson, Christine Taylor, W...",Ben Stiller,[Comedy],Zoolander
4,tt0308644,"[Johnny Depp, Kate Winslet, Julie Christie, Du...",Marc Forster,"[Biography, Drama, Family]",Finding Neverland


# Section 3 : Comparatif des différentes baselines

In [None]:
!pip install fastFM



In [None]:
!pip install surprise



In [None]:
import plotly.graph_objs as go
import plotly.express as px

## User-based Collaborative Filtering

In [None]:
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2)

user_based_cf = KNNBasic(sim_options={'user_based': True, 'name': 'cosine', 'min_support': 5, 'k': 20})
user_based_cf.fit(trainset)


Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7af2da775a80>

In [None]:
from surprise import accuracy

predictions = user_based_cf.test(testset)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

RMSE: 1.0421
MAE:  0.8339


In [None]:
df = pd.DataFrame({
    'Metric': ['RMSE', 'MAE'],
    'Value': [rmse, mae]
})

fig = go.Figure(data=[
    go.Bar(
        x=df['Metric'],
        y=df['Value'],
        marker_color=['blue', 'red']
    )
])

fig.update_layout(
    title='Comparaison RMSE et MAE',
    xaxis_title='Métrique',
    yaxis_title='Valeur'
)

fig.show()

# Item-based Collaborative Filtering

In [None]:
from surprise import KNNBasic, Reader, Dataset
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse, mae

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

item_based_cf = KNNBasic(sim_options={'user_based': False})
item_based_cf.fit(trainset)

predictions = item_based_cf.test(testset)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 1.0189
MAE:  0.7979
RMSE: 1.01893168577243
MAE: 0.7978541646285675


In [None]:
df = pd.DataFrame({
    'Metric': ['RMSE', 'MAE'],
    'Value': [rmse, mae]
})

fig = go.Figure(data=[
    go.Bar(
        x=df['Metric'],
        y=df['Value'],
        marker_color=['blue', 'red']
    )
])

fig.update_layout(
    title='Comparaison RMSE et MAE',
    xaxis_title='Métrique',
    yaxis_title='Valeur'
)

fig.show()

# Vanilla MF

In [None]:
from surprise import SVD, Reader, Dataset
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse, mae

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

vanilla_mf = SVD()
vanilla_mf.fit(trainset)

predictions = vanilla_mf.test(testset)
print(f"RMSE: {rmse(predictions)}")
print(f"MAE: {mae(predictions)}")

RMSE: 0.9540
RMSE: 0.9540385581401114
MAE:  0.7538
MAE: 0.753814915618836


In [None]:
df = pd.DataFrame({
    'Metric': ['RMSE', 'MAE'],
    'Value': [rmse(predictions), mae(predictions)]
})

fig = go.Figure(data=[
    go.Bar(
        x=df['Metric'],
        y=df['Value'],
        marker_color=['blue', 'red']
    )
])

fig.update_layout(
    title='Comparaison RMSE et MAE',
    xaxis_title='Métrique',
    yaxis_title='Valeur'
)

fig.show()

RMSE: 0.9540
MAE:  0.7538


# SVD with bias

In [None]:
df = ratings_df

mu = df['rating'].mean()

n_users = df['user_id'].nunique()
n_items = df['movie_id'].nunique()
n_factors = 10

user_bias = np.zeros(n_users)
item_bias = np.zeros(n_items)
user_factors = np.random.normal(0, .1, (n_users, n_factors))
item_factors = np.random.normal(0, .1, (n_items, n_factors))

user_id_index = {user_id: idx for idx, user_id in enumerate(df['user_id'].unique())}
movie_id_index = {movie_id: idx for idx, movie_id in enumerate(df['movie_id'].unique())}
mu, user_bias, item_bias, user_factors, item_factors, user_id_index, movie_id_index

(3.303296923230877,
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0

In [None]:
learning_rate = 0.01
n_epochs = 10
lambda_reg = 0.02

for epoch in range(n_epochs):
    for idx, row in df.iterrows():
        user_idx = user_id_index[row['user_id']]
        item_idx = movie_id_index[row['movie_id']]
        rating = row['rating']

        pred = mu + user_bias[user_idx] + item_bias[item_idx] + np.dot(user_factors[user_idx], item_factors[item_idx])

        error = rating - pred

        user_bias[user_idx] += learning_rate * (error - lambda_reg * user_bias[user_idx])
        item_bias[item_idx] += learning_rate * (error - lambda_reg * item_bias[item_idx])
        user_factors[user_idx] += learning_rate * (error * item_factors[item_idx] - lambda_reg * user_factors[user_idx])
        item_factors[item_idx] += learning_rate * (error * user_factors[user_idx] - lambda_reg * item_factors[item_idx])

user_bias, item_bias, user_factors, item_factors

(array([ 0.06850832, -0.4688836 , -0.22796985, ..., -0.02568119,
         0.16665207,  0.06342488]),
 array([-0.27013012,  0.2118366 ,  0.71297826,  0.03017911,  0.54795874,
        -0.14908201,  0.58833522, -0.56854251,  0.08816816,  0.15820235,
        -0.20782358,  0.37969106,  0.41564199, -1.1609567 , -0.39108241,
        -0.3088975 , -0.32756703, -0.07678117,  0.68604222, -0.06834468,
        -0.09019829, -1.26789641,  0.04233033, -0.5146634 ,  0.00877982,
         0.12348451, -0.89431618, -0.38307974,  0.11388313,  0.43368664,
         0.05229563,  0.14912816, -0.07227151,  0.32138235,  0.04270998,
         0.16979496,  0.69249288, -0.00466215, -0.25241469,  0.12653303,
        -0.07853526,  0.15825165, -0.15936153,  0.39707371, -0.2374074 ,
         0.34189204,  0.37198008, -0.68095169, -0.21806892,  0.3533631 ,
         0.32210404,  0.13970182,  0.50377257,  0.26681463, -0.06011031,
         0.36134824, -0.22218112,  0.06045517, -0.23492613,  0.08793197,
        -0.10800056, -0

In [None]:
def predict_rating(user_id, movie_id):
    user_idx = user_id_index[user_id]
    item_idx = movie_id_index[movie_id]
    prediction = mu + user_bias[user_idx] + item_bias[item_idx] + np.dot(user_factors[user_idx], item_factors[item_idx])
    return prediction

errors_squared = 0
absolute_errors = 0
n_ratings = len(df)

for idx, row in df.iterrows():
    predicted_rating = predict_rating(row['user_id'], row['movie_id'])
    actual_rating = row['rating']
    errors_squared += (actual_rating - predicted_rating) ** 2
    absolute_errors += abs(actual_rating - predicted_rating)

rmse = np.sqrt(errors_squared / n_ratings)
mae = absolute_errors / n_ratings

rmse, mae


(0.8480372520038627, 0.6709444482880511)

In [None]:
df = pd.DataFrame({
    'Metric': ['RMSE', 'MAE'],
    'Value': [rmse, mae]
})

fig = go.Figure(data=[
    go.Bar(
        x=df['Metric'],
        y=df['Value'],
        marker_color=['blue', 'red']
    )
])

fig.update_layout(
    title='Comparaison RMSE et MAE',
    xaxis_title='Métrique',
    yaxis_title='Valeur'
)

fig.show()

# SVD++

In [None]:
from surprise import SVDpp, Reader, Dataset
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse, mae


reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)

svd_pp = SVDpp()
svd_pp.fit(trainset)

predictions = svd_pp.test(testset)
print(f"RMSE: {rmse(predictions)}")
print(f"MAE: {mae(predictions)}")

RMSE: 0.9364
RMSE: 0.9363862550581047
MAE:  0.7356
MAE: 0.7356399023008192


In [None]:
df = pd.DataFrame({
    'Metric': ['RMSE', 'MAE'],
    'Value': [rmse(predictions), mae(predictions)]
})

fig = go.Figure(data=[
    go.Bar(
        x=df['Metric'],
        y=df['Value'],
        marker_color=['blue', 'red']
    )
])

fig.update_layout(
    title='Comparaison RMSE et MAE',
    xaxis_title='Métrique',
    yaxis_title='Valeur'
)

fig.show()

RMSE: 0.9364
MAE:  0.7356


# FM

In [None]:
ratings_df_sampled = ratings_df
ratings_df_sampled['movie_id'] = ratings_df_sampled['movie_id'].map(movie_metadata)
metadata_df = ratings_df_sampled['movie_id'].apply(pd.Series)
ratings_with_metadata = pd.concat([ratings_df_sampled.drop('movie_id', axis=1), metadata_df], axis=1)

In [None]:
ratings_with_metadata

Unnamed: 0,user_id,rating,director,genre,actors,title
0,1380819,4,Peter Segal,[Comedy],"[Jack Nicholson, Adam Sandler, Marisa Tomei, W...",Anger Management
1,185150,3,Peter Segal,[Comedy],"[Jack Nicholson, Adam Sandler, Marisa Tomei, W...",Anger Management
2,1351377,4,Peter Segal,[Comedy],"[Jack Nicholson, Adam Sandler, Marisa Tomei, W...",Anger Management
3,386143,2,Peter Segal,[Comedy],"[Jack Nicholson, Adam Sandler, Marisa Tomei, W...",Anger Management
4,2173336,3,Peter Segal,[Comedy],"[Jack Nicholson, Adam Sandler, Marisa Tomei, W...",Anger Management
...,...,...,...,...,...,...
259813,1139877,5,Brad Anderson,"[Drama, Thriller]","[Christian Bale, Jennifer Jason Leigh, Aitana ...",The Machinist
259814,1460015,4,Brad Anderson,"[Drama, Thriller]","[Christian Bale, Jennifer Jason Leigh, Aitana ...",The Machinist
259815,1098265,5,Brad Anderson,"[Drama, Thriller]","[Christian Bale, Jennifer Jason Leigh, Aitana ...",The Machinist
259816,1962894,4,Brad Anderson,"[Drama, Thriller]","[Christian Bale, Jennifer Jason Leigh, Aitana ...",The Machinist


In [None]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer()

def build_dataset_with_metadata(df):
    features = df[['user_id', 'actors', 'director', 'genre']].to_dict(orient='records')
    X = v.fit_transform(features)
    y = df['rating'].values
    return X, y

X, y = build_dataset_with_metadata(ratings_with_metadata)

In [None]:
from fastFM import als
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

fm = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5)
fm.fit(X_train, y_train)

y_pred = fm.predict(X_test)

rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')

RMSE: 1.0348762431901857
MAE: 0.789260070106665


In [None]:
df = pd.DataFrame({
    'Metric': ['RMSE', 'MAE'],
    'Value': [rmse, mae]
})

fig = go.Figure(data=[
    go.Bar(
        x=df['Metric'],
        y=df['Value'],
        marker_color=['blue', 'red']
    )
])

fig.update_layout(
    title='Comparaison RMSE et MAE',
    xaxis_title='Métrique',
    yaxis_title='Valeur'
)

fig.show()

# MLP

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

metadata_df = pd.DataFrame.from_dict(movie_metadata, orient='index')
mlb_genre = MultiLabelBinarizer()
genres_encoded = mlb_genre.fit_transform(metadata_df['genre'])
le_director = LabelEncoder()
directors_encoded = le_director.fit_transform(metadata_df['director']).reshape(-1, 1)
metadata_features = np.concatenate([genres_encoded, directors_encoded], axis=1)
metadata_df['movie_id'] = metadata_df.index
movie_id_to_index = {id: idx for idx, id in enumerate(metadata_df['movie_id'])}

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
ratings_df_sampled = ratings_df.sample(frac=0.2)
ratings_df_sampled['movie_id'] = ratings_df_sampled['movie_id'].astype(str)
ratings_df_sampled['movie_idx'] = ratings_df_sampled['movie_id'].map(movie_id_to_index)
user_le = LabelEncoder()
ratings_df_sampled['user_id_encoded'] = user_le.fit_transform(ratings_df_sampled['user_id'])
num_users = ratings_df_sampled['user_id_encoded'].nunique()
users_encoded = to_categorical(ratings_df_sampled['user_id_encoded'], num_users)
X_metadata = np.array([metadata_features[idx] for idx in ratings_df_sampled['movie_idx']])
X = np.concatenate([users_encoded, X_metadata], axis=1)
y = ratings_df_sampled['rating'].values

In [None]:
model = Sequential()
model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mean_squared_error')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7e8a36f89990>

In [None]:
y_pred = model.predict(X_test).flatten()

rmse = sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')


RMSE: 1.0651270353809188
MAE: 0.8526922889110455


In [None]:
df = pd.DataFrame({
    'Metric': ['RMSE', 'MAE'],
    'Value': [rmse, mae]
})

fig = go.Figure(data=[
    go.Bar(
        x=df['Metric'],
        y=df['Value'],
        marker_color=['blue', 'red']
    )
])

fig.update_layout(
    title='Comparaison RMSE et MAE',
    xaxis_title='Métrique',
    yaxis_title='Valeur'
)

fig.show()

## Recap

In [None]:
import pandas as pd
import plotly.express as px

models = ["User-based CF", "Item-based CF", "Vanilla MF", "SVD with bias", "SVD++", "FM", "MLP"]
rmse_scores = [1.0421, 1.0189, 0.9540, 0.8480, 0.9364, 1.0348, 1.0651]
mae_scores = [0.8339, 0.7979, 0.7538, 0.6709, 0.7356, 0.7892, 0.8526]

results_df = pd.DataFrame({
    "Model": models,
    "RMSE": rmse_scores,
    "MAE": mae_scores
})

results_df_sorted = results_df.sort_values(by="RMSE")

fig = px.bar(
    results_df_sorted,
    x="Model",
    y="RMSE",
    title="RMSE by Model",
    labels={"Model": "Model", "RMSE": "Root Mean Squared Error (RMSE)"},
    text="RMSE",
    color="RMSE",
    color_continuous_scale="Viridis",
)

fig.show()

mae_df = pd.DataFrame({
    "Model": models,
    "MAE": mae_scores
})

mae_df_sorted = mae_df.sort_values(by="MAE")

fig = px.bar(
    mae_df_sorted,
    x="Model",
    y="MAE",
    title="MAE by Model",
    labels={"Model": "Model", "MAE": "Mean Absolute Error (MAE)"},
    text="MAE",
    color="MAE",
    color_continuous_scale="Viridis",
)

fig.show()


In [None]:
results_df

Unnamed: 0,Model,RMSE,MAE
0,User-based CF,1.0421,0.8339
1,Item-based CF,1.0189,0.7979
2,Vanilla MF,0.954,0.7538
3,SVD with bias,0.848,0.6709
4,SVD++,0.9364,0.7356
5,FM,1.0348,0.7892
6,MLP,1.0651,0.8526


# Section 4 : Mise en place de notre système de recommandation

Nous avons choisis d'implémenter un système de recommandation basé sur la méthode ALS - Alternating Least Squares

In [None]:
from surprise import AlgoBase, Dataset, Reader
from surprise.model_selection import cross_validate
import numpy as np

class MyALSAlgorithm(AlgoBase):
    def __init__(self, n_factors=100, n_epochs=10, reg=0.1):
        AlgoBase.__init__(self)
        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.reg = reg

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)

        self.global_mean = np.mean([rating for (_, _, rating) in trainset.all_ratings()])

        # Initializing user and item factors
        self.user_factors = np.random.normal(size=(trainset.n_users, self.n_factors))
        self.item_factors = np.random.normal(size=(trainset.n_items, self.n_factors))

        # ALS optimization
        for _ in range(self.n_epochs):
            self.user_factors = self.als_step(self.item_factors, trainset.ur, self.user_factors)
            self.item_factors = self.als_step(self.user_factors, trainset.ir, self.item_factors, user_based=False)

        return self


    def als_step(self, fixed_factors, interactions, variable_factors, user_based=True):
        for i in range(variable_factors.shape[0]):
            # Extracting the indices of the items/users that the current user/item has interacted with
            if user_based:
                idx = [j for j, _ in interactions[i]]
            else:
                idx = [j for j, _ in interactions[i]]

            # Extracting the corresponding factors
            A = fixed_factors[idx, :]

            # Building the ratings vector for the user/item
            ratings = np.array([r for _, r in interactions[i]])

            # Regularization term - to prevent overfitting
            reg_term = np.eye(self.n_factors) * self.reg

            # Solving the least squares problem: (A^T * A + reg_term) * x = A^T * ratings
            variable_factors[i, :] = np.linalg.solve(np.dot(A.T, A) + reg_term, np.dot(A.T, ratings))

        return variable_factors

    def estimate(self, u, i):
        # Predict rating
        if self.trainset.knows_user(u) and self.trainset.knows_item(i):
            return np.dot(self.user_factors[u, :], self.item_factors[i, :].T)
        else:
            # Returning the global mean if the user or item is unknown
            return self.global_mean

reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], reader)

algo = MyALSAlgorithm()

cv = cross_validate(algo, data, verbose=True)


Evaluating RMSE, MAE of algorithm MyALSAlgorithm on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    2.0613  2.0664  2.0752  2.0727  2.0682  2.0688  0.0049  
MAE (testset)     1.7434  1.7443  1.7515  1.7496  1.7464  1.7470  0.0031  
Fit time          148.12  146.53  146.34  149.48  147.58  147.61  1.14    
Test time         1.00    0.39    0.40    0.84    0.38    0.60    0.26    


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

test_rmse = cv['test_rmse']
test_mae = cv['test_mae']
fit_time = cv['fit_time']
test_time = cv['test_time']

fig = make_subplots(rows=2, cols=2, subplot_titles=("RMSE per Fold", "MAE per Fold", "Fit Time per Fold", "Test Time per Fold"))

fig.add_trace(go.Scatter(x=list(range(1, 6)), y=test_rmse, mode='markers+lines', name='RMSE'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(1, 6)), y=test_mae, mode='markers+lines', name='MAE'), row=1, col=2)
fig.add_trace(go.Scatter(x=list(range(1, 6)), y=fit_time, mode='markers+lines', name='Fit Time'), row=2, col=1)
fig.add_trace(go.Scatter(x=list(range(1, 6)), y=test_time, mode='markers+lines', name='Test Time'), row=2, col=2)

fig.update_layout(height=600, width=800, title_text="Performance Metrics Across Folds")
fig.show()


## NDCG / HR

In [None]:
from surprise import Dataset, Reader
from surprise.model_selection import split
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
import numpy as np

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2)
algo = MyALSAlgorithm()
algo.fit(trainset)


<__main__.MyALSAlgorithm at 0x7e88768b2980>

In [None]:
predictions = algo.test(testset)

In [None]:
def hit_rate(top_n, actual_items):
    """Calculating Hit Rate for recommended items"""
    hits = sum(1 for item in top_n if item in actual_items)
    return hits / len(top_n)

def ndcg(top_n, actual_items):
    """Calculating NDCG for recommended items"""
    dcg = sum(1.0 / np.log2(i + 2) if top_n[i] in actual_items else 0 for i in range(len(top_n)))
    idcg = sum(1.0 / np.log2(i + 2) for i in range(min(len(top_n), len(actual_items))))
    return dcg / idcg if idcg > 0 else 0


In [None]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    """Returning the top-N recommendation for each user from a set of predictions."""
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n=10)


In [None]:
hr_scores = []
ndcg_scores = []

for uid, user_ratings in top_n.items():
    test_items = set([iid for (user, iid, _) in testset if user == uid])
    top_n_items = [iid for (iid, _) in user_ratings]

    hr_scores.append(hit_rate(top_n_items, test_items))
    ndcg_scores.append(ndcg(top_n_items, test_items))

mean_hr = np.mean(hr_scores)
mean_ndcg = np.mean(ndcg_scores)

print(f"Mean HR: {mean_hr}, Mean NDCG: {mean_ndcg}")


Mean HR: 1.0, Mean NDCG: 1.0


# Recap

## Recap

In [None]:
import pandas as pd
import plotly.express as px

models = ["User-based CF", "Item-based CF", "Vanilla MF", "SVD with bias", "SVD++", "FM", "MLP", "ALS"]
rmse_scores = [1.0421, 1.0189, 0.9540, 0.8480, 0.9364, 1.0348, 1.0651, 2.0688]
mae_scores = [0.8339, 0.7979, 0.7538, 0.6709, 0.7356, 0.7892, 0.8526, 1.7470]

results_df = pd.DataFrame({
    "Model": models,
    "RMSE": rmse_scores,
    "MAE": mae_scores
})

results_df_sorted = results_df.sort_values(by="RMSE")

fig = px.bar(
    results_df_sorted,
    x="Model",
    y="RMSE",
    title="RMSE by Model",
    labels={"Model": "Model", "RMSE": "Root Mean Squared Error (RMSE)"},
    text="RMSE",
    color="RMSE",
    color_continuous_scale="Viridis",
)

fig.show()

mae_df = pd.DataFrame({
    "Model": models,
    "MAE": mae_scores
})

mae_df_sorted = mae_df.sort_values(by="MAE")

fig = px.bar(
    mae_df_sorted,
    x="Model",
    y="MAE",
    title="MAE by Model",
    labels={"Model": "Model", "MAE": "Mean Absolute Error (MAE)"},
    text="MAE",
    color="MAE",
    color_continuous_scale="Viridis",
)

fig.show()


In [None]:
results_df

Unnamed: 0,Model,RMSE,MAE
0,User-based CF,1.0421,0.8339
1,Item-based CF,1.0189,0.7979
2,Vanilla MF,0.954,0.7538
3,SVD with bias,0.848,0.6709
4,SVD++,0.9364,0.7356
5,FM,1.0348,0.7892
6,MLP,1.0651,0.8526
7,ALS,2.0688,1.747


## Comparing methods

User-Based Collaborative Filtering:

    Advantages:
        Captures user preferences by finding similar users.
        No need for item metadata.

    Disadvantages:
        Cold start problem for new users.
        Sparsity of data can lead to poor recommendations.

Item-Based Collaborative Filtering:

    Advantages:
        Less affected by the cold start problem.
        Can provide recommendations for new items.

    Disadvantages:
        Computationally expensive for large datasets.
        May suffer from data sparsity.

Vanilla Matrix Factorization:

    Advantages:
        Captures latent factors in the data.
        Can handle missing values.

    Disadvantages:
        Prone to overfitting.
        May require extensive hyperparameter tuning.

SVD with Bias:

    Advantages:
        Effective in capturing complex relationships.
        Low RMSE and MAE indicate accurate predictions.

    Disadvantages:
        Computationally intensive.
        Sensitive to noisy data.

SVD++:

    Advantages:
        Builds upon SVD by considering implicit feedback.
        Better handling of sparse data.

    Disadvantages:
        Requires a large amount of data to perform well.
        May still suffer from scalability issues.

Factorization Machines (FM):

    Advantages:
        Can capture high-order interactions between features.
        Versatile and can be applied to various recommendation scenarios.

    Disadvantages:
        May require feature engineering.
        Can be computationally expensive for large feature spaces.

Multilayer Perceptron (MLP):

    Advantages:
        Non-linear model capable of capturing complex patterns.
        Good performance with sufficient data.
        
    Disadvantages:
        May overfit with limited data.
        Requires tuning of architecture and hyperparameters.

# Section 5 : Implémentation de notre système de recommandation au sein de notre application

Pour accéder à l'application : http://52.90.64.156/