# Recommendation

Data: https://grouplens.org/datasets/movielens/latest/

In [1]:
import os
import pandas as pd
from surprise import Reader, Dataset, accuracy, SVDpp
from surprise.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
movielens_dir = 'data/ml-latest-small'

ratings_file = os.path.join(movielens_dir, "ratings.csv")
tags_file = os.path.join(movielens_dir, "tags.csv")
movies_file = os.path.join(movielens_dir, "movies.csv")

df = pd.read_csv(ratings_file)
tags = pd.read_csv(tags_file)
movies = pd.read_csv(movies_file)

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [6]:
user_item_matrix = df.pivot(index='userId', columns='movieId', values='rating')

In [7]:
reader = Reader(rating_scale=(0.5, 5))

data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

In [8]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [10, 20, 30],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.1, 0.2]
}

grid_search = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3)
grid_search.fit(data)

print("Best parameters:", grid_search.best_params)
print("Best RMSE:", grid_search.best_score['rmse'])
print("Best MAE:", grid_search.best_score['mae'])


In [None]:
best_model = grid_search.best_estimator['rmse']

best_model.fit(trainset)

In [11]:
algo = SVDpp(n_factors=100,
             n_epochs=5,
             lr_all=0.01,
             reg_all=0.01,
             verbose=True)


algo.fit(trainset)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x77167225c810>

In [12]:
predictions = algo.test(testset)

y_true = [pred.r_ui for pred in predictions]
y_pred = [pred.est for pred in predictions]

mae = mean_absolute_error(y_true, y_pred)
print(f'Mean Absolute Error (MAE): {mae:.4f}')

r2 = r2_score(y_true, y_pred)
print(f'R²: {r2:.4f}')


Mean Absolute Error (MAE): 0.6754
R²: 0.2931


In [13]:
def get_top_n_recommendations(user_id, n=5):
    all_items = user_item_matrix.columns

    rated_items = user_item_matrix.loc[user_id].dropna().index

    unrated_items = [item for item in all_items if item not in rated_items]
    predictions = [algo.predict(user_id, item) for item in unrated_items]

    predictions.sort(key=lambda x: x.est, reverse=True)

    top_n_items = [pred.iid for pred in predictions[:n]]
    return top_n_items

recommendations = get_top_n_recommendations(user_id=1, n=5)
print(f'Top 5 recommendations for user 1: {recommendations}')

Top 5 recommendations for user 1: [858, 904, 4973, 4993, 5618]


In [14]:
for id in recommendations:
    print(movies[movies['movieId']==id]['title'])

659    Godfather, The (1972)
Name: title, dtype: object
686    Rear Window (1954)
Name: title, dtype: object
3622    Amelie (Fabuleux destin d'Amélie Poulain, Le) ...
Name: title, dtype: object
3638    Lord of the Rings: The Fellowship of the Ring,...
Name: title, dtype: object
3984    Spirited Away (Sen to Chihiro no kamikakushi) ...
Name: title, dtype: object
