In [1]:
import pandas as pd
from surprise import Dataset, Reader, accuracy
from surprise.prediction_algorithms.matrix_factorization import SVD
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [2]:
ratings_path = r"C:\Users\ASUS\Desktop\movie_recommender\data\ratings.csv"
movies_path = r"C:\Users\ASUS\Desktop\movie_recommender\data\movies.csv"

In [3]:
ratings_df = pd.read_csv(ratings_path)
movies_df = pd.read_csv(movies_path)

In [4]:
df = pd.merge(ratings_df, movies_df[['movieId', 'genres']], on = 'movieId', how = 'left')
df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,genres
0,1,1,4.0,964982703,Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Comedy|Romance
2,1,6,4.0,964982224,Action|Crime|Thriller
3,1,47,5.0,964983815,Mystery|Thriller
4,1,50,5.0,964982931,Crime|Mystery|Thriller
5,1,70,3.0,964982400,Action|Comedy|Horror|Thriller
6,1,101,5.0,964980868,Adventure|Comedy|Crime|Romance
7,1,110,4.0,964982176,Action|Drama|War
8,1,151,5.0,964984041,Action|Drama|Romance|War
9,1,157,5.0,964984100,Comedy|War


In [5]:
encoder = LabelEncoder()
mlb = MultiLabelBinarizer()

df['userId'] = encoder.fit_transform(df['userId'])
df['movieId'] = encoder.fit_transform(df['movieId'])

df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('genres').str.split('|')), columns = mlb.classes_, index = df.index ))
     

In [6]:
df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,4.0,964982703,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,2,4.0,964981247,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,0,5,4.0,964982224,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,43,5.0,964983815,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,0,46,5.0,964982931,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
5,0,62,3.0,964982400,0,1,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
6,0,89,5.0,964980868,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
7,0,97,4.0,964982176,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,0,124,5.0,964984041,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
9,0,130,5.0,964984100,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [7]:
df.drop(columns = "(no genres listed)", inplace = True)
df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,4.0,964982703,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,2,4.0,964981247,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,0,5,4.0,964982224,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,0,43,5.0,964983815,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,0,46,5.0,964982931,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
5,0,62,3.0,964982400,1,0,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
6,0,89,5.0,964980868,0,1,0,0,1,1,...,0,0,0,0,0,1,0,0,0,0
7,0,97,4.0,964982176,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,0,124,5.0,964984041,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
9,0,130,5.0,964984100,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 23 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   userId       100836 non-null  int64  
 1   movieId      100836 non-null  int64  
 2   rating       100836 non-null  float64
 3   timestamp    100836 non-null  int64  
 4   Action       100836 non-null  int32  
 5   Adventure    100836 non-null  int32  
 6   Animation    100836 non-null  int32  
 7   Children     100836 non-null  int32  
 8   Comedy       100836 non-null  int32  
 9   Crime        100836 non-null  int32  
 10  Documentary  100836 non-null  int32  
 11  Drama        100836 non-null  int32  
 12  Fantasy      100836 non-null  int32  
 13  Film-Noir    100836 non-null  int32  
 14  Horror       100836 non-null  int32  
 15  IMAX         100836 non-null  int32  
 16  Musical      100836 non-null  int32  
 17  Mystery      100836 non-null  int32  
 18  Romance      100836 non-

In [9]:
train_df, test_df = train_test_split(df, test_size = 0.2)
train_df

Unnamed: 0,userId,movieId,rating,timestamp,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
47053,306,1616,2.5,1186162276,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
18031,112,1062,3.0,980306983,0,0,0,0,1,0,...,0,0,0,1,0,1,0,0,0,0
82455,522,6621,4.0,1503126258,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
41546,281,1776,5.0,1378489057,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2886,18,2485,2.0,965703425,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15219,97,7449,5.0,1532457912,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67466,436,16,4.0,859720978,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
25788,176,8388,4.0,1435837729,1,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
18825,120,486,4.0,847656290,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80668 entries, 47053 to 54813
Data columns (total 23 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   userId       80668 non-null  int64  
 1   movieId      80668 non-null  int64  
 2   rating       80668 non-null  float64
 3   timestamp    80668 non-null  int64  
 4   Action       80668 non-null  int32  
 5   Adventure    80668 non-null  int32  
 6   Animation    80668 non-null  int32  
 7   Children     80668 non-null  int32  
 8   Comedy       80668 non-null  int32  
 9   Crime        80668 non-null  int32  
 10  Documentary  80668 non-null  int32  
 11  Drama        80668 non-null  int32  
 12  Fantasy      80668 non-null  int32  
 13  Film-Noir    80668 non-null  int32  
 14  Horror       80668 non-null  int32  
 15  IMAX         80668 non-null  int32  
 16  Musical      80668 non-null  int32  
 17  Mystery      80668 non-null  int32  
 18  Romance      80668 non-null  int32  
 19  Sci-F

In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20168 entries, 27924 to 96036
Data columns (total 23 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   userId       20168 non-null  int64  
 1   movieId      20168 non-null  int64  
 2   rating       20168 non-null  float64
 3   timestamp    20168 non-null  int64  
 4   Action       20168 non-null  int32  
 5   Adventure    20168 non-null  int32  
 6   Animation    20168 non-null  int32  
 7   Children     20168 non-null  int32  
 8   Comedy       20168 non-null  int32  
 9   Crime        20168 non-null  int32  
 10  Documentary  20168 non-null  int32  
 11  Drama        20168 non-null  int32  
 12  Fantasy      20168 non-null  int32  
 13  Film-Noir    20168 non-null  int32  
 14  Horror       20168 non-null  int32  
 15  IMAX         20168 non-null  int32  
 16  Musical      20168 non-null  int32  
 17  Mystery      20168 non-null  int32  
 18  Romance      20168 non-null  int32  
 19  Sci-F

In [12]:
train_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
count,80668.0,80668.0,80668.0,80668.0,80668.0,80668.0,80668.0,80668.0,80668.0,80668.0,...,80668.0,80668.0,80668.0,80668.0,80668.0,80668.0,80668.0,80668.0,80668.0,80668.0
mean,325.171183,3092.623816,3.4993,1205182000.0,0.304259,0.239736,0.069495,0.091523,0.38713,0.164737,...,0.008554,0.072457,0.04076,0.041392,0.075842,0.180158,0.170886,0.261244,0.047925,0.019438
std,182.483809,2622.701371,1.042546,216443100.0,0.460096,0.426925,0.254295,0.288354,0.487097,0.370945,...,0.09209,0.259245,0.197734,0.199196,0.264746,0.384322,0.376411,0.439315,0.213609,0.138058
min,0.0,0.0,0.5,828124600.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,176.0,899.0,3.0,1014822000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,325.0,2248.0,3.5,1183239000.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,476.0,5057.25,4.0,1435994000.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,609.0,9723.0,5.0,1537799000.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
scale = Reader(rating_scale = (0.5, 5))
data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], scale)
trainset = data.build_full_trainset()

In [14]:
model_svd = SVD(n_factors=100, n_epochs=30, lr_all=0.005, reg_all=0.04)
model_svd.fit(trainset)

predictions_svd = model_svd.test(trainset.build_anti_testset())
rmse_score = accuracy.rmse(predictions_svd)
print(f"Updated RMSE: {rmse_score}")

RMSE: 0.4986
Updated RMSE: 0.4986220266810801


In [19]:
def get_top_n_recommendations(user_id, n=12):
  user_movies = df[df['userId'] == user_id]['movieId'].unique()
  all_movies = df['movieId'].unique()
  movies_to_predict = list(set(all_movies) - set(user_movies))

  user_movie_pairs = [(user_id, movie_id, 0) for movie_id in movies_to_predict]
  predictions_cf = model_svd.test(user_movie_pairs)

  top_n_recommendations = sorted(predictions_cf, key = lambda x: x.est)[:n]

  for pred in top_n_recommendations:
    predicted_rating = pred.est
    print(predicted_rating)


  top_n_movie_ids = [int(pred.iid) for pred in top_n_recommendations]

  top_n_movies = encoder.inverse_transform(top_n_movie_ids)

  return top_n_movies

In [17]:
test_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
27924,189,8554,4.0,1504310669,0,1,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
28114,194,520,5.0,974706650,0,0,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
17367,110,2637,3.5,1516140996,0,0,0,0,0,1,...,0,1,0,0,1,0,0,1,0,0
8741,59,904,3.0,1393542084,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
56331,371,1050,5.0,874415328,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
user_id = 59
recommendations = get_top_n_recommendations(user_id)
top_n_movies_titles = movies_df[movies_df['movieId'].isin(recommendations)]['title'].tolist()
print(f"Top 12 Recommendations for User {user_id}:")
for i, title in enumerate(top_n_movies_titles, 1):
  print(f"{i}.{title}")

2.1715670963931637
2.228004054398255
2.2744082784056325
2.2903049731043237
2.3293168205473562
2.453465537973381
2.4698880708648865
2.471150761508248
2.5118408598266497
2.5403786762291856
2.563894751765771
2.5923872702158826
Top 5 Recommendations for User 59:
1.Stuart Saves His Family (1995)
2.Anaconda (1997)
3.Speed 2: Cruise Control (1997)
4.Godzilla (1998)
5.Honey, I Blew Up the Kid (1992)
6.I Still Know What You Did Last Summer (1998)
7.Rocky V (1990)
8.Wild Wild West (1999)
9.Inspector Gadget (1999)
10.Battlefield Earth (2000)
11.Catwoman (2004)
12.Disaster Movie (2008)
