# Collaborative Filtering with surprise

## NormalPredictor

In [1]:
import pandas as pd
from surprise import Reader
from surprise import Dataset

In [2]:
df_ratings = pd.read_csv("../data/raw/ratings.csv")
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [3]:
df_ratings = df_ratings.drop("timestamp", axis=1)
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [4]:
df_movies = pd.read_csv("../data/raw/movies.csv")
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
reader = Reader(rating_scale=(0, 5))
df_surprise = Dataset.load_from_df(df_ratings, reader=reader)


In [6]:
from surprise import NormalPredictor
from surprise.model_selection import cross_validate

normpred = NormalPredictor()

cross_validate(normpred, df_surprise, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.4382  1.4371  1.4377  1.4374  1.4371  1.4375  0.0004  
MAE (testset)     1.1467  1.1456  1.1458  1.1461  1.1458  1.1460  0.0004  
Fit time          9.35    11.56   11.84   11.73   12.31   11.36   1.04    
Test time         28.28   21.91   29.45   23.12   28.43   26.24   3.09    


{'test_rmse': array([1.43818598, 1.43705562, 1.43766151, 1.43738785, 1.43708963]),
 'test_mae': array([1.14672847, 1.14556725, 1.14581694, 1.14606023, 1.14583065]),
 'fit_time': (9.346808195114136,
  11.562877893447876,
  11.844789028167725,
  11.729647159576416,
  12.313029766082764),
 'test_time': (28.28259587287903,
  21.908761024475098,
  29.44517707824707,
  23.124305725097656,
  28.42549705505371)}

In [7]:
user_id = 1000

In [8]:
train_set = df_surprise.build_full_trainset()

anti_testset = []

targetUser = train_set.to_inner_uid(user_id)

moyenne = train_set.global_mean

user_note = train_set.ur[targetUser]

user_livre = [item for (item,_) in (user_note)]

ratings = train_set.all_ratings()

for livre in train_set.all_items():

    if livre not in user_livre:

        anti_testset.append((user_id, train_set.to_raw_iid(livre), moyenne))


In [9]:
movieId_title_map = df_movies.set_index('movieId')['title'].to_dict()
predictions_df = pd.DataFrame(predictions)
predictions_df['title'] = predictions_df['iid'].map(movieId_title_map)
predictions_df = predictions_df.rename(columns={'uid': 'userId', 'est': 'note'})
predictions_df = predictions_df[['userId', 'title', 'note']]

predictions_df.head(10)

NameError: name 'predictions' is not defined

## SVD

In [10]:
from surprise import SVD
from surprise.model_selection import cross_validate
svd = SVD()
cross_validate(svd, df_surprise,measures=['RMSE', 'MAE'], cv=5, verbose=True )

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7862  0.7866  0.7850  0.7859  0.7860  0.7859  0.0005  
MAE (testset)     0.5983  0.5984  0.5975  0.5981  0.5978  0.5980  0.0003  
Fit time          97.77   114.94  114.26  114.93  110.23  110.43  6.57    
Test time         42.22   43.51   44.04   34.46   42.84   41.41   3.53    


{'test_rmse': array([0.78621578, 0.78657014, 0.78502879, 0.78587988, 0.78596539]),
 'test_mae': array([0.59825212, 0.59844251, 0.59754015, 0.59807276, 0.59775006]),
 'fit_time': (97.7685718536377,
  114.94235634803772,
  114.25651407241821,
  114.93496108055115,
  110.2253828048706),
 'test_time': (42.219940185546875,
  43.51165795326233,
  44.03701305389404,
  34.45718193054199,
  42.83836913108826)}

In [11]:
movieId_title_map = df_movies.set_index('movieId')['title'].to_dict()
predictionsSVD = svd.test(anti_testset)

predictionsSVD = pd.DataFrame(predictionsSVD)

predictionsSVD['title'] = predictionsSVD['iid'].map(movieId_title_map)
predictionsSVD = predictionsSVD.rename(columns={'uid': 'userId', 'est': 'note'})

predictionsSVD = predictionsSVD[['userId', 'title', 'note']]
predictionsSVD.sort_values('note', ascending=False, inplace=True)
predictionsSVD.head(10)

Unnamed: 0,userId,title,note
2960,1000,Serenity (2005),4.814452
1809,1000,"Dark Knight Rises, The (2012)",4.794528
878,1000,Howl's Moving Castle (Hauru no ugoku shiro) (2...,4.783524
12,1000,"Shawshank Redemption, The (1994)",4.779427
1754,1000,Stardust (2007),4.740225
1337,1000,"Green Mile, The (1999)",4.735305
2684,1000,"Passion of the Christ, The (2004)",4.717988
1815,1000,"Hobbit: The Desolation of Smaug, The (2013)",4.71152
1743,1000,"Illusionist, The (2006)",4.710351
5653,1000,"Batman: The Dark Knight Returns, Part 1 (2012)",4.706515


## SVD Tronqué

In [17]:
from surprise import SVD
svd = SVD(n_factors=50)
cross_validate(svd, df_surprise,measures=['RMSE', 'MAE'], cv=5, verbose=True )

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7872  0.7866  0.7867  0.7860  0.7872  0.7867  0.0004  
MAE (testset)     0.5990  0.5987  0.5989  0.5981  0.5995  0.5989  0.0004  
Fit time          77.50   82.54   86.39   84.79   89.85   84.21   4.12    
Test time         27.12   35.86   31.14   32.40   29.12   31.13   2.97    


{'test_rmse': array([0.78718928, 0.78664585, 0.78671536, 0.78597414, 0.7871939 ]),
 'test_mae': array([0.59902864, 0.59874219, 0.59886645, 0.59814903, 0.59946982]),
 'fit_time': (77.49506688117981,
  82.53794598579407,
  86.38784098625183,
  84.78652095794678,
  89.8542652130127),
 'test_time': (27.119922161102295,
  35.863821029663086,
  31.138434886932373,
  32.403340578079224,
  29.12393283843994)}

In [18]:
predictionsSVD = svd.test(anti_testset)

predictionsSVD = pd.DataFrame(predictionsSVD)

predictionsSVD['title'] = predictionsSVD['iid'].map(movieId_title_map)
predictionsSVD = predictionsSVD.rename(columns={'uid': 'userId', 'est': 'note'})

predictionsSVD = predictionsSVD[['userId', 'title', 'note']]
predictionsSVD.sort_values('note', ascending=False, inplace=True)
predictionsSVD.head(10)

Unnamed: 0,userId,title,note
562,1000,Gladiator (2000),4.911072
1809,1000,"Dark Knight Rises, The (2012)",4.881962
2960,1000,Serenity (2005),4.826835
823,1000,Kill Bill: Vol. 1 (2003),4.809776
89,1000,"Sixth Sense, The (1999)",4.807095
2612,1000,Interstellar (2014),4.790193
12,1000,"Shawshank Redemption, The (1994)",4.784331
172,1000,Braveheart (1995),4.775283
143,1000,Pirates of the Caribbean: The Curse of the Bla...,4.77377
807,1000,"Matrix Reloaded, The (2003)",4.720901


In [21]:
from surprise import SVD
svd = SVD(n_factors=25)
cross_validate(svd, df_surprise,measures=['RMSE', 'MAE'], cv=5, verbose=True )

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7889  0.7893  0.7886  0.7883  0.7881  0.7886  0.0004  
MAE (testset)     0.6005  0.6009  0.6005  0.6004  0.6003  0.6005  0.0002  
Fit time          77.18   69.26   80.87   76.94   83.43   77.53   4.79    
Test time         47.90   34.65   34.74   34.71   34.98   37.40   5.25    


{'test_rmse': array([0.78885566, 0.78930682, 0.78863671, 0.78832992, 0.78810567]),
 'test_mae': array([0.6004796 , 0.6008774 , 0.60048894, 0.60039586, 0.60029309]),
 'fit_time': (77.18194508552551,
  69.25884127616882,
  80.86783480644226,
  76.93908715248108,
  83.4263117313385),
 'test_time': (47.901293992996216,
  34.64883995056152,
  34.744434118270874,
  34.70640993118286,
  34.97965598106384)}

In [22]:
predictionsSVD = svd.test(anti_testset)

predictionsSVD = pd.DataFrame(predictionsSVD)

predictionsSVD['title'] = predictionsSVD['iid'].map(movieId_title_map)
predictionsSVD = predictionsSVD.rename(columns={'uid': 'userId', 'est': 'note'})

predictionsSVD = predictionsSVD[['userId', 'title', 'note']]
predictionsSVD.sort_values('note', ascending=False, inplace=True)
predictionsSVD.head(10)

Unnamed: 0,userId,title,note
1317,1000,Good Will Hunting (1997),4.884728
12,1000,"Shawshank Redemption, The (1994)",4.872083
562,1000,Gladiator (2000),4.85042
1066,1000,Life Is Beautiful (La Vita Ã¨ bella) (1997),4.789677
1809,1000,"Dark Knight Rises, The (2012)",4.767926
2599,1000,Intouchables (2011),4.75867
359,1000,Forrest Gump (1994),4.757844
172,1000,Braveheart (1995),4.753261
1600,1000,"Bourne Identity, The (2002)",4.745722
2905,1000,Man on Fire (2004),4.741475
