# How To Run Recommender Systems

link: https://predictivehacks.com/how-to-run-recommender-systems-in-python/

In [1]:
import pandas as pd
import numpy as np

In [2]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=columns)

In [3]:
# Load the major data

df

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [4]:
# Movie information

columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=columns, encoding='latin-1')

In [5]:
movies

Unnamed: 0,item_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
movie_names = movies[['item_id', 'movie title']]
movie_names

Unnamed: 0,item_id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [7]:
combined_movies_data = pd.merge(df, movie_names, on='item_id')
combined_movies_data = combined_movies_data[['user_id','movie title', 'rating']]
combined_movies_data.head()

Unnamed: 0,user_id,movie title,rating
0,196,Kolya (1996),3
1,63,Kolya (1996),3
2,226,Kolya (1996),5
3,154,Kolya (1996),3
4,306,Kolya (1996),5


In [8]:
# my user_id is the 1001

my_ratings = pd.read_csv('ml-100k/my_movies_rating.csv')
my_ratings.columns = ['user_id', 'movie title', 'rating']

my_ratings

Unnamed: 0,user_id,movie title,rating
0,1001,Aladdin (1992),1.0
1,1001,Braveheart (1995),5.0
2,1001,"Clockwork Orange, A (1971)",2.0
3,1001,Dances with Wolves (1990),3.5
4,1001,"English Patient, The (1996)",2.0
5,1001,Face/Off (1997),3.5
6,1001,Forrest Gump (1994),4.0
7,1001,"Game, The (1997)",3.5
8,1001,"Godfather, The (1972)",5.0
9,1001,Jurassic Park (1993),3.5


In [9]:
combined_movies_data = pd.concat([combined_movies_data, my_ratings], axis=0)
combined_movies_data

Unnamed: 0,user_id,movie title,rating
0,196,Kolya (1996),3.0
1,63,Kolya (1996),3.0
2,226,Kolya (1996),5.0
3,154,Kolya (1996),3.0
4,306,Kolya (1996),5.0
...,...,...,...
20,1001,Star Trek: The Wrath of Khan (1982),1.0
21,1001,Star Wars (1977),1.0
22,1001,Terminator 2: Judgment Day (1991),3.5
23,1001,Titanic (1997),4.0


In [11]:
# rename the columns to userID, itemID and rating
combined_movies_data.columns = ['userID', 'itemID', 'rating']

In [12]:
combined_movies_data

Unnamed: 0,userID,itemID,rating
0,196,Kolya (1996),3.0
1,63,Kolya (1996),3.0
2,226,Kolya (1996),5.0
3,154,Kolya (1996),3.0
4,306,Kolya (1996),5.0
...,...,...,...
20,1001,Star Trek: The Wrath of Khan (1982),1.0
21,1001,Star Wars (1977),1.0
22,1001,Terminator 2: Judgment Day (1991),3.5
23,1001,Titanic (1997),4.0


In [13]:
# count reviews for each move

combined_movies_data['reviews'] = combined_movies_data.groupby(['itemID'])['rating'].transform('count')
combined_movies_data['reviews']

0     117
1     117
2     117
3     117
4     117
     ... 
20    245
21    584
22    296
23    351
24    251
Name: reviews, Length: 100025, dtype: int64

In [14]:
combined_movies_data= combined_movies_data[combined_movies_data.reviews>25][['userID', 'itemID', 'rating']]

In [15]:
combined_movies_data

Unnamed: 0,userID,itemID,rating
0,196,Kolya (1996),3.0
1,63,Kolya (1996),3.0
2,226,Kolya (1996),5.0
3,154,Kolya (1996),3.0
4,306,Kolya (1996),5.0
...,...,...,...
20,1001,Star Trek: The Wrath of Khan (1982),1.0
21,1001,Star Wars (1977),1.0
22,1001,Terminator 2: Judgment Day (1991),3.5
23,1001,Titanic (1997),4.0


# Use Surprise

In [16]:
from surprise import NMF, SVD, SVDpp, KNNBasic, KNNWithMeans, KNNWithZScore, CoClustering
from surprise.model_selection import cross_validate
from surprise import Reader, Dataset

In [17]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(combined_movies_data, reader)

In [18]:
# get the list of the movie ids
unique_ids = combined_movies_data['itemID'].unique()
len(unique_ids)

851

In [19]:
# get the list of the ids that the userid 1001 has rated
iids1001 = combined_movies_data.loc[combined_movies_data['userID']==1001, 'itemID']

In [20]:
iids1001

0                          Aladdin (1992)
1                       Braveheart (1995)
2              Clockwork Orange, A (1971)
3               Dances with Wolves (1990)
4             English Patient, The (1996)
5                         Face/Off (1997)
6                     Forrest Gump (1994)
7                        Game, The (1997)
8                   Godfather, The (1972)
9                    Jurassic Park (1993)
10                       Liar Liar (1997)
11                  Lion King, The (1994)
12                    Pulp Fiction (1994)
13                  Reservoir Dogs (1992)
14              Return of the Jedi (1983)
15                       Rock, The (1996)
16                          Scream (1996)
17                   Seven (Se7en) (1995)
18       Silence of the Lambs, The (1991)
19        Star Trek: First Contact (1996)
20    Star Trek: The Wrath of Khan (1982)
21                       Star Wars (1977)
22      Terminator 2: Judgment Day (1991)
23                         Titanic

In [21]:
# remove the rated movies for the recommendations
movies_to_predict = np.setdiff1d(unique_ids,iids1001)
movies_to_predict

array(['101 Dalmatians (1996)', '12 Angry Men (1957)', '187 (1997)',
       '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)',
       '2001: A Space Odyssey (1968)', '39 Steps, The (1935)',
       '8 1/2 (1963)', 'Absolute Power (1997)', 'Abyss, The (1989)',
       'Ace Ventura: Pet Detective (1994)',
       'Ace Ventura: When Nature Calls (1995)',
       'Addams Family Values (1993)', 'Addicted to Love (1997)',
       'Adventures of Pinocchio, The (1996)',
       'Adventures of Priscilla, Queen of the Desert, The (1994)',
       'Adventures of Robin Hood, The (1938)',
       'Affair to Remember, An (1957)', 'African Queen, The (1951)',
       'Age of Innocence, The (1993)', 'Air Bud (1997)',
       'Air Force One (1997)', 'Airheads (1994)', 'Akira (1988)',
       'Aladdin and the King of Thieves (1996)',
       'Alice in Wonderland (1951)', 'Alien (1979)', 'Alien 3 (1992)',
       'Alien: Resurrection (1997)', 'Aliens (1986)',
       'All About Eve (1950)', '

# Recommender Systems using NMF

In [22]:
algo = NMF()
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x7f857494d6d0>

In [23]:
my_recs = []
for iid in movies_to_predict:
    my_recs.append((iid, algo.predict(uid=1001,iid=iid).est))

#My recommendations according to NMF:
pd.DataFrame(my_recs, columns=['iid', 'predictions']).sort_values('predictions', ascending=False).head(10)

Unnamed: 0,iid,predictions
667,Singin' in the Rain (1952),3.907929
608,Rear Window (1954),3.906779
141,Casablanca (1942),3.902223
55,As Good As It Gets (1997),3.849125
661,"Shawshank Redemption, The (1994)",3.807796
724,Swingers (1996),3.782632
178,Cool Hand Luke (1967),3.715865
600,Raiders of the Lost Ark (1981),3.695051
660,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,3.684342
680,Some Folks Call It a Sling Blade (1993),3.671098


# Recommender Systems using SVD

In [24]:
algo = SVD()
algo.fit(data.build_full_trainset())

my_recs = []
for iid in movies_to_predict:
    my_recs.append((iid, algo.predict(uid=1001,iid=iid).est))
    
pd.DataFrame(my_recs, columns=['iid', 'predictions']).sort_values('predictions', ascending=False).head(10)

Unnamed: 0,iid,predictions
593,Psycho (1960),4.020073
141,Casablanca (1942),4.017672
641,Schindler's List (1993),3.943196
110,"Boot, Das (1981)",3.909336
536,North by Northwest (1959),3.857742
731,Taxi Driver (1976),3.760249
329,GoodFellas (1990),3.753714
327,Good Will Hunting (1997),3.72888
168,"Close Shave, A (1995)",3.708267
608,Rear Window (1954),3.699643


# Recommender Systems using SVD++


In [25]:
algo = SVDpp()
algo.fit(data.build_full_trainset())

my_recs = []
for iid in movies_to_predict:
    my_recs.append((iid, algo.predict(uid=1001,iid=iid).est))
    
pd.DataFrame(my_recs, columns=['iid', 'predictions']).sort_values('predictions', ascending=False).head(10)

Unnamed: 0,iid,predictions
641,Schindler's List (1993),4.115901
549,One Flew Over the Cuckoo's Nest (1975),4.063863
593,Psycho (1960),3.992354
661,"Shawshank Redemption, The (1994)",3.975734
757,To Kill a Mockingbird (1962),3.908425
46,Apocalypse Now (1979),3.902816
719,Sunset Blvd. (1950),3.81281
330,"Graduate, The (1967)",3.812287
536,North by Northwest (1959),3.801013
354,Henry V (1989),3.758968


# Recommender Systems using KNN with Z-Score

In [26]:
algo = KNNWithZScore()
algo.fit(data.build_full_trainset())
my_recs = []
for iid in movies_to_predict:
    my_recs.append((iid, algo.predict(uid=1001,iid=iid).est))
    
pd.DataFrame(my_recs, columns=['iid', 'predictions']).sort_values('predictions', ascending=False).head(10)


Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0,iid,predictions
425,L.A. Confidential (1997),4.284275
661,"Shawshank Redemption, The (1994)",4.214673
168,"Close Shave, A (1995)",4.177236
820,"Wrong Trousers, The (1993)",4.161339
608,Rear Window (1954),4.118845
55,As Good As It Gets (1997),4.10819
1,12 Angry Men (1957),4.096164
549,One Flew Over the Cuckoo's Nest (1975),4.088914
784,"Usual Suspects, The (1995)",4.086552
647,Secrets & Lies (1996),4.082041


# Recommender Systems using Co-Clustering

In [27]:
algo = CoClustering()
algo.fit(data.build_full_trainset())
my_recs = []
for iid in movies_to_predict:
    my_recs.append((iid, algo.predict(uid=1001,iid=iid).est))
    
pd.DataFrame(my_recs, columns=['iid', 'predictions']).sort_values('predictions', ascending=False).head(10)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  algo.fit(data.build_full_trainset())


Unnamed: 0,iid,predictions
168,"Close Shave, A (1995)",3.902191
641,Schindler's List (1993),3.877562
820,"Wrong Trousers, The (1993)",3.877221
141,Casablanca (1942),3.867909
796,Wallace & Gromit: The Best of Aardman Animatio...,3.858881
661,"Shawshank Redemption, The (1994)",3.856349
1,12 Angry Men (1957),3.837551
608,Rear Window (1954),3.798679
784,"Usual Suspects, The (1995)",3.796887
680,Some Folks Call It a Sling Blade (1993),3.786233


In [28]:
cv = []
# Iterate over all recommender system algorithms
for recsys in [NMF(), SVD(), SVDpp(), KNNWithZScore(), CoClustering()]:
    # Perform cross validation
    tmp = cross_validate(recsys, data, measures=['RMSE'], cv=3, verbose=False)
    cv.append((str(recsys).split(' ')[0].split('.')[-1], tmp['test_rmse'].mean()))
pd.DataFrame(cv, columns=['RecSys', 'RMSE'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0,RecSys,RMSE
0,NMF,0.956529
1,SVD,0.934578
2,SVDpp,0.917004
3,KNNWithZScore,0.943103
4,CoClustering,0.949282


SVV+ gives the best result.