# Collaborative Filtering

## The Framework

In [None]:
#!wget http://files.grouplens.org/datasets/movielens/ml-1m.zip (movie, rating, users)

In [8]:
!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip

--2020-09-03 06:56:39--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2020-09-03 06:56:40 (5.82 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]



In [9]:
import zipfile
with zipfile.ZipFile("/content/ml-1m.zip","r") as zip_ref:
    zip_ref.extractall()

In [3]:
import pandas as pd
import numpy as np

In [6]:
#Load the u.user file into a dataframe
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

users = pd.read_csv('/content/ml-100k/u.user', sep='|', names=u_cols,
 encoding='latin-1')
print(users.shape)
users.head()

(943, 5)


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [7]:
#Load the u.item file into a dataframe
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('/content/ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')
print(movies.shape)
movies.head()

(1682, 24)


Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [10]:
#Remove all information except Movie ID and title
movies = movies[['movie_id', 'title']]

In [12]:
#Load the u.data file into a dataframe
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv('/content/ml-100k/u.data', sep='\t', names=r_cols,
 encoding='latin-1')
print(ratings.shape)

ratings.head()

(100000, 4)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [13]:
#Drop the timestamp column
ratings = ratings.drop('timestamp', axis=1)

In [14]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = ratings.copy()
y = ratings['user_id']

#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

In [15]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [16]:
#Define the baseline model to always return 3.
def baseline(user_id, movie_id):
    return 3.0

In [17]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'])
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [18]:
score(baseline)

1.2470926188539486

## User Based Collaborative Filtering

### Ratings Matrix

In [19]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='movie_id')

r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,1633,1634,1635,1636,1637,1638,1639,1640,1641,1642,1643,1644,1645,1647,1648,1649,1651,1652,1653,1654,1656,1657,1658,1659,1660,1661,1662,1663,1664,1668,1669,1670,1671,1673,1674,1675,1676,1679,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,2.0,5.0,,5.0,,,,4.0,5.0,4.0,1.0,,4.0,3.0,4.0,3.0,,4.0,1.0,,3.0,5.0,4.0,,1.0,2.0,,3.0,4.0,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,2.0,,,4.0,4.0,,,,,3.0,,,,,,4.0,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,,3.0,,,,,,,,,,,,,,,4.0,,,,3.0,,,4.0,3.0,,,,4.0,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Mean

In [20]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, movie_id):
    
    #Check if movie_id exists in r_matrix
    if movie_id in r_matrix:
        #Compute the mean of all the ratings given to the movie
        mean_rating = r_matrix[movie_id].mean()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        mean_rating = 3.0
    
    return mean_rating

In [21]:
#Compute RMSE for the Mean model
score(cf_user_mean)

1.0234701463131335

### Weighted Mean

In [22]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = r_matrix.copy().fillna(0)

In [23]:
# Import cosine_score 
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [24]:
#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)

cosine_sim.head(10)

user_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,904,905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,930,931,932,933,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,1.0,0.118076,0.029097,0.011628,0.264677,0.312419,0.308729,0.224269,0.026017,0.286411,0.220437,0.213117,0.351204,0.235971,0.138522,0.341201,0.136597,0.340766,0.088945,0.168888,0.168084,0.245544,0.363621,0.24438,0.266342,0.184639,0.14204,0.218228,0.050924,0.149212,0.091746,0.129824,0.03811,0.011462,0.016556,0.0,0.153244,0.148866,0.0309,0.038933,...,0.103657,0.109444,0.070591,0.217109,0.238477,0.126022,0.154188,0.200165,0.175615,0.340986,0.068807,0.0,0.429265,0.107925,0.160852,0.21424,0.036933,0.233471,0.262186,0.124593,0.26624,0.06884,0.096171,0.179761,0.160145,0.238781,0.243788,0.12381,0.275831,0.395287,0.308475,0.055872,0.197862,0.131367,0.152449,0.084456,0.293293,0.056765,0.103536,0.326491
2,0.118076,1.0,0.099097,0.10768,0.034279,0.152789,0.086705,0.078864,0.06894,0.092399,0.098726,0.074706,0.143962,0.088732,0.352885,0.067225,0.141859,0.111234,0.085395,0.023666,0.117823,0.043818,0.085681,0.142768,0.082198,0.415777,0.108961,0.039521,0.196305,0.183749,0.052282,0.159113,0.083984,0.071035,0.070759,0.041877,0.071331,0.050981,0.190476,0.171597,...,0.17343,0.110674,0.228069,0.163651,0.052568,0.121685,0.302777,0.035253,0.031939,0.107282,0.011763,0.17294,0.083593,0.241509,0.028323,0.221194,0.283037,0.141837,0.08004,0.171161,0.143195,0.02992,0.391046,0.099513,0.091941,0.054671,0.177199,0.374315,0.011658,0.065508,0.086927,0.259636,0.289092,0.318824,0.149105,0.186347,0.168034,0.106748,0.136796,0.080358
3,0.029097,0.099097,1.0,0.252131,0.026893,0.062539,0.039767,0.089474,0.078162,0.03767,0.031866,0.08656,0.138187,0.040749,0.098841,0.057337,0.006849,0.027751,0.0,0.039608,0.11487,0.027501,0.022252,0.044016,0.0,0.115385,0.006513,0.013229,0.136596,0.063315,0.154689,0.071696,0.319455,0.095315,0.201552,0.067167,0.0,0.053754,0.168958,0.296983,...,0.047419,0.15727,0.058609,0.045193,0.058653,0.07512,0.049184,0.0118,0.032073,0.057332,0.0,0.27135,0.014727,0.027311,0.00808,0.205548,0.34503,0.084334,0.011482,0.138457,0.051355,0.156487,0.171978,0.013431,0.032057,0.033273,0.006673,0.143409,0.0,0.034622,0.040918,0.019031,0.065417,0.055373,0.086503,0.018418,0.096993,0.109631,0.092574,0.018987
4,0.011628,0.10768,0.252131,1.0,0.0,0.045543,0.078812,0.095354,0.059498,0.053879,0.074209,0.038437,0.060181,0.034741,0.107286,0.059952,0.0,0.014787,0.044219,0.031658,0.085692,0.014654,0.022396,0.142399,0.026656,0.068105,0.041645,0.089874,0.182678,0.048197,0.037467,0.084593,0.228098,0.206432,0.167468,0.080916,0.034698,0.069562,0.186854,0.142598,...,0.0,0.041901,0.036034,0.065676,0.101574,0.066713,0.076241,0.025151,0.042725,0.068204,0.0,0.119287,0.015694,0.0,0.0,0.121569,0.080093,0.053259,0.07342,0.067841,0.020524,0.156763,0.160368,0.09304,0.027331,0.0,0.0,0.040219,0.019493,0.024598,0.024226,0.050703,0.056561,0.107294,0.098892,0.0,0.1329,0.142798,0.097066,0.015176
5,0.264677,0.034279,0.026893,0.0,1.0,0.202843,0.299619,0.163724,0.038474,0.153021,0.290192,0.101406,0.298501,0.203679,0.090818,0.198008,0.024542,0.211799,0.123906,0.16377,0.138529,0.310043,0.322358,0.105729,0.182711,0.09395,0.052063,0.275308,0.0,0.102223,0.028427,0.111519,0.0,0.00339,0.0,0.0,0.146963,0.184862,0.0,0.003598,...,0.033272,0.028901,0.024854,0.1325,0.230388,0.0,0.081344,0.313078,0.166869,0.263148,0.05291,0.0,0.236461,0.061731,0.128287,0.109758,0.0,0.191999,0.329955,0.112303,0.197299,0.06384,0.0,0.195483,0.113108,0.19375,0.114039,0.062417,0.254115,0.303009,0.262547,0.048524,0.048312,0.022202,0.09191,0.066,0.156172,0.115842,0.124297,0.267574
6,0.312419,0.152789,0.062539,0.045543,0.202843,1.0,0.375963,0.131795,0.110944,0.400758,0.181573,0.144113,0.362558,0.255405,0.140799,0.318539,0.18146,0.435097,0.124124,0.081459,0.112906,0.163946,0.323844,0.239829,0.29529,0.19193,0.100198,0.144158,0.11217,0.115962,0.173279,0.091918,0.019113,0.055186,0.017519,0.009984,0.077397,0.121422,0.043595,0.122732,...,0.068162,0.163542,0.141608,0.19752,0.276502,0.094167,0.188212,0.242689,0.307533,0.35213,0.003155,0.078274,0.319394,0.079365,0.182315,0.222591,0.071534,0.181535,0.166834,0.081613,0.257186,0.034775,0.142396,0.103892,0.223762,0.322596,0.193909,0.202405,0.342766,0.362498,0.287549,0.080312,0.162988,0.182856,0.114262,0.09209,0.261859,0.097606,0.206104,0.187637
7,0.308729,0.086705,0.039767,0.078812,0.299619,0.375963,1.0,0.211282,0.107795,0.328923,0.253871,0.196016,0.405534,0.279351,0.094242,0.316566,0.047996,0.368376,0.146381,0.14941,0.197145,0.267456,0.305163,0.227525,0.253992,0.086829,0.063341,0.224672,0.062056,0.069856,0.093859,0.068116,0.067619,0.01759,0.055698,0.013365,0.217697,0.192564,0.052888,0.036407,...,0.089398,0.052483,0.103166,0.180978,0.28364,0.046556,0.141965,0.29155,0.248274,0.352779,0.065464,0.033958,0.335681,0.066402,0.189979,0.198241,0.042018,0.197898,0.30832,0.111075,0.281859,0.092209,0.074814,0.232053,0.141826,0.248047,0.088436,0.06369,0.39633,0.330114,0.290002,0.07417,0.094619,0.084235,0.11562,0.100625,0.233843,0.039199,0.224227,0.296332
8,0.224269,0.078864,0.089474,0.095354,0.163724,0.131795,0.211282,1.0,0.03704,0.183375,0.126203,0.112603,0.229197,0.12404,0.108578,0.250784,0.040091,0.157039,0.086363,0.170033,0.079572,0.349877,0.173679,0.14969,0.156184,0.115001,0.0,0.207795,0.025086,0.135901,0.014635,0.079943,0.10223,0.011999,0.026664,0.0,0.379606,0.061935,0.029859,0.038209,...,0.020188,0.049101,0.021113,0.062531,0.094421,0.0,0.198924,0.080591,0.023469,0.209176,0.0,0.021179,0.200436,0.0,0.025226,0.068037,0.074658,0.182845,0.240855,0.122974,0.160335,0.021985,0.081099,0.135236,0.116767,0.178544,0.067707,0.106042,0.121352,0.275864,0.165008,0.066843,0.058766,0.068759,0.087159,0.129381,0.188662,0.121223,0.08391,0.273238
9,0.026017,0.06894,0.078162,0.059498,0.038474,0.110944,0.107795,0.03704,1.0,0.155435,0.032419,0.115581,0.08288,0.113484,0.00838,0.044574,0.062707,0.091315,0.059361,0.0,0.060805,0.102293,0.090551,0.035983,0.093039,0.062982,0.130446,0.052991,0.049047,0.041408,0.03219,0.213934,0.043874,0.084456,0.031278,0.0,0.0,0.029296,0.058377,0.089644,...,0.0,0.035999,0.051599,0.032915,0.07552,0.0,0.089554,0.027011,0.064238,0.057987,0.084495,0.0,0.016855,0.109401,0.081378,0.125632,0.062557,0.071497,0.136342,0.102001,0.060613,0.0,0.049208,0.030745,0.04892,0.152324,0.007637,0.051832,0.080598,0.057787,0.011708,0.0,0.10171,0.034568,0.045002,0.052699,0.107486,0.055766,0.070065,0.088281
10,0.286411,0.092399,0.03767,0.053879,0.153021,0.400758,0.328923,0.183375,0.155435,1.0,0.164532,0.135594,0.364255,0.234072,0.060259,0.294155,0.155878,0.35353,0.119455,0.097595,0.139478,0.13748,0.347185,0.242786,0.217935,0.117602,0.06353,0.188181,0.092529,0.122234,0.209571,0.063483,0.032459,0.023743,0.022215,0.01899,0.117444,0.124322,0.051827,0.03714,...,0.030661,0.131036,0.131929,0.103528,0.210078,0.10686,0.189905,0.198438,0.181681,0.29317,0.072013,0.016543,0.270468,0.07437,0.173392,0.194202,0.04443,0.090768,0.172086,0.093143,0.238077,0.050881,0.059704,0.10372,0.123344,0.271309,0.14193,0.083595,0.317441,0.3729,0.278558,0.04931,0.153506,0.065471,0.060088,0.033686,0.197107,0.085402,0.118945,0.162538


In [25]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, movie_id):
    
    #Check if movie_id exists in r_matrix
    if movie_id in r_matrix:
        
        #Get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[user_id]
        
        #Get the user ratings for the movie in question
        m_ratings = r_matrix[movie_id]
        
        #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        
        #Compute the final weighted mean
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        wmean_rating = 3.0
    
    return wmean_rating

In [26]:
score(cf_user_wmean)

1.0174483808407588

### Demographics

In [27]:
#Merge the original users dataframe with the training set 
merged_df = pd.merge(X_train, users)

merged_df.head()

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,889,684,2,24,M,technician,78704
1,889,279,2,24,M,technician,78704
2,889,29,3,24,M,technician,78704
3,889,190,3,24,M,technician,78704
4,889,232,3,24,M,technician,78704


In [28]:
#Compute the mean rating of every movie by gender
gender_mean = merged_df[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()

In [29]:
#Set the index of the users dataframe to the user_id
users = users.set_index('user_id')

In [30]:
#Gender Based Collaborative Filter using Mean Ratings
def cf_gender(user_id, movie_id):
    
    #Check if movie_id exists in r_matrix (or training set)
    if movie_id in r_matrix:
        #Identify the gender of the user
        gender = users.loc[user_id]['sex']
        
        #Check if the gender has rated the movie
        if gender in gender_mean[movie_id]:
            
            #Compute the mean rating given by that gender to the movie
            gender_rating = gender_mean[movie_id][gender]
        
        else:
            gender_rating = 3.0
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        gender_rating = 3.0
    
    return gender_rating

In [31]:
score(cf_gender)

1.0330308800874282

In [32]:
#Compute the mean rating by gender and occupation
gen_occ_mean = merged_df[['sex', 'rating', 'movie_id', 'occupation']].pivot_table(
    values='rating', index='movie_id', columns=['occupation', 'sex'], aggfunc='mean')

gen_occ_mean.head()

occupation,administrator,administrator,artist,artist,doctor,educator,educator,engineer,engineer,entertainment,entertainment,executive,executive,healthcare,healthcare,homemaker,homemaker,lawyer,lawyer,librarian,librarian,marketing,marketing,none,none,other,other,programmer,programmer,retired,retired,salesman,salesman,scientist,scientist,student,student,technician,technician,writer,writer
sex,F,M,F,M,M,F,M,F,M,F,M,F,M,F,M,F,M,F,M,F,M,F,M,F,M,F,M,F,M,F,M,F,M,F,M,F,M,F,M,F,M
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2
1,4.0,4.222222,4.25,3.5,3.666667,3.5,3.923077,4.0,3.970588,5.0,3.222222,3.0,4.066667,2.25,4.0,,,,4.166667,4.0,3.571429,4.0,4.25,3.5,5.0,4.0,4.107143,4.0,4.28,,3.666667,4.0,4.0,3.5,3.888889,3.833333,3.709091,4.0,4.2,4.166667,3.142857
2,3.0,3.75,,,,,3.25,,3.363636,,3.0,,3.0,2.666667,,,,,5.0,3.0,,,3.0,5.0,4.0,3.666667,3.5,,3.0,,2.0,,,,,2.333333,3.333333,,2.714286,5.0,2.666667
3,3.5,2.5,,,,4.0,2.5,,3.625,,,,2.0,,,,,,,2.5,,,2.0,1.0,5.0,3.5,3.25,,2.6,,,,1.0,,,2.0,3.217391,,4.0,,1.0
4,3.0,3.888889,,4.666667,3.0,2.75,3.636364,,3.555556,,4.0,,3.4,3.0,,,,,4.5,3.6,3.0,,3.5,5.0,2.0,3.75,2.888889,3.0,3.615385,4.0,4.0,4.0,3.666667,,3.6,3.285714,3.724138,,3.2,4.25,3.5
5,4.0,2.333333,,,,4.0,1.5,,2.666667,,3.25,,4.5,3.0,,,,,,3.25,4.0,,3.0,5.0,4.0,4.333333,3.6,,3.285714,,3.5,,,,3.5,4.333333,3.272727,,3.333333,4.0,2.666667


In [33]:
#Gender and Occupation Based Collaborative Filter using Mean Ratings
def cf_gen_occ(user_id, movie_id):
    
    #Check if movie_id exists in gen_occ_mean
    if movie_id in gen_occ_mean.index:
        
        #Identify the user
        user = users.loc[user_id]
        
        #Identify the gender and occupation
        gender = user['sex']
        occ = user['occupation']
        
        #Check if the occupation has rated the movie
        if occ in gen_occ_mean.loc[movie_id]:
            
            #Check if the gender has rated the movie
            if gender in gen_occ_mean.loc[movie_id][occ]:
                
                #Extract the required rating
                rating = gen_occ_mean.loc[movie_id][occ][gender]
                
                #Default to 3.0 if the rating is null
                if np.isnan(rating):
                    rating = 3.0
                
                return rating
            
    #Return the default rating    
    return 3.0

In [34]:
score(cf_gen_occ)

1.1391976012043645

## Model Based Approaches

In [36]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 4.4MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1670954 sha256=6f39abc0ad3a5f6185c1e1dbde4936f0cebcbaa3cabca755d5323dc20b606beb
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [39]:
#Import the required classes and methods from the surprise library
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import cross_validate #replace for evaluate()

#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader()

#Create the dataset to be used for building the filter
data = Dataset.load_from_df(ratings, reader)

#Define the algorithm object; in this case kNN
knn = KNNBasic()

#Evaluate the performance in terms of RMSE
#evaluate(knn, data, measures=['RMSE'])


# Run 5-fold cross-validation and print results
cross_validate(knn, data, measures=['RMSE'], cv=3, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9864  0.9925  0.9892  0.9893  0.0025  
Fit time          0.44    0.46    0.49    0.46    0.02    
Test time         6.12    6.28    6.33    6.24    0.09    


{'fit_time': (0.44236111640930176, 0.4626145362854004, 0.48784852027893066),
 'test_rmse': array([0.98635249, 0.99248302, 0.98919075]),
 'test_time': (6.11767578125, 6.2827231884002686, 6.328152179718018)}

In [40]:
#Import SVD
from surprise import SVD

#Define the SVD algorithm object
svd = SVD()

#Evaluate the performance in terms of RMSE
#evaluate(svd, data, measures=['RMSE'])

# Run 5-fold cross-validation and print results
cross_validate(svd, data, measures=['RMSE'], cv=3, verbose=True)

Evaluating RMSE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9441  0.9451  0.9458  0.9450  0.0007  
Fit time          4.55    4.63    4.72    4.63    0.07    
Test time         0.42    0.26    0.40    0.36    0.07    


{'fit_time': (4.5466179847717285, 4.634291172027588, 4.718250274658203),
 'test_rmse': array([0.94408006, 0.94509942, 0.94584326]),
 'test_time': (0.41843652725219727, 0.2573983669281006, 0.39621663093566895)}