In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens-20m-dataset/rating.csv
/kaggle/input/movielens-20m-dataset/link.csv
/kaggle/input/movielens-20m-dataset/genome_tags.csv
/kaggle/input/movielens-20m-dataset/genome_scores.csv
/kaggle/input/movielens-20m-dataset/tag.csv
/kaggle/input/movielens-20m-dataset/movie.csv


In [2]:
movies = pd.read_csv('/kaggle/input/movielens-20m-dataset/movie.csv')
ratings = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')

# Organize data and shrink

In [3]:
display(ratings)
total_user = ratings['userId'].nunique()
total_movie = ratings['movieId'].nunique()
print(f'The total number of unique users: {total_user}') # Total number of users is 138493
print(f'The total number of unique movies: {total_movie}') # Total number of users is 26744
# Ｓince the total number of user and movie is too large for this exercise, I will shrink the data to save computational resources.
# To do so, the filter would grab the 1000 movies with the most ratings, and the top 1000 users that rated these movies.

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


The total number of unique users: 138493
The total number of unique movies: 26744


In [4]:
# Grab the 1000 movies with the most ratings
ratings['userId'] = ratings['userId'].astype(str)
ratings['movieId'] = ratings['movieId'].astype(str)

most_rated_movies = ratings['movieId'].value_counts().head(200).index  # get the 1000 most rated films
most_rated_movies = ratings[ratings['movieId'].isin(most_rated_movies)]  # get the 1000 most rated films
display(most_rated_movies)
num_movie = most_rated_movies['movieId'].nunique()
print(f'The total number of unique movies: {num_movie}') 

# # Grab the most frequent raters of the 1000 movies
most_frequent_raters = most_rated_movies['userId'].value_counts().head(200).index  # get the 1000 most rated films
most_frequent_raters = most_rated_movies[most_rated_movies['userId'].isin(most_frequent_raters)]  # get the 1000 most rated films
display(most_frequent_raters)
num_raters = most_frequent_raters['userId'].nunique()
print(f'The total number of raters: {num_raters}') 

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
7,1,223,4.0,2005-04-02 23:46:13
...,...,...,...,...
20000150,138493,6874,5.0,2009-10-17 19:08:23
20000159,138493,7153,4.0,2009-11-16 16:50:20
20000164,138493,7361,5.0,2009-10-17 19:26:47
20000167,138493,7438,5.0,2009-10-17 19:08:26


The total number of unique movies: 200


Unnamed: 0,userId,movieId,rating,timestamp
106582,741,1,5.0,2007-10-16 21:51:03
106583,741,2,3.0,2007-10-16 22:34:54
106586,741,6,3.5,2007-10-16 22:04:06
106588,741,10,4.0,2007-10-16 22:33:40
106592,741,16,4.0,2007-11-10 18:48:00
...,...,...,...,...
19962747,138208,7361,5.0,2004-04-02 22:27:50
19962753,138208,7438,4.5,2004-05-19 03:45:25
19962808,138208,8961,4.5,2004-11-15 05:17:55
19962864,138208,33794,3.5,2005-06-26 05:32:25


The total number of raters: 200


In [5]:
shrink_df = most_frequent_raters
# shrink_df is the shrinked dataframe, it holds the 1000 most rated movies and 1000 most frequent raters of those movies.
display(shrink_df)
# now, we should rename user ID and movie ID to build are matrix, to do so, I will use a dictionary to map the old ID to the mew
map_user_ID = {}
map_movie_ID = {}
new_user_id = 0
for old_id in shrink_df['userId'].unique():
    if old_id not in map_user_ID:
        map_user_ID[old_id] = new_user_id
        new_user_id += 1
    else:
        continue

new_movie_id = 0
for old_id in shrink_df['movieId'].unique():
    if old_id not in map_movie_ID:
        map_movie_ID[old_id] = new_movie_id
        new_movie_id += 1
    else:
        continue
        
shrink_df['new_user_id'] =shrink_df['userId'].map(map_user_ID)
shrink_df['new_movie_id'] =shrink_df['movieId'].map(map_movie_ID)

Unnamed: 0,userId,movieId,rating,timestamp
106582,741,1,5.0,2007-10-16 21:51:03
106583,741,2,3.0,2007-10-16 22:34:54
106586,741,6,3.5,2007-10-16 22:04:06
106588,741,10,4.0,2007-10-16 22:33:40
106592,741,16,4.0,2007-11-10 18:48:00
...,...,...,...,...
19962747,138208,7361,5.0,2004-04-02 22:27:50
19962753,138208,7438,4.5,2004-05-19 03:45:25
19962808,138208,8961,4.5,2004-11-15 05:17:55
19962864,138208,33794,3.5,2005-06-26 05:32:25


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shrink_df['new_user_id'] =shrink_df['userId'].map(map_user_ID)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shrink_df['new_movie_id'] =shrink_df['movieId'].map(map_movie_ID)


In [6]:
display(shrink_df)

Unnamed: 0,userId,movieId,rating,timestamp,new_user_id,new_movie_id
106582,741,1,5.0,2007-10-16 21:51:03,0,0
106583,741,2,3.0,2007-10-16 22:34:54,0,1
106586,741,6,3.5,2007-10-16 22:04:06,0,2
106588,741,10,4.0,2007-10-16 22:33:40,0,3
106592,741,16,4.0,2007-11-10 18:48:00,0,4
...,...,...,...,...,...,...
19962747,138208,7361,5.0,2004-04-02 22:27:50,199,188
19962753,138208,7438,4.5,2004-05-19 03:45:25,199,189
19962808,138208,8961,4.5,2004-11-15 05:17:55,199,190
19962864,138208,33794,3.5,2005-06-26 05:32:25,199,191


# User-User CF
The first method is a user-user approach. Predict user_I's rating based on other users similar to user_I.

In [7]:
import numpy as np
def get_user_rating(user_id,main_df):
    '''
    This fxn filters the dataframe and return all movies rated by user_id
    '''
    df = main_df[main_df['userId'] == user_id]
    
    return df

def merge_two_users_df(user_one_df,user_two_df):
    '''
    This fxn merges does an inner join and the resulting datafram will contain movies that both user_one and user_two have rated.
    '''

    merged_df = pd.merge(user_one_df, user_two_df, on='movieId', how='inner')
    
    return merged_df
def calculate_similiarity(vector1,vector2):
    '''
    This fxn calculates the similarity between two users. 
    vector1 -> Contains all the ratings by user_I, but only with movies that user_J also rated.
    vector2 -> Contains all the ratings by user_J, but only with movies that user_I also rated.

    return
    cosine_sim -> The similiarity of user_I, user_J when rating movies.
    '''

    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    
    cosine_sim = dot_product / (norm_vector1 * norm_vector2)
    
    # print("Cosine Similarity:", cosine_sim)
    return cosine_sim

In [8]:
# loop through all users, calculate the cosine similiarity and store them in a 2D list.
num_rows = shrink_df['userId'].nunique()
num_cols = shrink_df['userId'].nunique()

rows = []
count = 0
for user_1 in shrink_df['userId'].unique():
    row = []
    # if count%(shrink_df['movieId'].nunique()/10) == 0:
    #     print(map_user_ID[user_1])
        
    for user_2 in shrink_df['userId'].unique():
            
        user_rating_1 = get_user_rating(user_id = user_1,main_df = shrink_df)
        user_rating_2 = get_user_rating(user_id = user_2,main_df = shrink_df)

        merged_df = merge_two_users_df(user_one_df = user_rating_1,user_two_df = user_rating_2)
        
        vector_1 = merged_df['rating_x']
        vector_2 = merged_df['rating_y']

        weight = calculate_similiarity(vector_1,vector_2)

        row.append(weight)
    count += 1
    rows.append(row)

In [9]:
weight_matrix_arr = np.array(rows)
weight_matrix = rows
# weights_matrix is a symmetric matrix that stores the similiarity of two users. EX: W[i][j] is the similiarity of user i and user j.
print(f'The shape of the weight matrix is {weight_matrix_arr.shape}')


The shape of the weight matrix is (200, 200)


In [10]:
def get_largest_indices(arr, n):
    '''
    This fxn returns the indices of the largest n values in a np array
    '''
    return np.argpartition(arr, -n)[-n:]
    
def user_average_rating(user_id,main_df):
    '''
    This fxn returns the average ratings of a user in main_df
    '''
    avg =  main_df[main_df['new_user_id'] == user_id]['rating'].mean()

    return avg

def user_rating_movie(user_id,movie_id,main_df):
    '''
    This fxn gives the rating of user_i to movie_j
    '''

    rating = main_df[(main_df['new_user_id'] == user_id) & (main_df['new_movie_id'] == movie_id)]['rating'].values[0]

    return rating


def predict_score(user_i_id, movie_m_id, main_df, weight_matrix):

    user_i_avg = user_average_rating(user_id = user_i_id,main_df = main_df)

    closest_K_indices = get_largest_indices(weight_matrix[user_i_id], n=26) # find the closest K users to calculate score, closest K will be the largest similiarity in weight_matrix
    
    numerator_lst = []
    denominator_lst = []

    for user_j in closest_K_indices[1:]: 

        try:
            
            user_j_avg = user_average_rating(user_id = user_j,main_df = main_df)
            user_j_rating = user_rating_movie(user_id = user_j,movie_id = movie_m_id,main_df = main_df)
        
        except:          

            continue
            
        numerator_lst.append(weight_matrix[user_i_id][user_j] * (user_j_rating - user_j_avg))
        denominator_lst.append(abs(weight_matrix[user_i_id][user_j]))

    numerator =  sum(numerator_lst) # sum {weights * ( user j rating of movie m - average rating of user j)}
    denominator = sum(denominator_lst) # absolute weights of the closest K users to calculate score

    score = (numerator / denominator) + user_i_avg
    # print(f'Predicted score: {score}')
    
    return score

def MSE(predict,ground_truth):

    predict_arr = np.array(predict)
    ground_truth_arr = np.array(ground_truth)
    MSE = np.mean((ground_truth_arr-predict_arr)**2)
    
    return MSE

In [11]:
predict_lst_user = []
ground_truth_lst_user = []

for user_i in shrink_df.new_user_id.unique():
    print(user_i)
    for movie_m in shrink_df.new_movie_id.unique():

        try:
            ground_truth = shrink_df[(shrink_df.new_user_id == user_i) & (shrink_df.new_movie_id == movie_m)]['rating'].values[0]
            score = predict_score(user_i_id = user_i, movie_m_id = movie_m, main_df = shrink_df, weight_matrix = weight_matrix_arr)
            predict_lst_user.append(score)
            ground_truth_lst_user.append(ground_truth)
        except:
            # print(f'User {user_i} did not rate movie {movie_m}. Ground truth does not exist.')
            continue
        # break
    # break

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199


# Item-Item CF
Use the similiarity of items to predict the ratings of a user. How one user i would rate item to predict what he how he would rate another item that is similar. In application, this method shuold be more accurate

In [12]:
#We still have to calculate the similiarity but this time it's the similiarity of two items. The same fxn can be used here. 
def Item_df(item_id,main_df):
    '''
    This fxn filters the main_df and return only users that rated item_I
    '''

    item_df = main_df[main_df['movieId'] == item_id]

    return item_df

def merge_item_df(item_i_df,item_j_df):
    '''
    This fxn inner joins two item df and returns the intersect of both items.
    The return df should include users that rated both Item_I and Item_J
    '''
    
    return pd.merge(item_i_df, item_j_df, on='userId', how='inner') 

def calculate_similiarity(vector1,vector2):
    '''
    This fxn calculates the similarity between two users. 
    vector1 -> Contains all the ratings of Item_I, but only with movies that Item_J also rated.
    vector2 -> Contains all the ratings by Item_J, but only with movies that Item_I also rated.

    return
    cosine_sim -> The similiarity of user_I, user_J when rating movies.
    '''

    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    
    cosine_sim = dot_product / (norm_vector1 * norm_vector2)
    
    # print("Cosine Similarity:", cosine_sim)
    return cosine_sim

In [13]:
rows = []
count = 0
for item_i in shrink_df['movieId'].unique():
    row = []
    # if count%(shrink_df['movieId'].nunique()/10) == 0:
    #     print(map_movie_ID[item_i]/10)
        
    for item_j in shrink_df['movieId'].unique():
            
        item_rating_i = Item_df(item_id = item_i,main_df = shrink_df)
        item_rating_j = Item_df(item_id = item_j,main_df = shrink_df)

        merge_item_df = pd.merge(item_rating_i, item_rating_j, on='userId', how='inner') 
        
        vector_1 = merge_item_df['rating_x']
        vector_2 = merge_item_df['rating_y']

        weight = calculate_similiarity(vector_1,vector_2)

        row.append(weight)
        # break
    # break
    count += 1
    rows.append(row)

    

In [14]:
weight_matrix_arr = np.array(rows)
weight_matrix = rows
# weight_matrix is a symmetric matrix that stores the similiarity of two items based on how two items were rated by the same users. EX: W[i][j] is the similiarity of item i and item j.
print(f'The shape of the weight matrix is {weight_matrix_arr.shape}')
# weight_matrix_arr

The shape of the weight matrix is (200, 200)


In [15]:
def get_largest_indices(arr, n):
    '''
    This fxn returns the indices of the largest n values in a np array
    '''
    return np.argpartition(arr, -n)[-n:]
    
def items_average_rating(movie_id,main_df):
    '''
    This fxn returns the average ratings of a user in main_df
    '''
    avg =  main_df[main_df['new_movie_id'] == movie_id]['rating'].mean()

    return avg

def user_rating_movie(user_id,movie_id,main_df):
    '''
    This fxn gives the rating of user_i to movie_j
    '''

    rating = main_df[(main_df['new_user_id'] == user_id) & (main_df['new_movie_id'] == movie_id)]['rating'].values[0]

    return rating


def predict_score(user_i_id, movie_m_id, main_df, weight_matrix):

    movie_m_avg = items_average_rating(movie_id = movie_m_id,main_df = main_df)

    closest_K_indices = get_largest_indices(weight_matrix[movie_m_id], n=26) # find the closest K users to calculate score, closest K will be the largest similiarity in weight_matrix
    
    numerator_lst = []
    denominator_lst = []

    for movie_j in closest_K_indices[1:]: 

        try:
            
            movie_j_avg = items_average_rating(movie_id = movie_j,main_df = main_df)
            movie_j_rating = user_rating_movie(user_id = user_i_id,movie_id = movie_j,main_df = main_df)
        
        except:          

            continue
            
        numerator_lst.append(weight_matrix[movie_m_id][movie_j] * (movie_j_rating - movie_j_avg))
        denominator_lst.append(abs(weight_matrix[movie_m_id][movie_j]))

    numerator =  sum(numerator_lst) # sum {weights * ( user j rating of movie m - average rating of user j)}
    denominator = sum(denominator_lst) # absolute weights of the closest K users to calculate score

    score = (numerator / denominator) + movie_m_avg
    # print(f'Predicted score: {score}')
    
    return score

def MSE(predict,ground_truth):

    predict_arr = np.array(predict)
    ground_truth_arr = np.array(ground_truth)
    MSE = np.mean((ground_truth_arr-predict_arr)**2)
    
    return MSE

In [16]:
predict_lst_item = []
ground_truth_lst_item = []

for movie_m in shrink_df.new_movie_id.unique():
    print(movie_m)
    for user_i in shrink_df.new_user_id.unique():

        try:
            ground_truth = shrink_df[(shrink_df.new_user_id == user_i) & (shrink_df.new_movie_id == movie_m)]['rating'].values[0]
            score = predict_score(user_i_id = user_i, movie_m_id = movie_m, main_df = shrink_df, weight_matrix = weight_matrix_arr)
            predict_lst_item.append(score)
            ground_truth_lst_item.append(ground_truth)
        except:
            # print(f'User {user_i} did not rate movie {movie_m}. Ground truth does not exist.')
            continue
        # break
    # break
# MSE = MSE(predict = predict_lst,ground_truth = ground_truth_lst)
# print(f'The final MSE:  {MSE}')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199


In [17]:
user_mse = MSE(predict = predict_lst_user,ground_truth = ground_truth_lst_user)
item_mse = MSE(predict = predict_lst_item,ground_truth = ground_truth_lst_item)
print(f'The final MSE using user-user:  {user_mse}')
print(f'The final MSE using item-item:  {item_mse}')

The final MSE using user-user:  0.46189767055778486
The final MSE using item-item:  0.42629396665770486


# SUMMARY
Collaborative filtering is a recommending technique that is user specific. It can be split into a User- User approach vs a Item-item approach.
## User-user
In this approach, the concept is to use the similiarity between users to predict ratings of a specific item. Meaning: "These users are similar to you in ratings, so you would probabbly like this new item they already rated highly."
To do so. the first step it to calculate the similiarity of two users based on the items they have both rated and store these as weights in a 2D matrix. To predict the score that user i will rate item j, we add the adverage rating of user i and the weighted average of how other users similar to user i(KNN) deviates from their average score, when they rate item j.

## Item-item 
This approach is not as intuitive. The concept is to use the similiarity of items to make recommendations. To do so. the first step it to calculate the similiarity of two items based on users that have rated both items. 
To predict the score that user i will rate item j, we add the average rating of item j to the weighted average of the deviation of the ratings when other items similar to item j is being rated.