In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
path = './data/ml-latest-small/'
ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')

train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [5]:
sparse_matrix = train_df.groupby('movieId').apply(lambda x: pd.Series(x['rating'].values, index=x['userId'])).unstack()
sparse_matrix.index.name='movieId'

sparse_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,,,4.5,,,,...,,,4.0,3.0,4.0,2.5,,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,3.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,,,,,,,,,,...,,,,,,,,,,
193583,,,,,,,,,,,...,,,,,,,,,,
193585,,,,,,,,,,,...,,,,,,,,,,
193587,,,,,,,,,,,...,,,,,,,,,,


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_matrix(a, b):
    cos_values = cosine_similarity(a.values, b.values)
    cos_df = pd.DataFrame(data=cos_values, columns=a.index.values, index=a.index)
    
    return cos_df

## Item-Based

In [9]:
item_matrix = sparse_matrix.fillna(0)
item_cos_df = cos_matrix(item_matrix, item_matrix)
item_cos_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.362258,0.191738,0.000000,0.234845,0.278680,0.184020,0.128930,0.140521,0.329772,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.362258,1.000000,0.185543,0.092463,0.218363,0.228204,0.194008,0.160969,0.058837,0.349398,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.191738,0.185543,1.000000,0.122155,0.223768,0.177748,0.276536,0.246393,0.194060,0.235465,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.092463,0.122155,1.000000,0.035394,0.000000,0.208622,0.189295,0.000000,0.080288,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.234845,0.218363,0.223768,0.035394,1.000000,0.209460,0.328209,0.271967,0.193847,0.180153,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193583,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193585,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193587,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [11]:
'''
make user-item matrix
'''
userId_grouped = train_df.groupby('userId')
item_prediction_result_df = pd.DataFrame(index=list(userId_grouped.indices.keys()), columns=item_matrix.index)
item_prediction_result_df

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,,,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [34]:
%%time
for userid, group in userId_grouped:
    '''
    userid = userId
    group = movieId, rating, timestamp
    user_sim = (Number of movies rated by user, total number of movies)
    user_sum = (total number of movies) value: sum of similarity
    '''
    user_sim = item_cos_df.loc[group['movieId']]
    user_rating = group['rating']
    user_sum = user_sim.sum(axis=0)
    
    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / (user_sum + 1)
    item_prediction_result_df.loc[userid] = pred_ratings

Wall time: 11.9 s


In [27]:
item_prediction_result_df.head(5)

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
1,214.719912,180.574813,169.088997,23.153171,79.066968,188.308546,83.960004,74.276796,34.704113,178.15414,...,0.447872,0.447872,0.447872,0.447872,0.447872,0.447872,0.447872,0.447872,0.447872,3.447774
2,19.173986,16.973849,8.691711,0.358965,6.825172,13.043242,3.826557,5.965096,1.807907,12.903223,...,1.774636,1.774636,1.774636,1.774636,1.774636,1.774636,1.774636,1.774636,1.774636,3.177911
3,6.906998,5.520524,5.065935,0.320967,1.86974,6.995893,2.434319,1.577312,1.496269,7.006525,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,116.30604,94.492881,69.161818,14.821971,46.995678,95.516684,61.558332,38.872671,16.422225,91.016503,...,0.597874,0.597874,0.597874,0.597874,0.597874,0.597874,0.597874,0.597874,0.597874,2.421628
5,39.617707,38.164025,24.025782,13.464431,22.147551,31.847675,25.482262,16.42529,5.579477,39.219495,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.879993


## User-Based

In [28]:
user_matrix = sparse_matrix.fillna(0).transpose()
user_cos_df = cos_matrix(user_matrix, user_matrix)
user_cos_df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.016314,0.049021,0.165799,0.123392,0.118556,0.112563,0.142135,0.056088,0.012906,...,0.070901,0.152097,0.187324,0.067264,0.151517,0.139042,0.198771,0.232811,0.112174,0.143902
2,0.016314,1.000000,0.000000,0.004627,0.000000,0.013391,0.029067,0.032754,0.000000,0.080739,...,0.170123,0.020395,0.014415,0.000000,0.000000,0.019846,0.016076,0.055610,0.032404,0.075810
3,0.049021,0.000000,1.000000,0.000000,0.005770,0.004833,0.000000,0.005911,0.000000,0.000000,...,0.006401,0.005889,0.015344,0.000000,0.012783,0.008884,0.004642,0.009433,0.000000,0.031309
4,0.165799,0.004627,0.000000,1.000000,0.133565,0.090914,0.094497,0.050417,0.000000,0.021991,...,0.075828,0.090252,0.241155,0.054366,0.081585,0.162277,0.083074,0.107276,0.026720,0.068325
5,0.123392,0.000000,0.005770,0.133565,1.000000,0.238812,0.071386,0.393773,0.000000,0.006245,...,0.050523,0.343953,0.101064,0.159651,0.111464,0.086797,0.073278,0.097040,0.205395,0.053090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.139042,0.019846,0.008884,0.162277,0.086797,0.086447,0.137372,0.080967,0.053366,0.061267,...,0.136437,0.077495,0.244189,0.061137,0.132016,1.000000,0.120745,0.224829,0.064349,0.159929
607,0.198771,0.016076,0.004642,0.083074,0.073278,0.135438,0.171735,0.159539,0.014172,0.012561,...,0.093158,0.158940,0.156456,0.101872,0.083353,0.120745,1.000000,0.208673,0.096324,0.097743
608,0.232811,0.055610,0.009433,0.107276,0.097040,0.136393,0.238417,0.155110,0.091135,0.051562,...,0.134926,0.141069,0.188459,0.111872,0.154623,0.224829,0.208673,1.000000,0.110371,0.260886
609,0.112174,0.032404,0.000000,0.026720,0.205395,0.181736,0.052096,0.439794,0.000000,0.028483,...,0.028450,0.306228,0.055558,0.181878,0.093744,0.064349,0.096324,0.110371,1.000000,0.057971


In [32]:
movieId_grouped = train_df.groupby('movieId')
user_prediction_result_df = pd.DataFrame(index=list(movieId_grouped.indices.keys()), columns=user_matrix.index)
user_prediction_result_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,,,,,,,,,,...,,,,,,,,,,
193583,,,,,,,,,,,...,,,,,,,,,,
193585,,,,,,,,,,,...,,,,,,,,,,
193587,,,,,,,,,,,...,,,,,,,,,,


In [35]:
%%time
for movieId, group in movieId_grouped:
    user_sim = user_cos_df.loc[group['userId']]
    user_rating = group['rating']
    user_sum = user_sim.sum(axis=0)

    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / (user_sum+1)
    user_prediction_result_df.loc[movieId] = pred_ratings

Wall time: 19.8 s


In [37]:
user_prediction_result_df.transpose().head(5)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.740314,3.094927,2.789876,0.951121,2.27525,3.703472,2.626654,1.449118,1.70502,3.222512,...,0.030698,0.023876,0.027287,0.027287,0.023876,0.027287,0.023876,0.023876,0.023876,0.228209
2,3.41796,2.674527,1.61624,0.160544,1.448245,3.174517,1.42666,0.520892,0.454905,2.828223,...,0.292802,0.227735,0.260269,0.260269,0.227735,0.260269,0.227735,0.227735,0.227735,0.422754
3,2.382156,1.500534,1.013439,0.073546,0.41241,2.121499,0.650073,0.15634,0.18322,1.803467,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.641485,2.972038,2.473705,0.644564,2.028397,3.512056,2.465829,1.077831,1.288335,3.055768,...,0.052803,0.041069,0.046936,0.046936,0.041069,0.046936,0.041069,0.041069,0.041069,0.178123
5,3.730396,3.258272,2.623135,1.359217,2.573024,3.516378,2.826829,1.608303,1.335225,3.16351,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161087
