In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [5]:
path = '/content/drive/MyDrive/Colab Notebooks/boostcamp/week5/data/others'

ratings_df = pd.read_csv(os.path.join(path, 'ratings.csv'), encoding='utf-8')
movies_df = pd.read_csv(os.path.join(path, 'movies.csv'), index_col='movieId', encoding='utf-8')
tags_df = pd.read_csv(os.path.join(path, 'tags.csv'), encoding='utf-8')

In [6]:
movies_df

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [7]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [8]:
total_count = len(movies_df.index)
total_genres = list(set([genre for sublist in list(map(lambda x: x.split('|'), movies_df['genres'])) for genre in sublist]))

In [9]:
print(f"전체 영화 수: {total_count}")
print(f"장르: {total_genres}")

전체 영화 수: 9742
장르: ['Mystery', 'Sci-Fi', 'War', 'Western', 'Action', 'Horror', 'Thriller', 'Documentary', 'IMAX', 'Drama', '(no genres listed)', 'Musical', 'Adventure', 'Children', 'Romance', 'Comedy', 'Animation', 'Film-Noir', 'Fantasy', 'Crime']


In [10]:
print(len(total_genres))

20


In [11]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [12]:
genre_count = dict.fromkeys(total_genres)
genre_count.keys()

dict_keys(['Mystery', 'Sci-Fi', 'War', 'Western', 'Action', 'Horror', 'Thriller', 'Documentary', 'IMAX', 'Drama', '(no genres listed)', 'Musical', 'Adventure', 'Children', 'Romance', 'Comedy', 'Animation', 'Film-Noir', 'Fantasy', 'Crime'])

In [20]:
genre_count = pd.DataFrame(np.sum(movies_df["genres"].str.split('|'))).value_counts()

In [21]:
genre_count

Drama                 4361
Comedy                3756
Thriller              1894
Action                1828
Romance               1596
Adventure             1263
Crime                 1199
Sci-Fi                 980
Horror                 978
Fantasy                779
Children               664
Animation              611
Mystery                573
Documentary            440
War                    382
Musical                334
Western                167
IMAX                   158
Film-Noir               87
(no genres listed)      34
dtype: int64

In [24]:
movies_rep = np.log10(total_count/genre_count)
  
movies_rep

Drama                 0.349062
Comedy                0.413923
Thriller              0.711268
Action                0.726672
Romance               0.785615
Adventure             0.887245
Crime                 0.909829
Sci-Fi                0.997422
Horror                0.998309
Fantasy               1.097111
Children              1.166480
Animation             1.202607
Mystery               1.230494
Documentary           1.345195
War                   1.406585
Musical               1.464902
Western               1.765932
IMAX                  1.789991
Film-Noir             2.049129
(no genres listed)    2.457169
dtype: float64

In [25]:
genre_representation = movies_df["genres"].str.get_dummies()
for i in genre_representation.columns:
  genre_representation[i] *= movies_rep[i][0]

genre_representation

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,0.000000,0.887245,1.202607,1.16648,0.413923,0.0,0.0,0.000000,1.097111,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.887245,0.000000,1.16648,0.000000,0.0,0.0,0.000000,1.097111,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.785615,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.349062,0.000000,0.0,0.0,0.0,0.0,0.0,0.785615,0.0,0.0,0.0,0.0
5,0.0,0.000000,0.000000,0.000000,0.00000,0.413923,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.726672,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
193583,0.0,0.000000,0.000000,1.202607,0.00000,0.413923,0.0,0.0,0.000000,1.097111,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
193585,0.0,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.349062,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
193587,0.0,0.726672,0.000000,1.202607,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [26]:
tag_column = list(map(lambda x: x.split(','), tags_df['tag']))
unique_tags = list(set(list(map(lambda x: x.strip(), list([tag for sublist in tag_column for tag in sublist])))))

print(unique_tags)

['Soundtrack', 'ryan reynolds', 'Pee Wee Herman', 'Tom Clancy', 'David Fincher', 'Suspense', 'slow', 'ben stiller', 'r:disturbing violent images', 'KIDNAPPING', 'Titanic', 'race', 'bad humor', 'writing', 'adolescence', 'Shark', 'Prince', 'needed more autobots', 'confusing ending', 'New York City', 'Notable Nudity', 'meryl streep', 'space epic', 'wine', 'Bechdel Test:Fail', 'wonderwoman', 'space action', 'android(s)/cyborg(s)', 'Wall Street', 'achronological', 'happy ending', 'marijuana', 'Queen Victoria', 'ancient Rome', 'interesting characters', 'Everything you want is here', 'immigration', 'revolutionary', 'drug abuse', 'rape', 'flood', 'Jesse Ventura', 'Bible', 'father-son relationship', 'time travel', 'Tim Burton', 'video', 'Simon and Garfunkel', 'enigmatic', 'acting', 'Hepburn and Tracy', 'symbolic', 'Andrew Lloyd Weber', 'film-noir', 'moldy', 'hip hop', 'villain nonexistent or not needed for good story', 'class', 'Arthur C. Clarke', 'Adam Sandler', 'Poor story', 'Jason Segel', 't

In [27]:
print(len(tag_column))
print(len(unique_tags))

3683
1589


In [28]:
total_movie_count = len(set(tags_df['movieId']))
# key: tag, value: number of movies with such tag
tag_count_dict = dict.fromkeys(unique_tags)

for each_movie_tag_list in tags_df['tag']:
    for tag in each_movie_tag_list.split(","):
        if tag_count_dict[tag.strip()] == None:
            tag_count_dict[tag.strip()] = 1
        else:
            tag_count_dict[tag.strip()] += 1

tag_idf = dict()
for each_tag in tag_count_dict:
    tag_idf[each_tag] = np.log10(total_movie_count / tag_count_dict[each_tag])

tag_idf

{'Soundtrack': 2.895422546039408,
 'ryan reynolds': 3.196452541703389,
 'Pee Wee Herman': 3.196452541703389,
 'Tom Clancy': 2.895422546039408,
 'David Fincher': 3.196452541703389,
 'Suspense': 3.196452541703389,
 'slow': 3.196452541703389,
 'ben stiller': 3.196452541703389,
 'r:disturbing violent images': 3.196452541703389,
 'KIDNAPPING': 3.196452541703389,
 'Titanic': 3.196452541703389,
 'race': 2.2933625547114453,
 'bad humor': 3.196452541703389,
 'writing': 2.7193312869837265,
 'adolescence': 2.155059856545164,
 'Shark': 3.196452541703389,
 'Prince': 3.196452541703389,
 'needed more autobots': 3.196452541703389,
 'confusing ending': 3.196452541703389,
 'New York City': 2.895422546039408,
 'Notable Nudity': 3.196452541703389,
 'meryl streep': 3.196452541703389,
 'space epic': 3.196452541703389,
 'wine': 3.196452541703389,
 'Bechdel Test:Fail': 3.196452541703389,
 'wonderwoman': 3.196452541703389,
 'space action': 2.895422546039408,
 'android(s)/cyborg(s)': 3.196452541703389,
 'Wall S

In [29]:
len(tag_idf.keys())


1589

In [30]:
tag_representation = pd.DataFrame(columns=sorted(unique_tags), index=list(set(tags_df['movieId'])))
for name, group in tqdm(tags_df.groupby(by='movieId')):
    temp_list = list(map(lambda x: x.split(','), list(group['tag'])))
    temp_tag_list = list(set(list(map(lambda x: x.strip(), list([tag for sublist in temp_list for tag in sublist])))))

    dict_temp = {i: tag_idf[i.strip()] for i in temp_tag_list}
    row_to_add = pd.DataFrame(dict_temp, index=[group['movieId'].values[0]])
    tag_representation.update(row_to_add)

tag_representation = tag_representation.sort_index(0)
tag_representation

100%|██████████| 1572/1572 [04:38<00:00,  5.64it/s]


Unnamed: 0,"""artsy""",06 Oscar Nominated Best Movie - Animation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001-like,2D animation,70mm,80's,AIDs,AS Byatt,AWESOME,Aardman,Academy award (Best Supporting Actress),Action,Adam Sandler,Adrien Brody,Adventure,Afghanistan,Africa,Agatha Christie,Al Pacino,Alcatraz,Alfred Hitchcock,Alicia Vikander,Amazing Cinematography,American Indians,American propaganda,Amish,Amtrak,Amy Adams,Andrew Lloyd Weber,Andy Garcia,Andy Kaufman,Andy Samberg,Angelina Jolie,...,violence,violence in america,violent,virginity,virtual reality,visual,visually appealing,visually stunning,von Bulow,voyeurism,wapendrama,war,way too long,weak plot,weather forecaster,wedding,weddings,weird,werewolf,western,whales,whimsical,white guilt,widows/widowers,will ferrell,wine,winona ryder,wistful,witty,wizards,women,wonderwoman,workplace,writing,wrongful imprisonment,wry,younger men,zither,zoe kazan,zombies
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
184471,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.19645,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
187593,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
187595,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [31]:
movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [32]:
movie_representation = pd.concat([genre_representation, tag_representation], axis=1).fillna(0)
print(movie_representation.shape)
print(movie_representation.describe())

(9742, 1609)
       (no genres listed)       Action  ...    zoe kazan      zombies
count         9742.000000  9742.000000  ...  9742.000000  9742.000000
mean             0.008576     0.136354  ...     0.000328     0.001241
std              0.144915     0.283726  ...     0.032385     0.054775
min              0.000000     0.000000  ...     0.000000     0.000000
25%              0.000000     0.000000  ...     0.000000     0.000000
50%              0.000000     0.000000  ...     0.000000     0.000000
75%              0.000000     0.000000  ...     0.000000     0.000000
max              2.457169     0.726672  ...     3.196453     2.418301

[8 rows x 1609 columns]


In [33]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    result_df = pd.DataFrame(data=cos_sim, index=[a.index])

    return result_df

In [34]:
print(movie_representation.head())

   (no genres listed)  Action  Adventure  ...  zither  zoe kazan  zombies
1                 0.0     0.0   0.887245  ...     0.0        0.0      0.0
2                 0.0     0.0   0.887245  ...     0.0        0.0      0.0
3                 0.0     0.0   0.000000  ...     0.0        0.0      0.0
4                 0.0     0.0   0.000000  ...     0.0        0.0      0.0
5                 0.0     0.0   0.000000  ...     0.0        0.0      0.0

[5 rows x 1609 columns]


In [35]:
cs_df = cos_sim_matrix(movie_representation, movie_representation)
cs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,9702,9703,9704,9705,9706,9707,9708,9709,9710,9711,9712,9713,9714,9715,9716,9717,9718,9719,9720,9721,9722,9723,9724,9725,9726,9727,9728,9729,9730,9731,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
1,1.0,0.124438,0.008403,0.040571,0.011755,0.0,0.016339,0.331122,0.0,0.131794,0.01146,0.035819,0.428332,0.0,0.127943,0.0,0.0,0.093519,0.093519,0.02637,0.012976,0.0,0.0,0.0,0.0,0.0,0.252486,0.0,0.139012,0.0,0.0,0.0,0.058681,0.0,0.127412,0.005151,0.0,0.0,0.0,0.0,...,0.0,0.046287,0.093519,0.164693,0.117018,0.103766,0.399957,0.007761,0.098841,0.0,0.2018,0.254479,0.0,0.043593,0.093519,0.332223,0.071492,0.0,0.131794,0.0,0.0,0.036562,0.0,0.0,0.093519,0.0,0.0,0.27171,0.0,0.13748,0.064466,0.260941,0.071492,0.27171,0.0,0.348295,0.379492,0.0,0.232553,0.093519
2,0.124438,1.0,0.0,0.0,0.0,0.0,0.0,0.240843,0.0,0.095861,0.0,0.0,0.186183,0.0,0.09306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183647,0.0,0.101111,0.0,0.0,0.0,0.042682,0.0,0.082309,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.11979,0.085114,0.075475,0.17385,0.0,0.071892,0.0,0.12849,0.170429,0.0,0.0,0.0,0.222496,0.0,0.0,0.095861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.082123,0.0,0.0,0.0,0.0,0.0,0.108082,0.117763,0.0,0.0,0.0
3,0.008403,0.0,1.0,0.179391,0.011294,0.0,0.072246,0.0,0.0,0.0,0.050673,0.034413,0.0,0.0,0.096374,0.0,0.049018,0.089849,0.089849,0.025335,0.012466,0.0,0.0,0.0,0.050722,0.0,0.0,0.045593,0.0,0.0,0.0,0.0,0.0,0.0,0.01369,0.022777,0.0,0.0,0.0,0.0,...,0.0,0.044471,0.089849,0.0,0.0,0.0,0.0,0.007456,0.0,0.0,0.024159,0.019374,0.0,0.192754,0.089849,0.025292,0.068686,0.0,0.0,0.0,0.0,0.035127,0.0,0.155841,0.089849,0.0,0.0,0.0,0.0,0.023609,0.00656,0.0,0.068686,0.0,0.0,0.020322,0.022142,0.0,0.0,0.089849
4,0.040571,0.0,0.179391,1.0,0.05453,0.0,0.348828,0.0,0.0,0.0,0.282473,0.166156,0.0,0.039185,0.465326,0.053144,0.2834,0.433821,0.433821,0.209317,0.060192,0.042043,0.0,0.120845,0.293251,0.059511,0.104881,0.263595,0.039466,0.131045,0.03317,0.0,0.024376,0.03061,0.066099,0.109975,0.037651,0.049771,0.105052,0.057419,...,0.0,0.21472,0.433821,0.0,0.0,0.074504,0.0,0.036,0.0,0.0,0.116646,0.093542,0.131045,0.930677,0.433821,0.122119,0.567487,0.0,0.0,0.0,0.0,0.290218,0.0,0.900999,0.433821,0.365843,0.365843,0.0,0.0,0.113993,0.031674,0.101979,0.567487,0.0,0.0,0.098119,0.106908,0.365843,0.0,0.433821
5,0.011755,0.0,0.011294,0.05453,1.0,0.0,0.640342,0.0,0.0,0.0,0.015403,0.048143,0.0,0.0,0.0,0.0,0.0,0.125697,0.125697,0.035443,0.01744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213067,0.0,0.0,0.019152,0.006924,0.0,0.0,0.0,0.0,...,0.0,0.062214,0.125697,0.0,0.0,0.0,0.0,0.010431,0.0,0.0,0.033798,0.027103,0.0,0.058592,0.125697,0.035383,0.096091,0.0,0.0,0.0,0.0,0.049142,0.0,0.0,0.125697,0.0,0.0,0.0,0.0,0.033029,0.009177,0.0,0.096091,0.0,0.0,0.028429,0.030976,0.0,0.0,0.125697


In [37]:
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=11)

In [38]:
test_userids = list(set(test_df.userId.values))
test_userids


[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

In [39]:
result_df = pd.DataFrame()

for user_id in tqdm(test_userids):
    user_record_df = train_df.loc[train_df.userId == int(user_id), :]
    
    user_sim_df = cs_df.loc[user_record_df['movieId']]  # (n, 9742); n은 userId가 평점을 매긴 영화 수
    user_rating_df = user_record_df[['rating']]  # (n, 1)
    sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)  # (9742, 1)
    # print("user_id=", i, user_record_df.shape, user_sim_df.T.shape, user_rating_df.shape, sim_sum.shape)

    prediction = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (sim_sum+1) # (9742, 1)

    prediction_df = pd.DataFrame(prediction, index=cs_df.index).reset_index()
    prediction_df.columns = ['movieId', 'pred_rating']    
    prediction_df = prediction_df[['movieId', 'pred_rating']][prediction_df.movieId.isin(test_df[test_df.userId == user_id]['movieId'].values)]

    temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
    result_df = pd.concat([result_df, temp_df], axis=0)

100%|██████████| 608/608 [00:18<00:00, 33.36it/s]


In [40]:
result_df.head(10)

Unnamed: 0,movieId,pred_rating,userId,rating,timestamp
0,50,3.719431,1,5.0,964982931
1,110,3.606876,1,4.0,964982176
2,423,4.11032,1,3.0,964982363
3,552,4.102051,1,4.0,964982653
4,592,4.052923,1,4.0,964982271
5,661,4.32889,1,5.0,964982838
6,673,4.178758,1,3.0,964981775
7,1009,4.230842,1,3.0,964981775
8,1023,4.41306,1,5.0,964982681
9,1025,4.280658,1,5.0,964982791


In [41]:
mse = mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['pred_rating'].values)
rmse = np.sqrt(mse)

print(mse, rmse)

1.411799611451824 1.188191740188352
