In [None]:
import numpy as np 
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_directory = './drive/MyDrive/data.csv'

###Filtering Netflix data. Only kept top movies and users. Dataset reduced from ~10mil to 3mil

In [None]:
netflix_df = pd.read_csv(file_directory, sep=',', 
                       names=['movieId', 'userId','rating','date'])
title_df = pd.read_csv('./drive/MyDrive/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['movieId', 'year', 'title'])
netflix_df = netflix_df.merge(title_df, left_on='movieId', right_on='movieId')
f = ['count','mean']

df_movie_summary = netflix_df.groupby('movieId')['rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.9),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

print('Min number of reviews per movie: {}'.format(movie_benchmark))

df_cust_summary = netflix_df.groupby('userId')['rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.9),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

print('Min number of reviews given per user: {}'.format(cust_benchmark))
print('Original: {}'.format(netflix_df.shape))
netflix_df = netflix_df[~netflix_df['movieId'].isin(drop_movie_list)]
netflix_df = netflix_df[~netflix_df['userId'].isin(drop_cust_list)]
print('Filtered: {}'.format(netflix_df.shape))

Min number of reviews per movie: 12304.0
Min number of reviews given per user: 541.0
Original: (100480507, 6)
Filtered: (30344365, 6)


In [None]:
# get movie count
movie_count = netflix_df['movieId'].nunique()
# get user count
user_count = netflix_df['userId'].nunique() 
# get rating count
rating_count = netflix_df['userId'].count() 
print("num_movies:", movie_count)
print("user_count:", user_count)
print("rating_count:", rating_count)

num_movies: 1777
user_count: 48033
rating_count: 30344365


##IMDB DATA + merging with filtered netflix data

In [None]:
imdb_movies_df = pd.read_csv("IMDb movies.csv",engine='python')
imdb_movies_df = imdb_movies_df[["original_title","country","genre","director","actors","duration","country","year","description"]]
imdb_movies_df
a = pd.merge(netflix_df, imdb_movies_df, how = 'inner', left_on = 'title', right_on = 'original_title')
a = a.drop_duplicates(
  subset = ['movieId', 'userId'],
  keep = 'last').reset_index(drop = True)
a

Unnamed: 0,movieId,userId,rating,date,year_x,title,original_title,country,genre,director,actors,duration,country.1,year_y,description
0,30,900816,3,2005-07-08,2003.0,Something's Gotta Give,Something's Gotta Give,USA,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",128,USA,2003,A swinger on the cusp of being a senior citize...
1,30,1990901,4,2004-05-24,2003.0,Something's Gotta Give,Something's Gotta Give,USA,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",128,USA,2003,A swinger on the cusp of being a senior citize...
2,30,1402412,4,2004-05-04,2003.0,Something's Gotta Give,Something's Gotta Give,USA,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",128,USA,2003,A swinger on the cusp of being a senior citize...
3,30,1601783,3,2004-07-27,2003.0,Something's Gotta Give,Something's Gotta Give,USA,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",128,USA,2003,A swinger on the cusp of being a senior citize...
4,30,306466,3,2004-04-02,2003.0,Something's Gotta Give,Something's Gotta Give,USA,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",128,USA,2003,A swinger on the cusp of being a senior citize...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26077004,17764,1011294,4,2005-11-07,1998.0,Shakespeare in Love,Shakespeare in Love,"USA, UK","Comedy, Drama, History",John Madden,"Geoffrey Rush, Tom Wilkinson, Steven O'Donnell...",123,"USA, UK",1998,"The world's greatest ever playwright,"
26077005,17764,2549935,4,2005-11-12,1998.0,Shakespeare in Love,Shakespeare in Love,"USA, UK","Comedy, Drama, History",John Madden,"Geoffrey Rush, Tom Wilkinson, Steven O'Donnell...",123,"USA, UK",1998,"The world's greatest ever playwright,"
26077006,17764,2357910,4,2005-11-17,1998.0,Shakespeare in Love,Shakespeare in Love,"USA, UK","Comedy, Drama, History",John Madden,"Geoffrey Rush, Tom Wilkinson, Steven O'Donnell...",123,"USA, UK",1998,"The world's greatest ever playwright,"
26077007,17764,1331785,5,2005-11-21,1998.0,Shakespeare in Love,Shakespeare in Love,"USA, UK","Comedy, Drama, History",John Madden,"Geoffrey Rush, Tom Wilkinson, Steven O'Donnell...",123,"USA, UK",1998,"The world's greatest ever playwright,"


In [None]:

# get movie count
movie_count = a['movieId'].nunique()
# get user count
user_count = a['userId'].nunique() 
# get rating count
rating_count = a['userId'].count() 

print("num_movies:", movie_count)
print("user_count:", user_count)
print("rating_count:", rating_count)


num_movies: 1478
user_count: 48033
rating_count: 26077009


In [None]:
a = a[['movieId','userId','rating','description','genre','director','actors','country','original_title']]
a = a.loc[:,~a.columns.duplicated()]


In [None]:
a

Unnamed: 0,movieId,userId,rating,description,genre,director,actors,country,original_title
0,30,900816,3,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give
1,30,1990901,4,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give
2,30,1402412,4,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give
3,30,1601783,3,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give
4,30,306466,3,A swinger on the cusp of being a senior citize...,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",USA,Something's Gotta Give
...,...,...,...,...,...,...,...,...,...
26077004,17764,1011294,4,"The world's greatest ever playwright,","Comedy, Drama, History",John Madden,"Geoffrey Rush, Tom Wilkinson, Steven O'Donnell...","USA, UK",Shakespeare in Love
26077005,17764,2549935,4,"The world's greatest ever playwright,","Comedy, Drama, History",John Madden,"Geoffrey Rush, Tom Wilkinson, Steven O'Donnell...","USA, UK",Shakespeare in Love
26077006,17764,2357910,4,"The world's greatest ever playwright,","Comedy, Drama, History",John Madden,"Geoffrey Rush, Tom Wilkinson, Steven O'Donnell...","USA, UK",Shakespeare in Love
26077007,17764,1331785,5,"The world's greatest ever playwright,","Comedy, Drama, History",John Madden,"Geoffrey Rush, Tom Wilkinson, Steven O'Donnell...","USA, UK",Shakespeare in Love


In [None]:
a.to_pickle('filtered_data.pkl')

## Baseline: Item based collab filtering

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(a, test_size=0.1)


In [None]:
UM = pd.pivot_table(train_df,values='rating',index='userId',columns='movieId')

In [None]:
UM

movieId,30,77,83,108,118,143,148,175,187,189,197,199,241,252,257,269,273,285,295,299,305,311,312,313,329,330,331,334,348,353,357,367,406,413,424,442,443,457,482,483,...,17321,17324,17330,17339,17346,17355,17358,17381,17387,17398,17405,17412,17424,17426,17441,17472,17479,17482,17506,17508,17513,17526,17533,17537,17541,17558,17560,17574,17580,17589,17621,17622,17627,17672,17692,17697,17703,17709,17762,17764
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
6,3.0,,,,,,,5.0,,,3.0,,3.0,,,,,,4.0,3.0,,,,,4.0,,,,,,,,,,,,,5.0,,,...,,3.0,,4.0,,3.0,,,,,3.0,,4.0,,,,4.0,,,,,,,,,,,,3.0,,,,5.0,,4.0,,2.0,5.0,,
7,5.0,,5.0,,,,,5.0,,,,,,,5.0,,4.0,5.0,,4.0,,,,5.0,4.0,,,,4.0,,5.0,,,,,,,5.0,,,...,,,,,5.0,5.0,5.0,,,,4.0,,,,5.0,,,3.0,,,,4.0,4.0,2.0,4.0,3.0,,,,,,,,,,,,,,3.0
79,3.0,,,,,,,,,,,,,,5.0,,,3.0,,5.0,,,1.0,,,,3.0,,,,,,4.0,,,,,,,4.0,...,,4.0,,,,2.0,,,,,4.0,,3.0,,5.0,3.0,,4.0,2.0,,,,,,4.0,,5.0,,3.0,2.0,5.0,2.0,4.0,4.0,4.0,1.0,,,,4.0
134,,4.0,,,5.0,5.0,5.0,,,,,,,,,,5.0,,5.0,,,,,,4.0,5.0,,,,,,,,,,,,3.0,,5.0,...,5.0,,5.0,,,5.0,,,,,5.0,,,,,,,5.0,,5.0,5.0,,,,,5.0,5.0,,,4.0,5.0,,,,,,5.0,,4.0,
188,3.0,,,3.0,3.0,,,,,4.0,,4.0,,,,,,3.0,4.0,,,,3.0,2.0,4.0,,,,,,,,,,,,,4.0,,3.0,...,,3.0,,5.0,,3.0,,,,,3.0,,,,,,,,,,4.0,3.0,,4.0,,4.0,4.0,,3.0,,,,,,,,,3.0,,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649257,4.0,,,,,4.0,3.0,,,,3.0,,,,,,,,4.0,5.0,,,3.0,3.0,3.0,,3.0,,,,,,,,,,,,,2.0,...,,3.0,,,,,,,,,,,,,4.0,,3.0,,,,,,,4.0,,,4.0,,3.0,4.0,,4.0,4.0,,,,,4.0,,4.0
2649267,5.0,,,,,4.0,,,,,5.0,,,4.0,5.0,,,,3.0,5.0,,,,5.0,3.0,4.0,3.0,,,3.0,,,,2.0,,,,,,3.0,...,4.0,5.0,,,,3.0,,,3.0,,3.0,,,,,,4.0,3.0,4.0,,5.0,3.0,,,,,,4.0,5.0,,,5.0,,,3.0,,,,,4.0
2649285,3.0,,,,,,2.0,,2.0,1.0,,,,,,,,,,5.0,,,5.0,4.0,4.0,2.0,5.0,4.0,,,,,,,,,,,3.0,,...,2.0,4.0,,,,3.0,,,4.0,,4.0,,,3.0,2.0,,,3.0,2.0,,3.0,,,,,,3.0,,,3.0,,3.0,3.0,,2.0,,,,3.0,4.0
2649296,3.0,4.0,,,,,,3.0,,2.0,3.0,,,,,,,,,3.0,,,,4.0,4.0,3.0,,,,,4.0,,,,,,,3.0,,4.0,...,,4.0,,5.0,,3.0,,,,,,,,3.0,3.0,,4.0,,,,,,,,,,4.0,,,3.0,3.0,,5.0,,3.0,,,,,


In [None]:
corrMatrix = UM.corr()

In [None]:
corrMatrix

movieId,30,77,83,108,118,143,148,175,187,189,197,199,241,252,257,269,273,285,295,299,305,311,312,313,329,330,331,334,348,353,357,367,406,413,424,442,443,457,482,483,...,17321,17324,17330,17339,17346,17355,17358,17381,17387,17398,17405,17412,17424,17426,17441,17472,17479,17482,17506,17508,17513,17526,17533,17537,17541,17558,17560,17574,17580,17589,17621,17622,17627,17672,17692,17697,17703,17709,17762,17764
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
30,1.000000,0.210486,0.191957,0.139058,0.175844,0.164993,0.277826,0.017159,0.069415,0.135245,0.293888,0.051534,0.101547,0.260389,0.203574,0.202962,0.280455,0.257007,0.142258,0.223593,0.269778,0.009052,0.069044,0.280254,0.108006,0.191414,0.111361,0.239146,0.033579,0.243539,0.160662,0.083233,0.265084,0.091750,-0.015767,0.163450,0.116359,0.070938,0.164279,0.197758,...,0.188191,0.297183,0.197582,0.051559,0.188016,0.271286,0.069874,0.087527,0.031089,0.201272,0.226868,0.071830,0.110338,0.182536,0.204327,0.154848,0.120378,0.170532,0.248533,0.014326,0.174900,0.139116,0.031073,0.148887,0.267171,0.174072,0.118934,0.258324,0.228450,0.134100,0.203591,0.075792,0.207044,0.213320,0.180211,0.267580,0.162050,0.224633,0.120208,0.191795
77,0.210486,1.000000,0.146307,0.143141,0.397949,0.229020,0.403335,-0.024008,0.111793,0.247830,0.408187,0.033232,-0.020376,0.300270,0.238202,0.179422,0.332286,0.394033,0.351664,0.092562,0.378930,-0.091443,-0.002572,0.349683,0.167690,0.288730,0.079106,0.360305,0.056806,0.340892,0.049428,0.102366,0.392317,0.025606,0.014298,0.189715,-0.037706,0.021506,0.026504,0.350383,...,0.204596,0.351329,0.366043,0.104279,0.280817,0.286257,0.083652,-0.101053,-0.021123,0.338453,0.358184,0.020767,0.154356,0.196952,0.064961,0.195820,0.163823,0.191764,0.320240,-0.016429,0.388754,0.014080,-0.052347,0.274977,0.320844,0.349037,0.151366,0.379084,0.378875,0.241485,0.319995,0.105427,0.278143,0.204576,0.369359,0.327886,0.305424,0.153526,0.147759,0.025951
83,0.191957,0.146307,1.000000,0.087642,0.193538,0.203868,0.160391,0.118120,0.112133,0.191229,0.167821,0.268014,0.246476,0.244206,0.286150,0.282406,0.199479,0.231689,0.167510,0.205540,0.257604,0.158223,0.183162,0.185634,0.117804,0.135078,0.107720,0.160958,0.233832,0.135278,0.214080,0.115826,0.165185,0.143209,0.156361,0.330507,0.262964,0.115011,0.260957,0.088100,...,0.262837,0.128626,0.124769,0.170374,0.229178,0.143144,0.194280,0.245845,0.177943,0.155162,0.090090,0.182975,0.164901,0.210838,0.348323,0.220414,0.147487,0.216496,0.103174,0.188635,0.123217,0.274459,0.302568,0.158067,0.151479,0.161405,0.132334,0.288638,0.120715,0.150357,0.133032,0.127543,0.158998,0.246069,0.118611,0.113381,0.080735,0.278966,0.214873,0.205072
108,0.139058,0.143141,0.087642,1.000000,0.176426,0.193304,0.149339,0.151573,0.139354,0.174618,0.236829,0.095387,0.153986,0.164322,0.107426,0.116087,0.194419,0.212930,0.167279,0.045703,0.126930,0.186393,0.157625,0.086874,0.148757,0.157473,0.127180,0.210472,0.104635,0.126071,0.159094,0.169872,0.233242,0.178812,0.067695,0.120316,0.098805,0.169852,0.105051,0.119618,...,0.122799,0.166498,0.235709,0.155466,0.148046,0.161911,0.279466,0.197412,0.139984,0.226147,0.207344,0.159950,0.194228,0.114159,0.073322,0.177842,0.138637,0.086780,0.186488,0.144861,0.132848,0.161882,0.121438,0.129019,0.182682,0.202131,0.137439,0.117272,0.216667,0.186654,0.159165,0.145403,0.083615,0.161043,0.155011,0.213406,0.195895,0.158051,0.226976,0.107491
118,0.175844,0.397949,0.193538,0.176426,1.000000,0.240554,0.272332,0.097482,0.104785,0.335854,0.278630,0.087396,0.007639,0.311324,0.241532,0.212400,0.244566,0.290502,0.352802,0.035766,0.286376,-0.022745,0.026399,0.198745,0.111079,0.289129,0.136452,0.262995,0.033929,0.223479,0.055644,0.168084,0.312150,0.005463,0.035563,0.228409,-0.029238,0.094267,0.011013,0.318514,...,0.203268,0.272721,0.290381,0.212307,0.322146,0.213959,0.155417,0.003210,-0.024348,0.256799,0.278511,0.077275,0.273783,0.189497,0.125500,0.366348,0.143848,0.135985,0.217865,0.057405,0.296685,0.079129,-0.074839,0.311182,0.263265,0.328357,0.260158,0.263529,0.256399,0.199571,0.328418,0.097552,0.207759,0.192223,0.241391,0.250936,0.266177,0.167870,0.112099,-0.007668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17697,0.267580,0.327886,0.113381,0.213406,0.250936,0.181321,0.323295,-0.037587,0.066173,0.125365,0.345276,-0.041190,-0.013773,0.386564,0.240015,0.217844,0.442743,0.270033,0.207196,0.106501,0.321435,0.021005,-0.014232,0.263775,0.100180,0.240955,0.020474,0.458133,0.075704,0.304307,0.103784,0.111133,0.355496,0.089506,-0.101283,0.148445,0.084961,0.049416,0.083084,0.261460,...,0.163575,0.348320,0.325245,0.013145,0.184209,0.279782,0.122017,-0.042980,0.062890,0.333302,0.368751,0.100317,0.164232,0.142176,0.109957,0.167711,0.155754,0.227425,0.354217,-0.008896,0.247942,0.063954,-0.033526,0.180045,0.446710,0.263152,0.082319,0.202294,0.345111,0.197829,0.221190,0.113486,0.235032,0.248382,0.367464,1.000000,0.267453,0.180971,0.084971,0.073352
17703,0.162050,0.305424,0.080735,0.195895,0.266177,0.124303,0.235471,0.088097,0.176482,0.156010,0.245691,0.096313,0.091838,0.249712,0.106155,0.136458,0.278062,0.234885,0.255651,0.046988,0.247202,0.104532,0.079109,0.183585,0.134395,0.198097,0.119087,0.281105,0.107485,0.225654,0.111627,0.127335,0.267918,0.078679,0.073764,0.119552,0.033330,0.133881,0.047880,0.231374,...,0.138108,0.228808,0.309611,0.124698,0.163343,0.209452,0.161498,0.027205,0.082373,0.266406,0.242188,0.059906,0.142472,0.155696,0.029178,0.224798,0.145325,0.092000,0.221463,0.056866,0.207261,0.075146,0.038817,0.181017,0.277973,0.291111,0.130797,0.199041,0.285649,0.243626,0.224595,0.177906,0.166816,0.151473,0.267143,0.267453,1.000000,0.122517,0.165486,0.058719
17709,0.224633,0.153526,0.278966,0.158051,0.167870,0.206425,0.230069,0.152933,0.134594,0.163586,0.217496,0.193990,0.171773,0.201960,0.270275,0.218662,0.159708,0.279516,0.167740,0.184472,0.230743,0.088273,0.185780,0.225521,0.113885,0.183382,0.182981,0.144332,0.145726,0.179879,0.165266,0.153492,0.199854,0.173753,0.083082,0.278081,0.189790,0.105658,0.183070,0.118305,...,0.229048,0.168722,0.150199,0.181576,0.169579,0.184901,0.194243,0.114431,0.089434,0.120328,0.153810,0.179196,0.217968,0.191108,0.185687,0.214106,0.146615,0.152133,0.172167,0.117249,0.190485,0.160497,0.135336,0.175019,0.197414,0.156982,0.152272,0.213760,0.121340,0.138890,0.151449,0.169396,0.142395,0.286902,0.117290,0.180971,0.122517,1.000000,0.224222,0.215771
17762,0.120208,0.147759,0.214873,0.226976,0.112099,0.253577,0.156552,0.169741,0.200802,0.140288,0.182585,0.146352,0.174615,0.153308,0.186807,0.176346,0.105977,0.233004,0.154242,0.145766,0.195872,0.181806,0.199271,0.145613,0.186024,0.203033,0.195276,0.086086,0.184442,0.187907,0.184972,0.205573,0.132643,0.206475,0.120455,0.230249,0.181189,0.204254,0.228352,0.125362,...,0.149774,0.124040,0.126369,0.179863,0.207059,0.155498,0.185901,0.118429,0.215424,0.148378,0.126421,0.150139,0.186018,0.126538,0.135479,0.200114,0.175662,0.160249,0.142267,0.204567,0.153952,0.154911,0.172267,0.147541,0.135071,0.180406,0.141976,0.212287,0.145355,0.233390,0.143356,0.276429,0.132654,0.179858,0.100407,0.084971,0.165486,0.224222,1.000000,0.180329


In [None]:
type(corrMatrix)

pandas.core.frame.DataFrame

In [None]:
def predict_score(movie_of_interest, user_of_interest):
  num_sim_movies_to_consider = 10
  corrMatrix[movie_of_interest]
  d = corrMatrix[movie_of_interest].sort_values(ascending=False)
  list_of_top_sim_movies = []
  movies_added=0
  i=0
  for id, sim in d.iteritems():
    if not np.isnan(UM[id][user_of_interest]):
      
      score = UM[id][user_of_interest]
      
      list_of_top_sim_movies.append([id,score,sim])
      i += 1
      if i == num_sim_movies_to_consider+1: #+1 since we get rid of itself. 
        break
  list_of_top_sim_movies.pop(0)
  # print(list_of_top_sim_movies)
  predicted_score = 0
  num_movies = 0
  denominator = 0
  for m in list_of_top_sim_movies:
    num_movies+= 1
    predicted_score += m[1]*m[2]
    denominator += m[2]
  predicted_score /= denominator
  return predicted_score
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [None]:
rating_label = test_df['rating'].tolist()
movie_id = test_df['movieId'].tolist()
user_id = test_df['userId'].tolist()
predictions = []
num = 0
print(len(rating_label))
for i in range(len(rating_label)):
  movie_of_interest = movie_id[i]
  user_of_interest = user_id[i]
  predicted = predict_score(movie_of_interest,user_of_interest)
  predictions.append(predicted)
  num += 1
  print(num)

print(MSE(predictions, rating_label))


In [None]:
print(MSE(predictions, rating_label))

0.7072740967339217


## Content-boosted. Will it improve performance?

In [None]:
movies = a.drop_duplicates(subset=['movieId'], keep="last")
movies = movies[['movieId',"original_title","genre","director","actors","duration","year_x","country"]]
movies = movies.loc[:,~movies.columns.duplicated()]
movies

Unnamed: 0,movieId,original_title,genre,director,actors,duration,year_x,country
31891,30,Something's Gotta Give,"Comedy, Drama, Romance",Nancy Meyers,"Jack Nicholson, Diane Keaton, Keanu Reeves, Fr...",128,2003.0,USA
41640,77,Congo,"Action, Adventure, Mystery",Frank Marshall,"Laura Linney, Dylan Walsh, Ernie Hudson, Tim C...",109,1995.0,USA
52389,83,Silkwood,"Biography, Drama, History",Mike Nichols,"Meryl Streep, Kurt Russell, Cher, Craig T. Nel...",131,1983.0,USA
60360,108,Spartan,"Action, Crime, Drama",David Mamet,"Tia Texada, Derek Luke, Val Kilmer, Jeremie Ca...",106,2004.0,"Germany, USA"
73110,118,Rambo: First Blood Part II,"Action, Adventure, Thriller",George P. Cosmatos,"Sylvester Stallone, Richard Crenna, Charles Na...",96,1985.0,"USA, Mexico"
...,...,...,...,...,...,...,...,...
25994418,17697,New York Minute,"Comedy, Crime, Family",Dennie Gordon,"Ashley Olsen, Mary-Kate Olsen, Eugene Levy, An...",91,2004.0,USA
26009297,17703,Hulk,"Action, Sci-Fi",Ang Lee,"Eric Bana, Jennifer Connelly, Sam Elliott, Jos...",138,2003.0,USA
26031466,17709,A River Runs Through It,Drama,Robert Redford,"Craig Sheffer, Brad Pitt, Tom Skerritt, Brenda...",123,1992.0,USA
26049507,17762,Gattaca,"Drama, Sci-Fi, Thriller",Andrew Niccol,"Ethan Hawke, Uma Thurman, Gore Vidal, Xander B...",106,1997.0,USA


In [None]:
movies.genre = movies.genre.astype(str).str.replace(",", '')
movies.actors = movies.actors.astype(str).str.replace(",", '')
movies.director = movies.director.astype(str).str.replace(",", '')
movies.country = movies.country.astype(str).str.replace(",", '')
for index, row in movies.iterrows():
  if row.duration <= 90:
    movies.at[index,'duration2'] = 'short'
  elif row.duration <= 120:
    movies.at[index,'duration2'] = 'medium'
  else:
    movies.at[index,'duration2'] = 'long'

  if row.year_x < 1950:
    movies.at[index,'year2'] = 'older'
  elif row.year_x < 1990:
    movies.at[index,'year2'] = 'old'
  else:
    movies.at[index,'year2'] = 'new'
movies.drop(columns=['year_x', 'duration'], inplace=True)
movies


Unnamed: 0,movieId,original_title,genre,director,actors,country,duration2,year2
31891,30,Something's Gotta Give,Comedy Drama Romance,Nancy Meyers,Jack Nicholson Diane Keaton Keanu Reeves Franc...,USA,long,new
41640,77,Congo,Action Adventure Mystery,Frank Marshall,Laura Linney Dylan Walsh Ernie Hudson Tim Curr...,USA,medium,new
52389,83,Silkwood,Biography Drama History,Mike Nichols,Meryl Streep Kurt Russell Cher Craig T. Nelson...,USA,long,old
60360,108,Spartan,Action Crime Drama,David Mamet,Tia Texada Derek Luke Val Kilmer Jeremie Campb...,Germany USA,medium,new
73110,118,Rambo: First Blood Part II,Action Adventure Thriller,George P. Cosmatos,Sylvester Stallone Richard Crenna Charles Napi...,USA Mexico,medium,old
...,...,...,...,...,...,...,...,...
25994418,17697,New York Minute,Comedy Crime Family,Dennie Gordon,Ashley Olsen Mary-Kate Olsen Eugene Levy Andy ...,USA,medium,new
26009297,17703,Hulk,Action Sci-Fi,Ang Lee,Eric Bana Jennifer Connelly Sam Elliott Josh L...,USA,long,new
26031466,17709,A River Runs Through It,Drama,Robert Redford,Craig Sheffer Brad Pitt Tom Skerritt Brenda Bl...,USA,long,new
26049507,17762,Gattaca,Drama Sci-Fi Thriller,Andrew Niccol,Ethan Hawke Uma Thurman Gore Vidal Xander Berk...,USA,medium,new


In [None]:
movies['features'] = movies[movies.columns[2:]].apply(
    lambda x: ' '.join(x.dropna().astype(str)),
    axis=1
)
movies = movies[['movieId','original_title','features']]
movies

Unnamed: 0,movieId,original_title,features
31891,30,Something's Gotta Give,Comedy Drama Romance Nancy Meyers Jack Nichols...
41640,77,Congo,Action Adventure Mystery Frank Marshall Laura ...
52389,83,Silkwood,Biography Drama History Mike Nichols Meryl Str...
60360,108,Spartan,Action Crime Drama David Mamet Tia Texada Dere...
73110,118,Rambo: First Blood Part II,Action Adventure Thriller George P. Cosmatos S...
...,...,...,...
25994418,17697,New York Minute,Comedy Crime Family Dennie Gordon Ashley Olsen...
26009297,17703,Hulk,Action Sci-Fi Ang Lee Eric Bana Jennifer Conne...
26031466,17709,A River Runs Through It,Drama Robert Redford Craig Sheffer Brad Pitt T...
26049507,17762,Gattaca,Drama Sci-Fi Thriller Andrew Niccol Ethan Hawk...


In [None]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
import re
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')
stop_words.remove('not')
lemmatizer = WordNetLemmatizer()
def data_preprocessing(text):
  text = re.sub(re.compile('<.*?>'), '', text)
  text = re.sub('[^A-Za-z0-9]+',' ',text)
  text = text.lower()
  tokens = nltk.word_tokenize(text)
  text = [word for word in tokens if word not in stop_words]
  text = [lemmatizer.lemmatize(word) for word in text]
  text = ' '.join(text)
  return text
movies['features'] = movies.features.apply(data_preprocessing)
movies

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,movieId,original_title,features
31891,30,Something's Gotta Give,comedy drama romance nancy meyers jack nichols...
41640,77,Congo,action adventure mystery frank marshall laura ...
52389,83,Silkwood,biography drama history mike nichols meryl str...
60360,108,Spartan,action crime drama david mamet tia texada dere...
73110,118,Rambo: First Blood Part II,action adventure thriller george p cosmatos sy...
...,...,...,...
25994418,17697,New York Minute,comedy crime family dennie gordon ashley olsen...
26009297,17703,Hulk,action sci fi ang lee eric bana jennifer conne...
26031466,17709,A River Runs Through It,drama robert redford craig sheffer brad pitt t...
26049507,17762,Gattaca,drama sci fi thriller andrew niccol ethan hawk...


In [None]:
from sklearn.naive_bayes import MultinomialNB
import scipy
from sklearn.feature_extraction.text import CountVectorizer

# from sklearn.metrics import accuracy_score

def delete_rows_csr(mat, indices):
    if not isinstance(mat, scipy.sparse.csr_matrix):
        raise ValueError("works only for CSR format -- use .tocsr() first")
    indices = list(indices)
    mask = np.ones(mat.shape[0], dtype=bool)
    mask[indices] = False
    return mat[mask]

features = list(movies.features)
for index, row in UM.iterrows():
  movieId = list(row.index)
  row = list(row)  
  c = pd.DataFrame(list(zip(row, features,movieId)))
  vect = CountVectorizer(min_df=10)
  x_bow = vect.fit_transform(c[1])
  break
num = 0
for index, row in UM.iterrows():
    num += 1
    if num < 0: 
      continue
    print(num)
    if num > 10000:
      break
    row = list(row)
    c = pd.DataFrame(list(zip(row, features,movieId)))
    d = c.loc[c[0].isnull()] 
    e = c.loc[c[0].notnull()]    
    y = e[0].values

    not_rated_movies = d.index.tolist()
    x_bow_filtered = delete_rows_csr(x_bow,not_rated_movies)
    clf = MultinomialNB()
    clf.fit(x_bow_filtered, y)
    for i, row in d.iterrows():
      movie = row[2]
      bow = x_bow[i]
      result = clf.predict(bow)
      UM[movie][index] = result

In [None]:
final_UM = pd.read_csv('./drive/MyDrive/final_UM.csv')

In [None]:
final_UM.drop(columns=['Unnamed: 0','Unnamed: 0.1'],inplace=True)

In [None]:
final_UM = final_UM.set_index('userId')

In [None]:
final_UM

Unnamed: 0_level_0,30,77,83,108,118,143,148,175,187,189,197,199,241,252,257,269,273,285,295,299,305,311,312,313,329,330,331,334,348,353,357,367,406,413,424,442,443,457,482,483,...,17321,17324,17330,17339,17346,17355,17358,17381,17387,17398,17405,17412,17424,17426,17441,17472,17479,17482,17506,17508,17513,17526,17533,17537,17541,17558,17560,17574,17580,17589,17621,17622,17627,17672,17692,17697,17703,17709,17762,17764
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
6,3.0,3.0,4.0,3.0,3.0,3.0,4.0,5.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,3.0,5.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,4.0,3.0,4.0,3.0,5.0,3.0,3.0,...,3.0,3.0,3.0,4.0,3.0,3.0,4.0,3.0,5.0,4.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,3.0,5.0,3.0,4.0,3.0,2.0,5.0,4.0,4.0
7,5.0,4.0,5.0,4.0,4.0,4.0,5.0,5.0,4.0,5.0,5.0,3.0,3.0,4.0,5.0,5.0,4.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,4.0,5.0,4.0,3.0,4.0,3.0,5.0,3.0,4.0,4.0,5.0,5.0,4.0,5.0,4.0,3.0,...,5.0,5.0,3.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,4.0,5.0,4.0,5.0,4.0,5.0,5.0,3.0,5.0,4.0,4.0,4.0,5.0,2.0,4.0,3.0,4.0,4.0,5.0,3.0,4.0,4.0,3.0,4.0,4.0,4.0,3.0,5.0,3.0,3.0
79,3.0,3.0,3.0,3.0,3.0,3.0,1.0,4.0,5.0,4.0,5.0,4.0,4.0,4.0,5.0,1.0,4.0,3.0,3.0,5.0,4.0,5.0,1.0,3.0,4.0,4.0,3.0,4.0,4.0,2.0,4.0,4.0,4.0,4.0,5.0,3.0,4.0,4.0,3.0,4.0,...,4.0,5.0,4.0,4.0,5.0,2.0,4.0,3.0,4.0,4.0,4.0,5.0,3.0,3.0,5.0,3.0,4.0,4.0,2.0,3.0,3.0,4.0,4.0,4.0,4.0,3.0,5.0,4.0,3.0,2.0,5.0,2.0,4.0,4.0,4.0,1.0,4.0,4.0,3.0,4.0
134,4.0,4.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,5.0,3.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,4.0,5.0
188,4.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,3.0,4.0,4.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,2.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,5.0,3.0,3.0,3.0,3.0,3.0,...,4.0,3.0,3.0,5.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,3.0,4.0,3.0,4.0,4.0,3.0,3.0,4.0,4.0,4.0,4.0,3.0,3.0,5.0,4.0,3.0,4.0,3.0,3.0,4.0,4.0,3.0,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649257,4.0,4.0,3.0,4.0,4.0,4.0,3.0,3.0,4.0,4.0,3.0,4.0,5.0,3.0,3.0,4.0,4.0,4.0,4.0,5.0,3.0,4.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,4.0,3.0,4.0,4.0,3.0,5.0,2.0,...,3.0,3.0,3.0,4.0,4.0,4.0,3.0,4.0,3.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,3.0,4.0,4.0,5.0,4.0,5.0,3.0,4.0,4.0,4.0,4.0,4.0
2649267,5.0,4.0,4.0,3.0,4.0,4.0,5.0,4.0,5.0,4.0,5.0,3.0,3.0,3.0,5.0,4.0,4.0,4.0,3.0,5.0,3.0,4.0,4.0,5.0,3.0,4.0,3.0,5.0,4.0,4.0,4.0,5.0,5.0,2.0,5.0,4.0,3.0,3.0,3.0,3.0,...,4.0,5.0,4.0,4.0,5.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,5.0,3.0,4.0,4.0,4.0,3.0,4.0,4.0,5.0,3.0,4.0,3.0,4.0,5.0,4.0,4.0,5.0,4.0,3.0,5.0,4.0,4.0,3.0,3.0,3.0,5.0,4.0,4.0
2649285,3.0,2.0,2.0,4.0,2.0,2.0,2.0,3.0,2.0,1.0,3.0,3.0,2.0,4.0,4.0,3.0,3.0,3.0,3.0,5.0,3.0,3.0,5.0,4.0,4.0,2.0,5.0,3.0,3.0,3.0,3.0,3.0,3.0,5.0,3.0,4.0,3.0,4.0,3.0,4.0,...,2.0,4.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,3.0,4.0,2.0,4.0,3.0,2.0,2.0,4.0,3.0,2.0,3.0,3.0,2.0,3.0,4.0,3.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,4.0,3.0,2.0,
2649296,3.0,3.0,3.0,4.0,3.0,4.0,3.0,3.0,4.0,2.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,3.0,4.0,4.0,3.0,4.0,3.0,4.0,5.0,5.0,3.0,3.0,3.0,3.0,4.0,...,3.0,4.0,3.0,5.0,3.0,3.0,3.0,4.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,3.0,3.0,4.0,4.0,3.0,4.0,3.0,4.0,3.0,3.0,4.0,3.0,3.0,3.0,5.0,2.0,3.0,3.0,3.0,3.0,3.0,


In [None]:
corrMatrix2 = final_UM.corr()

In [None]:
corrMatrix2.index.name = "movieId"
corrMatrix2.columns.name = "movieId"

movieId
30       1.000000
77       0.199538
83       0.284426
108      0.293205
118      0.260082
           ...   
17697    0.265051
17703    0.212356
17709    0.242375
17762    0.224124
17764    0.189952
Name: 30, Length: 1478, dtype: float64

In [None]:
def predict_score(movie_of_interest, user_of_interest):
  num_sim_movies_to_consider = 10
  corrMatrix2[movie_of_interest]
  d = corrMatrix2[movie_of_interest].sort_values(ascending=False)
  list_of_top_sim_movies = []
  movies_added=0
  i=0
  for id, sim in d.iteritems():
    if not np.isnan(final_UM[id][user_of_interest]):
      
      score = final_UM[id][user_of_interest]
      
      list_of_top_sim_movies.append([id,score,sim])
      i += 1
      if i == num_sim_movies_to_consider+1: #+1 since we get rid of itself. 
        break
  list_of_top_sim_movies.pop(0)
  # print(list_of_top_sim_movies)
  predicted_score = 0
  num_movies = 0
  denominator = 0
  for m in list_of_top_sim_movies:
    num_movies+= 1
    predicted_score += m[1]*m[2]
    denominator += m[2]
  predicted_score /= denominator
  return predicted_score
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)
    

In [None]:
rating_label = test_df['rating'].tolist()
movie_id = test_df['movieId'].tolist()
user_id = test_df['userId'].tolist()
predictions = []
num = 0
# print(len(rating_label))
for i in range(len(rating_label)):
  movie_of_interest = str(movie_id[i])
  user_of_interest = user_id[i]
  # print(movie_of_interest, user_of_interest)
  predicted = predict_score(movie_of_interest,user_of_interest)
  predictions.append(predicted)
  num += 1
  print(num)

print(MSE(predictions, rating_label))

In [None]:
print(MSE(predictions, rating_label))

0.7398384046728764


## Addressing Cold Start Problem

In [None]:
a

Unnamed: 0,movieId,userId,rating
0,30,900816,3
1,30,1990901,4
2,30,1402412,4
3,30,1601783,3
4,30,306466,3
...,...,...,...
26077004,17764,1011294,4
26077005,17764,2549935,4
26077006,17764,2357910,4
26077007,17764,1331785,5


In [None]:
list_of_movies = list(set(a.movieId.values))

In [None]:
len(movies)

1478

In [None]:
import random
cold_movies = random.sample(list_of_movies, 147)

In [None]:
cold_df =a[a['movieId'].isin(cold_movies)]
hot_df = a[~a.movieId.isin(cold_movies)]


In [None]:
cold_df2 = cold_df.copy() 

In [None]:
cold_df2.rating = np.nan

In [None]:
a2= pd.concat([cold_df2, hot_df])
a2

Unnamed: 0,movieId,userId,rating
474841,334,1128809,
474842,334,1513370,
474843,334,1778851,
474844,334,754082,
474845,334,2541216,
...,...,...,...
26077004,17764,1011294,4.0
26077005,17764,2549935,4.0
26077006,17764,2357910,4.0
26077007,17764,1331785,5.0


In [None]:
UM2 = pd.pivot_table(a2,values='rating',index='userId',columns='movieId')

In [None]:
UM2

movieId,30,77,83,108,118,143,148,175,187,189,197,199,241,252,257,269,273,285,295,299,305,311,312,313,329,330,331,348,353,357,367,406,413,424,442,443,457,482,483,501,...,17295,17299,17302,17303,17308,17319,17321,17324,17330,17339,17346,17355,17358,17387,17398,17405,17424,17426,17441,17472,17479,17482,17506,17508,17526,17533,17537,17558,17560,17574,17580,17589,17621,17622,17627,17672,17692,17697,17709,17764
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
6,3.0,,,,,,,5.0,,,3.0,,3.0,,,,,,4.0,3.0,,,,,4.0,,,,,,,,,,,,5.0,,,3.0,...,5.0,,,,3.0,2.0,,3.0,,4.0,,3.0,,,,3.0,4.0,,,,4.0,,,,,,,,3.0,,3.0,,,,5.0,,4.0,,5.0,
7,5.0,,5.0,,,,,5.0,,,,,,,5.0,,4.0,5.0,,4.0,,,,5.0,4.0,,,4.0,,5.0,,,,,,,5.0,,,5.0,...,,,,,3.0,4.0,,5.0,,5.0,5.0,5.0,5.0,,,4.0,,,5.0,,5.0,3.0,,,4.0,4.0,2.0,3.0,,,,,,,,,,,,3.0
79,3.0,,,,,,1.0,,,,,,,,5.0,,,3.0,,5.0,,,1.0,,,,3.0,,,,,4.0,,,,,,,4.0,,...,,,4.0,,4.0,,,4.0,,,5.0,2.0,,,,4.0,3.0,,5.0,3.0,,4.0,2.0,,,,,,5.0,,3.0,2.0,5.0,2.0,4.0,4.0,4.0,1.0,,4.0
134,,4.0,,,5.0,5.0,5.0,,,,,,,,,,5.0,,5.0,,,,,,4.0,5.0,,,,,,,,,,,3.0,,5.0,,...,5.0,5.0,5.0,,5.0,,5.0,,5.0,,,5.0,3.0,,,5.0,,,,,5.0,5.0,,5.0,,,,5.0,5.0,,,4.0,5.0,,,,,,,
188,3.0,,,3.0,3.0,,,,,4.0,,4.0,,,,,,3.0,4.0,,,,3.0,2.0,4.0,,,,,,,,,,,,4.0,,3.0,,...,,,3.0,,3.0,3.0,,3.0,,5.0,,3.0,,,,3.0,,,,,,,,,3.0,,4.0,4.0,4.0,,3.0,,,,,,,,3.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649257,4.0,,,,,4.0,3.0,,,,3.0,,,,,,,,4.0,5.0,3.0,,3.0,3.0,3.0,,3.0,,,,,,,,,,,5.0,2.0,,...,3.0,,,4.0,4.0,,,3.0,,4.0,,,,,,,,,4.0,,3.0,,,,,,4.0,,4.0,,3.0,4.0,,4.0,4.0,,,,4.0,4.0
2649267,5.0,,,,,4.0,5.0,,,,5.0,,,4.0,5.0,,,,3.0,5.0,3.0,,,5.0,3.0,4.0,3.0,,3.0,,,,2.0,,,,,,3.0,,...,2.0,,,,4.0,3.0,4.0,5.0,,,,3.0,,3.0,,3.0,,,,,4.0,3.0,4.0,,3.0,,,,,4.0,5.0,,,5.0,4.0,,3.0,,,4.0
2649285,3.0,,,,,,2.0,,2.0,1.0,,,,,,,,,,5.0,,,5.0,4.0,4.0,2.0,5.0,,,,,,,,,,,3.0,,,...,,,4.0,,3.0,,2.0,4.0,,,,3.0,,4.0,,4.0,,3.0,2.0,,,3.0,2.0,,,,,,3.0,,,3.0,,3.0,3.0,,2.0,,,4.0
2649296,3.0,4.0,,,,4.0,,3.0,,2.0,3.0,,,,,,,,,3.0,,,,4.0,4.0,3.0,,,,4.0,,,,,,,3.0,,4.0,,...,4.0,,,,3.0,,,4.0,,5.0,,3.0,,,,3.0,,3.0,3.0,,4.0,,,,,,,,4.0,,,3.0,3.0,,5.0,,3.0,,,


In [None]:
movies

Unnamed: 0,movieId,original_title,features
31891,30,Something's Gotta Give,Comedy Drama Romance Nancy Meyers Jack Nichols...
41640,77,Congo,Action Adventure Mystery Frank Marshall Laura ...
52389,83,Silkwood,Biography Drama History Mike Nichols Meryl Str...
60360,108,Spartan,Action Crime Drama David Mamet Tia Texada Dere...
73110,118,Rambo: First Blood Part II,Action Adventure Thriller George P. Cosmatos S...
...,...,...,...
25994418,17697,New York Minute,Comedy Crime Family Dennie Gordon Ashley Olsen...
26009297,17703,Hulk,Action Sci-Fi Ang Lee Eric Bana Jennifer Conne...
26031466,17709,A River Runs Through It,Drama Robert Redford Craig Sheffer Brad Pitt T...
26049507,17762,Gattaca,Drama Sci-Fi Thriller Andrew Niccol Ethan Hawk...


In [None]:
from sklearn.naive_bayes import MultinomialNB
import scipy
from sklearn.feature_extraction.text import CountVectorizer

# from sklearn.metrics import accuracy_score

def delete_rows_csr(mat, indices):
    if not isinstance(mat, scipy.sparse.csr_matrix):
        raise ValueError("works only for CSR format -- use .tocsr() first")
    indices = list(indices)
    mask = np.ones(mat.shape[0], dtype=bool)
    mask[indices] = False
    return mat[mask]

features = list(movies.features)
for index, row in final_UM.iterrows():
  movieId = list(row.index)
  row = list(row)  
  c = pd.DataFrame(list(zip(row, features,movieId)))
  vect = CountVectorizer(min_df=10)
  x_bow = vect.fit_transform(c[1])
  break
num = 0
for index, row in final_UM.iterrows():
    num += 1
    if num < 0: 
      continue
    print(num)
    if num > 10000:
      break
    row = list(row)
    c = pd.DataFrame(list(zip(row, features,movieId)))
    d = c.loc[c[0].isnull()] 
    e = c.loc[c[0].notnull()]    
    y = e[0].values

    not_rated_movies = d.index.tolist()
    x_bow_filtered = delete_rows_csr(x_bow,not_rated_movies)
    clf = MultinomialNB()
    clf.fit(x_bow_filtered, y)
    for i, row in d.iterrows():
      movie = row[2]
      bow = x_bow[i]
      result = clf.predict(bow)
      final_UM[movie][index] = result

In [None]:
corrMatrix3 = final_UM.corr()

In [None]:
def predict_score(movie_of_interest, user_of_interest):
  num_sim_movies_to_consider = 10
  corrMatrix3[movie_of_interest]
  d = corrMatrix3[movie_of_interest].sort_values(ascending=False)
  list_of_top_sim_movies = []
  movies_added=0
  i=0
  for id, sim in d.iteritems():
    if not np.isnan(final_UM[id][user_of_interest]):
      
      score = final_UM[id][user_of_interest]
      
      list_of_top_sim_movies.append([id,score,sim])
      i += 1
      if i == num_sim_movies_to_consider+1: #+1 since we get rid of itself. 
        break
  list_of_top_sim_movies.pop(0)
  # print(list_of_top_sim_movies)
  predicted_score = 0
  num_movies = 0
  denominator = 0
  for m in list_of_top_sim_movies:
    num_movies+= 1
    predicted_score += m[1]*m[2]
    denominator += m[2]
  predicted_score /= denominator
  return predicted_score
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [None]:
cold_df


Unnamed: 0,movieId,userId,rating
474841,334,1128809,5
474842,334,1513370,4
474843,334,1778851,5
474844,334,754082,3
474845,334,2541216,4
...,...,...,...
26049503,17762,2646249,3
26049504,17762,1272199,5
26049505,17762,633738,1
26049506,17762,1331785,3


In [None]:
rating_label = cold_df['rating'].tolist()
movie_id = cold_df['movieId'].tolist()
user_id = cold_df['userId'].tolist()
predictions = []
num = 0
print(len(rating_label))
for i in range(len(rating_label)):
  movie_of_interest = str(movie_id[i])
  user_of_interest = user_id[i]
  predicted = predict_score(movie_of_interest,user_of_interest)
  predictions.append(predicted)
  num += 1
  print(num)

print(MSE(predictions, rating_label))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2418110
2418111
2418112
2418113
2418114
2418115
2418116
2418117
2418118
2418119
2418120
2418121
2418122
2418123
2418124
2418125
2418126
2418127
2418128
2418129
2418130
2418131
2418132
2418133
2418134
2418135
2418136
2418137
2418138
2418139
2418140
2418141
2418142
2418143
2418144
2418145
2418146
2418147
2418148
2418149
2418150
2418151
2418152
2418153
2418154
2418155
2418156
2418157
2418158
2418159
2418160
2418161
2418162
2418163
2418164
2418165
2418166
2418167
2418168
2418169
2418170
2418171
2418172
2418173
2418174
2418175
2418176
2418177
2418178
2418179
2418180
2418181
2418182
2418183
2418184
2418185
2418186
2418187
2418188
2418189
2418190
2418191
2418192
2418193
2418194
2418195
2418196
2418197
2418198
2418199
2418200
2418201
2418202
2418203
2418204
2418205
2418206
2418207
2418208
2418209
2418210
2418211
2418212
2418213
2418214
2418215
2418216
2418217
2418218
2418219
2418220
2418221
2418222
2418223
2418224
2418225
2418226