In [1]:
import pandas as pd
import pickle
import numpy as np

from sklearn.impute import SimpleImputer

from sklearn.decomposition import NMF

# ignore NMF warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
with open('./data/full_df.bin', 'rb') as f:
    df = pickle.load(f)

In [3]:
with open('./data/movie_dict.bin', 'rb') as f:
    movie_dict = pickle.load(f)

In [4]:
movie_titles = [i for i in movie_dict.keys()]

In [5]:
df.shape

(61559, 13)

In [6]:
df.head()

Unnamed: 0,userId,movieId,rating,title,year,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7
0,1,1,4.0,Toy Story (1995),1995.0,Adventure,Animation,Children,Comedy,Fantasy,,,
1,5,1,4.0,Toy Story (1995),1995.0,Adventure,Animation,Children,Comedy,Fantasy,,,
2,7,1,4.5,Toy Story (1995),1995.0,Adventure,Animation,Children,Comedy,Fantasy,,,
3,15,1,2.5,Toy Story (1995),1995.0,Adventure,Animation,Children,Comedy,Fantasy,,,
4,17,1,4.5,Toy Story (1995),1995.0,Adventure,Animation,Children,Comedy,Fantasy,,,


In [7]:
# Create a dataframe with only the relevant columns
df_long = df[["userId", "movieId", "rating"]]
df_long

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,5,1,4.0
2,7,1,4.5
3,15,1,2.5
4,17,1,4.5
...,...,...,...
100825,610,147657,4.0
100826,610,147662,3.0
100830,610,158721,3.5
100831,610,160341,2.5


In [8]:
# Turn long into wide format
ratings = df_long.pivot(index="userId", columns='movieId', values='rating')
ratings.columns = movie_titles
ratings.head(5)

Unnamed: 0_level_0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,The Shining (1997),Serving in Silence: The Margarethe Cammermeyer Story (1995),Hare-um Scare-um (1939),Porky in Wackyland (1938),Porky's Hare Hunt (1938),The Tale of the Bunny Picnic (1986),Patti Rocks (1988),De platte jungle (1978),Bunny (1998),Andrew Dice Clay: Dice Rules (1991)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [9]:
ratings.shape

(610, 4927)

In [10]:
ratings = ratings.transpose().fillna(round(ratings.mean(axis=1), 1)).transpose().head()
ratings

Unnamed: 0_level_0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,The Shining (1997),Serving in Silence: The Margarethe Cammermeyer Story (1995),Hare-um Scare-um (1939),Porky in Wackyland (1938),Porky's Hare Hunt (1938),The Tale of the Bunny Picnic (1986),Patti Rocks (1988),De platte jungle (1978),Bunny (1998),Andrew Dice Clay: Dice Rules (1991)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,4.4,4.0,4.4,4.4,4.0,4.4,4.4,4.4,4.4,...,4.4,4.4,4.4,4.4,4.4,4.4,4.4,4.4,4.4,4.4
2,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,...,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8,3.8
3,2.6,2.6,2.6,2.6,2.6,2.6,2.6,2.6,2.6,2.6,...,2.6,2.6,2.6,2.6,2.6,2.6,2.6,2.6,2.6,2.6
4,3.6,3.6,3.6,3.6,3.6,3.6,3.6,3.6,3.6,3.6,...,3.6,3.6,3.6,3.6,3.6,3.6,3.6,3.6,3.6,3.6
5,4.0,3.6,3.6,3.6,3.6,3.6,3.6,3.6,3.6,3.6,...,3.6,3.6,3.6,3.6,3.6,3.6,3.6,3.6,3.6,3.6


imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
ratings = pd.DataFrame(imputer.fit_transform(ratings.transpose()).transpose(),
                       index = ratings.index,
                       columns = ratings.columns)
ratings.head(5)

In [11]:
n_components = 25
nmf= NMF(n_components=n_components)

In [12]:
nmf.fit(ratings)

In [13]:
# Extract Q
Q = nmf.components_
Q

array([[0.38264544, 0.28738154, 0.44836428, ..., 0.48746363, 0.43099172,
        0.16334102],
       [0.52077973, 0.40391372, 0.27268162, ..., 0.07355028, 0.24215644,
        0.0559164 ],
       [0.60805014, 0.12269384, 0.18390977, ..., 0.17252792, 0.17861691,
        0.81494756],
       ...,
       [0.23788565, 0.26582218, 0.44624525, ..., 0.35954545, 0.26738605,
        0.50337099],
       [0.21961067, 0.99457897, 0.41154011, ..., 0.27363405, 0.30276069,
        0.21785214],
       [0.01233965, 0.34832839, 0.22528572, ..., 0.36046389, 0.70103534,
        0.85443067]])

In [14]:
# Turn Q into a dataframe
Q = pd.DataFrame(Q,
                 columns=movie_titles,
                 index=[f"feature_{i+1}" for i in range(n_components)])
Q

Unnamed: 0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,The Shining (1997),Serving in Silence: The Margarethe Cammermeyer Story (1995),Hare-um Scare-um (1939),Porky in Wackyland (1938),Porky's Hare Hunt (1938),The Tale of the Bunny Picnic (1986),Patti Rocks (1988),De platte jungle (1978),Bunny (1998),Andrew Dice Clay: Dice Rules (1991)
feature_1,0.382645,0.287382,0.448364,0.22437,0.227458,0.509375,0.246373,0.285215,0.101983,0.369596,...,0.240391,0.169799,0.291376,0.477693,0.371999,0.065514,0.361298,0.487464,0.430992,0.163341
feature_2,0.52078,0.403914,0.272682,0.771089,0.492445,0.215784,0.136226,0.055156,0.536353,0.210549,...,0.331934,0.20799,0.410192,0.12057,0.395669,0.311979,0.470129,0.07355,0.242156,0.055916
feature_3,0.60805,0.122694,0.18391,0.078283,0.186876,0.118873,0.311827,0.200991,0.245017,0.05931,...,0.071951,0.129043,0.291744,0.43431,0.115185,0.412094,0.451387,0.172528,0.178617,0.814948
feature_4,0.48695,0.337745,0.572718,0.198388,0.645183,0.14241,0.531345,0.407286,0.196655,0.424492,...,0.518111,0.852065,0.063392,0.555468,0.352993,0.162893,0.268368,0.059326,0.610005,0.312416
feature_5,0.139047,0.781183,0.025323,0.435606,0.355391,0.141394,0.487792,0.456961,0.159717,0.093143,...,0.550092,0.517414,0.397749,0.155595,0.595765,0.882777,0.30705,0.00054,0.659701,0.270853
feature_6,0.000288,0.277792,0.228893,0.377629,0.220027,0.246261,0.166764,0.580923,0.303969,0.112368,...,0.253989,0.105103,0.436417,0.527701,0.367676,0.354679,0.208773,0.303638,0.626047,0.551324
feature_7,0.334018,0.231301,0.164655,0.068124,0.440894,0.298034,0.131926,0.380183,0.227521,0.117698,...,0.130483,0.634555,0.447851,0.078946,0.039776,0.113044,0.300546,0.346995,0.030083,0.317343
feature_8,0.167133,0.0,0.236878,0.368277,0.366545,0.099992,0.150948,0.231618,0.388301,0.093177,...,0.313284,0.142085,0.279168,0.286512,0.186559,0.077821,0.259009,0.50901,0.23551,0.596241
feature_9,0.416542,0.208395,0.144764,0.090348,0.700007,0.457786,0.474154,0.302415,0.269666,0.37566,...,0.110087,0.240579,0.391262,0.463315,0.528924,0.64793,0.056256,0.314788,0.349557,0.718729
feature_10,0.035045,0.563108,0.074044,0.305237,0.633608,0.38444,0.5364,0.19619,0.414464,0.013467,...,0.288034,0.190666,0.622315,0.01742,0.496888,0.722761,0.126065,0.514951,0.390399,0.453157


In [15]:
P = pd.DataFrame(nmf.transform(ratings),
                 index=ratings.index,
                 columns = [f"feature_{i+1}" for i in range(n_components)])
P

Unnamed: 0_level_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.491982,1.639716,0.804497,0.345405,0.563055,0.261429,0.37705,0.450626,0.065082,0.38938,...,0.536206,0.531437,0.210139,0.326999,0.566973,0.234711,0.165367,0.105238,0.412285,0.700406
2,3.07777,1.280821,0.515814,0.413292,0.495588,0.247416,0.622056,0.550394,0.353836,0.190495,...,0.33534,0.293014,0.234797,0.348079,0.147277,0.121166,0.387306,0.146761,0.470815,0.398107
3,1.799992,0.774008,0.204171,0.508865,0.397011,0.293425,0.160608,0.289412,0.180267,0.387896,...,0.0,0.243013,0.299826,0.3754,0.0,0.480842,0.33109,0.0,0.256058,0.210921
4,2.309115,0.855788,0.68067,0.714586,0.141401,0.785525,0.318489,0.069325,0.000158,0.750341,...,0.380996,0.740775,0.354653,0.519323,0.203418,0.00247,0.374639,0.133419,0.4547,0.0
5,3.406791,1.611993,0.907021,0.406843,0.352314,0.080097,0.340983,0.11005,0.426617,0.309419,...,0.066405,0.262039,0.515079,0.269563,0.194577,0.113774,0.152567,0.520395,0.311107,0.163435


In [16]:
# E.9 Reconstructing R - (Finding r_hat)
r_hat = pd.DataFrame(np.dot(P, Q),
                     index=ratings.index,
                     columns=movie_titles)
r_hat

Unnamed: 0_level_0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,The Shining (1997),Serving in Silence: The Margarethe Cammermeyer Story (1995),Hare-um Scare-um (1939),Porky in Wackyland (1938),Porky's Hare Hunt (1938),The Tale of the Bunny Picnic (1986),Patti Rocks (1988),De platte jungle (1978),Bunny (1998),Andrew Dice Clay: Dice Rules (1991)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.997233,4.399687,3.999246,4.39952,4.399662,3.999559,4.400299,4.400268,4.400421,4.401311,...,4.400125,4.400163,4.399854,4.39997,4.399567,4.400502,4.399381,4.40027,4.399068,4.400226
2,3.801689,3.79984,3.799749,3.800134,3.801249,3.800434,3.800165,3.799689,3.799224,3.799287,...,3.799797,3.800276,3.79999,3.80007,3.800262,3.799848,3.800553,3.799663,3.800105,3.799917
3,2.59955,2.600366,2.600345,2.600344,2.599572,2.599922,2.600185,2.599952,2.600321,2.600455,...,2.600473,2.600249,2.599749,2.599661,2.600099,2.600049,2.599909,2.600128,2.59997,2.599428
4,3.599722,3.599743,3.599845,3.600011,3.600536,3.600165,3.600167,3.600125,3.599948,3.599566,...,3.600054,3.600338,3.600022,3.599884,3.599991,3.600219,3.599909,3.599923,3.600388,3.600327
5,3.999349,3.599741,3.59948,3.599778,3.600302,3.599968,3.600438,3.600175,3.600128,3.60058,...,3.600035,3.600385,3.59981,3.599912,3.599828,3.600311,3.599721,3.600082,3.599486,3.600089


In [17]:
# E.10 Calculate the differences between 𝑅 and r_hat
abs(ratings - r_hat)

Unnamed: 0_level_0,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,The Shining (1997),Serving in Silence: The Margarethe Cammermeyer Story (1995),Hare-um Scare-um (1939),Porky in Wackyland (1938),Porky's Hare Hunt (1938),The Tale of the Bunny Picnic (1986),Patti Rocks (1988),De platte jungle (1978),Bunny (1998),Andrew Dice Clay: Dice Rules (1991)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.002767,0.000313,0.000754,0.00048,0.000338,0.000441,0.000299,0.000268,0.000421,0.001311,...,0.000125,0.000163,0.000146,3e-05,0.000433,0.000502,0.000619,0.00027,0.000932,0.000226
2,0.001689,0.00016,0.000251,0.000134,0.001249,0.000434,0.000165,0.000311,0.000776,0.000713,...,0.000203,0.000276,1e-05,7e-05,0.000262,0.000152,0.000553,0.000337,0.000105,8.3e-05
3,0.00045,0.000366,0.000345,0.000344,0.000428,7.8e-05,0.000185,4.8e-05,0.000321,0.000455,...,0.000473,0.000249,0.000251,0.000339,9.9e-05,4.9e-05,9.1e-05,0.000128,3e-05,0.000572
4,0.000278,0.000257,0.000155,1.1e-05,0.000536,0.000165,0.000167,0.000125,5.2e-05,0.000434,...,5.4e-05,0.000338,2.2e-05,0.000116,9e-06,0.000219,9.1e-05,7.7e-05,0.000388,0.000327
5,0.000651,0.000259,0.00052,0.000222,0.000302,3.2e-05,0.000438,0.000175,0.000128,0.00058,...,3.5e-05,0.000385,0.00019,8.8e-05,0.000172,0.000311,0.000279,8.2e-05,0.000514,8.9e-05


In [18]:
# Extract the reconstruction error
nmf.reconstruction_err_

0.19893167070822423

In [19]:
with open('./data/nmf.sav', 'wb') as f:
    pickle.dump(nmf, f)

In [20]:
with open('./data/ratings.sav', 'wb') as f:
    pickle.dump(ratings, f)