# Matrix completion with Netflix rating data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load data
data = pd.read_csv('archive/Netflix_Dataset_Rating.csv')

In [3]:
ids = data.User_ID.unique()

import random
random.seed(42)

ids = random.sample(list(ids), 1000)
data = data[data.User_ID.isin(ids)]

In [4]:
# Data into a dataframe
df = pd.pivot(data=data, index='User_ID', columns='Movie_ID', values='Rating')

In [5]:
# count missing values
print(f'Number of missing values: {df.isnull().sum().sum()}')
print(f'Observing probability: {1- df.isnull().sum().sum() / (df.shape[0] * df.shape[1])}')

Number of missing values: 1228821
Observing probability: 0.08976222222222219


In [16]:
# Inverse probability weighting
p = 0.1
r = 1

M_star = df.to_numpy()

In [17]:
P_M_star = df.fillna(0).values

In [18]:
M = (1 / p) * P_M_star

In [19]:
# rank r=5 SVD of M
U, Sigma, V_T = np.linalg.svd(M, full_matrices=False)

U = U[:, :r]
Sigma = Sigma[:r]
V = V_T.T
V = V[:, :r]

In [20]:
# Reconstruction
M = U @ np.diag(Sigma) @ V.T

In [22]:
df

Movie_ID,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9597,,,,3.0,,,5.0,4.0,,,...,,,,,,,,,,
11186,,,,,,,,4.0,,,...,,,,,,,,,,
11502,,,,,,,5.0,,,,...,,,,,,,,,,
13061,,,,,5.0,,3.0,,,,...,,,,,,4.0,,,,
19026,,,,,,,,5.0,,,...,5.0,,,,,5.0,,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2638967,,,,,,,5.0,5.0,,,...,2.0,,,,,4.0,,,,
2641316,,,,,,,,,,,...,,,,4.0,,4.0,,,,
2642897,,,,,,,,,,,...,,,,,,,,,,
2647351,,,,,,,,3.0,,5.0,...,3.0,,,5.0,,4.0,,,,


In [23]:
# Prediction output

pd.DataFrame(M, index=df.index, columns=df.columns)

Movie_ID,3,8,16,17,18,26,28,30,32,33,...,4472,4474,4478,4479,4485,4488,4490,4492,4493,4496
User_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9597,0.466357,2.480216,1.046366,1.367686,3.633661,1.137234,12.532817,26.392558,0.866414,2.100113,...,23.433076,1.571947,1.025750,8.627691,1.660625,11.194300,2.357783,2.184228,1.556859,2.654256
11186,0.447846,2.381770,1.004833,1.313398,3.489431,1.092094,12.035355,25.344964,0.832024,2.016754,...,22.502952,1.509552,0.985035,8.285234,1.594710,10.749967,2.264196,2.097530,1.495063,2.548902
11502,0.169256,0.900151,0.379760,0.496378,1.318774,0.412739,4.548566,9.578716,0.314450,0.762199,...,8.504624,0.570511,0.372278,3.131269,0.602695,4.062775,0.855716,0.792727,0.565035,0.963316
13061,0.261381,1.390098,0.586462,0.766553,2.036574,0.637391,7.024324,14.792354,0.485603,1.177060,...,13.133640,0.881036,0.574907,4.835600,0.930738,6.274119,1.321477,1.224204,0.872580,1.487643
19026,0.310651,1.652129,0.697009,0.911047,2.420465,0.757538,8.348397,17.580689,0.577138,1.398934,...,15.609310,1.047110,0.683275,5.747103,1.106180,7.456780,1.570574,1.454965,1.037059,1.768061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2638967,0.293964,1.563380,0.659567,0.862107,2.290443,0.716845,7.899939,16.636291,0.546136,1.323786,...,14.770810,0.990861,0.646571,5.438380,1.046759,7.056217,1.486206,1.376807,0.981351,1.673085
2641316,0.306514,1.630127,0.687726,0.898914,2.388231,0.747449,8.237218,17.346559,0.569452,1.380303,...,15.401434,1.033165,0.674176,5.670566,1.091449,7.357475,1.549658,1.435588,1.023248,1.744515
2642897,0.121453,0.645919,0.272503,0.356184,0.946308,0.296168,3.263901,6.873371,0.225639,0.546929,...,6.102638,0.409380,0.267134,2.246895,0.432474,2.915313,0.614034,0.568835,0.405450,0.691244
2647351,0.338667,1.801127,0.759869,0.993210,2.638756,0.825857,9.101302,19.166213,0.629188,1.525097,...,17.017043,1.141544,0.744897,6.265408,1.205942,8.129274,1.712217,1.586181,1.130587,1.927515


### >> How to handle the overflow error ? 