In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD, randomized_svd

In [2]:
sub_df = pd.read_csv('Datasets/small_ratings.csv')
wide_df = sub_df.pivot(index='userId', columns='movieId', values='rating')
wide_df

movieId,1,2,3,8,10,11,16,19,21,22,...,128832,129354,132660,133419,134368,134393,134853,135887,139747,149354
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
35,,,,,,4.0,,,5.0,,...,,,,,,,,,,
46,5.0,,,,3.0,,,,,,...,,,,,,,,,,
226,3.5,3.0,3.5,,4.0,,4.5,3.5,,2.0,...,,,,,,,,,,
390,,,,,,,,,,,...,,,,,,,,,,
405,,,,,,,4.0,,,,...,,,,,,,,,,
432,2.5,4.0,,,,,,2.0,,,...,,,,,,,,,,
435,,,,,,,,,,,...,,,,,,,,,,
436,4.0,4.0,,3.0,,,,,,,...,,,,,,,,,,
563,,2.5,,,,,,,,,...,3.5,3.0,3.5,,4.5,4.5,4.0,3.5,3.5,
583,,,,,,,,,,,...,,,,4.0,,,,,,4.0


In [3]:
col_means = np.matrix(wide_df.mean().values)
row_means = np.matrix(wide_df.mean(axis=1).values).T
# Creating custom values to fill NaN in wide_df
fill_matrix = np.dot(row_means, col_means)
na_indx = wide_df.isna()
wide_array = np.array(wide_df)
fill_sub = np.array(fill_matrix[na_indx]).reshape(-1)
wide_array[na_indx] = fill_sub
wide_array

array([[15.32608696, 13.79347826, 14.30434783, ..., 14.30434783,
        14.30434783, 16.34782609],
       [ 5.        , 13.5       , 14.        , ..., 14.        ,
        14.        , 16.        ],
       [ 3.5       ,  3.        ,  3.5       , ..., 12.16715976,
        12.16715976, 13.90532544],
       ...,
       [ 4.        ,  4.        , 11.12735849, ..., 11.12735849,
        11.12735849, 12.71698113],
       [12.37676056,  2.5       , 11.55164319, ...,  3.5       ,
         3.5       , 13.20187793],
       [12.35491071, 11.11941964, 11.53125   , ..., 11.53125   ,
        11.53125   ,  4.        ]])

In [4]:
Z = wide_array
r = 20
svd=TruncatedSVD(n_components=r)
svd.fit(Z)
Sigma2=np.diag(svd.singular_values_)
VT=svd.components_
W=svd.transform(Z) / svd.singular_values_
H = np.dot(Sigma2,VT)

In [5]:
U, Sigma, VT = randomized_svd(Z,
                              n_components=15,
                              n_iter=5,
                              random_state=None)

In [6]:
np.dot(np.dot(U, np.diag(Sigma)), VT)

array([[15.32608696, 13.79347826, 14.30434783, ..., 14.30434783,
        14.30434783, 16.34782609],
       [ 5.        , 13.5       , 14.        , ..., 14.        ,
        14.        , 16.        ],
       [ 3.5       ,  3.        ,  3.5       , ..., 12.16715976,
        12.16715976, 13.90532544],
       ...,
       [ 4.        ,  4.        , 11.12735849, ..., 11.12735849,
        11.12735849, 12.71698113],
       [12.37676056,  2.5       , 11.55164319, ...,  3.5       ,
         3.5       , 13.20187793],
       [12.35491071, 11.11941964, 11.53125   , ..., 11.53125   ,
        11.53125   ,  4.        ]])

In [7]:
sub_df.groupby('userId').size().reset_index(name='counts')

Unnamed: 0,userId,counts
0,35,23
1,46,42
2,226,507
3,390,81
4,405,120
5,432,260
6,435,42
7,436,106
8,563,213
9,583,56


In [34]:
from sklearn.model_selection import train_test_split

xtrain, xtest = train_test_split(sub_df, train_size=0.90, stratify=sub_df['userId'])

In [35]:
xtrain.groupby('userId').size().reset_index(name='counts')
xtest.groupby('userId').size().reset_index(name='counts')
xtrain.drop('Unnamed: 0', inplace=True, axis=1)
xtest.drop('Unnamed: 0', inplace=True, axis=1)

In [36]:
train_wide = xtrain.pivot(index='userId', columns='movieId', values='rating')
test_wide = xtest.pivot(index='userId', columns='movieId', values='rating')
test_wide.sort_index(axis=1, inplace=True)
# test_array = np.array(test_wide.to_records(index=False))
# test_array = np.array(test_wide)

In [38]:
                                     ## Rearranging training and test data sets to have columns from the whole data set ##
# Movie ids in the whole dataset
movies_ids = np.unique(sub_df.movieId)
# Movie ids in the training dataset
movies_ids_train = np.unique(xtrain.movieId)
# Movie ids that are not in training data
movies_ids_miss = np.setdiff1d(movies_ids, movies_ids_train)
# Adding columns with movies not in train set but which appear in the whole data set
nas_array = np.empty((train_wide.shape[0], movies_ids_miss.shape[0],))
nas_array[:] = np.nan
missing_df = pd.DataFrame(nas_array)
missing_df.columns = movies_ids_miss
missing_df.index = train_wide.index
train_df = pd.concat([train_wide, missing_df], axis=1)
# Sorting column names to control positions of movieId
train_df.sort_index(axis=1, inplace=True)

# Same for test
movies_ids_test = np.unique(xtest.movieId)
# Movie ids that are not in training data
movies_ids_miss_test = np.setdiff1d(movies_ids, movies_ids_test)
# Adding columns with movies not in train set but which appear in the whole data set
nas_array2 = np.empty((test_wide.shape[0], movies_ids_miss_test.shape[0],))
nas_array2[:] = np.nan
missing_df2 = pd.DataFrame(nas_array2)
missing_df2.columns = movies_ids_miss_test
missing_df2.index = train_wide.index
test_df = pd.concat([test_wide, missing_df2], axis=1)
# Sorting column names to control positions of movieId
test_df.sort_index(axis=1, inplace=True)
test_array = np.array(test_df)

In [None]:
                                                                      ## Inputation ##
train_df.fillna(0, inplace=True)
# train_array= np.array(train_df.to_records(index=False))
train_array= np.array(train_df)

In [28]:
train_array

array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ],
       [3.5, 3. , 3.5, ..., 0. , 0. , 0. ],
       ...,
       [4. , 4. , 0. , ..., 0. , 0. , 0. ],
       [0. , 2.5, 0. , ..., 3.5, 3.5, 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 4. ]])

In [29]:
r = 5
svd = TruncatedSVD(n_components=r)
svd.fit(train_array)
Sigma2=np.diag(svd.singular_values_)
VT=svd.components_
W=svd.transform(train_array) / svd.singular_values_
H = np.dot(Sigma2,VT)
Z_tilde = np.dot(W, H)

In [42]:
diff = test_array - Z_tilde
num_of_vals = (~np.isnan(diff)).sum()
# Delete not NaN values to further summing
diff= diff[~np.isnan(diff)]
RMSE = np.sqrt(1 / np.abs(num_of_vals) * ((diff ** 2).sum()))

In [45]:
RMSE

1.7440056672097703