In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD, randomized_svd

In [2]:
sub_df = pd.read_csv('Datasets/small_ratings.csv')
wide_df = sub_df.pivot(index='userId', columns='movieId', values='rating')
wide_df

movieId,1,2,3,8,10,11,16,19,21,22,...,128832,129354,132660,133419,134368,134393,134853,135887,139747,149354
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
35,,,,,,4.0,,,5.0,,...,,,,,,,,,,
46,5.0,,,,3.0,,,,,,...,,,,,,,,,,
226,3.5,3.0,3.5,,4.0,,4.5,3.5,,2.0,...,,,,,,,,,,
390,,,,,,,,,,,...,,,,,,,,,,
405,,,,,,,4.0,,,,...,,,,,,,,,,
432,2.5,4.0,,,,,,2.0,,,...,,,,,,,,,,
435,,,,,,,,,,,...,,,,,,,,,,
436,4.0,4.0,,3.0,,,,,,,...,,,,,,,,,,
563,,2.5,,,,,,,,,...,3.5,3.0,3.5,,4.5,4.5,4.0,3.5,3.5,
583,,,,,,,,,,,...,,,,4.0,,,,,,4.0


In [3]:
col_means = np.matrix(wide_df.mean().values)
row_means = np.matrix(wide_df.mean(axis=1).values).T
# Creating custom values to fill NaN in wide_df
fill_matrix = np.dot(row_means, col_means)
na_indx = wide_df.isna()
wide_array = np.array(wide_df)
fill_sub = np.array(fill_matrix[na_indx]).reshape(-1)
wide_array[na_indx] = fill_sub
wide_array

array([[15.32608696, 13.79347826, 14.30434783, ..., 14.30434783,
        14.30434783, 16.34782609],
       [ 5.        , 13.5       , 14.        , ..., 14.        ,
        14.        , 16.        ],
       [ 3.5       ,  3.        ,  3.5       , ..., 12.16715976,
        12.16715976, 13.90532544],
       ...,
       [ 4.        ,  4.        , 11.12735849, ..., 11.12735849,
        11.12735849, 12.71698113],
       [12.37676056,  2.5       , 11.55164319, ...,  3.5       ,
         3.5       , 13.20187793],
       [12.35491071, 11.11941964, 11.53125   , ..., 11.53125   ,
        11.53125   ,  4.        ]])

In [4]:
Z = wide_array
r = 20
svd=TruncatedSVD(n_components=r)
svd.fit(Z)
Sigma2=np.diag(svd.singular_values_)
VT=svd.components_
W=svd.transform(Z) / svd.singular_values_
H = np.dot(Sigma2,VT)

In [5]:
U, Sigma, VT = randomized_svd(Z,
                              n_components=15,
                              n_iter=5,
                              random_state=None)

In [6]:
np.dot(np.dot(U, np.diag(Sigma)), VT)

array([[15.32608696, 13.79347826, 14.30434783, ..., 14.30434783,
        14.30434783, 16.34782609],
       [ 5.        , 13.5       , 14.        , ..., 14.        ,
        14.        , 16.        ],
       [ 3.5       ,  3.        ,  3.5       , ..., 12.16715976,
        12.16715976, 13.90532544],
       ...,
       [ 4.        ,  4.        , 11.12735849, ..., 11.12735849,
        11.12735849, 12.71698113],
       [12.37676056,  2.5       , 11.55164319, ...,  3.5       ,
         3.5       , 13.20187793],
       [12.35491071, 11.11941964, 11.53125   , ..., 11.53125   ,
        11.53125   ,  4.        ]])

In [7]:
sub_df.groupby('userId').size().reset_index(name='counts')

Unnamed: 0,userId,counts
0,35,23
1,46,42
2,226,507
3,390,81
4,405,120
5,432,260
6,435,42
7,436,106
8,563,213
9,583,56


In [38]:
from sklearn.model_selection import train_test_split

xtrain, xtest = train_test_split(sub_df, train_size=0.90, stratify=sub_df['userId'])

In [39]:
xtrain.groupby('userId').size().reset_index(name='counts')
xtest.groupby('userId').size().reset_index(name='counts')
xtrain.drop('Unnamed: 0', inplace=True, axis=1)
xtest.drop('Unnamed: 0', inplace=True, axis=1)

In [81]:
train_wide = xtrain.pivot(index='userId', columns='movieId', values='rating')
test_wide = xtest.pivot(index='userId', columns='movieId', values='rating')
test_array = np.array(test_wide.to_records(index=False))

In [82]:
# Movie ids in the whole dataset
movies_ids = np.unique(sub_df.movieId)
# Movie ids in the training dataset
movies_ids_train =np.unique(xtrain.movieId)
if_in_training_vec = pd.DataFrame(movies_ids).isin(movies_ids_train)
movies_ids_df = pd.DataFrame(movies_ids)
# Movie ids that are not in training data
movie_ids_miss = np.array(movies_ids_df.iloc[np.array(~if_in_training_vec)]).reshape(-1)
nas_array = np.empty((train_wide.shape[0],movie_ids_miss.shape[0],))
nas_array[:] = np.nan
missing_df = pd.DataFrame(nas_array)
missing_df.columns = movie_ids_miss
missing_df.index = train_wide.index
train_df = pd.concat([train_wide, missing_df], axis=1)
train_df

Unnamed: 0_level_0,1,2,3,10,11,16,19,21,25,32,...,68135,72011,84152,84189,97921,99117,114662,119145,133419,149354
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
35,,,,,4.0,,,5.0,,,...,,,,,,,,,,
46,5.0,,,3.0,,,,,,4.0,...,,,,,,,,,,
226,,3.0,3.5,4.0,,,3.5,,,4.0,...,,,,,,,,,,
390,,,,,,,,,,,...,,,,,,,,,,
405,,,,,,4.0,,,4.0,4.5,...,,,,,,,,,,
432,2.5,4.0,,,,,2.0,,4.5,,...,,,,,,,,,,
435,,,,,,,,,,,...,,,,,,,,,,
436,4.0,4.0,,,,,,,,,...,,,,,,,,,,
563,,,,,,,,,,,...,,,,,,,,,,
583,,,,,,,,,,,...,,,,,,,,,,
