In [11]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data is contained in '../Data/Processed' directory
# Output data is written to '../Data/Processed' directory
# This cell lists all files under the input directory

import os
INPUT_DIR = os.path.join(os.path.dirname(os.getcwd()), 'Data', 'Processed')
OUTPUT_DIR = os.path.join(os.path.dirname(os.getcwd()), 'Data', 'Processed')
for dirname, _, filenames in os.walk(INPUT_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\average_ratings_by_user.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\Books_valid_ISBN_known_year_no_images.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\normalized_user-book_matrix.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\popular_books_with_descriptions.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\ratings_for_popular_books.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\Ratings_valid_ISBN.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\user-book_matrix.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\user-book_matrix_normalized_withna.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\users_valid_age_with_country.csv


In this notebook we use matrix factorization to predict missing values in the user-book matrix. We write the resulting matrix to a new csv file.

First, we load the normalized user-book matrix with NaNs.

In [2]:
user_book_withna = pd.read_csv(os.path.join(INPUT_DIR, 'user-book_matrix_normalized_withna.csv'), index_col=0)
print(user_book_withna.info())
print(user_book_withna.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10637 entries, 26 to 278844
Data columns (total 95 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   0060392452  158 non-null    float64
 1   0060502258  166 non-null    float64
 2   0060928336  320 non-null    float64
 3   0060930535  218 non-null    float64
 4   0060934417  159 non-null    float64
 5   0060938455  154 non-null    float64
 6   0060976845  186 non-null    float64
 7   0060987103  156 non-null    float64
 8   006101351X  118 non-null    float64
 9   014028009X  131 non-null    float64
 10  0140293248  165 non-null    float64
 11  0142000205  129 non-null    float64
 12  0142001740  307 non-null    float64
 13  0312195516  383 non-null    float64
 14  0312278586  226 non-null    float64
 15  0312291639  167 non-null    float64
 16  0312305060  129 non-null    float64
 17  0316096199  133 non-null    float64
 18  0316284955  171 non-null    float64
 19  0316601950  272 non-nul

Let us check how sparse this matrix is. We compute the ratio of missing values.

In [3]:
missing_values = user_book_withna.isnull().values.sum()
total_values = user_book_withna.size
sparsity = missing_values / total_values
print(sparsity)

0.9823486044244766


So, less than 2 percent of the matrix is filled. We will use matrix factorization to predict the missing values.

We load the normalized matrix for which missing values are filled with zeros.

In [4]:
user_book_normalized = pd.read_csv(os.path.join(INPUT_DIR, 'normalized_user-book_matrix.csv'), index_col=0)
print(user_book_normalized.info())
print(user_book_normalized.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10637 entries, 26 to 278844
Data columns (total 95 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   0060392452  10637 non-null  float64
 1   0060502258  10637 non-null  float64
 2   0060928336  10637 non-null  float64
 3   0060930535  10637 non-null  float64
 4   0060934417  10637 non-null  float64
 5   0060938455  10637 non-null  float64
 6   0060976845  10637 non-null  float64
 7   0060987103  10637 non-null  float64
 8   006101351X  10637 non-null  float64
 9   014028009X  10637 non-null  float64
 10  0140293248  10637 non-null  float64
 11  0142000205  10637 non-null  float64
 12  0142001740  10637 non-null  float64
 13  0312195516  10637 non-null  float64
 14  0312278586  10637 non-null  float64
 15  0312291639  10637 non-null  float64
 16  0312305060  10637 non-null  float64
 17  0316096199  10637 non-null  float64
 18  0316284955  10637 non-null  float64
 19  0316601950  10637 non-n

Now we are going to apply SVDs.

In [6]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(user_book_normalized.values)
sigma = np.diag(sigma)
print(U)
print(sigma)
print(Vt)

[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 1.33193481e-03 -2.71467221e-03  2.62317904e-03  1.41927888e-03
   4.87638585e-04  9.30602458e-05]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]
[[19.48556162  0.          0.          0.          0.          0.        ]
 [ 0.         20.02611235  0.          0.          0.          0.        ]
 [ 0.          0.         21.20315984  0.          0.          0.        ]
 [ 0.          0.          0.         23.69271751  0.          0.        ]
 [ 0.          0.          0.          0.         25.22058888  0.        ]
 [ 0.        

Now we can compute our predictions.

In [7]:
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
print(predicted_ratings)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 ...
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.00235152 -0.00241238  0.00322214 ... -0.00259287  0.00684641
   0.00053094]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


It remains to add average rating for each user.

We load the average rating for each user.

In [8]:
user_average_ratings = pd.read_csv(os.path.join(INPUT_DIR, 'average_ratings_by_user.csv'), index_col=0)
user_average_ratings = user_average_ratings.squeeze()
print(user_average_ratings)

User-ID
26        10.000000
51         9.000000
91         9.000000
114       10.000000
165        9.000000
            ...    
278755     1.000000
278773     8.000000
278798     7.000000
278843     8.333333
278844     7.000000
Name: 0, Length: 10637, dtype: float64


We now add them to the predictions.

In [9]:
predicted_ratings = predicted_ratings + user_average_ratings.values.reshape(-1, 1)
print(predicted_ratings)

[[10.         10.         10.         ... 10.         10.
  10.        ]
 [ 9.          9.          9.         ...  9.          9.
   9.        ]
 [ 9.          9.          9.         ...  9.          9.
   9.        ]
 ...
 [ 7.          7.          7.         ...  7.          7.
   7.        ]
 [ 8.33098182  8.33092095  8.33655547 ...  8.33074047  8.34017974
   8.33386428]
 [ 7.          7.          7.         ...  7.          7.
   7.        ]]


We create a dataframe from the numpy array of predictions.

In [12]:
predicted_ratings_df = pd.DataFrame(predicted_ratings, 
                                    index=user_book_withna.index, 
                                    columns=user_book_withna.columns)
print(predicted_ratings_df.head())

         0060392452  0060502258  0060928336  0060930535  0060934417  \
User-ID                                                               
26             10.0        10.0        10.0        10.0        10.0   
51              9.0         9.0         9.0         9.0         9.0   
91              9.0         9.0         9.0         9.0         9.0   
114            10.0        10.0        10.0        10.0        10.0   
165             9.0         9.0         9.0         9.0         9.0   

         0060938455  0060976845  0060987103  006101351X  014028009X  ...  \
User-ID                                                              ...   
26             10.0        10.0        10.0        10.0        10.0  ...   
51              9.0         9.0         9.0         9.0         9.0  ...   
91              9.0         9.0         9.0         9.0         9.0  ...   
114            10.0        10.0        10.0        10.0        10.0  ...   
165             9.0         9.0         9.0   

We write this dataframe of predictions to 'matrix_factorization.csv'.

In [13]:
predicted_ratings_df.to_csv(os.path.join(OUTPUT_DIR, 'matrix_factorization.csv'))