In [20]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data is contained in '../Data/Processed' directory
# Output data is written to '../Data/Processed' directory
# This cell lists all files under the input directory

import os
INPUT_DIR = os.path.join(os.path.dirname(os.getcwd()), 'Data', 'Processed')
OUTPUT_DIR = os.path.join(os.path.dirname(os.getcwd()), 'Data', 'Processed')
for dirname, _, filenames in os.walk(INPUT_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\average_ratings_by_user.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\Books_valid_ISBN_known_year_no_images.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\normalized_user-book_matrix.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\popular_books_with_descriptions.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\ratings_for_popular_books.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\Ratings_valid_ISBN.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\user-book_matrix.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\user-book_matrix_normalized_withna.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\users_valid_age_with_country.csv


In this notebook we create a user-book matrix containing information about rating each user gave to each book or NaN if they didn't review the book. We also create a normalized matrix where each has a mean 0 for the ratings they gave.

We first load the ratings dataframe.

In [21]:
ratings_df = pd.read_csv(os.path.join(INPUT_DIR, 'ratings_for_popular_books.csv'))
print(ratings_df.info())
print(ratings_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17837 entries, 0 to 17836
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   User-ID      17837 non-null  int64 
 1   ISBN         17837 non-null  object
 2   Book-Rating  17837 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 418.2+ KB
None
   User-ID        ISBN  Book-Rating
0   276788  043935806X            7
1   276925  0385504209            8
2   276953  0446310786           10
3   276964  0440220602            9
4   277042  0971880107            2


To create a user-book matrix, we pivot our dataframe.

In [22]:
user_ratings_pivot = ratings_df.pivot(index='User-ID', columns='ISBN', values='Book-Rating')
print(user_ratings_pivot.info())
print(user_ratings_pivot.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10637 entries, 26 to 278844
Data columns (total 95 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   0060392452  158 non-null    float64
 1   0060502258  166 non-null    float64
 2   0060928336  320 non-null    float64
 3   0060930535  218 non-null    float64
 4   0060934417  159 non-null    float64
 5   0060938455  154 non-null    float64
 6   0060976845  186 non-null    float64
 7   0060987103  156 non-null    float64
 8   006101351X  118 non-null    float64
 9   014028009X  131 non-null    float64
 10  0140293248  165 non-null    float64
 11  0142000205  129 non-null    float64
 12  0142001740  307 non-null    float64
 13  0312195516  383 non-null    float64
 14  0312278586  226 non-null    float64
 15  0312291639  167 non-null    float64
 16  0312305060  129 non-null    float64
 17  0316096199  133 non-null    float64
 18  0316284955  171 non-null    float64
 19  0316601950  272 non-nul

We write this matrix to a user-book_matrix.csv file.

In [23]:
user_ratings_pivot.to_csv(os.path.join(OUTPUT_DIR, 'user-book_matrix.csv'))

Now we have our (very sparse) matrix with ratings that users gave to books or NaN if they didn't leave a review. To get rid of NaNs, we will center each row so that the average rating the user gives is zero (we will have negative ratings now). To do that, we will subtract the average rating for every user. We then will fill NaNs with zeroes.

In [24]:
user_average_ratings = user_ratings_pivot.mean(axis=1)
print(user_average_ratings)
user_ratings_pivot = user_ratings_pivot.sub(user_average_ratings, axis=0)
print(user_ratings_pivot.head())

User-ID
26        10.000000
51         9.000000
91         9.000000
114       10.000000
165        9.000000
            ...    
278755     1.000000
278773     8.000000
278798     7.000000
278843     8.333333
278844     7.000000
Length: 10637, dtype: float64
ISBN     0060392452  0060502258  0060928336  0060930535  0060934417  \
User-ID                                                               
26              NaN         NaN         NaN         NaN         NaN   
51              NaN         NaN         NaN         NaN         NaN   
91              NaN         NaN         NaN         NaN         NaN   
114             NaN         NaN         NaN         NaN         NaN   
165             NaN         NaN         NaN         NaN         NaN   

ISBN     0060938455  0060976845  0060987103  006101351X  014028009X  ...  \
User-ID                                                              ...   
26              NaN         NaN         NaN         NaN         NaN  ...   
51              

We write our normalized matrix to 'user-book_matrix_normalized_withna.csv'. 

In [25]:
user_ratings_pivot.to_csv(os.path.join(OUTPUT_DIR, 'user-book_matrix_normalized_withna.csv'))

It remains to remove NaNs from the matrix.

In [26]:
user_ratings_pivot = user_ratings_pivot.fillna(0)
print(user_ratings_pivot.head())
print(user_ratings_pivot.mean(axis=1))

ISBN     0060392452  0060502258  0060928336  0060930535  0060934417  \
User-ID                                                               
26              0.0         0.0         0.0         0.0         0.0   
51              0.0         0.0         0.0         0.0         0.0   
91              0.0         0.0         0.0         0.0         0.0   
114             0.0         0.0         0.0         0.0         0.0   
165             0.0         0.0         0.0         0.0         0.0   

ISBN     0060938455  0060976845  0060987103  006101351X  014028009X  ...  \
User-ID                                                              ...   
26              0.0         0.0         0.0         0.0         0.0  ...   
51              0.0         0.0         0.0         0.0         0.0  ...   
91              0.0         0.0         0.0         0.0         0.0  ...   
114             0.0         0.0         0.0         0.0         0.0  ...   
165             0.0         0.0         0.0   

We write this normalized user-book matrix to 'normalized_user-book_matrix.csv'. We will also write the average ratings for each user to a separate 'average_ratings_by_user.csv' file.

In [27]:
user_ratings_pivot.to_csv(os.path.join(OUTPUT_DIR, 'normalized_user-book_matrix.csv'))
user_average_ratings.to_csv(os.path.join(OUTPUT_DIR, 'average_ratings_by_user.csv'))