In [1]:
import numpy as np
import pandas as pd
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns
import gzip
import json
import os
import requests
from io import BytesIO  # Import BytesIO from the io module

In [2]:
# Define the http adress where the files are recorded.
git_path = 'https://github.com/gunony/ML_Book_Valuations/raw/main/datasets/'

## PART 1 - CONSTITUTION OF THE DATASET BEFORE ANALYSIS AND MACHINE LEARNING
As explained in the readme file, the aim of the project is to predict the rating that would be given to a book based on a certain amount of input data. This first part consists of preparing the data and creating the dataset to be used for the ML algorithm, which constitutes the second part of the project.

### STEP 1 - LOAD THE FILE
As first dataset called 'books.csv' is a curation of Goodreads books. 

In [3]:
# Download the initial dataset of Books list
url = git_path+'books.csv'
df_Books = pd.read_csv(url,sep=",")

In [4]:
df_Books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14485 entries, 0 to 14484
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              14485 non-null  int64  
 1   title               14485 non-null  object 
 2   authors             14485 non-null  object 
 3   average_rating      14485 non-null  float64
 4   isbn                14485 non-null  object 
 5   isbn13              14485 non-null  object 
 6   language_code       14485 non-null  object 
 7   num_pages           14485 non-null  int64  
 8   ratings_count       14485 non-null  int64  
 9   text_reviews_count  14485 non-null  int64  
 10  publication_date    14485 non-null  object 
 11  publisher           14485 non-null  object 
dtypes: float64(1), int64(4), object(7)
memory usage: 1.3+ MB


### Step 2 : Adding book's genre in the data frame Books

The genre of the book (history, novel, poetry, etc.) is a factor that readers take into account when making their choice. We can assume that a philosophy book will be read less than a Harry Potter book. This popularity should have an impact on the rating given by readers.

The genre is not included in the initial file. The objective of this step is to therefore add the genre of each books. Goodreads determines a book's genre by crowd-sourcing user shelves. If a number of users shelve a book as "science," for example, then that genre is assigned to the book in their algorithm. This isn't a perfect system, as sometimes users might shelve something as "science" when it's actually "science fiction," and so on. For more details : https://help.goodreads.com/s/article/How-can-I-set-my-book-s-genres. 

The genres of book have been grouped into 10 categories with the score given by each reader. These categories and their respective scores will be added to the initial dataset.

The data set related to book’s genres comes from the great work made by Mengting Wan and Julian McAuley. 
• Mengting Wan, Julian McAuley, "Item Recommendation on Monotonic Behavior Chains", in RecSys'18. [bibtex] 
• Mengting Wan, Rishabh Misra, Ndapa Nakashole, Julian McAuley, "Fine-Grained Spoiler Detection from Large-Scale Review Corpora", in ACL'19. [bibtex]

All files can be found at this adress https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/byGenre/

In [5]:
# Function to load the gz file and transform into JSON file 
def load_data(url):
    response = requests.get(url)
    with gzip.GzipFile(fileobj=BytesIO(response.content), mode='rb') as fin:
        data = [json.loads(line.decode('utf-8')) for line in fin]
    return data

# Assuming the function is defined
URL = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/'
books = load_data(os.path.join(URL, 'goodreads_book_genres_initial.json.gz'))

In [6]:
# Transform the nested JSON data structure books into a dataframe called df_Genre
df_Genre = pd.json_normalize(books)

# Rename the column with book number - will be necessary for the next steps
df_Genre.rename(columns={'book_id': 'bookID'}, inplace=True)

In [7]:
display(df_Genre)
df_Genre.info()

Unnamed: 0,bookID,"genres.history, historical fiction, biography",genres.fiction,"genres.fantasy, paranormal","genres.mystery, thriller, crime",genres.poetry,genres.romance,genres.non-fiction,genres.children,genres.young-adult,"genres.comics, graphic"
0,5333265,1.0,,,,,,,,,
1,1333909,5.0,219.0,,,,,,,,
2,7327624,,8.0,31.0,1.0,1.0,,,,,
3,6066819,,555.0,,10.0,,23.0,,,,
4,287140,,,,,,,3.0,,,
...,...,...,...,...,...,...,...,...,...,...,...
2360650,3084038,7.0,,,,,,5.0,,,
2360651,26168430,,1.0,,4.0,,,,1.0,,
2360652,2342551,,,,,14.0,,1.0,7.0,1.0,
2360653,22017381,,,,2.0,,13.0,,,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2360655 entries, 0 to 2360654
Data columns (total 11 columns):
 #   Column                                         Dtype  
---  ------                                         -----  
 0   bookID                                         object 
 1   genres.history, historical fiction, biography  float64
 2   genres.fiction                                 float64
 3   genres.fantasy, paranormal                     float64
 4   genres.mystery, thriller, crime                float64
 5   genres.poetry                                  float64
 6   genres.romance                                 float64
 7   genres.non-fiction                             float64
 8   genres.children                                float64
 9   genres.young-adult                             float64
 10  genres.comics, graphic                         float64
dtypes: float64(10), object(1)
memory usage: 198.1+ MB


#### Remark:
For each book was assigned a score for each of the 10 literary genres. The book's genre with the highest score will be recorded in a specific column. For books without genre the code genres.missing will be given. The NaN will be replaced by 0.

In [8]:
# Replace NaN with 0 in the dataframe Genre
df_Genre= df_Genre.fillna(0)
# Define a function to get the title of the column with the highest note
def get_highest_note_column(row):
    max_note = row[['genres.history, historical fiction, biography', 'genres.fiction', 'genres.fantasy, paranormal',
                    'genres.mystery, thriller, crime', 'genres.poetry', 'genres.romance', 'genres.non-fiction',
                    'genres.children', 'genres.young-adult', 'genres.comics, graphic']].max()
    return row.index[row == max_note][0]
# Add a new column 'book_genre' to the dataframe
df_Genre['main_genre'] = df_Genre.apply(get_highest_note_column, axis=1)
# Replace 0 with 'genres.missing' in the 'book_genre' column
df_Genre['main_genre'] = df_Genre['main_genre'].replace(0, 'genres.missing')
# Look at the updated dataframe
display(df_Genre)
df_Genre.info()

Unnamed: 0,bookID,"genres.history, historical fiction, biography",genres.fiction,"genres.fantasy, paranormal","genres.mystery, thriller, crime",genres.poetry,genres.romance,genres.non-fiction,genres.children,genres.young-adult,"genres.comics, graphic",main_genre
0,5333265,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"genres.history, historical fiction, biography"
1,1333909,5.0,219.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,genres.fiction
2,7327624,0.0,8.0,31.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,"genres.fantasy, paranormal"
3,6066819,0.0,555.0,0.0,10.0,0.0,23.0,0.0,0.0,0.0,0.0,genres.fiction
4,287140,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,genres.non-fiction
...,...,...,...,...,...,...,...,...,...,...,...,...
2360650,3084038,7.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,"genres.history, historical fiction, biography"
2360651,26168430,0.0,1.0,0.0,4.0,0.0,0.0,0.0,1.0,0.0,0.0,"genres.mystery, thriller, crime"
2360652,2342551,0.0,0.0,0.0,0.0,14.0,0.0,1.0,7.0,1.0,0.0,genres.poetry
2360653,22017381,0.0,0.0,0.0,2.0,0.0,13.0,0.0,0.0,0.0,0.0,genres.romance


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2360655 entries, 0 to 2360654
Data columns (total 12 columns):
 #   Column                                         Dtype  
---  ------                                         -----  
 0   bookID                                         object 
 1   genres.history, historical fiction, biography  float64
 2   genres.fiction                                 float64
 3   genres.fantasy, paranormal                     float64
 4   genres.mystery, thriller, crime                float64
 5   genres.poetry                                  float64
 6   genres.romance                                 float64
 7   genres.non-fiction                             float64
 8   genres.children                                float64
 9   genres.young-adult                             float64
 10  genres.comics, graphic                         float64
 11  main_genre                                     object 
dtypes: float64(10), object(2)
memory usage: 21

In [9]:
# Now we're going to include the data relating to book's genres in the books dataset.

# Convert the bookID column in dataframe Genre from object to int64
df_Genre['bookID'] = df_Genre['bookID'].astype('int64')
# Merge both DataFrames based on the book ID
df_Books = pd.merge(df_Books, df_Genre, left_on='bookID', right_on='bookID', how='left')

In [10]:
# check the final result of the books dataframe after adding book's genre.
df_Books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14485 entries, 0 to 14484
Data columns (total 23 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   bookID                                         14485 non-null  int64  
 1   title                                          14485 non-null  object 
 2   authors                                        14485 non-null  object 
 3   average_rating                                 14485 non-null  float64
 4   isbn                                           14485 non-null  object 
 5   isbn13                                         14485 non-null  object 
 6   language_code                                  14485 non-null  object 
 7   num_pages                                      14485 non-null  int64  
 8   ratings_count                                  14485 non-null  int64  
 9   text_reviews_count                             144

#### Remarks :
In the books dataframe there 14 485 books. Only 12664 books have a book's genre. The remaining 1821 books don't have one. 
We will replace the NaN of these 1821 books with the number 0.

How to explain the absence of this information, there are two possibilities. Either these books are new and readers have not had time to read or categorise them. Or these books have simply never been read. This information will be confirmed by the book's publication date.

For the rest of the project, we are going to replace the 'NaN' of these 1824 books with the number zero. We will still be able to find these books by summing the scores of the 10 columns representing the book genres. Or by selecting the 'main_genre' column if there is no result. 

### Step 3 - Calculate the seniority of the books
From the Publication date until today.

In [11]:
from datetime import datetime

# Check if column publication_date is correct 
for index, row in df_Books.iterrows():
    try:
        date_obj = datetime.strptime(row['publication_date'], '%m/%d/%Y')
    except ValueError as e:
        print(f"Error in row {index}: {e}")
        print("Row details:", row)

In [12]:
# Convert 'publication_date' to datetime with US format
df_Books['publication_date'] = pd.to_datetime(df_Books['publication_date'], format='%m/%d/%Y')

In [13]:
# Calculate the number of months from the Publication date until today
today = datetime.today()
df_Books['Months_Until_Today'] = (today.year - df_Books['publication_date'].dt.year)*12+(today.month - df_Books['publication_date'].dt.month)

In [14]:
df_Books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14485 entries, 0 to 14484
Data columns (total 24 columns):
 #   Column                                         Non-Null Count  Dtype         
---  ------                                         --------------  -----         
 0   bookID                                         14485 non-null  int64         
 1   title                                          14485 non-null  object        
 2   authors                                        14485 non-null  object        
 3   average_rating                                 14485 non-null  float64       
 4   isbn                                           14485 non-null  object        
 5   isbn13                                         14485 non-null  object        
 6   language_code                                  14485 non-null  object        
 7   num_pages                                      14485 non-null  int64         
 8   ratings_count                                  14485 non

### Step 4 : Book's language

In [15]:
df_Books['language_code'].value_counts()

eng      11276
en-US     1789
en-GB      377
spa        288
fre        255
ger        186
ita         82
jpn         64
por         25
en-CA       20
mul         19
zho         15
nl          13
grc         11
swe          9
pol          7
dan          7
tur          5
lat          3
rus          3
gre          3
enm          3
hin          2
srp          2
ind          2
ara          2
cat          2
cze          1
urd          1
est          1
rum          1
tam          1
wel          1
fin          1
kor          1
mar          1
ale          1
gla          1
nor          1
glg          1
msa          1
nob          1
Name: language_code, dtype: int64

#### Remarks : 
There are miscellaneous languages. But english is the most representative as Good reads is a US web site.
We are going to group English-language books under a single code eng.

In [16]:
df_Books['language_code'].replace(['en-CA', 'en-GB', 'en-US', 'enm'],'eng', inplace=True)

### Step 5 : Work on the authors
The authors also have a rating which we will retrieve and add to the books dataset.
In the authors column of the Books dataset, some books have several names. The first corresponds to the writer (e.g. Harry Potter: 'J.K. Rowling/Mary GrandPrÃ©'). The other names may be illustrators (in the case of Mary GrandPrÃ©), translators or co-authors. 
Unfortunately, this is information is not available. For the rest of the project, we will therefore keep only the first name that corresponds to the author or one of the authors. This decision was validated after testing a few books.
Information on the authors is available on the following site: https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/byGenre/

In [17]:
# Split the author column to keep only the name of the writer
df_Books[['authors','co-authors']] = df_Books['authors'].str.split(pat='/', n=1, expand=True)

In [18]:
# Function to load the gz file and transform into JSON file 
def load_data(url):
    response = requests.get(url)
    with gzip.GzipFile(fileobj=BytesIO(response.content), mode='rb') as fin:
        data = [json.loads(line.decode('utf-8')) for line in fin]
    return data

# Assuming the function is defined
URL = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/'
books = load_data(os.path.join(URL, 'goodreads_book_authors.json.gz'))

In [19]:
# Transform the nested JSON data structure books into a dataframe called df_Genre
df_Authors = pd.json_normalize(books)

In [20]:
# A look at the dataframe
df_Authors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 829529 entries, 0 to 829528
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   average_rating      829529 non-null  object
 1   author_id           829529 non-null  object
 2   text_reviews_count  829529 non-null  object
 3   name                829529 non-null  object
 4   ratings_count       829529 non-null  object
dtypes: object(5)
memory usage: 31.6+ MB


In [21]:
# Create a mapping dictionnary containing only the columns name and average_rating
auteur_map = dict(zip(df_Authors['name'], df_Authors['average_rating']))

# Adding Authors rating to df-Books
df_Books['author_rating'] = df_Books['authors'].map(auteur_map)

# For the authors not in the csv file, we give a note of zero.
df_Books['author_rating'] = df_Books['author_rating'].fillna(0)

In [22]:
df_Books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14485 entries, 0 to 14484
Data columns (total 26 columns):
 #   Column                                         Non-Null Count  Dtype         
---  ------                                         --------------  -----         
 0   bookID                                         14485 non-null  int64         
 1   title                                          14485 non-null  object        
 2   authors                                        14485 non-null  object        
 3   average_rating                                 14485 non-null  float64       
 4   isbn                                           14485 non-null  object        
 5   isbn13                                         14485 non-null  object        
 6   language_code                                  14485 non-null  object        
 7   num_pages                                      14485 non-null  int64         
 8   ratings_count                                  14485 non

In [23]:
# change Dtype of author rating
df_Books['author_rating'] = df_Books['author_rating'].astype('float64')

In [24]:
# check if there is NaN in the column author rating
Has_Nan = df_Books['author_rating'].isnull().sum()
print(Has_Nan)

0


### Step 6 : Adding format of the books
We add the format of the book (paper, audio,...) from the the file 'goodreads_books.json.gz' which is located on the web site : https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/

As the file is large (1.9 GB), it was uploaded and converted into an easier-to-load csv file separately containing only the prinicpal information needed : bookID + Format + isebook. 

In [25]:
# Download the Book's format csv file
df_BooksFormat = pd.read_csv(r"C:\Users\gunon\Documents\bootcamp-main\3-projects\ML_Book_Valuations\datasets\BooksFormat.csv", sep=",")
# What is inside the file
df_BooksFormat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2360655 entries, 0 to 2360654
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   book_id   int64 
 1   format    object
 2   is_ebook  object
dtypes: int64(1), object(2)
memory usage: 54.0+ MB


  df_BooksFormat = pd.read_csv(r"C:\Users\gunon\Documents\bootcamp-main\3-projects\ML_Book_Valuations\datasets\BooksFormat.csv", sep=",")


In [26]:
# solve the alert message above
df_BooksFormat['format'] = df_BooksFormat['format'].astype (object)

In [27]:
df_BooksFormat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2360655 entries, 0 to 2360654
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   book_id   int64 
 1   format    object
 2   is_ebook  object
dtypes: int64(1), object(2)
memory usage: 54.0+ MB


In [28]:
# Create a mapping dictionary containing only BookID and Format from the file BooksList.csv
book_format = dict(zip(df_BooksFormat['book_id'], df_BooksFormat['format']))

# Add format to the Books dataframe using the map function
df_Books['book_format'] = df_Books['bookID'].map(book_format)

# Group book's format into two families: audio / books
df_Books['book_format'] = df_Books['book_format'].fillna('Unknown Binding')
df_Books['book_format'] = df_Books.apply(lambda row: 'audio' if 'audio' in row['publisher'].lower() and
                        row['book_format'] == 'Unknown Binding' else row['book_format'], axis=1)
df_Books['book_format'] = df_Books['book_format'].replace(['Audio', 'Audio CD', 'audio Cassette', 'Audio Cassette'
                        ,'Audiobook', 'MP3 CD'], 'audio')
df_Books['book_format'] = df_Books['book_format'].replace(['Unknown Binding'], 'book')
df_Books['book_format'] = df_Books['book_format'].apply(lambda x: 'book' if x != 'audio' else x)

# get_dummies tranforms categorical data to numbers 0=audio / 1=book
df_Books["audio 0 book 1"] = pd.get_dummies(df_Books.book_format, drop_first=True).values

In [29]:
df_Books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14485 entries, 0 to 14484
Data columns (total 28 columns):
 #   Column                                         Non-Null Count  Dtype         
---  ------                                         --------------  -----         
 0   bookID                                         14485 non-null  int64         
 1   title                                          14485 non-null  object        
 2   authors                                        14485 non-null  object        
 3   average_rating                                 14485 non-null  float64       
 4   isbn                                           14485 non-null  object        
 5   isbn13                                         14485 non-null  object        
 6   language_code                                  14485 non-null  object        
 7   num_pages                                      14485 non-null  int64         
 8   ratings_count                                  14485 non

In [30]:
# check the new columns
df_Books['book_format'].value_counts()

book     14232
audio      253
Name: book_format, dtype: int64

In [31]:
df_Books['audio 0 book 1'].value_counts()

1    14232
0      253
Name: audio 0 book 1, dtype: int64

In [32]:
### Step 7 : Save the final file

In [33]:
# Saving Books dataframe with all data on github
df_Books.to_csv(r"C:\Users\gunon\Documents\bootcamp-main\3-projects\ML_Book_Valuations\datasets\Books_Final.csv", index=False)