## 1. Load libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


## 2. Load data

In [2]:
# Some corrections (bad columns' names, tabulation, ...) have been done to the data before loading it
# During exporting, we let Python choose the right data types for each column for now...
data = pd.read_csv(r"books_new.csv")

## 3. Explore data for more corrections

In [3]:
# Number of rows, and columns
data.shape

(11127, 12)

In [4]:
# View of the first two rows
data.head(2)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling / Mary GrandPré,4.57,439785960,9780000000000.0,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling / Mary GrandPré,4.49,439358078,9780000000000.0,eng,870,2153167,29221,9/1/2004,Scholastic Inc.


In [5]:
# View of the data types chosen for each column by Python, and eventually number of NA (missing data)
data.info()
# The data types chosen for each column is correct.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              11127 non-null  int64  
 1   title               11127 non-null  object 
 2   authors             11127 non-null  object 
 3   average_rating      11127 non-null  float64
 4   isbn                11127 non-null  object 
 5   isbn13              11127 non-null  float64
 6   language_code       11127 non-null  object 
 7   num_pages           11127 non-null  int64  
 8   ratings_count       11127 non-null  int64  
 9   text_reviews_count  11127 non-null  int64  
 10  publication_date    11127 non-null  object 
 11  publisher           11127 non-null  object 
dtypes: float64(2), int64(4), object(6)
memory usage: 1.0+ MB


In [6]:
data.describe()
# We can't have average_rate, num_pages or ratings_count equal to zero,
# we must investigate and eventually drop the corresponding rows, they can be outliers ;

# But text_reviews_count can be equal to zero.

Unnamed: 0,bookID,average_rating,isbn13,num_pages,ratings_count,text_reviews_count
count,11127.0,11127.0,11127.0,11127.0,11127.0,11127.0
mean,21310.938887,3.933631,9759178000000.0,336.376921,17936.41,541.854498
std,13093.358023,0.352445,442865000000.0,241.127305,112479.4,2576.176608
min,1.0,0.0,8987060000.0,0.0,0.0,0.0
25%,10287.0,3.77,9780000000000.0,192.0,104.0,9.0
50%,20287.0,3.96,9780000000000.0,299.0,745.0,46.0
75%,32104.5,4.135,9780000000000.0,416.0,4993.5,237.5
max,45641.0,5.0,9790000000000.0,6576.0,4597666.0,94265.0


In [7]:
columns_to_drop = ["bookID", "isbn", "isbn13"]

# We decided to drop these columns because they are IDs and they don't really determine the average rating: they are just identifiers
data1 = data.drop(columns_to_drop, axis=1, inplace=False)
data1.head(2)

Unnamed: 0,title,authors,average_rating,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling / Mary GrandPré,4.57,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling / Mary GrandPré,4.49,eng,870,2153167,29221,9/1/2004,Scholastic Inc.


In [8]:
# Get the publication year, instead of having the long date (this format of long date isn't really important)
data1["publication_date"] = data1["publication_date"].apply(lambda x: int(x.split("/")[-1]))
data1.rename(columns={"publication_date" : "publication_year"}, inplace=True)
data1.head(2)

Unnamed: 0,title,authors,average_rating,language_code,num_pages,ratings_count,text_reviews_count,publication_year,publisher
0,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling / Mary GrandPré,4.57,eng,652,2095690,27591,2006,Scholastic Inc.
1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling / Mary GrandPré,4.49,eng,870,2153167,29221,2004,Scholastic Inc.


In [9]:
# Inspect the langage code, by proceeding with data1
data1["language_code"].value_counts()
# Books written in english were the most rated, but there are also some langages (like arabic, turkish) in minority...

language_code
eng      8911
en-US    1409
spa       218
en-GB     214
fre       144
ger        99
jpn        46
mul        19
zho        14
grc        11
por        10
en-CA       7
ita         5
enm         3
lat         3
swe         2
rus         2
srp         1
nl          1
msa         1
glg         1
wel         1
ara         1
nor         1
tur         1
gla         1
ale         1
Name: count, dtype: int64

In [10]:
# Drop language_code
del data1["language_code"]

In [11]:
data1.describe()

Unnamed: 0,average_rating,num_pages,ratings_count,text_reviews_count,publication_year
count,11127.0,11127.0,11127.0,11127.0,11127.0
mean,3.933631,336.376921,17936.41,541.854498,2000.16752
std,0.352445,241.127305,112479.4,2576.176608,8.248836
min,0.0,0.0,0.0,0.0,1900.0
25%,3.77,192.0,104.0,9.0,1998.0
50%,3.96,299.0,745.0,46.0,2003.0
75%,4.135,416.0,4993.5,237.5,2005.0
max,5.0,6576.0,4597666.0,94265.0,2020.0


In [12]:
# Investigate the oldest book (written in 1900)
data1[data1["publication_year"]==1900]

Unnamed: 0,title,authors,average_rating,num_pages,ratings_count,text_reviews_count,publication_year,publisher
9375,Consider the Lilies,Iain Crichton Smith / Isobel Murray,3.88,144,332,33,1900,Polygon


In [13]:
# Investigate the publication_year distibution
data1["publication_year"].value_counts()

# It seems that most of the rated books are recent.

publication_year
2006    1700
2005    1260
2004    1071
2003     931
2002     798
        ... 
1947       1
1922       1
1929       1
1919       1
1940       1
Name: count, Length: 87, dtype: int64

In [14]:
data1["publication_year"].unique()

array([2006, 2004, 2003, 2005, 2002, 1996, 2001, 2000, 1990, 1993, 1997,
       1991, 2012, 1994, 1982, 1986, 1999, 1987, 1984, 2007, 1965, 1998,
       1995, 1964, 1970, 2011, 1955, 1988, 1985, 1989, 1963, 2008, 2009,
       1976, 1975, 1980, 1992, 1973, 2019, 1954, 2015, 1919, 1921, 1923,
       1969, 1968, 1961, 1953, 1958, 1983, 1978, 1929, 1977, 1979, 1922,
       1981, 2010, 1950, 1971, 1960, 1959, 2013, 2017, 2016, 1972, 1947,
       1943, 1974, 1957, 2014, 2018, 1952, 1935, 1956, 1966, 1925, 1962,
       1949, 1913, 1928, 1914, 1948, 1967, 1900, 2020, 1931, 1940],
      dtype=int64)

In [15]:
# Number of rows, and columns (columns'number must change since we delete "bookID", "isbn", "language_code" and "isbn13")
data1.shape

(11127, 8)

In [19]:
# Explore the number of unique values for each column
data1.nunique()

# The columns title and authors which are categorical variables have the most different values.

title                 10352
authors                6643
average_rating          209
num_pages               997
ratings_count          5294
text_reviews_count     1822
publication_year         87
publisher              2292
dtype: int64

In [20]:
# Be sure that none row has missing year
data1["publication_year"].isnull().sum()
# data["publication_year"].isna().sum()

0

In [21]:
# Analyze the rows with average_rating == 0
data1[data1["average_rating"]==0]

# In generally, the rows with average_rating == 0, have also ratings_count == 0
# That's normal since there weren't reviewed by people, so 0 is a default value in these cases.
# This 0 can be seen like some kind of default value for books which haven't be reviewed.

# Keeping these rows can be problematic !!!

Unnamed: 0,title,authors,average_rating,num_pages,ratings_count,text_reviews_count,publication_year,publisher
265,Out to Eat London 2002 (Lonely Planet Out to Eat),Lonely Planet / Mark Honan,0.0,295,0,0,2001,Lonely Planet
375,Juiced Official Strategy Guide,Doug Walsh,0.0,112,0,0,2005,BradyGames
987,Open City 6: The Only Woman He Ever Left,Open City Magazine / James Purdy / Daniel Pinc...,0.0,200,0,0,2000,Grove Press Open City Books
2532,How To Meditate: An Anthology Of Talks On Medi...,Frederick P. Lenz,0.0,228,0,0,2004,Frederick P. Lenz Foundation for American Budd...
2533,Insights: Talks On The Nature Of Existence,Frederick P. Lenz,0.0,304,0,0,2003,Frederick P. Lenz Foundation for American Budd...
2758,Venac sonetnih venaca; Puževa srma,Dobrica Erić,0.0,222,0,0,1996,"Izdavačka agencija ""Draganić"""
3493,Brodie's notes on Aldous Huxley's brave new world,Graham Handley,0.0,71,0,0,1992,Macmillan
4242,American Film Guide,Frank N. Magill,0.0,5,0,0,1983,Salem Press Inc
4678,The Man and the Author: John Milton: Twentieth...,J. Martin Evans,0.0,386,0,0,2002,Routledge
5325,Canopy: A Work for Voice and Light in Harvard ...,David Ward / Parveen Adams / Seamus Heaney /...,0.0,63,0,0,1997,Arts Publications


In [22]:
# Drop the rows having average_rating == 0
data2 = data1[~(data1["average_rating"]==0)]
# data2 contains only rows with average_rating != 0

data2.shape

(11101, 8)

In [23]:
# Analyze the rows with ratings_count == 0
# We can't have rows for which at the same time ratings_count is equal to 0, but average_rating is different from 0  since the 
# average_rating is the average of all ratings based on the ratings_count.
data2[(data2["ratings_count"]==0)]

Unnamed: 0,title,authors,average_rating,num_pages,ratings_count,text_reviews_count,publication_year,publisher
264,Lonely Planet Londres,Lonely Planet / Sarah Johnstone / Tom Masters,4.03,480,0,0,2006,Geoplaneta
525,American Government: Continuity and Change Al...,Karen O'Connor / Larry J. Sabato,2.83,664,0,0,2005,Longman Publishing Group
526,Essentials of American and Texas Government: C...,Karen O'Connor / Larry J. Sabato,3.5,854,0,0,2005,Longman Publishing Group
624,Comoediae 1: Acharenses/Equites/Nubes/Vespae/P...,Aristophanes / F.W. Hall / W.M. Geldart,5.0,364,0,0,1922,Oxford University Press USA
747,Melville and the politics of identity: From *K...,Julian Markels,3.33,164,0,0,1993,University of Illinois Press
935,April May und June,Elizabeth von Arnim,3.88,88,0,0,1995,Insel Frankfurt
1110,Dr No / Moonraker / Thunderball / From Russia ...,Ian Fleming,3.98,862,0,0,1984,Heinemann-Octopus
1681,V.S. Naipaul,Bruce Alvin King,2.0,240,0,0,2003,Palgrave Macmillan
2051,The Baby Emergency (Tennengarrah Clinic #1),Carol Marinelli,3.6,285,0,0,2004,Mills & Boon
2255,Animales No Se Visten Los (Animals Should Def...,Judi Barrett / Ron Barrett,4.11,32,0,0,1991,Live Oak Media


In [24]:
# Rows with average_rating != 0 but ratings_count == 0 must be dropped
data3 = data2[~(data2['ratings_count'] == 0)]
# data3 contains only rows with average_rating != 0 and ratings_count != 0

data3.shape

(11046, 8)

In [25]:
# Analyze rows with num_pages == 0
# We can't have rows for which at the same time num_pages == 0, but average_rating != 0 since at least pages of these books must be read before
# giving a rating.
data3[(data3["num_pages"]==0)]

Unnamed: 0,title,authors,average_rating,num_pages,ratings_count,text_reviews_count,publication_year,publisher
306,The 5 Love Languages / The 5 Love Languages Jo...,Gary Chapman,4.70,0,22,4,2005,Moody Publishers
853,The Tragedy of Pudd'nhead Wilson,Mark Twain / Michael Prichard,3.79,0,3,0,2003,Tantor Media
1061,Murder by Moonlight & Other Mysteries (New Adv...,NOT A BOOK,4.00,0,7,2,2006,Simon Schuster Audio
1064,The Unfortunate Tobacconist & Other Mysteries ...,NOT A BOOK,3.50,0,12,1,2003,Simon & Schuster Audio
1230,The Da Vinci Code (Robert Langdon #2),Dan Brown / Paul Michael,3.84,0,91,16,2006,Random House Audio
...,...,...,...,...,...,...,...,...
10030,The Chessmen of Mars (Barsoom #5),Edgar Rice Burroughs / John Bolen,3.83,0,5147,157,2005,Tantor Media
10192,Fine Lines (One-Eyed Mack #6),Jim Lehrer,3.23,0,17,4,1995,Random House Value Publishing
10624,Stowaway and Milk Run: Two Unabridged Stories ...,Mary Higgins Clark / Jan Maxwell,3.49,0,64,2,1999,Simon & Schuster Audio
10926,The Mask of the Enchantress,Victoria Holt,3.85,0,21,1,1981,Ivy Books


In [26]:
# Rows with average_rating != 0 but num_pages == 0 must be dropped
data4 = data3[~(data3["num_pages"]==0)]
# data4 contains only rows with average_rating != 0, ratings_count != 0 and num_pages != 0

data4.shape

(10971, 8)

In [27]:
data4.describe()

Unnamed: 0,average_rating,num_pages,ratings_count,text_reviews_count,publication_year
count,10971.0,10971.0,10971.0,10971.0,10971.0
mean,3.943235,339.06499,18188.83,549.457205,2000.183939
std,0.294536,240.173871,113256.2,2593.630924,8.200245
min,1.0,1.0,1.0,0.0,1900.0
25%,3.78,197.0,113.0,10.0,1998.0
50%,3.96,302.0,781.0,48.0,2003.0
75%,4.14,416.0,5138.0,244.0,2005.0
max,5.0,6576.0,4597666.0,94265.0,2020.0


In [28]:
# Delete unecessary data
del data, data1, data2, data3

# Rename data4 as data, then delete data4
data = data4.copy(deep=True)
del data4

# We will be working with only one dataframe named data which is a clean version of our original data.

In [29]:
# Analyze correlations between quantitative variables
data[['average_rating', 'num_pages', 'ratings_count', 'text_reviews_count', 'publication_year']].corr()

# No high correlation were found ... (Correlation lesser than 0.5 or greater than -0.5)
# Using only these quantitative variables to explain average_rating won't give us models which can perform so well.

Unnamed: 0,average_rating,num_pages,ratings_count,text_reviews_count,publication_year
average_rating,1.0,0.170746,0.040815,0.033764,-0.046053
num_pages,0.170746,1.0,0.032975,0.035104,-0.022084
ratings_count,0.040815,0.032975,1.0,0.865931,0.044832
text_reviews_count,0.033764,0.035104,0.865931,1.0,0.067372
publication_year,-0.046053,-0.022084,0.044832,0.067372,1.0


In [125]:
# Create copies of data
data_copy1 = data.copy(deep=True)
data_copy2 = data.copy(deep=True)

In [126]:
# Normalize the num_pages column
from sklearn.preprocessing import StandardScaler, MinMaxScaler

Scaler1 = StandardScaler()
Scaler2 = MinMaxScaler()

data_copy1[['num_pages']] = Scaler1.fit_transform(data_copy1[['num_pages']])
data_copy2[['num_pages']] = Scaler2.fit_transform(data_copy2[['num_pages']])

In [42]:
data_copy1[['num_pages']].describe()

Unnamed: 0,num_pages
count,10971.0
mean,4.922181e-17
std,1.000046
min,-1.407649
25%,-0.5915359
50%,-0.1543327
75%,0.3203451
max,25.9696


In [43]:
data_copy2[['num_pages']].describe()
# data_copy2 is the better format to use since having positive values for num_pages is more understandable.

Unnamed: 0,num_pages
count,10971.0
mean,0.051417
std,0.036528
min,0.0
25%,0.02981
50%,0.045779
75%,0.063118
max,1.0


## 4. Transform all the qualitative data into quantitative data

In [127]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [128]:
# Instantiate the TF-IDF vectorizer
publisher_tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the 'publisher' column
data_copy2_publisher_tfidf = publisher_tfidf_vectorizer.fit_transform(data_copy2['publisher'])
# Convert the TF-IDF matrices to DataFrames
data_copy2_publisher_df = pd.DataFrame(data_copy2_publisher_tfidf.toarray(), columns=publisher_tfidf_vectorizer.get_feature_names_out())
# data_copy2_publisher_df


In [129]:
# Instantiate the TF-IDF vectorizers
title_tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the 'title' column
data_copy2_title_tfidf = title_tfidf_vectorizer.fit_transform(data_copy2['title'])
# Convert the TF-IDF matrices to DataFrames
data_copy2_title_df = pd.DataFrame(data_copy2_title_tfidf.toarray(), columns=title_tfidf_vectorizer.get_feature_names_out())
# data_copy2_title_df

In [155]:
data_copy3 = pd.concat([data_copy2.drop(['publisher', 'title'], axis=1), data_copy2_publisher_df, data_copy2_title_df], axis=1, join='inner')
del data_copy3['authors']

data_copy3 = pd.concat([data_copy3, data_copy2['authors']], axis=1, join='inner')

In [157]:
# Instantiate the TF-IDF vectorizers
authors_tfidf_vectorizer = TfidfVectorizer()


In [159]:
# Fit and transform the 'authors' column
data_copy3_authors_tfidf = authors_tfidf_vectorizer.fit_transform(data_copy3['authors'])
# Convert the TF-IDF matrices to DataFrames
data_copy3_authors_df = pd.DataFrame(data_copy3_authors_tfidf.toarray(), columns=authors_tfidf_vectorizer.get_feature_names_out())
# data_copy3_authors_df
data_copy4 = pd.concat([data_copy3.drop(['authors'], axis=1), data_copy3_authors_df], axis=1, join='inner')
data_copy4


Unnamed: 0,average_rating,num_pages,ratings_count,text_reviews_count,publication_year,10,18,1976,1st,2000,...,朱學恆,林靜華,橋口,皇冠編譯組,神尾葉子,章博,維人,羅琳,荒川弘,趙丕慧
0,4.57,0.099011,2095690,27591,2006,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.49,0.132167,2153167,29221,2004,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.42,0.053384,6333,244,2003,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.56,0.066008,2339585,36325,2004,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.78,0.408973,41428,164,2004,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10812,3.63,0.023270,406,45,2006,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10813,3.77,0.024183,7,1,2003,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10814,3.93,0.036958,1780,220,2003,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10815,4.08,0.061293,44,7,1994,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Prepare data for eventually more advanced models like NLP
data["text"] = data.apply(lambda row: "the book {} written by {} and published by {} in {}".format(
    row["title"], row["authors"], row["publisher"], row["publication_year"]), axis=1)
data['text2'] = data.apply(lambda row:"the book {} written by {} and published by {} in {} with {} pages rated by {} persons and reviewed by {} people".format(
    row["title"], row["authors"], row["publisher"], row["publication_year"], row['num_pages'], row['ratings_count'], row['text_reviews_count']), axis=1)

In [28]:
# View 
data["text"][0]

'the book Harry Potter and the Half-Blood Prince (Harry Potter  #6) written by J.K. Rowling / Mary GrandPré and published by Scholastic Inc. in 2006'

In [29]:
# View 
data["text2"][0]

'the book Harry Potter and the Half-Blood Prince (Harry Potter  #6) written by J.K. Rowling / Mary GrandPré and published by Scholastic Inc. in 2006 with 652 pages rated by 2095690 persons and reviewed by 27591 people'

## 3. Choose Models for prediction

### 3.1. Regression Linear Models

Split data

In [160]:
data = data_copy4

# Split data into training and test 
target = data[['average_rating']]
features = data[data.columns[~(data.columns.isin(target.columns))].tolist()]
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=123)

# To be sure, we're still working with dataframes for X_train, X_test and we have arrays for y_train and y_test
X_train = pd.DataFrame(X_train) 
X_test = pd.DataFrame(X_test)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

del target, features

Linear Regression

In [164]:
# We will use only quantitative variables to predict avarage_rating
# quant_variables = ['num_pages', 'ratings_count', 'text_reviews_count', 'publication_year']

# In our linear model, we will take into account the interaction among the quant_variables since we
# already know that taking independantly, they don't have enough informations to predict average_rating.
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Create all polynomial combination of features having a degree less or equal to two
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train2 = poly.fit_transform(X_train) # X_train2 = poly.fit_transform(X_train[quant_variables])
X_test2 = poly.fit_transform(X_test) # X_test2 = poly.fit_transform(X_test[quant_variables])

# Fit our linear regression model on the train, then evaluate the test
model = LinearRegression()
model.fit(X_train2, y_train)

# Evaluate the model
train_score = model.score(X_train2, y_train)
test_score = model.score(X_test2, y_test)
print(f"Training R^2 score: {train_score:.4f}")
print(f"Testing R^2 score: {test_score:.4f}")

# We obtain a R-squared of 3.7% on the test set.
# We will look for more advanced models in order to have better results.

del X_train2, X_test2

MemoryError: Unable to allocate 20.1 TiB for an array with shape (7466, 369933601) and data type float64

Random Forest

In [163]:
# We will use only quantitative variables to predict avarage_rating
# quant_variables = ['num_pages', 'ratings_count', 'text_reviews_count', 'publication_year']
X_train2 = X_train #  X_train2 = X_train[quant_variables]
X_test2 = X_test # X_test2 = X_test[quant_variables]

# We will use a RandomForest Regressor which is more advanced than our linear model with interactions
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, r2_score

# Create a Random Forest regressor
rf = RandomForestRegressor(random_state=123)

# Define the parameter grid to search to optimize the hyperparameters
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [5, 10, 15],   # Maximum depth of the tree
    'min_samples_split': [5, 10]  # Minimum number of samples required to split a node
}

# Perform Grid Search Cross-Validation
scorer = make_scorer(r2_score)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring=scorer, n_jobs=-1)
grid_search.fit(X_train2, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_r2 = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best r2_score:", best_r2)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
test_r2 = r2_score(y_test, best_model.predict(X_test2))
print("r2-score on Test Set using Best Model:", test_r2)

# The Random Forest gives a better result compared to the linear regression,
# But to achieve this 9.6% r_squared which is an augmentation of 1.54 campared to the 3.7% r_squared,
# we need to train at least 1000 trees.

MemoryError: Unable to allocate 474. MiB for an array with shape (5973, 10395) and data type float64

### 3.1. BERT Model for Regression

### 3.2. Classification Models

Create classes and split data

In [34]:
# We treat this problem as a classification model : we need to determine the right cut to seperate average_rating
# into different groups.

# We decide to create two groups for average_rating :
# the group with average_rating lesser than or equal to 4 will be qualified low_medium,
# the group with average_rating greater than 4 will be qualified high.
# This separation has been chosen in order to have similar number in each group.

target = data[['average_rating']]
features = data[data.columns[~(data.columns.isin(target.columns))].tolist()]
Rating_category = target['average_rating'].apply(lambda x: 'high' if x>4 else 'low_medium')

# Split data into training and test 
X_train, X_test, y_train, y_test = train_test_split(features, Rating_category, test_size=0.3, random_state=123)

# To be sure, we're still working with dataframes for X_train, X_test and we have arrays for y_train and y_test
X_train = pd.DataFrame(X_train) 
X_test = pd.DataFrame(X_test)

del target, features, Rating_category


In [35]:
# Verify if the proportion of low_medium and high is the same in y_train and in y_test
print("The distibution of low and high in the test data is : \n"
      , y_test.value_counts()/sum(y_test.value_counts()))
print('\n')
print("The distibution of low and high in the train data is : \n"
      , y_train.value_counts()/sum(y_train.value_counts()))

The distibution of low and high in the test data is : 
 average_rating
low_medium    0.568044
high          0.431956
Name: count, dtype: float64


The distibution of low and high in the train data is : 
 average_rating
low_medium    0.575856
high          0.424144
Name: count, dtype: float64


In [36]:
# Train a logistic regression to predict upper or lower
from sklearn.linear_model import LogisticRegression

# We will use only quantitative variables to predict avarage_rating
quant_variables = ['num_pages', 'ratings_count', 'text_reviews_count', 'publication_year']
X_train2 = X_train[quant_variables]
X_test2 = X_test[quant_variables]

# We normalize each feature in X_train2 and X_test2
from sklearn.preprocessing import StandardScaler
Transformer = StandardScaler().fit(X_train2)
X_train3 = pd.DataFrame(Transformer.transform(X_train2))
X_test3 = pd.DataFrame(Transformer.transform(X_test2))
del X_train2, X_test2

# Create a Logistic Regression model
model = LogisticRegression(random_state=123)

# Train the model using the training data
model.fit(X_train3, y_train)

# Calculate the f1-score on the training and the test set
from sklearn.metrics import f1_score
f1_train = f1_score(y_train, model.predict(X_train3), average='weighted')
print(f"The F1-score on the training is : {f1_train}")
f1_test = f1_score(y_test, model.predict(X_test3), average='weighted')
print(f"The F1-score on the test is : {f1_test}")

The F1-score on the training is : 0.529439716067725
The F1-score on the test is : 0.5534189925263948


In [37]:
# Create a Logistic Regression model in which we take into account the interaction between the features
model2 = LogisticRegression(random_state=123)

# Create all polynomial combination of features having a degree less or equal to two
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train4 = poly.fit_transform(X_train3)
X_test4 = poly.fit_transform(X_test3)

# Train the model using the training data
model2.fit(X_train4, y_train)

# Calculate the f1-score on the training and the test set
f1_train = f1_score(y_train, model2.predict(X_train4), average='weighted')
print(f"The F1-score on the training is : {f1_train}")
f1_test = f1_score(y_test, model2.predict(X_test4), average='weighted')
print(f"The F1-score on the test is : {f1_test}")

# When we try to take into account the different interactions between variables, the F1-score of the new model
# isn't better than this of the previous one.

The F1-score on the training is : 0.5358595347504272
The F1-score on the test is : 0.5500117592064663


Random Forest

In [38]:
# We will use only quantitative variables to predict avarage_rating
quant_variables = ['num_pages', 'ratings_count', 'text_reviews_count', 'publication_year']
X_train2 = X_train[quant_variables]
X_test2 = X_test[quant_variables]

# We will use a RandomForest Classifier which is more advanced than the logistic regression with interactions
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create a Random Forest regressor
rf = RandomForestClassifier(random_state=123)

# Define the parameter grid to search to optimize the hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300, 500, 1000],  # Number of trees in the forest
    'max_depth': [5, 10, 15],   # Maximum depth of the tree
    'min_samples_split': [5, 10]  # Minimum number of samples required to split a node
}

# Perform Grid Search Cross-Validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train3, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best F1-score:", best_score)

# Evaluate the best model on the test set
best_model2 = grid_search.best_estimator_
test_f1 = f1_score(y_test, best_model2.predict(X_test3), average='weighted')
print("F1-score on Test Set using Best Model:", test_f1)

# Even for a classification problem, using an optimized random forest of 500 trees based on only the quantitative features 
# doesn't give us more than 62% of F1-score.
# So, using the qualitative variables or trying to look for more quantitatives features through some links is necessary if we
# want better models.

Best Parameters: {'max_depth': 15, 'min_samples_split': 10, 'n_estimators': 200}
Best F1-score: 0.6182291956189468
F1-score on Test Set using Best Model: 0.6114765872676544
