## 1. Load libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


## 2. Load data

In [2]:
# Some corrections (bad columns' names, tabulation, ...) have been done to the data before loading it
# During exporting, we let Python choose the right data types for each column for now...
data = pd.read_csv(r"books_new.csv")

## 3. Explore data for more corrections

In [3]:
# Number of rows, and columns
data.shape

(11127, 12)

In [3]:
# View of the first two rows
data.head(2)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling / Mary GrandPré,4.57,439785960,9780000000000.0,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling / Mary GrandPré,4.49,439358078,9780000000000.0,eng,870,2153167,29221,9/1/2004,Scholastic Inc.


In [4]:
# View of the data types chosen for each column by Python, and eventually number of NA (missing data)
data.info()
# The data types chosen for each column is correct.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              11127 non-null  int64  
 1   title               11127 non-null  object 
 2   authors             11127 non-null  object 
 3   average_rating      11127 non-null  float64
 4   isbn                11127 non-null  object 
 5   isbn13              11127 non-null  float64
 6   language_code       11127 non-null  object 
 7   num_pages           11127 non-null  int64  
 8   ratings_count       11127 non-null  int64  
 9   text_reviews_count  11127 non-null  int64  
 10  publication_date    11127 non-null  object 
 11  publisher           11127 non-null  object 
dtypes: float64(2), int64(4), object(6)
memory usage: 1.0+ MB


In [5]:
data.describe()
# We can't have average_rate, num_pages or ratings_count equal to zero,
# we must investigate and eventually drop the corresponding rows, they can be outliers ;

# But text_reviews_count can be equal to zero.

Unnamed: 0,bookID,average_rating,isbn13,num_pages,ratings_count,text_reviews_count
count,11127.0,11127.0,11127.0,11127.0,11127.0,11127.0
mean,21310.938887,3.933631,9759178000000.0,336.376921,17936.41,541.854498
std,13093.358023,0.352445,442865000000.0,241.127305,112479.4,2576.176608
min,1.0,0.0,8987060000.0,0.0,0.0,0.0
25%,10287.0,3.77,9780000000000.0,192.0,104.0,9.0
50%,20287.0,3.96,9780000000000.0,299.0,745.0,46.0
75%,32104.5,4.135,9780000000000.0,416.0,4993.5,237.5
max,45641.0,5.0,9790000000000.0,6576.0,4597666.0,94265.0


In [6]:
columns_to_drop = ["bookID", "isbn", "isbn13"]

# We decided to drop these columns because they are IDs and they don't really determine the average rating: they are just identifiers
data1 = data.drop(columns_to_drop, axis=1, inplace=False)
data1.head(2)

Unnamed: 0,title,authors,average_rating,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling / Mary GrandPré,4.57,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling / Mary GrandPré,4.49,eng,870,2153167,29221,9/1/2004,Scholastic Inc.


In [7]:
# Get the publication year, instead of having the long date (this format of long date isn't really important)
data1["publication_date"] = data1["publication_date"].apply(lambda x: int(x.split("/")[-1]))
data1.rename(columns={"publication_date" : "publication_year"}, inplace=True)
data1.head(2)

Unnamed: 0,title,authors,average_rating,language_code,num_pages,ratings_count,text_reviews_count,publication_year,publisher
0,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling / Mary GrandPré,4.57,eng,652,2095690,27591,2006,Scholastic Inc.
1,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling / Mary GrandPré,4.49,eng,870,2153167,29221,2004,Scholastic Inc.


In [8]:
# Inspect the langage code, by proceeding with data1
data1["language_code"].value_counts()
# Books written in english were the most rated, but there are also some langages (like arabic, turkish) in minority...

language_code
eng      8911
en-US    1409
spa       218
en-GB     214
fre       144
ger        99
jpn        46
mul        19
zho        14
grc        11
por        10
en-CA       7
ita         5
enm         3
lat         3
swe         2
rus         2
srp         1
nl          1
msa         1
glg         1
wel         1
ara         1
nor         1
tur         1
gla         1
ale         1
Name: count, dtype: int64

In [9]:
# Drop language_code
del data1["language_code"]

In [10]:
data1.describe()

Unnamed: 0,average_rating,num_pages,ratings_count,text_reviews_count,publication_year
count,11127.0,11127.0,11127.0,11127.0,11127.0
mean,3.933631,336.376921,17936.41,541.854498,2000.16752
std,0.352445,241.127305,112479.4,2576.176608,8.248836
min,0.0,0.0,0.0,0.0,1900.0
25%,3.77,192.0,104.0,9.0,1998.0
50%,3.96,299.0,745.0,46.0,2003.0
75%,4.135,416.0,4993.5,237.5,2005.0
max,5.0,6576.0,4597666.0,94265.0,2020.0


In [11]:
# Investigate the oldest book (written in 1900)
data1[data1["publication_year"]==1900]

Unnamed: 0,title,authors,average_rating,num_pages,ratings_count,text_reviews_count,publication_year,publisher
9375,Consider the Lilies,Iain Crichton Smith / Isobel Murray,3.88,144,332,33,1900,Polygon


In [12]:
# Investigate the publication_year distibution
data1["publication_year"].value_counts()

# It seems that most of the rated books are recent.

publication_year
2006    1700
2005    1260
2004    1071
2003     931
2002     798
        ... 
1947       1
1922       1
1929       1
1919       1
1940       1
Name: count, Length: 87, dtype: int64

In [13]:
data1["publication_year"].unique()

array([2006, 2004, 2003, 2005, 2002, 1996, 2001, 2000, 1990, 1993, 1997,
       1991, 2012, 1994, 1982, 1986, 1999, 1987, 1984, 2007, 1965, 1998,
       1995, 1964, 1970, 2011, 1955, 1988, 1985, 1989, 1963, 2008, 2009,
       1976, 1975, 1980, 1992, 1973, 2019, 1954, 2015, 1919, 1921, 1923,
       1969, 1968, 1961, 1953, 1958, 1983, 1978, 1929, 1977, 1979, 1922,
       1981, 2010, 1950, 1971, 1960, 1959, 2013, 2017, 2016, 1972, 1947,
       1943, 1974, 1957, 2014, 2018, 1952, 1935, 1956, 1966, 1925, 1962,
       1949, 1913, 1928, 1914, 1948, 1967, 1900, 2020, 1931, 1940],
      dtype=int64)

In [14]:
# Number of rows, and columns (columns'number must change since we delete "bookID", "isbn", "language_code" and "isbn13")
data1.shape

(11127, 8)

In [15]:
# Explore the number of unique values for each column
data1.nunique()

# The columns title and authors which are categorical variables have the most different values.

title                 10352
authors                6643
average_rating          209
num_pages               997
ratings_count          5294
text_reviews_count     1822
publication_year         87
publisher              2292
dtype: int64

In [16]:
# Be sure that none row has missing year
data1["publication_year"].isnull().sum()
# data["publication_year"].isna().sum()

0

In [17]:
# Analyze the rows with average_rating == 0
data1[data1["average_rating"]==0]

# In generally, the rows with average_rating == 0, have also ratings_count == 0
# That's normal since there weren't reviewed by people, so 0 is a default value in these cases.
# This 0 can be seen like some kind of default value for books which haven't be reviewed.

# Keeping these rows can be problematic !!!

Unnamed: 0,title,authors,average_rating,num_pages,ratings_count,text_reviews_count,publication_year,publisher
265,Out to Eat London 2002 (Lonely Planet Out to Eat),Lonely Planet / Mark Honan,0.0,295,0,0,2001,Lonely Planet
375,Juiced Official Strategy Guide,Doug Walsh,0.0,112,0,0,2005,BradyGames
987,Open City 6: The Only Woman He Ever Left,Open City Magazine / James Purdy / Daniel Pinc...,0.0,200,0,0,2000,Grove Press Open City Books
2532,How To Meditate: An Anthology Of Talks On Medi...,Frederick P. Lenz,0.0,228,0,0,2004,Frederick P. Lenz Foundation for American Budd...
2533,Insights: Talks On The Nature Of Existence,Frederick P. Lenz,0.0,304,0,0,2003,Frederick P. Lenz Foundation for American Budd...
2758,Venac sonetnih venaca; Puževa srma,Dobrica Erić,0.0,222,0,0,1996,"Izdavačka agencija ""Draganić"""
3493,Brodie's notes on Aldous Huxley's brave new world,Graham Handley,0.0,71,0,0,1992,Macmillan
4242,American Film Guide,Frank N. Magill,0.0,5,0,0,1983,Salem Press Inc
4678,The Man and the Author: John Milton: Twentieth...,J. Martin Evans,0.0,386,0,0,2002,Routledge
5325,Canopy: A Work for Voice and Light in Harvard ...,David Ward / Parveen Adams / Seamus Heaney /...,0.0,63,0,0,1997,Arts Publications


In [18]:
# Drop the rows having average_rating == 0
data2 = data1[~(data1["average_rating"]==0)]
# data2 contains only rows with average_rating != 0

data2.shape

(11101, 8)

In [19]:
# Analyze the rows with ratings_count == 0
# We can't have rows for which at the same time ratings_count is equal to 0, but average_rating is different from 0  since the 
# average_rating is the average of all ratings based on the ratings_count.
data2[(data2["ratings_count"]==0)]

Unnamed: 0,title,authors,average_rating,num_pages,ratings_count,text_reviews_count,publication_year,publisher
264,Lonely Planet Londres,Lonely Planet / Sarah Johnstone / Tom Masters,4.03,480,0,0,2006,Geoplaneta
525,American Government: Continuity and Change Al...,Karen O'Connor / Larry J. Sabato,2.83,664,0,0,2005,Longman Publishing Group
526,Essentials of American and Texas Government: C...,Karen O'Connor / Larry J. Sabato,3.5,854,0,0,2005,Longman Publishing Group
624,Comoediae 1: Acharenses/Equites/Nubes/Vespae/P...,Aristophanes / F.W. Hall / W.M. Geldart,5.0,364,0,0,1922,Oxford University Press USA
747,Melville and the politics of identity: From *K...,Julian Markels,3.33,164,0,0,1993,University of Illinois Press
935,April May und June,Elizabeth von Arnim,3.88,88,0,0,1995,Insel Frankfurt
1110,Dr No / Moonraker / Thunderball / From Russia ...,Ian Fleming,3.98,862,0,0,1984,Heinemann-Octopus
1681,V.S. Naipaul,Bruce Alvin King,2.0,240,0,0,2003,Palgrave Macmillan
2051,The Baby Emergency (Tennengarrah Clinic #1),Carol Marinelli,3.6,285,0,0,2004,Mills & Boon
2255,Animales No Se Visten Los (Animals Should Def...,Judi Barrett / Ron Barrett,4.11,32,0,0,1991,Live Oak Media


In [20]:
# Rows with average_rating != 0 but ratings_count == 0 must be dropped
data3 = data2[~(data2['ratings_count'] == 0)]
# data3 contains only rows with average_rating != 0 and ratings_count != 0

data3.shape

(11046, 8)

In [21]:
# Analyze rows with num_pages == 0
# We can't have rows for which at the same time num_pages == 0, but average_rating != 0 since at least pages of these books must be read before
# giving a rating.
data3[(data3["num_pages"]==0)]

Unnamed: 0,title,authors,average_rating,num_pages,ratings_count,text_reviews_count,publication_year,publisher
306,The 5 Love Languages / The 5 Love Languages Jo...,Gary Chapman,4.70,0,22,4,2005,Moody Publishers
853,The Tragedy of Pudd'nhead Wilson,Mark Twain / Michael Prichard,3.79,0,3,0,2003,Tantor Media
1061,Murder by Moonlight & Other Mysteries (New Adv...,NOT A BOOK,4.00,0,7,2,2006,Simon Schuster Audio
1064,The Unfortunate Tobacconist & Other Mysteries ...,NOT A BOOK,3.50,0,12,1,2003,Simon & Schuster Audio
1230,The Da Vinci Code (Robert Langdon #2),Dan Brown / Paul Michael,3.84,0,91,16,2006,Random House Audio
...,...,...,...,...,...,...,...,...
10030,The Chessmen of Mars (Barsoom #5),Edgar Rice Burroughs / John Bolen,3.83,0,5147,157,2005,Tantor Media
10192,Fine Lines (One-Eyed Mack #6),Jim Lehrer,3.23,0,17,4,1995,Random House Value Publishing
10624,Stowaway and Milk Run: Two Unabridged Stories ...,Mary Higgins Clark / Jan Maxwell,3.49,0,64,2,1999,Simon & Schuster Audio
10926,The Mask of the Enchantress,Victoria Holt,3.85,0,21,1,1981,Ivy Books


In [22]:
# Rows with average_rating != 0 but num_pages == 0 must be dropped
data4 = data3[~(data3["num_pages"]==0)]
# data4 contains only rows with average_rating != 0, ratings_count != 0 and num_pages != 0

data4.shape

(10971, 8)

In [23]:
data4.describe()

Unnamed: 0,average_rating,num_pages,ratings_count,text_reviews_count,publication_year
count,10971.0,10971.0,10971.0,10971.0,10971.0
mean,3.943235,339.06499,18188.83,549.457205,2000.183939
std,0.294536,240.173871,113256.2,2593.630924,8.200245
min,1.0,1.0,1.0,0.0,1900.0
25%,3.78,197.0,113.0,10.0,1998.0
50%,3.96,302.0,781.0,48.0,2003.0
75%,4.14,416.0,5138.0,244.0,2005.0
max,5.0,6576.0,4597666.0,94265.0,2020.0


In [24]:
# Delete unecessary data
del data, data1, data2, data3

# Rename data4 as data, then delete data4
data = data4.copy(deep=True)
del data4

# We will be working with only one dataframe named data which is a clean version of our original data.

In [25]:
# Analyze correlations between quantitative variables
data[['average_rating', 'num_pages', 'ratings_count', 'text_reviews_count', 'publication_year']].corr()

# No high correlation were found ... (Correlation lesser than 0.5 or greater than -0.5)
# Using only these quantitative variables to explain average_rating won't give us models which can perform so well.

Unnamed: 0,average_rating,num_pages,ratings_count,text_reviews_count,publication_year
average_rating,1.0,0.170746,0.040815,0.033764,-0.046053
num_pages,0.170746,1.0,0.032975,0.035104,-0.022084
ratings_count,0.040815,0.032975,1.0,0.865931,0.044832
text_reviews_count,0.033764,0.035104,0.865931,1.0,0.067372
publication_year,-0.046053,-0.022084,0.044832,0.067372,1.0


In [40]:
data["average_rating"].value_counts()

average_rating
4.00    215
3.96    194
4.02    177
3.94    176
4.07    171
       ... 
3.10      1
2.81      1
2.55      1
4.91      1
2.33      1
Name: count, Length: 207, dtype: int64

In [29]:
# Create copies of data
data_copy1 = data.copy(deep=True)
data_copy2 = data.copy(deep=True)

In [30]:
# Normalize the num_pages column
from sklearn.preprocessing import StandardScaler, MinMaxScaler

Scaler1 = StandardScaler()
Scaler2 = MinMaxScaler()

data_copy1[['num_pages']] = Scaler1.fit_transform(data_copy1[['num_pages']])
data_copy2[['num_pages']] = Scaler2.fit_transform(data_copy2[['num_pages']])

In [30]:
data_copy1[['num_pages']].describe()

Unnamed: 0,num_pages
count,10971.0
mean,2.789472e-17
std,1.000046
min,-1.407649
25%,-0.5915359
50%,-0.1543327
75%,0.3203451
max,25.9696


In [31]:
data_copy2[['num_pages']].describe()
# data_copy2 is the better format to use since having positive values for num_pages is more understandable.

Unnamed: 0,num_pages
count,10971.0
mean,0.051417
std,0.036528
min,0.0
25%,0.02981
50%,0.045779
75%,0.063118
max,1.0


## 4. Transform all the qualitative data into quantitative data

In [127]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [128]:
# Instantiate the TF-IDF vectorizer
publisher_tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the 'publisher' column
data_copy2_publisher_tfidf = publisher_tfidf_vectorizer.fit_transform(data_copy2['publisher'])
# Convert the TF-IDF matrices to DataFrames
data_copy2_publisher_df = pd.DataFrame(data_copy2_publisher_tfidf.toarray(), columns=publisher_tfidf_vectorizer.get_feature_names_out())
# data_copy2_publisher_df


In [129]:
# Instantiate the TF-IDF vectorizers
title_tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the 'title' column
data_copy2_title_tfidf = title_tfidf_vectorizer.fit_transform(data_copy2['title'])
# Convert the TF-IDF matrices to DataFrames
data_copy2_title_df = pd.DataFrame(data_copy2_title_tfidf.toarray(), columns=title_tfidf_vectorizer.get_feature_names_out())
# data_copy2_title_df

In [155]:
data_copy3 = pd.concat([data_copy2.drop(['publisher', 'title'], axis=1), data_copy2_publisher_df, data_copy2_title_df], axis=1, join='inner')
del data_copy3['authors']

data_copy3 = pd.concat([data_copy3, data_copy2['authors']], axis=1, join='inner')

In [157]:
# Instantiate the TF-IDF vectorizers
authors_tfidf_vectorizer = TfidfVectorizer()


In [159]:
# Fit and transform the 'authors' column
data_copy3_authors_tfidf = authors_tfidf_vectorizer.fit_transform(data_copy3['authors'])
# Convert the TF-IDF matrices to DataFrames
data_copy3_authors_df = pd.DataFrame(data_copy3_authors_tfidf.toarray(), columns=authors_tfidf_vectorizer.get_feature_names_out())
# data_copy3_authors_df
data_copy4 = pd.concat([data_copy3.drop(['authors'], axis=1), data_copy3_authors_df], axis=1, join='inner')
data_copy4


Unnamed: 0,average_rating,num_pages,ratings_count,text_reviews_count,publication_year,10,18,1976,1st,2000,...,朱學恆,林靜華,橋口,皇冠編譯組,神尾葉子,章博,維人,羅琳,荒川弘,趙丕慧
0,4.57,0.099011,2095690,27591,2006,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.49,0.132167,2153167,29221,2004,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.42,0.053384,6333,244,2003,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.56,0.066008,2339585,36325,2004,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.78,0.408973,41428,164,2004,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10812,3.63,0.023270,406,45,2006,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10813,3.77,0.024183,7,1,2003,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10814,3.93,0.036958,1780,220,2003,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10815,4.08,0.061293,44,7,1994,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Prepare data for eventually more advanced models like NLP
data["text"] = data.apply(lambda row: "the book {} written by {} and published by {} in {}".format(
    row["title"], row["authors"], row["publisher"], row["publication_year"]), axis=1)
data['text2'] = data.apply(lambda row:"the book {} written by {} and published by {} in {} with {} pages rated by {} persons and reviewed by {} people".format(
    row["title"], row["authors"], row["publisher"], row["publication_year"], row['num_pages'], row['ratings_count'], row['text_reviews_count']), axis=1)

In [28]:
# View 
data["text"][0]

'the book Harry Potter and the Half-Blood Prince (Harry Potter  #6) written by J.K. Rowling / Mary GrandPré and published by Scholastic Inc. in 2006'

In [29]:
# View 
data["text2"][0]

'the book Harry Potter and the Half-Blood Prince (Harry Potter  #6) written by J.K. Rowling / Mary GrandPré and published by Scholastic Inc. in 2006 with 652 pages rated by 2095690 persons and reviewed by 27591 people'

## 3. Choose Models for prediction

### 3.1. Regression Linear Models

Split data

In [160]:
data = data_copy4

# Split data into training and test 
target = data[['average_rating']]
features = data[data.columns[~(data.columns.isin(target.columns))].tolist()]
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=123)

# To be sure, we're still working with dataframes for X_train, X_test and we have arrays for y_train and y_test
X_train = pd.DataFrame(X_train) 
X_test = pd.DataFrame(X_test)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

del target, features

Linear Regression

In [164]:
# We will use only quantitative variables to predict avarage_rating
# quant_variables = ['num_pages', 'ratings_count', 'text_reviews_count', 'publication_year']

# In our linear model, we will take into account the interaction among the quant_variables since we
# already know that taking independantly, they don't have enough informations to predict average_rating.
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

# Create all polynomial combination of features having a degree less or equal to two
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train2 = poly.fit_transform(X_train) # X_train2 = poly.fit_transform(X_train[quant_variables])
X_test2 = poly.fit_transform(X_test) # X_test2 = poly.fit_transform(X_test[quant_variables])

# Fit our linear regression model on the train, then evaluate the test
model = LinearRegression()
model.fit(X_train2, y_train)

# Evaluate the model
train_score = model.score(X_train2, y_train)
test_score = model.score(X_test2, y_test)
print(f"Training R^2 score: {train_score:.4f}")
print(f"Testing R^2 score: {test_score:.4f}")

# We obtain a R-squared of 3.7% on the test set.
# We will look for more advanced models in order to have better results.

del X_train2, X_test2

MemoryError: Unable to allocate 20.1 TiB for an array with shape (7466, 369933601) and data type float64

Random Forest

In [163]:
# We will use only quantitative variables to predict avarage_rating
# quant_variables = ['num_pages', 'ratings_count', 'text_reviews_count', 'publication_year']
X_train2 = X_train #  X_train2 = X_train[quant_variables]
X_test2 = X_test # X_test2 = X_test[quant_variables]

# We will use a RandomForest Regressor which is more advanced than our linear model with interactions
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, r2_score

# Create a Random Forest regressor
rf = RandomForestRegressor(random_state=123)

# Define the parameter grid to search to optimize the hyperparameters
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [5, 10, 15],   # Maximum depth of the tree
    'min_samples_split': [5, 10]  # Minimum number of samples required to split a node
}

# Perform Grid Search Cross-Validation
scorer = make_scorer(r2_score)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring=scorer, n_jobs=-1)
grid_search.fit(X_train2, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_r2 = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best r2_score:", best_r2)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
test_r2 = r2_score(y_test, best_model.predict(X_test2))
print("r2-score on Test Set using Best Model:", test_r2)

# The Random Forest gives a better result compared to the linear regression,
# But to achieve this 9.6% r_squared which is an augmentation of 1.54 campared to the 3.7% r_squared,
# we need to train at least 1000 trees.

MemoryError: Unable to allocate 474. MiB for an array with shape (5973, 10395) and data type float64

### 3.1. BERT Model for Regression

### 3.2. Classification Models

Create classes and split data

In [34]:
# We treat this problem as a classification model : we need to determine the right cut to seperate average_rating
# into different groups.

# We decide to create two groups for average_rating :
# the group with average_rating lesser than or equal to 4 will be qualified low_medium,
# the group with average_rating greater than 4 will be qualified high.
# This separation has been chosen in order to have similar number in each group.

target = data[['average_rating']]
features = data[data.columns[~(data.columns.isin(target.columns))].tolist()]
Rating_category = target['average_rating'].apply(lambda x: 'high' if x>4 else 'low_medium')

# Split data into training and test 
X_train, X_test, y_train, y_test = train_test_split(features, Rating_category, test_size=0.3, random_state=123)

# To be sure, we're still working with dataframes for X_train, X_test and we have arrays for y_train and y_test
X_train = pd.DataFrame(X_train) 
X_test = pd.DataFrame(X_test)

del target, features, Rating_category


In [35]:
# Verify if the proportion of low_medium and high is the same in y_train and in y_test
print("The distibution of low and high in the test data is : \n"
      , y_test.value_counts()/sum(y_test.value_counts()))
print('\n')
print("The distibution of low and high in the train data is : \n"
      , y_train.value_counts()/sum(y_train.value_counts()))

The distibution of low and high in the test data is : 
 average_rating
low_medium    0.568044
high          0.431956
Name: count, dtype: float64


The distibution of low and high in the train data is : 
 average_rating
low_medium    0.575856
high          0.424144
Name: count, dtype: float64


In [36]:
# Train a logistic regression to predict upper or lower
from sklearn.linear_model import LogisticRegression

# We will use only quantitative variables to predict avarage_rating
quant_variables = ['num_pages', 'ratings_count', 'text_reviews_count', 'publication_year']
X_train2 = X_train[quant_variables]
X_test2 = X_test[quant_variables]

# We normalize each feature in X_train2 and X_test2
from sklearn.preprocessing import StandardScaler
Transformer = StandardScaler().fit(X_train2)
X_train3 = pd.DataFrame(Transformer.transform(X_train2))
X_test3 = pd.DataFrame(Transformer.transform(X_test2))
del X_train2, X_test2

# Create a Logistic Regression model
model = LogisticRegression(random_state=123)

# Train the model using the training data
model.fit(X_train3, y_train)

# Calculate the f1-score on the training and the test set
from sklearn.metrics import f1_score
f1_train = f1_score(y_train, model.predict(X_train3), average='weighted')
print(f"The F1-score on the training is : {f1_train}")
f1_test = f1_score(y_test, model.predict(X_test3), average='weighted')
print(f"The F1-score on the test is : {f1_test}")

The F1-score on the training is : 0.529439716067725
The F1-score on the test is : 0.5534189925263948


In [37]:
# Create a Logistic Regression model in which we take into account the interaction between the features
model2 = LogisticRegression(random_state=123)

# Create all polynomial combination of features having a degree less or equal to two
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train4 = poly.fit_transform(X_train3)
X_test4 = poly.fit_transform(X_test3)

# Train the model using the training data
model2.fit(X_train4, y_train)

# Calculate the f1-score on the training and the test set
f1_train = f1_score(y_train, model2.predict(X_train4), average='weighted')
print(f"The F1-score on the training is : {f1_train}")
f1_test = f1_score(y_test, model2.predict(X_test4), average='weighted')
print(f"The F1-score on the test is : {f1_test}")

# When we try to take into account the different interactions between variables, the F1-score of the new model
# isn't better than this of the previous one.

The F1-score on the training is : 0.5358595347504272
The F1-score on the test is : 0.5500117592064663


In [1]:
import tensorflow as tf

print(tf.__version__)


2.15.0


Random Forest

In [38]:
# We will use only quantitative variables to predict avarage_rating
quant_variables = ['num_pages', 'ratings_count', 'text_reviews_count', 'publication_year']
X_train2 = X_train[quant_variables]
X_test2 = X_test[quant_variables]

# We will use a RandomForest Classifier which is more advanced than the logistic regression with interactions
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create a Random Forest regressor
rf = RandomForestClassifier(random_state=123)

# Define the parameter grid to search to optimize the hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300, 500, 1000],  # Number of trees in the forest
    'max_depth': [5, 10, 15],   # Maximum depth of the tree
    'min_samples_split': [5, 10]  # Minimum number of samples required to split a node
}

# Perform Grid Search Cross-Validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(X_train3, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best Parameters:", best_params)
print("Best F1-score:", best_score)

# Evaluate the best model on the test set
best_model2 = grid_search.best_estimator_
test_f1 = f1_score(y_test, best_model2.predict(X_test3), average='weighted')
print("F1-score on Test Set using Best Model:", test_f1)

# Even for a classification problem, using an optimized random forest of 500 trees based on only the quantitative features 
# doesn't give us more than 62% of F1-score.
# So, using the qualitative variables or trying to look for more quantitatives features through some links is necessary if we
# want better models.

Best Parameters: {'max_depth': 15, 'min_samples_split': 10, 'n_estimators': 200}
Best F1-score: 0.6182291956189468
F1-score on Test Set using Best Model: 0.6114765872676544


BERT MODEL TEST

In [26]:
df1 = data.copy()
df1['text'] = df1.apply(lambda row:"the book {} written by {} and published by {} in {} with {} pages rated by {} persons and reviewed by {} people.".format(
    row["title"], row["authors"], row["publisher"], row["publication_year"], row['num_pages'], row['ratings_count'], row['text_reviews_count']), axis=1)

In [27]:
a = "J.K. Rowling / Mary GrandPré / brad"
y = "b"
# b = a[:-1]
# c = a[-1:]
# print(b)
# print(c)

def parse_authors(authors):
    list_auth = [x.lstrip().rstrip() for x in authors.split('/')]
    n = len(list_auth)
    if n > 1:
        auth1 = list_auth[:-1]
        auth2 = list_auth[-1:]
        return  ', '.join(auth1) + ' and '+ auth2[0].lstrip().rstrip()
    else:
        return authors

In [69]:
parse_authors(a)
#parse_authors(y)

'J.K. Rowling, Mary GrandPré and brad'

In [28]:
df2 = data.copy(deep=True)
def parse_authors(authors):
    list_auth = [x.lstrip().rstrip() for x in authors.split('/')]
    n = len(list_auth)
    if n > 1:
        auth1 = list_auth[:-1]
        auth2 = list_auth[-1:]
        return  ", ".join(auth1) + " and "+ auth2[0]
    else:
        return authors
df2['authors'] = df2['authors'].apply(parse_authors)

In [29]:
df2['text'] = df2.apply(lambda row:"The book '{}' with {} pages, written by {}, published by '{}' in {} got rated by {} people and reviewed by {} people.".format(
    row["title"], row['num_pages'], row["authors"], row["publisher"], row["publication_year"], row['ratings_count'], row['text_reviews_count']), axis=1)

In [30]:
df3 = df2.copy()
df3['text'] = df2.apply(lambda row:"The book '{}' written by {} got published by {}.".format(
    row["title"], row["authors"], row["publisher"]), axis=1)

In [79]:
df2['text'][1056]

"The book 'The New Annotated Sherlock Holmes: The Complete Short Stories' with 1878 pages, written by Arthur Conan Doyle and Leslie S. Klinger, published by 'W. W. Norton & Company' in 2004 got rated by 1411 people and reviewed by 54 people."

In [83]:
df3['text'][1056]

"The book 'The New Annotated Sherlock Holmes: The Complete Short Stories' written by Arthur Conan Doyle and Leslie S. Klinger got published by W. W. Norton & Company."

In [27]:
df4 = df1[["text", "average_rating"]]
#df = data.drop(["text2", "publication_year"], axis=1)

In [29]:
df1["text"] = df1["text"].apply(lambda x: x.replace('/',','))
df1 = df1.drop(["title", "authors", "publisher", "publication_year"], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["text"] = df1["text"].apply(lambda x: x.replace('/',','))


In [29]:
df1["text"][0]

'the book Harry Potter and the Half-Blood Prince (Harry Potter  #6) written by J.K. Rowling , Mary GrandPré and published by Scholastic Inc. in 2006 with 652 pages rated by 2095690 persons and reviewed by 27591 people.'

In [104]:
df2 = df2.drop(["title", "authors", "publisher", "publication_year"], axis=1)

In [105]:
df2["text"][0]

"The book 'Harry Potter and the Half-Blood Prince (Harry Potter  #6)' with 652 pages, written by J.K. Rowling  and  Mary GrandPré, published by 'Scholastic Inc.' in 2006 got rated by 2095690 people and reviewed by 27591 people."

In [31]:
numerical_features = ['num_pages', 'ratings_count', 'text_reviews_count']

In [38]:

import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
#df = data_copy2

# Construct the text column
#df['text'] = df.apply(lambda row: f"The book {row['title']} written by {row['authors']} and published by {row['publisher']} in {row['publication_year']} with {row['num_pages']} pages rated by {row['ratings_count']} people and reviewed by {row['text_reviews_count']} people.", axis=1)

# Split the data into train and test sets
train_data, test_data = train_test_split(df1, test_size=0.2, random_state=42)

# Define the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_text(text):
    return tokenizer(text, return_tensors='tf', truncation=True, padding=True)

X_train_text = train_data['text'].apply(tokenize_text)
X_test_text = test_data['text'].apply(tokenize_text)

# Extract BERT embeddings for text data
def extract_bert_embeddings(text_tokens):
    return bert_model(**text_tokens).last_hidden_state[:, 0, :]

X_train_text_embeddings = tf.concat([extract_bert_embeddings(text_tokens) for text_tokens in X_train_text], axis=0)
X_test_text_embeddings = tf.concat([extract_bert_embeddings(text_tokens) for text_tokens in X_test_text], axis=0)

# Select numerical features
numerical_features = ['num_pages', 'ratings_count', 'text_reviews_count']

# Define a column transformer for numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
    ])

# Combine BERT embeddings and numerical features
X_train_combined = tf.concat([X_train_text_embeddings, preprocessor.fit_transform(train_data[numerical_features])], axis=1)
X_test_combined = tf.concat([X_test_text_embeddings, preprocessor.transform(test_data[numerical_features])], axis=1)

# Train a linear regression model
regression_model = LinearRegression()
regression_model.fit(X_train_combined, train_data['average_rating'])

# Make predictions on the test set
predictions = regression_model.predict(X_test_combined)

# Evaluate the model
mse = mean_squared_error(test_data['average_rating'], predictions)
mae = mean_absolute_error(test_data['average_rating'], predictions)
r2 = r2_score(test_data['average_rating'], predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Mean Squared Error: 0.07580131243840232
Mean Absolute Error: 0.20049456884431952
R-squared: 0.14152150132819352


In [52]:
df["average_rating"].describe()

count    10971.000000
mean         3.943235
std          0.294536
min          1.000000
25%          3.780000
50%          3.960000
75%          4.140000
max          5.000000
Name: average_rating, dtype: float64

In [39]:
# Install the required libraries
# !pip install transformers tensorflow

import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate

# Assuming you have a DataFrame called 'df' with your dataset

# Construct the text column
#df['text'] = df.apply(lambda row: f"The book {row['title']} written by {row['authors']} and published by {row['publisher']} in {row['publication_year']} with {row['num_pages']} pages rated by {row['ratings_count']} people and reviewed by {row['text_reviews_count']} people.", axis=1)

# Split the data into train and test sets
train_data, test_data = train_test_split(df1, test_size=0.2, random_state=42)

# Define the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_text(text):
    return tokenizer(text, return_tensors='tf', truncation=True, padding=True)

X_train_text = train_data['text'].apply(tokenize_text)
X_test_text = test_data['text'].apply(tokenize_text)

# Extract BERT embeddings for text data
def extract_bert_embeddings(text_tokens):
    return bert_model(**text_tokens).last_hidden_state[:, 0, :]

X_train_text_embeddings = tf.concat([extract_bert_embeddings(text_tokens) for text_tokens in X_train_text], axis=0)
X_test_text_embeddings = tf.concat([extract_bert_embeddings(text_tokens) for text_tokens in X_test_text], axis=0)

# Standardize numerical features
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(train_data[numerical_features])
X_test_numerical = scaler.transform(test_data[numerical_features])

# Build a combined model for text and numerical features
input_text = Input(shape=(X_train_text_embeddings.shape[1],), name='text_input')
input_numerical = Input(shape=(len(numerical_features),), name='numerical_input')

text_branch = Dense(128, activation='relu')(input_text)
numerical_branch = Dense(128, activation='relu')(input_numerical)

merged = Concatenate()([text_branch, numerical_branch])
output = Dense(1, activation='linear')(merged)

model = Model(inputs=[input_text, input_numerical], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit([X_train_text_embeddings, X_train_numerical], train_data['average_rating'], epochs=5, batch_size=32, validation_split=0.1)

# Make predictions on the test set
predictions = model.predict([X_test_text_embeddings, X_test_numerical])

# Evaluate the model
mse = mean_squared_error(test_data['average_rating'], predictions)
mae = mean_absolute_error(test_data['average_rating'], predictions)
r2 = r2_score(test_data['average_rating'], predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w


Epoch 1/5

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Mean Squared Error: 0.08054326595353845
Mean Absolute Error: 0.21524880369703428
R-squared: 0.0878170864111878


In [37]:
# Install the required libraries
# !pip install transformers xgboost tensorflow

import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from tqdm import tqdm

# Assuming you have a DataFrame called 'df' with your dataset

# Construct the text column
#df['text'] = df.apply(lambda row: f"The book {row['title']} written by {row['authors']} and published by {row['publisher']} in {row['publication_year']} with {row['num_pages']} pages rated by {row['ratings_count']} people and reviewed by {row['text_reviews_count']} people.", axis=1)

# Split the data into train and test sets
train_data, test_data = train_test_split(df1, test_size=0.2, random_state=42)

# Define the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_text(text):
    return tokenizer(text, return_tensors='tf', truncation=True, padding=True)

X_train_text = train_data['text'].apply(tokenize_text)
X_test_text = test_data['text'].apply(tokenize_text)

# Extract BERT embeddings for text data
def extract_bert_embeddings(text_tokens):
    return bert_model(**text_tokens).last_hidden_state[:, 0, :]

X_train_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_train_text, desc="Tokenizing Train Data")])
X_test_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_test_text, desc="Tokenizing Test Data")])

# Standardize numerical features
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(train_data[numerical_features])
X_test_numerical = scaler.transform(test_data[numerical_features])

# Combine BERT embeddings with numerical features
X_train_combined = np.hstack([X_train_text_embeddings, X_train_numerical])
X_test_combined = np.hstack([X_test_text_embeddings, X_test_numerical])

# Train XGBoost model
xgb_model = XGBRegressor()
xgb_model.fit(X_train_combined, train_data['average_rating'])

# Make predictions on the test set
predictions = xgb_model.predict(X_test_combined)

# Evaluate the model
mse = mean_squared_error(test_data['average_rating'], predictions)
mae = mean_absolute_error(test_data['average_rating'], predictions)
r2 = r2_score(test_data['average_rating'], predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Mean Squared Error: 0.07575095626484431
Mean Absolute Error: 0.20022113594107313
R-squared: 0.14209180401668686


In [33]:
# Install the required libraries
# !pip install transformers scikit-learn

import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm

# Assuming you have a DataFrame called 'df' with your dataset

# Construct the text column
#df['text'] = df.apply(lambda row: f"The book {row['title']} written by {row['authors']} and published by {row['publisher']} in {row['publication_year']} with {row['num_pages']} pages rated by {row['ratings_count']} people and reviewed by {row['text_reviews_count']} people.", axis=1)

# Split the data into train and test sets
train_data, test_data = train_test_split(df1, test_size=0.2, random_state=42)

# Define the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_text(text):
    return tokenizer(text, return_tensors='tf', truncation=True, padding=True)

X_train_text = train_data['text'].apply(tokenize_text)
X_test_text = test_data['text'].apply(tokenize_text)

# Extract BERT embeddings for text data
def extract_bert_embeddings(text_tokens):
    return bert_model(**text_tokens).last_hidden_state[:, 0, :]

X_train_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_train_text, desc="Tokenizing Train Data")])
X_test_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_test_text, desc="Tokenizing Test Data")])

# Standardize numerical features
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(train_data[numerical_features])
X_test_numerical = scaler.transform(test_data[numerical_features])

# Combine BERT embeddings with numerical features
X_train_combined = np.hstack([X_train_text_embeddings, X_train_numerical])
X_test_combined = np.hstack([X_test_text_embeddings, X_test_numerical])

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_combined, train_data['average_rating'])

# Make predictions on the test set
predictions = rf_model.predict(X_test_combined)

# Evaluate the model
mse = mean_squared_error(test_data['average_rating'], predictions)
mae = mean_absolute_error(test_data['average_rating'], predictions)
r2 = r2_score(test_data['average_rating'], predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Mean Squared Error: 0.07283465022779044
Mean Absolute Error: 0.19613179954441914
R-squared: 0.17512007157329412


In [33]:
# Install the required libraries
# !pip install transformers scikit-learn

import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm

# Assuming you have a DataFrame called 'df' with your dataset

# Construct the text column
#df['text'] = df.apply(lambda row: f"The book {row['title']} written by {row['authors']} and published by {row['publisher']} in {row['publication_year']} with {row['num_pages']} pages rated by {row['ratings_count']} people and reviewed by {row['text_reviews_count']} people.", axis=1)

# Split the data into train and test sets
train_data, test_data = train_test_split(df1, test_size=0.2, random_state=42)

# Define the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_text(text):
    return tokenizer(text, return_tensors='tf', truncation=True, padding=True)

X_train_text = train_data['text'].apply(tokenize_text)
X_test_text = test_data['text'].apply(tokenize_text)

# Extract BERT embeddings for text data
def extract_bert_embeddings(text_tokens):
    return bert_model(**text_tokens).last_hidden_state[:, 0, :]

X_train_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_train_text, desc="Tokenizing Train Data")])
X_test_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_test_text, desc="Tokenizing Test Data")])

# Standardize numerical features
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(train_data[numerical_features])
X_test_numerical = scaler.transform(test_data[numerical_features])

# Combine BERT embeddings with numerical features
X_train_combined = np.hstack([X_train_text_embeddings, X_train_numerical])
X_test_combined = np.hstack([X_test_text_embeddings, X_test_numerical])

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100,criterion='friedman_mse')
rf_model.fit(X_train_combined, train_data['average_rating'])

# Make predictions on the test set
predictions = rf_model.predict(X_test_combined)

# Evaluate the model
mse = mean_squared_error(test_data['average_rating'], predictions)
mae = mean_absolute_error(test_data['average_rating'], predictions)
r2 = r2_score(test_data['average_rating'], predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Mean Squared Error: 0.07225782266970388
Mean Absolute Error: 0.19582533029612756
R-squared: 0.18165286157559235


In [108]:
# Install the required libraries
# !pip install transformers scikit-learn

import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm

# Split the data into train and test sets
train_data, test_data = train_test_split(df2, test_size=0.2, random_state=42)

# Define the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_text(text):
    return tokenizer(text, return_tensors='tf', truncation=True, padding=True)

X_train_text = train_data['text'].apply(tokenize_text)
X_test_text = test_data['text'].apply(tokenize_text)

# Extract BERT embeddings for text data
def extract_bert_embeddings(text_tokens):
    return bert_model(**text_tokens).last_hidden_state[:, 0, :]

X_train_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_train_text, desc="Tokenizing Train Data")])
X_test_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_test_text, desc="Tokenizing Test Data")])

# Standardize numerical features
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(train_data[numerical_features])
X_test_numerical = scaler.transform(test_data[numerical_features])

# Combine BERT embeddings with numerical features
X_train_combined = np.hstack([X_train_text_embeddings, X_train_numerical])
X_test_combined = np.hstack([X_test_text_embeddings, X_test_numerical])

# Train Random Forest model
rf_model2 = RandomForestRegressor(n_estimators=100, criterion='friedman_mse', random_state=42)
rf_model2.fit(X_train_combined, train_data['average_rating'])

# Make predictions on the test set
predictions = rf_model2.predict(X_test_combined)

# Evaluate the model
mse = mean_squared_error(test_data['average_rating'], predictions)
mae = mean_absolute_error(test_data['average_rating'], predictions)
r2 = r2_score(test_data['average_rating'], predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')





  from .autonotebook import tqdm as notebook_tqdm





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Mean Squared Error: 0.07182913859681093
Mean Absolute Error: 0.19496938496583144
R-squared: 0.18650787064421104


In [32]:
# Install the required libraries
# !pip install transformers xgboost tensorflow

import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from tqdm import tqdm

#Split the data into train and test sets
train_data, test_data = train_test_split(df3, test_size=0.2, random_state=42)

# Define the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
bert_model = TFBertModel.from_pretrained('bert-large-uncased')

# Tokenize the text data
def tokenize_text(text):
    return tokenizer(text, return_tensors='tf', truncation=True, padding=True)

X_train_text = train_data['text'].apply(tokenize_text)
X_test_text = test_data['text'].apply(tokenize_text)

# Extract BERT embeddings for text data
def extract_bert_embeddings(text_tokens):
    return bert_model(**text_tokens).last_hidden_state[:, 0, :]

X_train_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_train_text, desc="Tokenizing Train Data")])
X_test_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_test_text, desc="Tokenizing Test Data")])

# Standardize numerical features
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(train_data[numerical_features])
X_test_numerical = scaler.transform(test_data[numerical_features])

# Combine BERT embeddings with numerical features
X_train_combined = np.hstack([X_train_text_embeddings, X_train_numerical])
X_test_combined = np.hstack([X_test_text_embeddings, X_test_numerical])

# Train XGBoost model
xgb_model = XGBRegressor()
xgb_model.fit(X_train_combined, train_data['average_rating'])

# Make predictions on the test set
predictions = xgb_model.predict(X_test_combined)

# Evaluate the model
mse = mean_squared_error(test_data['average_rating'], predictions)
mae = mean_absolute_error(test_data['average_rating'], predictions)
r2 = r2_score(test_data['average_rating'], predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')
print('Training set ------------------------------------------------------------------------------')
predictions_train = xgb_model.predict(X_train_combined)

# Evaluate the model
mse_train = mean_squared_error(train_data['average_rating'], predictions_train)
mae_train = mean_absolute_error(train_data['average_rating'], predictions_train)
r2_train = r2_score(train_data['average_rating'], predictions_train)

print(f'Mean Squared Error: {mse_train}')
print(f'Mean Absolute Error: {mae_train}')
print(f'R-squared: {r2_train}')




  from .autonotebook import tqdm as notebook_tqdm





Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Mean Squared Error: 0.07759803129857874
Mean Absolute Error: 0.20424978572523947
R-squared: 0.12117298149388334
Training set ------------------------------------------------------------------------------
Mean Squared Error: 0.002207902756146978
Mean Absolute Error: 0.03506574565807905
R-squared: 0.974428311275299


In [97]:
# Install the required libraries
# !pip install transformers xgboost tensorflow

import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from tqdm import tqdm

#Split the data into train and test sets
train_data, test_data = train_test_split(df3, test_size=0.2, random_state=42)

# Define the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_text(text):
    return tokenizer(text, return_tensors='tf', truncation=True, padding=True)

X_train_text = train_data['text'].apply(tokenize_text)
X_test_text = test_data['text'].apply(tokenize_text)

# Extract BERT embeddings for text data
def extract_bert_embeddings(text_tokens):
    return bert_model(**text_tokens).last_hidden_state[:, 0, :]

X_train_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_train_text, desc="Tokenizing Train Data")])
X_test_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_test_text, desc="Tokenizing Test Data")])

# Standardize numerical features
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(train_data[numerical_features])
X_test_numerical = scaler.transform(test_data[numerical_features])

# Combine BERT embeddings with numerical features
X_train_combined = np.hstack([X_train_text_embeddings, X_train_numerical])
X_test_combined = np.hstack([X_test_text_embeddings, X_test_numerical])

# Train XGBoost model
xgb_model = XGBRegressor()
xgb_model.fit(X_train_combined, train_data['average_rating'])

# Make predictions on the test set
predictions = xgb_model.predict(X_test_combined)

# Evaluate the model
mse = mean_squared_error(test_data['average_rating'], predictions)
mae = mean_absolute_error(test_data['average_rating'], predictions)
r2 = r2_score(test_data['average_rating'], predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')
print('Training set ------------------------------------------------------------------------------')
predictions_train = xgb_model.predict(X_train_combined)

# Evaluate the model
mse_train = mean_squared_error(train_data['average_rating'], predictions_train)
mae_train = mean_absolute_error(train_data['average_rating'], predictions_train)
r2_train = r2_score(train_data['average_rating'], predictions_train)

print(f'Mean Squared Error: {mse_train}')
print(f'Mean Absolute Error: {mae_train}')
print(f'R-squared: {r2_train}')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Mean Squared Error: 0.07357641346376258
Mean Absolute Error: 0.1980206516630829
R-squared: 0.16671932271152778
Training set ------------------------------------------------------------------------------
Mean Squared Error: 0.001836766758927898
Mean Absolute Error: 0.03206074743892458
R-squared: 0.9787267678848555


In [99]:
# Install the required libraries
# !pip install transformers xgboost tensorflow

# import numpy as np
# import tensorflow as tf
# from transformers import BertTokenizer, TFBertModel
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# from sklearn.preprocessing import StandardScaler
# from xgboost import XGBRegressor
# from tqdm import tqdm

# #Split the data into train and test sets
# train_data, test_data = train_test_split(df3, test_size=0.2, random_state=42)

# # Define the BERT model and tokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# # Tokenize the text data
# def tokenize_text(text):
#     return tokenizer(text, return_tensors='tf', truncation=True, padding=True)

# X_train_text = train_data['text'].apply(tokenize_text)
# X_test_text = test_data['text'].apply(tokenize_text)

# # Extract BERT embeddings for text data
# def extract_bert_embeddings(text_tokens):
#     return bert_model(**text_tokens).last_hidden_state[:, 0, :]

# X_train_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_train_text, desc="Tokenizing Train Data")])
# X_test_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_test_text, desc="Tokenizing Test Data")])

# # Standardize numerical features
# scaler = StandardScaler()
# X_train_numerical = scaler.fit_transform(train_data[numerical_features])
# X_test_numerical = scaler.transform(test_data[numerical_features])

# # Combine BERT embeddings with numerical features
# X_train_combined = np.hstack([X_train_text_embeddings, X_train_numerical])
# X_test_combined = np.hstack([X_test_text_embeddings, X_test_numerical])

# Train XGBoost model
xgb_model = XGBRegressor()
xgb_model.fit(X_train_text_embeddings, train_data['average_rating'])

# Make predictions on the test set
predictions = xgb_model.predict(X_test_text_embeddings)

# Evaluate the model
mse = mean_squared_error(test_data['average_rating'], predictions)
mae = mean_absolute_error(test_data['average_rating'], predictions)
r2 = r2_score(test_data['average_rating'], predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')
print('Training set ------------------------------------------------------------------------------')
predictions_train = xgb_model.predict(X_train_text_embeddings)

# Evaluate the model
mse_train = mean_squared_error(train_data['average_rating'], predictions_train)
mae_train = mean_absolute_error(train_data['average_rating'], predictions_train)
r2_train = r2_score(train_data['average_rating'], predictions_train)

print(f'Mean Squared Error: {mse_train}')
print(f'Mean Absolute Error: {mae_train}')
print(f'R-squared: {r2_train}')

Mean Squared Error: 0.08129823090318869
Mean Absolute Error: 0.20952855350433558
R-squared: 0.07926682315483247
Training set ------------------------------------------------------------------------------
Mean Squared Error: 0.002313236686283981
Mean Absolute Error: 0.035639110393380725
R-squared: 0.9732083451938611


In [54]:
!pip install torch

Collecting torch
  Downloading torch-2.1.2-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting sympy (from torch)
  Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
     ---------------------------------------- 0.0/5.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/5.7 MB 991.0 kB/s eta 0:00:06
      --------------------------------------- 0.1/5.7 MB 1.1 MB/s eta 0:00:06
      --------------------------------------- 0.1/5.7 MB 1.1 MB/s eta 0:00:06
     - -------------------------------------- 0.2/5.7 MB 1.1 MB/s eta 0:00:06
     - -------------------------------------- 0.3/5.7 MB 1.1 MB/s eta 0:00:05
     -- ------------------------------------- 0.3/5.7 MB 1.1 MB/s eta 0:00:05
     -- ------------------------------------- 0.4/5.7 MB 1.1 MB/s eta 0:00:05
     -- ------------------------------------- 0.4/5.7 MB 1.2 MB/s eta 0:00:05
     --- ------------------------------------ 0.5/5.7 MB 1.2 MB/s eta 0:00:05
     --- ------------------------------------ 0.5/5.7 MB 1

ERROR: Exception:
Traceback (most recent call last):
  File "C:\Users\brad\AppData\Roaming\Python\Python311\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
    yield
  File "C:\Users\brad\AppData\Roaming\Python\Python311\site-packages\pip\_vendor\urllib3\response.py", line 561, in read
    data = self._fp_read(amt) if not fp_closed else b""
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\brad\AppData\Roaming\Python\Python311\site-packages\pip\_vendor\urllib3\response.py", line 527, in _fp_read
    return self._fp.read(amt) if amt is not None else self._fp.read()
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\brad\AppData\Roaming\Python\Python311\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 98, in read
    data: bytes = self.__fp.read(amt)
                  ^^^^^^^^^^^^^^^^^^^
  File "C:\Python311\Lib\http\client.py", line 466, in read
    s = self.fp.read(amt)
        ^^^^^^^^^^^^^^^^^
  File "C:\Python311\Lib\socket.py", line 706, in readinto

In [57]:
import torch

In [59]:
# Install the required libraries
# !pip install transformers xgboost tensorflow

import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from tqdm import tqdm

# Fine-tune BERT
from transformers import TFBertForSequenceClassification, BertConfig
from transformers import TFTrainer, TrainingArguments

#Split the data into train and test sets
train_data, test_data = train_test_split(df2, test_size=0.2, random_state=42)

# Define the BERT model and tokenizer for fine-tuning
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_text(text):
    return tokenizer(text, return_tensors='tf', truncation=True, padding=True)

X_train_text = train_data['text'].apply(tokenize_text)
X_test_text = test_data['text'].apply(tokenize_text)

# Define the training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
    output_dir='./results',
    overwrite_output_dir=True,
    evaluation_strategy="epoch"
)

# Define the trainer
trainer = TFTrainer(
    model=model,
    args=training_args,
    train_dataset=X_train_text,  # Pass the tokenized text as training dataset
)

# Fine-tune the model
trainer.train()

# Extract BERT embeddings for text data
def extract_bert_embeddings(text_tokens):
    return model.bert(**text_tokens).last_hidden_state[:, 0, :]

X_train_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_train_text, desc="Tokenizing Train Data")])
X_test_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_test_text, desc="Tokenizing Test Data")])

# Standardize numerical features
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(train_data[numerical_features])
X_test_numerical = scaler.transform(test_data[numerical_features])

# Combine BERT embeddings with numerical features
X_train_combined = np.hstack([X_train_text_embeddings, X_train_numerical])
X_test_combined = np.hstack([X_test_text_embeddings, X_test_numerical])

# Train XGBoost model
xgb_model = XGBRegressor()
xgb_model.fit(X_train_combined, train_data['average_rating'])

# Make predictions on the test set
predictions = xgb_model.predict(X_test_combined)

# Evaluate the model
mse = mean_squared_error(test_data['average_rating'], predictions)
mae = mean_absolute_error(test_data['average_rating'], predictions)
r2 = r2_score(test_data['average_rating'], predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')
print('Training set ------------------------------------------------------------------------------')
predictions_train = xgb_model.predict(X_train_combined)

# Evaluate the model
mse_train = mean_squared_error(train_data['average_rating'], predictions_train)
mae_train = mean_absolute_error(train_data['average_rating'], predictions_train)
r2_train = r2_score(train_data['average_rating'], predictions_train)

print(f'Mean Squared Error: {mse_train}')
print(f'Mean Absolute Error: {mae_train}')
print(f'R-squared: {r2_train}')


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: 
TrainingArguments requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFTrainingArguments".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


In [114]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from tqdm import tqdm

# Split the data into train and test sets
train_data, test_data = train_test_split(df2, test_size=0.2, random_state=42)

# Define the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
bert_model = TFBertModel.from_pretrained('bert-large-uncased')

# Tokenize the text data
def tokenize_text(text):
    return tokenizer(text, return_tensors='tf', truncation=True, padding=True)

X_train_text = train_data['text'].apply(tokenize_text)
X_test_text = test_data['text'].apply(tokenize_text)

# Extract BERT embeddings for text data
def extract_bert_embeddings(text_tokens):
    return bert_model(**text_tokens).last_hidden_state[:, 0, :]

X_train_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_train_text, desc="Tokenizing Train Data")])
X_test_text_embeddings = np.vstack([extract_bert_embeddings(text_tokens).numpy() for text_tokens in tqdm(X_test_text, desc="Tokenizing Test Data")])

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ['num_pages', 'ratings_count', 'text_reviews_count']  # Define your numerical features here
X_train_numerical = scaler.fit_transform(train_data[numerical_features])
X_test_numerical = scaler.transform(test_data[numerical_features])

# Combine BERT embeddings with numerical features
X_train_combined = np.hstack([X_train_text_embeddings, X_train_numerical])
X_test_combined = np.hstack([X_test_text_embeddings, X_test_numerical])

# Hyperparameter tuning for XGBoost
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7]
}

xgb_model1 = XGBRegressor()
grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train_combined, train_data['average_rating'])

best_xgb_model = grid_search.best_estimator_

# Make predictions on the test set
predictions = best_xgb_model.predict(X_test_combined)

# Evaluate the model
mse = mean_squared_error(test_data['average_rating'], predictions)
mae = mean_absolute_error(test_data['average_rating'], predictions)
r2 = r2_score(test_data['average_rating'], predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')

# Evaluate on training set
predictions_train = best_xgb_model.predict(X_train_combined)
mse_train = mean_squared_error(train_data['average_rating'], predictions_train)
mae_train = mean_absolute_error(train_data['average_rating'], predictions_train)
r2_train = r2_score(train_data['average_rating'], predictions_train)

print('Training set ------------------------------------------------------------------------------')
print(f'Mean Squared Error: {mse_train}')
print(f'Mean Absolute Error: {mae_train}')
print(f'R-squared: {r2_train}')


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] END ......................max_depth=3, n_estimators=100; total time=   5.8s
[CV] END ......................max_depth=3, n_estimators=100; total time=   5.5s
[CV] END ......................max_depth=3, n_estimators=100; total time=   5.4s
[CV] END ......................max_depth=3, n_estimators=200; total time=  10.1s
[CV] END ......................max_depth=3, n_estimators=200; total time=  10.2s
[CV] END ......................max_depth=3, n_estimators=200; total time=  10.2s
[CV] END ......................max_depth=3, n_estimators=300; total time=  15.2s
[CV] END ......................max_depth=3, n_estimators=300; total time=  15.6s
[CV] END ......................max_depth=3, n_estimators=300; total time=  15.3s
[CV] END ......................max_depth=5, n_estimators=100; total time=  12.0s
[CV] END ......................max_depth=5, n_estimators=100; total time=  12.2s
[CV] END ......................max_depth=5, n_est

In [35]:

import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
#df = data_copy2

# Construct the text column
#df['text'] = df.apply(lambda row: f"The book {row['title']} written by {row['authors']} and published by {row['publisher']} in {row['publication_year']} with {row['num_pages']} pages rated by {row['ratings_count']} people and reviewed by {row['text_reviews_count']} people.", axis=1)

# Split the data into train and test set

# Define the BERT model and tokenizer

regression_model = LinearRegression()
regression_model.fit(X_train_combined, train_data['average_rating'])

# Make predictions on the test set
predictions = regression_model.predict(X_test_combined)

# Evaluate the model
mse = mean_squared_error(test_data['average_rating'], predictions)
mae = mean_absolute_error(test_data['average_rating'], predictions)
r2 = r2_score(test_data['average_rating'], predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')
print('Training set ------------------------------------------------------------------------------')
predictions_train = regression_model.predict(X_train_combined)

# Evaluate the model
mse_train = mean_squared_error(train_data['average_rating'], predictions_train)
mae_train = mean_absolute_error(train_data['average_rating'], predictions_train)
r2_train = r2_score(train_data['average_rating'], predictions_train)

print(f'Mean Squared Error: {mse_train}')
print(f'Mean Absolute Error: {mae_train}')
print(f'R-squared: {r2_train}')


Mean Squared Error: 0.07723046213916958
Mean Absolute Error: 0.20567632875444875
R-squared: 0.12533584108004248
Training set ------------------------------------------------------------------------------
Mean Squared Error: 0.059152667632517884
Mean Absolute Error: 0.18412563229225032
R-squared: 0.3149002601119345


In [41]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
regression_model = AdaBoostRegressor()
regression_model.fit(X_train_combined, train_data['average_rating'])

# Make predictions on the test set
predictions = regression_model.predict(X_test_combined)

# Evaluate the model
mse = mean_squared_error(test_data['average_rating'], predictions)
mae = mean_absolute_error(test_data['average_rating'], predictions)
r2 = r2_score(test_data['average_rating'], predictions)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')
print('Training set ------------------------------------------------------------------------------')
predictions_train = regression_model.predict(X_train_combined)

# Evaluate the model
mse_train = mean_squared_error(train_data['average_rating'], predictions_train)
mae_train = mean_absolute_error(train_data['average_rating'], predictions_train)
r2_train = r2_score(train_data['average_rating'], predictions_train)

print(f'Mean Squared Error: {mse_train}')
print(f'Mean Absolute Error: {mae_train}')
print(f'R-squared: {r2_train}')


Mean Squared Error: 0.09574084219202969
Mean Absolute Error: 0.23859231964882388
R-squared: -0.08430120564704491
Training set ------------------------------------------------------------------------------
Mean Squared Error: 0.08424875214922245
Mean Absolute Error: 0.23445721252497517
R-squared: 0.0242401485271857


In [124]:
from sklearn.model_selection import cross_val_score

# Assuming 'X_train_combined', 'y_train' are your training data and labels
# 'rf_model' is your current RandomForestRegressor model

# Cross-validation on the training set
cv_scores = cross_val_score(rf_model2, X_train_combined, train_data['average_rating'], cv=5, scoring='neg_mean_squared_error')

# Display cross-validation scores
print(f'Cross-Validation Mean Squared Error: {-cv_scores.mean()}')


Cross-Validation Mean Squared Error: 0.0696000097154216


In [69]:
# Install the required library
# !pip install joblib

import joblib

# Save the model
joblib.dump(rf_model, 'book_pred.joblib')

# Load the model later if needed
#loaded_model = joblib.load('random_forest_model.joblib')


['book_pred.joblib']

In [90]:
# Assuming 'df1' is your dataset and 'rf_model', 'tokenizer', 'scaler' are already defined

# Select the specific row
#line = df1.iloc[9000]

# Construct the text for the single line
text = line['text']
# Tokenize the text
tokenized_text = tokenizer(text, return_tensors='tf', truncation=True, padding=True)

# Extract BERT embeddings for text data
text_embedding = extract_bert_embeddings(tokenized_text).numpy()

# Standardize numerical features
numerical_features = scaler.transform(line[['num_pages', 'ratings_count', 'text_reviews_count']].values.reshape(1, -1))

# Combine BERT embeddings with numerical features
input_data = np.hstack([text_embedding, numerical_features])

# Make predictions
prediction = rf_model.predict(input_data.reshape(1, -1))
loaded_model = joblib.load('book_pred.joblib')
prediction2 = np.round(loaded_model.predict(input_data.reshape(1, -1)), 2)

# Print the prediction
print(f'Predicted Average Rating: {prediction[0]}')
print(f'Predicted Average Rating 2: {prediction2[0]}')


Predicted Average Rating: 4.358100000000004
Predicted Average Rating 2: 4.36


  should set `reset=False`.


In [80]:
import numpy as np

In [53]:
# Assuming 'df1' is your dataset and 'rf_model', 'tokenizer', 'scaler' are already defined

# Select the specific row
line = df1.iloc[8532]

# Construct the text for the single line
text = line['text']
# Tokenize the text
tokenized_text = tokenizer(text, return_tensors='tf', truncation=True, padding=True)

# Extract BERT embeddings for text data
text_embedding = extract_bert_embeddings(tokenized_text).numpy()

# Standardize numerical features
numerical_features = scaler.transform(line[numerical_features].values.reshape(1, -1))[0]

# Combine BERT embeddings with numerical features
input_data = np.hstack([text_embedding, numerical_features])

# Make predictions
prediction = rf_model.predict(input_data)

# Print the prediction
print(f'Predicted Average Rating: {prediction[0]}')


ValueError: Cannot index with multidimensional key

In [None]:
# Install the required library
# !pip install matplotlib

import matplotlib.pyplot as plt

# Assuming you have predictions from different models
linear_regression_predictions = linear_regression_model.predict(X_test_combined)
xgboost_predictions = xgboost_model.predict(X_test_combined)
random_forest_predictions = random_forest_model.predict(X_test_combined)

# Create a scatter plot
plt.figure(figsize=(12, 8))

plt.scatter(test_data['average_rating'], linear_regression_predictions, label='Linear Regression', alpha=0.5)
plt.scatter(test_data['average_rating'], xgboost_predictions, label='XGBoost', alpha=0.5)
plt.scatter(test_data['average_rating'], random_forest_predictions, label='Random Forest', alpha=0.5)

plt.title('Actual vs Predicted Ratings')
plt.xlabel('Actual Ratings')
plt.ylabel('Predicted Ratings')
plt.legend()
plt.grid(True)
plt.show()
