In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2

### Load the data

In [2]:
# Training data 
train_data = pd.read_csv("../data/book_rating_train.csv", index_col = False, delimiter = ',', header=0)

In [3]:
train_data_X = train_data.iloc[:, :-1]
train_data_y = train_data.iloc[:, -1]

In [4]:
train_data.shape

(23063, 10)

In [5]:
train_test_ratio = 0.8
split_position = int(train_test_ratio * train_data.shape[0])
y_train = train_data_y.iloc[:split_position]
y_test = train_data_y.iloc[split_position:]

# Preprocess

### Dealing with Missing Data

In [6]:
missing_df = train_data_X.isna().sum() / train_data_X.shape[0]
missing_df.loc[missing_df.values > 0]

Publisher    0.006417
Language     0.745870
dtype: float64

In [7]:
train_data["Publisher"] = train_data["Publisher"].fillna("NA")
train_data.loc[train_data["Publisher"] == "NA"]

Unnamed: 0,Name,Authors,PublishYear,PublishMonth,PublishDay,Publisher,Language,pagesNumber,Description,rating_label
305,UNIX System V release 4 : an introduction for ...,Kenneth H. Rosen,1996,2,1,,,1175,An informative introductory text to the UNIX o...,3.0
573,El abismo en el tiempo,H.P. Lovecraft,1998,2,28,,spa,100,“The Shadow out of Time” is H. P. Lovecraft’s ...,4.0
574,Anne of Windy Willows,L.M. Montgomery,1994,5,1,,,240,Anne Shirley has left Redmond College behind t...,4.0
579,Poor Liza,Nikolay Karamzin,2010,10,15,,,80,Nikolaj Karamzin (1766-1826) is vooral bekend ...,3.0
626,Adventures in Odyssey: Darkness Before Dawn,AIO Team,2004,11,4,,,4,An unprecedented wave of criminal activity has...,5.0
...,...,...,...,...,...,...,...,...,...,...
22187,Seasons of the Wild: A Year of Nature's Magic ...,Sy Montgomery,1996,3,1,,,152,"""Award-winning author Sy Montgomery takes read...",4.0
22491,"P.S. We'll Miss You (Friends-4-Ever, #1)",Deirdre Corey,1990,4,10,,eng,121,Don't forget to write!<br /><br />Molly has ju...,4.0
22630,The Iron Man with the Adventures of Dennis Dorgan,Robert E. Howard,1983,10,1,,eng,232,"Robert E. Howard, creator of <b>CONAN THE BARB...",4.0
22631,Day of Descent,Judith Reeves-Stevens,1993,12,31,,,404,"The long-awaited first, original novel based o...",4.0


<li> In the feature _"Language"_, there is nearly 75% missing data. Having a look at the distribution of the rating label of the whole training data and the missing data:

In [8]:
# Full training dataset
full_label = train_data["rating_label"].value_counts() / train_data.shape[0] * 100

# Missing label in "language" column
train_lan_na = train_data.loc[train_data["Language"].isna()]["rating_label"]
train_lan_na_unique = train_lan_na.value_counts()
na_lan = train_lan_na_unique/train_lan_na.shape[0] * 100

label_dist = pd.DataFrame({'Full': full_label,
                           'Missing': na_lan})

In [9]:
train_lan = train_data["Language"]
lan_count = train_lan.value_counts()

for lan in lan_count.index:
    if (lan_count[lan] < 100):
        continue
    train_currLan = train_data.loc[train_data["Language"] == lan]["rating_label"]
    currLan_sr = train_currLan.value_counts() / train_currLan.shape[0] * 100
    label_dist[lan] = currLan_sr

In [10]:
label_dist

Unnamed: 0,Full,Missing,eng,fre,spa
4.0,70.277067,67.672364,78.12844,73.376623,79.194631
3.0,25.426007,27.467736,19.357798,22.077922,18.791946
5.0,4.296926,4.8599,2.513761,4.545455,2.013423


As we can see, the distribution of the labels in each category is close to each other. Hence, knowing the value of feature _"Language"_ does not give us any information about the class label in this training data.
<br> We do not include this feature from the dataset.

In [11]:
train_data_X = train_data_X.drop(columns = ["Language"])
train_data_X

Unnamed: 0,Name,Authors,PublishYear,PublishMonth,PublishDay,Publisher,pagesNumber,Description
0,Best of Dr Jean: Reading & Writing,Jean R. Feldman,2005,6,1,Teaching Resources,48,Teachers will turn to this treasury of ideas a...
1,Here All Dwell Free,Gertrud Mueller Nelson,1991,10,1,DoubleDay,364,Every human being lives a fairy tale -- an unc...
2,Boomer's Big Surprise,Constance W. McGeorge,2005,3,31,Chronicle Books,32,<i>Boomer's Big Surprise</i> will have special...
3,"I'll Go and Do More: Annie Dodge Wauneka, Nava...",Carolyn Niethammer,2004,9,1,Bison Books,293,<i>I'll Go and Do More</i> is the story of Ann...
4,Us,Richard Mason,2005,7,7,Penguin Books Ltd,352,"Since their days at Oxford, they've gone their..."
...,...,...,...,...,...,...,...,...
23058,Black Coffee Blues,Henry Rollins,1997,8,1,2.13.61,120,"""If I lose the light of the sun, I will write ..."
23059,America's Champion Swimmer: Gertrude Ederle,David A. Adler,2005,6,1,HMH Books for Young Readers,32,"Trudy Ederle loved to swim, and she was determ..."
23060,Crime and Custom in Savage Society,Bronisław Malinowski,1989,2,15,Rowman & Littlefield Publishers,132,Bronislaw Malinowski achieved international re...
23061,The Name and Nature of Poetry and Other Select...,A.E. Housman,1998,4,21,New Amsterdam Books,136,Lovers of Housman's poetry and admirers of his...


### Data Types Conversion

##### Authors & Publisher

Convert _"Authors"_ and _"Publisher"_ attributes to numerical type using one-hot encoding

In [12]:
authors_df = pd.DataFrame(train_data["Authors"])
publisher_df = pd.DataFrame(train_data["Publisher"])

In [13]:
# Authors
authors_ohe = OneHotEncoder()

authors_ohe.fit(authors_df)
authors_trans_ohe = authors_ohe.transform(authors_df).toarray()
authors_trans_df = pd.DataFrame(authors_trans_ohe, columns=authors_ohe.get_feature_names_out())

In [14]:
#Publisher
publisher_ohe = OneHotEncoder()

publisher_ohe.fit(publisher_df)
publisher_trans_ohe = publisher_ohe.transform(publisher_df).toarray()
publisher_trans_df = pd.DataFrame(publisher_trans_ohe, columns=publisher_ohe.get_feature_names_out())

Select 10 best features from the one-hot encoded _"Authors"_ and _"Publisher"_ features.

<li> Authors

In [15]:
# Authors
authors_x2 = SelectKBest(chi2, k=10)
authors_train = authors_trans_df[:split_position]
authors_test = authors_trans_df[split_position:]

authors_train_x2 = authors_x2.fit_transform(authors_train,y_train)
authors_test_x2 = authors_x2.transform(authors_test)

In [16]:
authors_features = []
for feat_num in authors_x2.get_support(indices=True):
    authors_features.append(authors_ohe.get_feature_names_out()[feat_num])

In [17]:
authors_features

['Authors_Anonymous',
 'Authors_Carole Mortimer',
 'Authors_Charles Fillmore',
 'Authors_Creflo A. Dollar',
 'Authors_M. Fethullah Gülen',
 'Authors_Paramahansa Yogananda',
 'Authors_Paul Metcalf',
 'Authors_Peter Egan',
 'Authors_Quino',
 'Authors_W.E. Vine']

In [18]:
authors_train_df = pd.DataFrame(authors_train_x2, columns=authors_features)
authors_test_df = pd.DataFrame(authors_test_x2, columns=authors_features)
best_authors_df = pd.concat([authors_train_df, authors_test_df], ignore_index=True)

<li> Publisher

In [19]:
# Publisher
publisher_x2 = SelectKBest(chi2, k=10)
publisher_train = publisher_trans_df[:split_position]
publisher_test = publisher_trans_df[split_position:]

publisher_train_x2 = publisher_x2.fit_transform(publisher_train,y_train)
publisher_test_x2 = publisher_x2.transform(publisher_test)

In [20]:
publisher_features = []
for feat_num in publisher_x2.get_support(indices=True):
    publisher_features.append(publisher_ohe.get_feature_names_out()[feat_num])

In [21]:
publisher_features

['Publisher_Banner of Truth',
 'Publisher_Hal Leonard Publishing Corporation',
 'Publisher_Harrison House',
 'Publisher_ICS Publications',
 'Publisher_Light, Inc.',
 'Publisher_Museum of Modern Art',
 'Publisher_Self-Realization Fellowship Publishers',
 'Publisher_TAN Books',
 'Publisher_TFH Publications',
 'Publisher_VIZ Media']

In [22]:
publisher_train_df = pd.DataFrame(publisher_train_x2, columns=publisher_features)
publisher_test_df = pd.DataFrame(publisher_test_x2, columns=publisher_features)
best_publisher_df = pd.concat([publisher_train_df, publisher_test_df], ignore_index=True)

##### Name & Description

Convert _"Name"_ and _"Description"_ attributes to numerical type using the preprocessed CountVectorizer

In [23]:
import pickle
name_countVec = pickle.load(open("../data/book_text_features_countvec/book_text_features_countvec/train_name_countvectorizer.pkl", "rb"))
desc_countVec = pickle.load(open("../data/book_text_features_countvec/book_text_features_countvec/train_desc_countvectorizer.pkl", "rb"))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [24]:
name_df = pd.DataFrame(name_countVec.transform(train_data_X["Name"]).todense(), columns=name_countVec.get_feature_names_out())
desc_df = pd.DataFrame(desc_countVec.transform(train_data_X["Description"]).todense(), columns=desc_countVec.get_feature_names_out())

Select 10 best features from the vectorized _"Name"_ and _"Description"_ features.

<li> Description

In [25]:
# Description
desc_x2 = SelectKBest(chi2, k=10)
desc_train = desc_df[:split_position]
desc_test = desc_df[split_position:]

desc_train_x2 = desc_x2.fit_transform(desc_train,y_train)
desc_test_x2 = desc_x2.transform(desc_test)

In [26]:
desc_features = []
for feat_num in desc_x2.get_support(indices=True):
    desc_features.append("Desc_" + desc_countVec.get_feature_names_out()[feat_num])

In [27]:
desc_features

['Desc_bible',
 'Desc_concordance',
 'Desc_god',
 'Desc_herriot',
 'Desc_liszt',
 'Desc_nahuatl',
 'Desc_objectivist',
 'Desc_rinpoche',
 'Desc_web',
 'Desc_yogananda']

In [28]:
desc_train_df = pd.DataFrame(desc_train_x2, columns=desc_features)
desc_test_df = pd.DataFrame(desc_test_x2, columns=desc_features)
best_desc_df = pd.concat([desc_train_df, desc_test_df], ignore_index=True)

<li> Names

In [29]:
# Names
name_x2 = SelectKBest(chi2, k=10)
name_train = name_df[:split_position]
name_test = name_df[split_position:]

name_train_x2 = name_x2.fit_transform(name_train,y_train)
name_test_x2 = name_x2.transform(name_test)

In [30]:
name_features = []
for feat_num in name_x2.get_support(indices=True):
    name_features.append("Name_" + name_countVec.get_feature_names_out()[feat_num])

In [31]:
name_features

['Name_bible',
 'Name_holy',
 'Name_mafalda',
 'Name_nausicaä',
 'Name_niv',
 'Name_novel',
 'Name_pokemon',
 'Name_st',
 'Name_vine',
 'Name_yotsuba']

In [32]:
name_train_df = pd.DataFrame(name_train_x2, columns=name_features)
name_test_df = pd.DataFrame(name_test_x2, columns=name_features)
best_name_df = pd.concat([name_train_df, name_test_df], ignore_index=True)

<li> Import the preprocessed data

In [33]:
processed_train_data_X = pd.concat([train_data_X, best_authors_df, best_publisher_df, best_desc_df, best_name_df], axis=1)
p_train_data_X = processed_train_data_X.drop(columns = ["Name", "Authors", "Publisher", "Description"])
p_train_data_X

Unnamed: 0,PublishYear,PublishMonth,PublishDay,pagesNumber,Authors_Anonymous,Authors_Carole Mortimer,Authors_Charles Fillmore,Authors_Creflo A. Dollar,Authors_M. Fethullah Gülen,Authors_Paramahansa Yogananda,...,Name_bible,Name_holy,Name_mafalda,Name_nausicaä,Name_niv,Name_novel,Name_pokemon,Name_st,Name_vine,Name_yotsuba
0,2005,6,1,48,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1991,10,1,364,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2005,3,31,32,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2004,9,1,293,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,2005,7,7,352,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23058,1997,8,1,120,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
23059,2005,6,1,32,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
23060,1989,2,15,132,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
23061,1998,4,21,136,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


### Output to file

In [34]:
p_train_data_X.to_csv("../data/preprocessed/preprocessed_book_rating_train_X.csv", index=False)

In [35]:
train_data_y.to_csv("../data/preprocessed/preprocessed_book_rating_train_y.csv", index=False)

# Test case

In [None]:
test_data = pd.read_csv(r"../data/book_rating_test.csv", index_col = False, delimiter = ',', header=0)
test_data

In [None]:
test_data["Publisher"] = test_data["Publisher"].fillna("NA")

In [None]:
authors_df = pd.DataFrame(test_data["Authors"])
publisher_df = pd.DataFrame(test_data["Publisher"])

In [None]:
authors_trans_ohe = authors_ohe.transform(authors_df).toarray()
publisher_trans_ohe = publisher_ohe.transform(publisher_df).toarray()

In [None]:
authors_trans_df = pd.DataFrame(authors_trans_ohe, columns=authors_ohe.get_feature_names_out())
publisher_trans_df = pd.DataFrame(publisher_trans_ohe, columns=publisher_ohe.get_feature_names_out())

In [None]:
authors_test_x2 = authors_x2.transform(authors_trans_df)
authors_features = []
for feat_num in authors_x2.get_support(indices=True):
    authors_features.append(authors_ohe.get_feature_names_out()[feat_num])

In [None]:
publisher_test_x2 = publisher_x2.transform(publisher_trans_df)
publisher_features = []
for feat_num in publisher_x2.get_support(indices=True):
    publisher_features.append(publisher_ohe.get_feature_names_out()[feat_num])

In [None]:
name_df = pd.DataFrame(name_countVec.transform(test_data["Name"]).todense(), columns=name_countVec.get_feature_names_out())
desc_df = pd.DataFrame(desc_countVec.transform(test_data["Description"]).todense(), columns=desc_countVec.get_feature_names_out())

In [None]:
desc_test_x2 = desc_x2.transform(desc_df)
name_test_x2 = name_x2.transform(name_df)

In [None]:
desc_features = []
for feat_num in desc_x2.get_support(indices=True):
    desc_features.append("Desc_" + desc_countVec.get_feature_names_out()[feat_num])

name_features = []
for feat_num in name_x2.get_support(indices=True):
    name_features.append("Name_" + name_countVec.get_feature_names_out()[feat_num])

In [None]:
name_test_df = pd.DataFrame(name_test_x2, columns=name_features)
desc_test_df = pd.DataFrame(desc_test_x2, columns=desc_features)
publisher_test_df = pd.DataFrame(publisher_test_x2, columns=publisher_features)
authors_test_df = pd.DataFrame(authors_test_x2, columns=authors_features)

In [None]:
processed_test_data = pd.concat([test_data, name_test_df, desc_test_df, publisher_test_df, authors_test_df], axis=1)

In [None]:
processed_test_data = processed_test_data.drop(columns = ["Name", "Authors", "Publisher", "Description", "Language"])
processed_test_data