# COMP30027 Machine Learning Project 2

In [1]:
import pandas as pd
import numpy as np


### Load the data

In [2]:
# Training data 
train_data = pd.read_csv(r"project_data_files/book_rating_train.csv", index_col = False, delimiter = ',', header=0)

In [3]:
train_data_X = train_data.iloc[:, :-1]
train_data_y = train_data.iloc[:, -1]

## 1. Data Preprocessing

### Dealing with Missing Data

In [4]:
missing_df = train_data_X.isna().sum() / train_data_X.shape[0]
missing_df.loc[missing_df.values > 0]

Publisher    0.006417
Language     0.745870
dtype: float64

<li> In the feature _"Publisher"_, there is less that 1% missing data. Moreover, this is a categorical attribute, so we will replace the missing values with a new value: _"NA"_.

In [5]:
train_data["Publisher"] = train_data["Publisher"].fillna("NA")
train_data.loc[train_data["Publisher"] == "NA"]

Unnamed: 0,Name,Authors,PublishYear,PublishMonth,PublishDay,Publisher,Language,pagesNumber,Description,rating_label
305,UNIX System V release 4 : an introduction for ...,Kenneth H. Rosen,1996,2,1,,,1175,An informative introductory text to the UNIX o...,3.0
573,El abismo en el tiempo,H.P. Lovecraft,1998,2,28,,spa,100,“The Shadow out of Time” is H. P. Lovecraft’s ...,4.0
574,Anne of Windy Willows,L.M. Montgomery,1994,5,1,,,240,Anne Shirley has left Redmond College behind t...,4.0
579,Poor Liza,Nikolay Karamzin,2010,10,15,,,80,Nikolaj Karamzin (1766-1826) is vooral bekend ...,3.0
626,Adventures in Odyssey: Darkness Before Dawn,AIO Team,2004,11,4,,,4,An unprecedented wave of criminal activity has...,5.0
...,...,...,...,...,...,...,...,...,...,...
22187,Seasons of the Wild: A Year of Nature's Magic ...,Sy Montgomery,1996,3,1,,,152,"""Award-winning author Sy Montgomery takes read...",4.0
22491,"P.S. We'll Miss You (Friends-4-Ever, #1)",Deirdre Corey,1990,4,10,,eng,121,Don't forget to write!<br /><br />Molly has ju...,4.0
22630,The Iron Man with the Adventures of Dennis Dorgan,Robert E. Howard,1983,10,1,,eng,232,"Robert E. Howard, creator of <b>CONAN THE BARB...",4.0
22631,Day of Descent,Judith Reeves-Stevens,1993,12,31,,,404,"The long-awaited first, original novel based o...",4.0


<li> In the feature _"Language"_, there is nearly 75% missing data. Having a look at the distribution of the rating label of the whole training data and the missing data:

In [6]:
# Full training dataset
full_label = train_data["rating_label"].value_counts() / train_data.shape[0] * 100

# Missing label in "language" column
train_lan_na = train_data.loc[train_data["Language"].isna()]["rating_label"]
train_lan_na_unique = train_lan_na.value_counts()
na_lan = train_lan_na_unique/train_lan_na.shape[0] * 100

label_dist = pd.DataFrame({'Full': full_label,
                           'Missing': na_lan})

In [7]:
train_lan = train_data["Language"]
lan_count = train_lan.value_counts()

for lan in lan_count.index:
    if (lan_count[lan] < 100):
        continue
    train_currLan = train_data.loc[train_data["Language"] == lan]["rating_label"]
    currLan_sr = train_currLan.value_counts() / train_currLan.shape[0] * 100
    label_dist[lan] = currLan_sr

In [8]:
label_dist

Unnamed: 0,Full,Missing,eng,fre,spa
4.0,70.277067,67.672364,78.12844,73.376623,79.194631
3.0,25.426007,27.467736,19.357798,22.077922,18.791946
5.0,4.296926,4.8599,2.513761,4.545455,2.013423


As we can see, the distribution of the labels in each category is close to each other. Hence, knowing the value of feature _"Language"_ does not give us any information about the class label in this training data.
<br> We do not include this feature from the dataset.

In [9]:
train_data_X.drop(columns = ["Language"])

Unnamed: 0,Name,Authors,PublishYear,PublishMonth,PublishDay,Publisher,pagesNumber,Description
0,Best of Dr Jean: Reading & Writing,Jean R. Feldman,2005,6,1,Teaching Resources,48,Teachers will turn to this treasury of ideas a...
1,Here All Dwell Free,Gertrud Mueller Nelson,1991,10,1,DoubleDay,364,Every human being lives a fairy tale -- an unc...
2,Boomer's Big Surprise,Constance W. McGeorge,2005,3,31,Chronicle Books,32,<i>Boomer's Big Surprise</i> will have special...
3,"I'll Go and Do More: Annie Dodge Wauneka, Nava...",Carolyn Niethammer,2004,9,1,Bison Books,293,<i>I'll Go and Do More</i> is the story of Ann...
4,Us,Richard Mason,2005,7,7,Penguin Books Ltd,352,"Since their days at Oxford, they've gone their..."
...,...,...,...,...,...,...,...,...
23058,Black Coffee Blues,Henry Rollins,1997,8,1,2.13.61,120,"""If I lose the light of the sun, I will write ..."
23059,America's Champion Swimmer: Gertrude Ederle,David A. Adler,2005,6,1,HMH Books for Young Readers,32,"Trudy Ederle loved to swim, and she was determ..."
23060,Crime and Custom in Savage Society,Bronisław Malinowski,1989,2,15,Rowman & Littlefield Publishers,132,Bronislaw Malinowski achieved international re...
23061,The Name and Nature of Poetry and Other Select...,A.E. Housman,1998,4,21,New Amsterdam Books,136,Lovers of Housman's poetry and admirers of his...


### Text Preprocessing
 We will used the provided doc2vec for string data types features _"Name"_ and _"Description"_.

In [10]:
# Name feature
train_name = pd.read_csv(r"project_data_files/book_text_features_doc2vec/book_text_features_doc2vec/train_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_name = pd.read_csv(r"project_data_files/book_text_features_doc2vec/book_text_features_doc2vec/test_name_doc2vec100.csv", index_col = False, delimiter = ',', header=None)

# Description feature
train_desc = pd.read_csv(r"project_data_files/book_text_features_doc2vec/book_text_features_doc2vec/train_desc_doc2vec100.csv", index_col = False, delimiter = ',', header=None)
test_desc = pd.read_csv(r"project_data_files/book_text_features_doc2vec/book_text_features_doc2vec/test_desc_doc2vec100.csv", index_col = False, delimiter = ',', header=None)

In [11]:
def columns_change(df, string):
    arr = [string + str(i) for i in range(1, df.shape[1]+1)]
    df.columns = arr
    return df

In [12]:
train_name = columns_change(train_name, "name_")
train_desc = columns_change(train_desc, "desc_")

In [13]:
# train_data_X = pd.concat([train_data_X, train_name, train_desc], axis=1)
# train_data_X = train_data_X.drop("Name", axis=1)
# train_data_X = train_data_X.drop("Description", axis=1)
# train_data_X

### Data Types Conversion

We convert the nominal feature _"Publisher"_ into numeric feature using one-hot encoding

In [28]:
publisher_dummies = pd.get_dummies(train_data_X["Publisher"])

In [29]:
author_dummies = pd.get_dummies(train_data_X["Authors"])

In [25]:
# train_data_X = pd.concat([train_data_X, dummies], axis=1)
# train_data_X = train_data_X.drop("Publisher", axis=1)
# train_data_X

#### Feature Filtering

We use Mutual Information to select the best k features from each of the above processed datasets.

In [None]:
from sklearn.feature_selection import mutual_info_classif
mi = SelectKBest(score_func=mutual_info_classif, k=10)

X_train_mi = mi.fit_transform(train_data_X,train_data_y)


### Split the training data into train and test dataset

In [15]:
# Split the training data into train and test set
split_ratio = 0.8
split_position = int(0.8 * train_data.shape[0])
train_X = train_data_X[:split_position]
train_y = train_data_y[:split_position]

test_X = train_data_X[split_position:]
test_y = train_data_y[split_position:]

In [16]:
train_data_X.to_csv("Preprocessed_data.csv", index=False)

## 2. Modelling

## (a) Baseline model (0-R)

In [17]:
from sklearn.dummy import DummyClassifier
zero_r = DummyClassifier(strategy='most_frequent')
zero_r.fit(train_X, train_y)

In [18]:
zr_predict = zero_r.predict(test_X)
zr_predict

array([4., 4., 4., ..., 4., 4., 4.])

In [19]:
zero_r.score(test_X, test_y)

0.7053977888575764

## (b) One-R