## Decision Tree

In [1]:
# Importing the required packages and libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier



In [2]:
# Reading test CSV file directly from Web, and store it in a pandas DataFrame:
test_df=pd.read_csv("D:/Fauzan/Study PhD/Semester 1/Machine Learning/Homework/HW4/spooky-author-identification/spooky-author-identification/test/test.csv")

# Print first 5 rows
test_df.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [19]:
# Find unique value counts of each author
test_df.value_counts()

id       text                                                                                                                                                                                                         
id00008  And then there was 'The Man in the Bell,' a paper by the by, Miss Zenobia, which I cannot sufficiently recommend to your attention.                                                                              1
id18869  "Good God" I whispered, "can you do that for any time?"                                                                                                                                                          1
id18864  "While I improved in speech, I also learned the science of letters as it was taught to the stranger, and this opened before me a wide field for wonder and delight.                                              1
id18863  It is possible indeed it is far more than probable that he was innocent of all participation in the bloody transacti

In [3]:
# Reading train CSV file directly from Web, and store it in a pandas DataFrame:
train_df = pd.read_csv('D:/Fauzan/Study PhD/Semester 1/Machine Learning/Homework/HW4/spooky-author-identification/spooky-author-identification/train/train.csv')

# Print first 5 rows
train_df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
# Find unique value counts of each author
train_df['author'].value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

In [5]:
# Split into features and labels
X_train = train_df['text']
y_train = train_df['author']
X_test = test_df['text']

In [6]:
# Transform text data into vectors to run it through machine learning model
vectorizer = CountVectorizer(stop_words='english')
corpus = pd.concat([train_df['text'], test_df['text']])
vectorizer.fit(corpus)

CountVectorizer(stop_words='english')

In [7]:
# Randomly splitting the original dataset into training set and testing set
X_tr, X_tt, y_tr, y_tt = train_test_split(X_train, y_train, test_size=0.3, random_state=2)

X_tr_new = vectorizer.transform(X_tr)
X_tt_new = vectorizer.transform(X_tt)

### Experiment 1

In [8]:
# "my_decisiontree" is instantiated as an "object" of DecisionTreeClassifier "class" 
dt = DecisionTreeClassifier()
dt.fit(X_tr_new, y_tr)

DecisionTreeClassifier()

In [9]:
# Predicting the Binary Label:
y_predict_dt = dt.predict(X_tt_new)
print(y_predict_dt)

['EAP' 'EAP' 'HPL' ... 'EAP' 'HPL' 'EAP']


In [10]:
# Estimating the probability (likelihood) of Each Label: 
y_pred_proba_dt = dt.predict_proba(X_tt_new)
y_pred_proba_dt

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [11]:
# Find accuracy score
score_dt = accuracy_score(y_tt, y_predict_dt)
print("Decision tree accuracy:", score_dt)

Decision tree accuracy: 0.5902281239359891


### Experiment 2: Hyperparameter Tuning from GridSearch

In [12]:
# Creating a dictionary of parameters to use in GridSearchCV
from sklearn.model_selection import GridSearchCV

params = {
    'criterion':  ['gini', 'entropy'],
    'max_depth':  [None, 2, 4, 6, 8, 10],
    'max_features': [None, 'sqrt', 'log2', 0.2, 0.4, 0.6, 0.8],
    'splitter': ['best', 'random']
}

clf = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=params,
    cv=5,
    n_jobs=5,
    verbose=1,
)

clf.fit(X_tr_new, y_tr)
print(clf.best_params_)

Fitting 5 folds for each of 168 candidates, totalling 840 fits
{'criterion': 'entropy', 'max_depth': None, 'max_features': 0.4, 'splitter': 'random'}


In [13]:
# Predicting the Binary Label:
y_predict_dt_gs = clf.predict(X_tt_new)
print(y_predict_dt_gs)

['EAP' 'EAP' 'HPL' ... 'EAP' 'EAP' 'MWS']


In [14]:
# Find accuracy score
score_dt_gs1 = accuracy_score(y_tt, y_predict_dt_gs)
print("Decision tree accuracy:", score_dt_gs1)

Decision tree accuracy: 0.5948246510044263


### Experiment 3: Hyperparameter Tuning from GridSearch with Certain Parameters

In [15]:
# Using the Parameters from GridSearchCV
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

clf_gs2 = DecisionTreeClassifier(max_depth=4, criterion='entropy', max_features=0.6, splitter='best')
clf_gs2.fit(X_tr_new, y_tr)

DecisionTreeClassifier(criterion='entropy', max_depth=4, max_features=0.6)

In [16]:
predictions = clf_gs2.predict(X_tt_new)
print(y_predict_dt_gs)

['EAP' 'EAP' 'HPL' ... 'EAP' 'EAP' 'MWS']


In [17]:
score_dt_gs2 = accuracy_score(y_tt, predictions)
print("Decision tree accuracy:", score_dt_gs2)

Decision tree accuracy: 0.4453524004085802
