# Machine Learning Models Evaluation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr

from models.utils.evaluation import print_metrics

Load train and test datasets.

In [2]:
X_train = pd.read_csv("../features/weebit_train_with_features.csv", index_col=0)
X_test = pd.read_csv("../features/weebit_test_with_features.csv", index_col=0)

# get Y
y_train = X_train["Level"]
y_test = X_test["Level"]

# remove Y and Text columns 
X_train.drop(columns=['Text', 'Level'], inplace=True)
X_test.drop(columns=['Text', 'Level'], inplace=True)

# whole set; used in cross-validation
X = pd.concat([X_train, X_test]).reset_index(drop=True)
y = pd.concat([y_train, y_test]).reset_index(drop=True)

For scoring, we will use __Spearman correlation__.

In [3]:
scoring_function = lambda y_true, y_pred: spearmanr(y_true, y_pred)[0]

## 1. Random Forest Regression

In [4]:
from models.random_forest import RandomForest
from models.utils.hyperparemeter_optimization import grid_search_cv_for_ensembles

Firstly, we need to __find the best hyperparameters.__ We will do this using grid search.

In [5]:
# set the hyperparameter grid
max_depth_values = [5, 10, 15, 20]
n_estimators_values = [10, 50, 100]

# perform hyperparameter search
max_depth, n_estimators = grid_search_cv_for_ensembles(RandomForest(), max_depth_values, n_estimators_values, X_train, y_train, scoring_function, k=3, verbose=1)

print()
print("Best hyperparemeters are: max_depth=" + str(max_depth) + " n_estimators=" + str(n_estimators))

score=0.6783827484395081 | max_depth=5 n_estimators=10
score=0.6792103203707903 | max_depth=5 n_estimators=50
score=0.6846287855430528 | max_depth=5 n_estimators=100
score=0.708638451981671 | max_depth=10 n_estimators=10
score=0.7431187239243666 | max_depth=10 n_estimators=50
score=0.7347961957166792 | max_depth=10 n_estimators=100
score=0.7165482445521892 | max_depth=15 n_estimators=10
score=0.7435249110167789 | max_depth=15 n_estimators=50
score=0.7435577542109802 | max_depth=15 n_estimators=100
score=0.7060044666039577 | max_depth=20 n_estimators=10
score=0.7373928691717807 | max_depth=20 n_estimators=50
score=0.7424822070044348 | max_depth=20 n_estimators=100

Best hyperparemeters are: max_depth=15 n_estimators=100


In [6]:
rf = RandomForest(max_depth=max_depth, n_estimators=n_estimators, save_model=True)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [7]:
print_metrics(y_test, y_pred)

Spearman's correlation coef: 0.776516575205192
-----------
R^2 = 0.590388561593963
R = 0.7683674652104702
-----------


## 2. XGBoost Regression

In [8]:
from models.xgboost import XGBoost

In [9]:
# xgboost is showing a particular meaningless warning, we will ignore it
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Firstly, we need to __find the best hyperparameters.__ We will do this using grid search.

In [10]:
## set the hyperparameter grid
max_depth_values = [5, 10, 15, 20, 30]
n_estimators_values = [10, 50, 100, 200]

# perform hyperparameter search
max_depth, n_estimators = grid_search_cv_for_ensembles(XGBoost(), max_depth_values, n_estimators_values, X_train, y_train, scoring_function, k=3, verbose=1)

print()
print("Best hyperparemeters are: max_depth=" + str(max_depth) + " n_estimators=" + str(n_estimators))

score=0.6419889844170271 | max_depth=5 n_estimators=10
score=0.7483377034002218 | max_depth=5 n_estimators=50
score=0.756044230026724 | max_depth=5 n_estimators=100
score=0.7639983987917685 | max_depth=5 n_estimators=200
score=0.6646530489341059 | max_depth=10 n_estimators=10
score=0.7148517933037155 | max_depth=10 n_estimators=50
score=0.7183191508186861 | max_depth=10 n_estimators=100
score=0.7220409705323259 | max_depth=10 n_estimators=200
score=0.638643504903206 | max_depth=15 n_estimators=10
score=0.7066639837196066 | max_depth=15 n_estimators=50
score=0.7075090389794395 | max_depth=15 n_estimators=100
score=0.6996964025662665 | max_depth=15 n_estimators=200
score=0.6380222543381513 | max_depth=20 n_estimators=10
score=0.6924828556627783 | max_depth=20 n_estimators=50
score=0.7200795801507662 | max_depth=20 n_estimators=100
score=0.6937403812075035 | max_depth=20 n_estimators=200
score=0.6403684577690353 | max_depth=30 n_estimators=10
score=0.7158550750074734 | max_depth=30 n_esti

In [11]:
xgboost = XGBoost(save_model=True)

xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

In [12]:
print_metrics(y_test, y_pred)

Spearman's correlation coef: 0.7319301576049824
-----------
R^2 = 0.5230648822711715
R = 0.7232322464265345
-----------


Find mean Spearman correlation over k folds.

## 3. Support Vector Machine

In [13]:
from models.support_vector_machine import SupportVectorMachine
from models.utils.hyperparemeter_optimization import find_best_C

Firstly, we need to __find the best hyperparameter C.__ 

In [14]:
## set the hyperparameter grid
c_values = [1.0, 2.0, 5.0, 10.0, 20.0]

# perform hyperparameter search
best_c = find_best_C(SupportVectorMachine(), c_values, X_train, y_train, scoring_function, k=3, verbose=1)

print()
print("Best C is " + str(best_c))

score=0.701074722023504 | C=1.0
score=0.7183889095741917 | C=2.0
score=0.7240213074869728 | C=5.0
score=0.7210584680276391 | C=10.0
score=0.7248240205447445 | C=20.0

Best C is 20.0


In [15]:
svm = SupportVectorMachine(C=best_c, save_model=True)

svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

In [16]:
print_metrics(y_test, y_pred)

Spearman's correlation coef: 0.7588666565532721
-----------
R^2 = 0.5478683430743052
R = 0.7401812906810772
-----------


## 4. Multilayer Perceptron

In [17]:
from models.multilayer_perceptron import MultilayerPerceptron

Using TensorFlow backend.


In [18]:
mlp = MultilayerPerceptron(input_dim=X_train.shape[1], save_model=True, verbose=0)

mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

In [19]:
print_metrics(y_test, y_pred)

Spearman's correlation coef: 0.7852631715901716
-----------
R^2 = 0.5783411663467266
R = 0.7604874531159121
-----------
