# Machine Learning Models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from utils.evaluation import print_metrics

Load train and test datasets.

In [2]:
X_train = pd.read_csv("../features/weebit_train_with_features.csv", index_col=0)
X_test = pd.read_csv("../features/weebit_test_with_features.csv", index_col=0)

# get Y
y_train = X_train["Level"]
y_test = X_test["Level"]

# remove Y and Text columns 
X_train.drop(columns=['Text', 'Level'], inplace=True)
X_test.drop(columns=['Text', 'Level'], inplace=True)

## 1. Random Forest Classifier

In [3]:
from models.random_forest import RandomForest

In [4]:
rf = RandomForest()

In [5]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [6]:
print_metrics(y_test, y_pred, classification=True)

Spearman's correlation coef: 0.7615175332600173
-----------
R^2 = 0.5294429150491202
-----------
Confusion matrix:
[[ 80  28   7   5   2]
 [ 17 107  22   5   7]
 [  1  25 113   4  17]
 [  6   9   7  82  24]
 [  1   9  17  22 111]]
-----------
Accuracy: 0.6771978021978022
F1(micro)0.6771978021978022
F1(macro)0.6786515124676925
-----------


## 2. XGBoost Classifier

In [3]:
from models.xgboost import XGBoost

In [4]:
xgboost = XGBoost()

In [5]:
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

In [7]:
print_metrics(y_test, y_pred, classification=True)

Spearman's correlation coef: 0.7890308371249726
-----------
R^2 = 0.5811758475813705
-----------
Confusion matrix:
[[ 80  31   5   4   2]
 [ 17 112  18   5   6]
 [  3  27 114   3  13]
 [  5   7   6  82  28]
 [  2   7  10  27 114]]
-----------
Accuracy: 0.6895604395604396
F1(micro)0.6895604395604396
F1(macro)0.6893224224903831
-----------


## 3. Support Vector Classifier

In [3]:
from sklearn.svm import LinearSVC

Perform hyperparameter optimization to find best C.

In [7]:
kernel = 'linear'
Cs = [0.1, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0, 5.0, 10.0, 50.0]
bestC = find_best_C_for_SVC(X_train, y_train, kernel, Cs)

Train and evaluate the model.

In [17]:
svc = LinearSVC(C=10.0, max_iter=20000)

In [18]:
svc.fit(X_train, y_train)



LinearSVC(C=10.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=20000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [20]:
y_pred = svc.predict(X_test)
    
print_metrics(y_test, y_pred, classification=True)

Spearman's correlation coef: 0.7566728905246934
-----------
R^2 = 0.4996787620853598
-----------
Confusion matrix:
[[ 78  32   4   6   2]
 [ 26 102  18   8   4]
 [  6  28  95   6  25]
 [  8   7   8  72  33]
 [  4   3  10  30 113]]
-----------
Accuracy: 0.6318681318681318
F1(micro)0.6318681318681318
F1(macro)0.6296434044975326
-----------


## 4. Multilayer Perceptron

In [3]:
from models.multilayer_perceptron import MultilayerPerceptron

Using TensorFlow backend.


In [4]:
mlp = MultilayerPerceptron(input_dim=X_train.shape[1])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [5]:
mlp.fit(X_train, y_train)

Instructions for updating:
Use tf.cast instead.
Train on 2328 samples, validate on 583 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoc

In [6]:
y_pred_cat = mlp.predict(X_test)

In [7]:
y_pred = np.argmax(y_pred_cat, axis=1)

In [8]:
print_metrics(y_test, y_pred, classification=True)

Spearman's correlation coef: 0.7921287528133497
-----------
R^2 = 0.593223242828607
-----------
Confusion matrix:
[[ 72  40   4   5   1]
 [ 16 115  22   3   2]
 [  0  33 110   6  11]
 [  5   8   9  77  29]
 [  4   4  16  27 109]]
-----------
Accuracy: 0.6634615384615384
F1(micro)0.6634615384615384
F1(macro)0.6620169622932531
-----------
