# Machine Learning Models

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from utils.evaluation import get_r, print_metrics

  return f(*args, **kwds)
  return f(*args, **kwds)


Load train and test datasets.

In [5]:
X_train = pd.read_csv("../features/weebit_train_with_features.csv", index_col=0)
X_test = pd.read_csv("../features/weebit_test_with_features.csv", index_col=0)

# get Y
y_train = X_train["Level"]
y_test = X_test["Level"]

# remove Y and Text columns 
X_train.drop(columns=['Text', 'Level'], inplace=True)
X_test.drop(columns=['Text', 'Level'], inplace=True)

## 1. Linear Regression

In [33]:
from sklearn.linear_model import Ridge

In [36]:
lin_reg = Ridge(alpha=0.1)

lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)

print_metrics(y_test, y_pred)

R^2 = 0.5189337005930125
R = 0.7203705300697777
Pearson's correlation coef: 0.7252042670577996


## 2. Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rfc = RandomForestClassifier(max_depth=20, n_estimators=100)

In [27]:
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [43]:
print_metrics(y_test, y_pred, classification=True)

Confusion matrix:
[[ 80  31   5   4   2]
 [ 17 112  18   5   6]
 [  3  27 114   3  13]
 [  5   7   6  82  28]
 [  2   7  10  27 114]]
Accuracy: 0.6895604395604396
F1(micro)0.6895604395604396
F1(macro)0.6893224224903831
R^2 = 0.5811758475813705
R = 0.7623489014758076
Pearson's correlation coef: 0.7884321525669032


## 3. XGBoost Classifier

In [6]:
from xgboost import XGBClassifier

In [40]:
xgboost = XGBClassifier(max_depth=30, n_estimators=200, learning_rate=0.1)

In [41]:
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

In [42]:
print_metrics(y_test, y_pred)

R^2 = 0.5811758475813705
R = 0.7623489014758076
Pearson's correlation coef: 0.7884321525669032


## 4. Support Vector Classifier

In [6]:
from sklearn.svm import SVC
from utils.hyperparameter_optimization import find_best_C_for_SVC

Perform hyperparameter optimization to find best C.

In [7]:
kernel = 'linear'
Cs = [0.1, 0.3, 0.5, 0.7, 1.0, 1.5, 2.0, 5.0, 10.0, 50.0]
bestC = find_best_C_for_SVC(X_train, y_train, kernel, Cs)

Train and evaluate the model.

In [9]:
svc = SVC(kernel=kernel, C=bestC)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
    
print_metrics(y_test, y_pred, classification=True)

R^2 = 0.5634590898648464
R = 0.750639120926192
Pearson's correlation coef: 0.7810546559623373


## 5. Simple Feedforward Network

In [101]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical

In [159]:
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train.shape[1]))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu', input_dim=X_train.shape[1]))
model.add(Dropout(0.2))
model.add(Dense(5, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [160]:
y_train_cat = to_categorical(y_train, num_classes=5)
y_test_cat = to_categorical(y_test, num_classes=5)

In [161]:
model.fit(X_train, y_train_cat,
          epochs=150,
          batch_size=64,
          verbose=1)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x7f6296c0deb8>

In [162]:
y_pred_cat = model.predict(X_test)

In [163]:
y_pred = np.argmax(y_pred_cat, axis=1)

In [164]:
print_metrics(y_test, y_pred, classification=True)

Confusion matrix:
[[ 75  37   3   6   1]
 [ 20 115  14   4   5]
 [  2  33  99   4  22]
 [  2  10   6  79  31]
 [  3   2  13  27 115]]
Accuracy: 0.6634615384615384
F1(micro)0.6634615384615384
F1(macro)0.6628872300898104
R^2 = 0.5840105288160145
R = 0.7642058157433863
Pearson's correlation coef: 0.7924956207300548
