In [16]:
!pip install logitboost;



In [17]:
# imports
import pandas as pd
import numpy as np
import sklearn as sk
import time

from sklearn.ensemble import RandomForestClassifier
from logitboost import LogitBoost
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

from sklearn.metrics import classification_report, roc_auc_score
# seed
np.random.seed(0)

In [18]:
# where data is located
!ls ./drive/MyDrive/Thesis\ Workspace/Notebooks/data/sb15-set3
DATA_PATH = "./drive/MyDrive/Thesis Workspace/Notebooks/data/sb15-set3/"

dev.csv  test.csv  train.csv


In [19]:
# load user data
df_train = pd.read_csv(DATA_PATH + 'train.csv')
df_test = pd.read_csv(DATA_PATH + 'test.csv')
print('Shapes', df_train.shape, df_test.shape)

Shapes (8278, 6) (1183, 6)


In [20]:
X_train = df_train.drop('label', axis=1).values
y_train = df_train['label'].values

X_test = df_test.drop('label', axis=1).values
y_test = df_test['label'].values

print([x.shape for x in [X_train, y_train, X_test, y_test]])

[(8278, 5), (8278,), (1183, 5), (1183,)]


## Random Forest
10 trees with unlimited depth

In [21]:
random_forest_model = RandomForestClassifier(
    n_estimators = 10,
    max_depth = None,
    criterion = 'gini'
)
random_forest_model.fit(X_train,y_train)
pred = random_forest_model.predict(X_test)
print(classification_report(pred, y_test, digits=4))
print('ROC acc: ',roc_auc_score(pred, y_test))

              precision    recall  f1-score   support

           0     0.6409    0.5939    0.6165       586
           1     0.6281    0.6734    0.6500       597

    accuracy                         0.6340      1183
   macro avg     0.6345    0.6336    0.6332      1183
weighted avg     0.6344    0.6340    0.6334      1183

ROC acc:  0.6336117447304783



## Logit Boost
- Base estimator is decision stump

In [22]:
logit_boost_model = LogitBoost(
    base_estimator = None, #default None equals decision stump
    n_estimators=30, #just a number i choose
)
logit_boost_model.fit(X_train,y_train)
pred = logit_boost_model.predict(X_test)
print(classification_report(pred, y_test, digits=4))
print('ROC acc: ',roc_auc_score(pred, y_test))

              precision    recall  f1-score   support

           0     0.5046    0.6049    0.5502       453
           1     0.7203    0.6315    0.6730       730

    accuracy                         0.6213      1183
   macro avg     0.6125    0.6182    0.6116      1183
weighted avg     0.6377    0.6213    0.6260      1183

ROC acc:  0.6181816807281744


## SVM Poly
- Complexity param c = 1
- Paper state that data is to be normalized when used with SVM

In [23]:
x_train_norm, x_test_norm = [normalize(X, norm='l2') for X in [X_train, X_test]]
svm_poly_model = SVC(
    C = 1,
    kernel = 'poly',
    degree = 3
)
svm_poly_model.fit(x_train_norm,y_train)
pred = svm_poly_model.predict(x_test_norm)
print(classification_report(pred, y_test, digits=4))
print('ROC acc: ',roc_auc_score(pred, y_test))

              precision    recall  f1-score   support

           0     0.0387    0.5385    0.0722        39
           1     0.9719    0.5437    0.6973      1144

    accuracy                         0.5435      1183
   macro avg     0.5053    0.5411    0.3847      1183
weighted avg     0.9411    0.5435    0.6767      1183

ROC acc:  0.541083916083916


## SVM RBF
- Similar

In [24]:

svm_rbf_model = SVC(
    C = 1,
    kernel = 'rbf',
)
svm_rbf_model.fit(x_train_norm,y_train)
pred = svm_rbf_model.predict(x_test_norm)
print(classification_report(pred, y_test, digits=4))
print('ROC acc: ',roc_auc_score(pred, y_test))

              precision    recall  f1-score   support

           0     0.0276    0.5357    0.0525        28
           1     0.9797    0.5429    0.6986      1155

    accuracy                         0.5427      1183
   macro avg     0.5037    0.5393    0.3756      1183
weighted avg     0.9572    0.5427    0.6833      1183

ROC acc:  0.5392857142857143


## Multilayer Perceptron 10
- Gradient Decent
- Normalization
- 500 train epoch
- Learn rate 0.3
- Momentum 0.2
- Hidden layer with 10 nodes
- Sigmoid activation
- Validation threshold 20

### NOTE:
validation threshold is the number of traing epochs that we allow the model to not improve, if error does not decrease within this validation threshold epochs, we terminate training. This param in sklearn is `n_iter_no_change`

In [25]:
mlp_10_model = MLPClassifier(
    hidden_layer_sizes=(10,), #means only one hidden layer with 10 nodes
    learning_rate='constant',
    learning_rate_init=0.3,
    max_iter=500,
    solver='sgd',
    activation='logistic', #this is sigmoid
    momentum=0.2,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20
)
mlp_10_model.fit(x_train_norm,y_train)
pred = mlp_10_model.predict(x_test_norm)
print(classification_report(pred, y_test, digits=4))
try:
  print('ROC acc: ',roc_auc_score(pred, y_test))
except:
  print('ROC acc:', 'Undefined due to prediction only having one class')

              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000         0
           1     1.0000    0.5410    0.7021      1183

    accuracy                         0.5410      1183
   macro avg     0.5000    0.2705    0.3511      1183
weighted avg     1.0000    0.5410    0.7021      1183

ROC acc: Undefined due to prediction only having one class


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



## Multilayer Perceptron 20
Same but hidden layer has 20 nodes

In [26]:
mlp_20_model = MLPClassifier(
    hidden_layer_sizes=(20,), #means only one hidden layer with 10 nodes
    learning_rate='constant',
    learning_rate_init=0.3,
    max_iter=500,
    solver='sgd',
    activation='logistic', #this is sigmoid
    momentum=0.2,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20
)
mlp_20_model.fit(x_train_norm,y_train)
pred = mlp_20_model.predict(x_test_norm)
print(classification_report(pred, y_test, digits=4))
try:
  print('ROC acc: ',roc_auc_score(pred, y_test))
except:
  print('ROC acc:', 'Undefined due to prediction only having one class')

              precision    recall  f1-score   support

           0     0.2615    0.5772    0.3599       246
           1     0.8375    0.5720    0.6798       937

    accuracy                         0.5731      1183
   macro avg     0.5495    0.5746    0.5199      1183
weighted avg     0.7177    0.5731    0.6133      1183

ROC acc:  0.5746370964243259


## Conclude

**All things goo downnn hill!!!**

Random 4Rest > LogiBoost > MLP20  > SVM-P > SVM-R

MLP10 -> ??? worst??