# Various sklearn models

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import accuracy_score, f1_score

Xtrain = np.load(os.getcwd()+'/../feature_representations/feature_representation_2_train.npy')
ytrain = np.load(os.getcwd()+'/../feature_representations/ytrain.npy')
Xtest = np.load(os.getcwd()+'/../feature_representations/feature_representation_2_test.npy')
ytest = np.load(os.getcwd()+'/../feature_representations/ytest.npy')

In [3]:
Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape

((124800, 17), (124800,), (20800, 17), (20800,))

I have noticed better performance so far by removing the last 10 features for FR1.

In [3]:
# Xtrain = Xtrain[:,:8]
# Xtest = Xtest[:,:8]
# Xtrain.shape, ytrain.shape, Xtest.shape, ytest.shape

In [4]:
from sklearn.decomposition import PCA

pca = PCA(n_components=17)
pca.fit(Xtrain)
Xtrain = pca.transform(Xtrain)

# KNN

In [5]:
from sklearn.neighbors import KNeighborsClassifier

k = 3
model = KNeighborsClassifier(n_neighbors=k)
model.fit(Xtrain, ytrain)
yhat = model.predict(Xtest)

In [6]:
acc = accuracy_score(ytest, yhat)
f1 = f1_score(ytest, yhat, average='macro')
print(f'Accuracy is {acc}')
print(f'F1 score is {f1}')

Accuracy is 0.027019230769230768
F1 score is 0.022578150667278994


# Naive Bayes

In [7]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(Xtrain, ytrain)
yhat = model.predict(Xtest)

In [8]:
acc = accuracy_score(ytest, yhat)
f1 = f1_score(ytest, yhat, average='macro')
print(f'Accuracy is {acc}')
print(f'F1 score is {f1}')

Accuracy is 0.021298076923076923
F1 score is 0.01717086915485448


# Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(Xtrain, ytrain)
yhat = model.predict(Xtest)

In [10]:
acc = accuracy_score(ytest, yhat)
f1 = f1_score(ytest, yhat, average='macro')
print(f'Accuracy is {acc}')
print(f'F1 score is {f1}')

Accuracy is 0.027644230769230768
F1 score is 0.023423089424563663


# Support Vector Machine

This may take a little while to run...

In [11]:
from sklearn.svm import SVC

model = SVC()
model.fit(Xtrain, ytrain)
yhat = model.predict(Xtest)

In [12]:
acc = accuracy_score(ytest, yhat)
f1 = f1_score(ytest, yhat, average='macro')
print(f'Accuracy is {acc}')
print(f'F1 score is {f1}')

Accuracy is 0.03120192307692308
F1 score is 0.02548824846797005


In [13]:
model = SVC(kernel='linear')
model.fit(Xtrain, ytrain)
yhat = model.predict(Xtest)

In [14]:
acc = accuracy_score(ytest, yhat)
f1 = f1_score(ytest, yhat, average='macro')
print(f'Accuracy is {acc}')
print(f'F1 score is {f1}')

Accuracy is 0.02278846153846154
F1 score is 0.023242453894584715


In [15]:
model = SVC(kernel='poly')
model.fit(Xtrain, ytrain)
yhat = model.predict(Xtest)

In [16]:
acc = accuracy_score(ytest, yhat)
f1 = f1_score(ytest, yhat, average='macro')
print(f'Accuracy is {acc}')
print(f'F1 score is {f1}')

Accuracy is 0.03485576923076923
F1 score is 0.01850774466042029


# Neural Network

In [17]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(
    hidden_layer_sizes = [25,50,100,50],
    activation='logistic',
    solver='adam',
    batch_size=16,
    shuffle=True,
    random_state=4622,
)
model.fit(Xtrain, ytrain)



MLPClassifier(activation='logistic', batch_size=16,
              hidden_layer_sizes=[25, 50, 100, 50], random_state=4622)

In [18]:
acc = accuracy_score(ytest, yhat)
f1 = f1_score(ytest, yhat, average='macro')
print(f'Accuracy is {acc}')
print(f'F1 score is {f1}')

Accuracy is 0.03485576923076923
F1 score is 0.01850774466042029
