In [2]:
from sklearn.model_selection import train_test_split
import numpy as np
import data_loader

### setup

In [3]:
xs, ys = data_loader.verified_subjects_calibrated_yprs(resampled=True, flatten=True)
xs = np.array(xs)
ys = np.array(ys)
trainx, devx, testx, trainy, devy, testy = data_loader.load_all_classic_random_split()

Processing albert
Processing canon_12_5
Processing daniel
Processing haobin_11_22
Processing isa_12_5
Processing janet
Processing joanne
Processing jq_12_6
Processing kelly_11_7
Processing kevin_11_7
Processing ruocheng
Processing russell_11_20_stand
Processing russell_11_7
Processing russell_random_12_7
Processing solomon
Processing wenzhou_12_5
Processing yiheng_11_30
Processing yiheng_12_5
Processing yongxu_11_30
Processing albert
Processing canon_12_5
Processing daniel
Processing haobin_11_22
Processing isa_12_5
Processing janet
Processing joanne
Processing jq_12_6
Processing kelly_11_7
Processing kevin_11_7
Processing ruocheng
Processing russell_11_20_stand
Processing russell_11_7
Processing russell_random_12_7
Processing solomon
Processing wenzhou_12_5
Processing yiheng_11_30
Processing yiheng_12_5
Processing yongxu_11_30
Splitting out test set
Splitting out dev and train set


In [4]:
trainx, trainy = data_loader.augment_train_set(trainx, trainy)
trainx.shape, devx.shape, testx.shape, trainy.shape, devy.shape, testy.shape

Augmenting TRAIN set with proportion 1


((15336, 300), (959, 300), (959, 300), (15336,), (959,), (959,))

In [ ]:
# train_subjects = data_loader.VERIFIED_SUBJECTS[:-2]
# test_subjects = data_loader.VERIFIED_SUBJECTS[-2:]
# train_subjects , test_subjects

In [ ]:
# trainx, trainy = data_loader.verified_subjects_calibrated_yprs(resampled=True, flatten=True, subjects=train_subjects)
# testx, testy = data_loader.verified_subjects_calibrated_yprs(resampled=True, flatten=True, subjects=test_subjects)
# trainx = np.array(trainx)
# trainy = np.array(trainy)
# testx = np.array(testx)
# testy = np.array(testy)

In [5]:
# trainx.shape, trainy.shape, testx.shape, testy.shape

In [6]:
def get_acc(pred, testy):
    correct_pred = (pred-testy == 0).astype(int)
    acc = np.sum(correct_pred) / pred.shape[0]
    return acc

### SVM

In [13]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler

In [14]:
kernels = ['poly']
svc_acc = {}

for kernel in kernels:
    print(f'Training svc with {kernel} kernel')
    clf = svm.SVC(kernel=kernel, gamma='auto', max_iter=25000)
    clf.fit(trainx, trainy)
    pred = clf.predict(testx)
    svc_acc[kernel] = get_acc(pred, testy)

Training svc with poly kernel


In [15]:
svc_acc

{'poly': 0.43274244004171014}

### KNN

In [10]:
from sklearn.neighbors import KNeighborsClassifier

In [11]:
test_acc = {}

for num_neighbor in range(2, 10):
    print(f'Running KNN with n={num_neighbor}')
    
    clf = KNeighborsClassifier(n_neighbors=num_neighbor)
    clf.fit(trainx, trainy)
    
    pred = clf.predict(testx)
    test_acc[num_neighbor] = get_acc(pred, testy)

Running KNN with n=2
Running KNN with n=3
Running KNN with n=4
Running KNN with n=5
Running KNN with n=6
Running KNN with n=7
Running KNN with n=8
Running KNN with n=9


In [12]:
test_acc

{2: 0.7382690302398331,
 3: 0.7539103232533889,
 4: 0.7497393117831074,
 5: 0.7403545359749739,
 6: 0.7299270072992701,
 7: 0.7174139728884255,
 8: 0.7257559958289885,
 9: 0.7174139728884255}

### Nearest Centroid

In [ ]:
from sklearn.neighbors.nearest_centroid import NearestCentroid

In [ ]:
clf = NearestCentroid()
clf.fit(trainx, trainy)

In [ ]:
pred = clf.predict(testx)
correct_pred = (pred-testy == 0).astype(int)
acc = np.sum(correct_pred) / pred.shape[0]
acc

### Lasso: linear regression

In [ ]:
from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.1, max_iter=5000)
clf.fit(trainx, trainy)
pred = clf.predict(testx)
pred = np.rint(pred)
acc = get_acc(pred, testy)

In [ ]:
acc

### Gaussian Naive Bayes

In [ ]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(trainx, trainy)
pred = clf.predict(testx)
acc = get_acc(pred, testy)

In [ ]:
acc

### Forest of randomized trees

In [ ]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(trainx, trainy)
pred = clf.predict(testx)
acc = get_acc(pred, testy)

In [ ]:
acc

### Basic NN (multi-layer perceptron)

In [ ]:
from sklearn.neural_network import MLPClassifier

In [ ]:
structures = [
    (100,50),
    (100,100,50),
    (200,50),
    (200,100,50),
]

activations = ['logistic', 'tanh', 'relu']

l2_reg_const = [0.0001 * x for x in [1, 5, 10]]

nn_acc = {}

In [ ]:
for structure in structures:
    for act in activations:
        for alpha in l2_reg_const:
            name = f'nn-{str(structure)}-{act}-{alpha}'
            print(name)
            
            clf = MLPClassifier(
                hidden_layer_sizes=structure,
                activation=act,
                alpha=alpha,
                max_iter=5000
            )
            clf.fit(trainx, trainy)
            pred = clf.predict(testx)
            
            nn_acc[name] = get_acc(pred, testy)

In [ ]:
nn_acc