In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
from typing import List

In [2]:
parent_dir = './Data/In-lab/'
paths = np.array(list(os.listdir(parent_dir)))
np.random.shuffle(paths)
training_paths = paths[:-4]
validation_paths = paths[-4:]

def load_data_from_volunteer(path:str):
    df = pd.read_csv(path.rstrip('/')+'/labeledfeatures.csv')
    df = df[~df['EventType'].isna()]
    df = df.loc[:, 'mean':"EventType"]
    label_map = {r'[.]*[Rr][Ee][Ss][Tt][.]*': 0,
                 r'[.]*': 1}
    df['EventType'] = df['EventType'].replace(regex=label_map)
    return df.to_numpy()


def load_data(paths: List[str]):
    data = []
    for path in paths:
        data.append(load_data_from_volunteer(parent_dir + path))
    return np.concatenate(data, axis=0)

trainX = load_data(training_paths)
valX = load_data(validation_paths)

trainX, trainY = trainX[:, :-1], trainX[:, -1]
valX, valY = valX[:, :-1], valX[:, -1]

In [3]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(trainX)
trainX = scaler.transform(trainX)
valX = scaler.transform(valX)


In [4]:
from sklearn.svm import SVC


svm_clf = SVC()
svm_clf.fit(trainX, trainY)
pred_train_y = svm_clf.predict(trainX)
pred_val_y = svm_clf.predict(valX)

acc_train = accuracy_score(trainY, pred_train_y)
acc_val = accuracy_score(valY, pred_val_y)

print(f'Training Accuracy: {acc_train} | Validation Accuracy: {acc_val}')

Training Accuracy: 0.7455882352941177 | Validation Accuracy: 0.6506550218340611


In [5]:
from sklearn.neighbors import KNeighborsClassifier


knn_clf = KNeighborsClassifier()
knn_clf.fit(trainX, trainY)
pred_train_y = knn_clf.predict(trainX)
pred_val_y = knn_clf.predict(valX)

acc_train = accuracy_score(trainY, pred_train_y)
acc_val = accuracy_score(valY, pred_val_y)

print(f'Training Accuracy: {acc_train} | Validation Accuracy: {acc_val}')

Training Accuracy: 0.8220588235294117 | Validation Accuracy: 0.6703056768558951


In [6]:
from sklearn.linear_model import LogisticRegression


LR_clf = LogisticRegression()
LR_clf.fit(trainX, trainY)
pred_train_y = LR_clf.predict(trainX)
pred_val_y = LR_clf.predict(valX)

acc_train = accuracy_score(trainY, pred_train_y)
acc_val = accuracy_score(valY, pred_val_y)

print(f'Training Accuracy: {acc_train} | Validation Accuracy: {acc_val}')

Training Accuracy: 0.7235294117647059 | Validation Accuracy: 0.648471615720524


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
from sklearn.ensemble import RandomForestClassifier


rf_clf = RandomForestClassifier()
rf_clf.fit(trainX, trainY)
pred_train_y = rf_clf.predict(trainX)
pred_val_y = rf_clf.predict(valX)

acc_train = accuracy_score(trainY, pred_train_y)
acc_val = accuracy_score(valY, pred_val_y)

print(f'Training Accuracy: {acc_train} | Validation Accuracy: {acc_val}')

Training Accuracy: 1.0 | Validation Accuracy: 0.6506550218340611
