In [1]:
from utils.htil_toolbox import Subject, load_files, NumpyDatasetGroupSplit, nn_eval, SimpleLinear
import torch
#import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
import math
torch.manual_seed(42)

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold,GridSearchCV,cross_val_score,cross_validate, GroupShuffleSplit, train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVR, SVR
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingRegressor
from sklearn.naive_bayes import GaussianNB, MultinomialNB


def getBasicPipeline(clf):
    return Pipeline([('scaler',StandardScaler()),('classifier',clf)])

def getPolynomialPipeline(clf):
    return Pipeline(
        [
            ('scaler', StandardScaler()),
            ('classifier', clf)
        ]
    )


def evalML(data_array, label_array, group_array, pipelineFunc=getBasicPipeline, splits=5, classifier=LogisticRegression()):     
    clf=classifier
    gkf=GroupKFold(n_splits=splits)
    #param_grid = {'classifier__C': [0.01, 0.05,0.1,0.5, 1,2,3,4,5,8, 10,12,15]}
    param_grid = {}
    pipe=pipelineFunc(clf)
    gscv=GridSearchCV(pipe, param_grid, cv=gkf, n_jobs=16)
    #print("prefit", data_array.shape, label_array.shape)
    gscv.fit(data_array, label_array, groups=group_array)
    print("Training acc: ", gscv.best_score_)
    return gscv
    
def validation(x_train, y_train, x_validate, y_validate, groups):
     # SVM
    _svm = SVC(C=5, kernel='linear')

    #Decision Trees
    _d_trees = DecisionTreeClassifier()

    #Ensmble
    _rf = RandomForestClassifier(n_estimators=500,  n_jobs=-1)
    _ada = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=20), n_estimators=500, algorithm="SAMME.R", learning_rate=0.5
    )
     
    #KNN
    _knn = KNeighborsClassifier(n_neighbors=4)

    #LDA
    _lda = LinearDiscriminantAnalysis(solver='svd')

    #Narive Bayes
    _gnb = GaussianNB()

    #classifiers = [_svm, _d_trees, _rf, _gnb, _ada, _knn, _lda, _gnb]

    classifiers =[_svm, _lda, _knn]

    return testClassifiers(classifiers, x_train, y_train, groups, x_validate, y_validate)

def testClassifiers(classifiers, train_data, train_labels, groups, test_data, test_labels):
    models = []
    for clf in classifiers:
        print(clf)
        # Train Model
        model = evalML(train_data, train_labels, groups, pipelineFunc=getBasicPipeline, splits=10, classifier=clf)
        models.append(model)
        # Get Validation Accuracy
        y_pred = model.predict(test_data)
        score = accuracy_score(test_labels, y_pred)
        print("validation acc", score)
    return models

def split(x, y, groups):
    splitter = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=16)
    split = splitter.split(x, groups=groups)
    train_inds, validate_inds = next(split)
    x_train = x[train_inds]
    y_train = y[train_inds]
    groups_train = groups[train_inds]
    x_validate = x[validate_inds]
    y_validate = y[validate_inds]
    return x_train, y_train, x_validate, y_validate, groups_train

def main():
    x = np.load('data/dataframes/x_raw_exp1.npy')
    y = np.load('data/dataframes/y_raw_exp1.npy')
    groups = np.load('data/dataframes/group_raw_exp1.npy')
    x_train, y_train, x_validate, y_validate, groups_train = split(x, y, groups)
    print(x_train.shape, y_train.shape, x_validate.shape, y_validate.shape)
    x_train_reshaped = x_train.reshape(x_train.shape[0], -1) # reshape X train
    x_validate_reshaped = x_validate.reshape(x_validate.shape[0], -1) # reshape X test
    models = validation(x_train_reshaped, y_train, x_validate_reshaped, y_validate, groups_train)

#main()

  from .autonotebook import tqdm as notebook_tqdm


(3166, 4, 23) (3166,) (890, 4, 23) (890,)
SVC(C=5, kernel='linear')
Training acc:  0.5264707990771986
validation acc 0.503370786516854
LinearDiscriminantAnalysis()
Training acc:  0.5252714768329625
validation acc 0.5112359550561798
KNeighborsClassifier(n_neighbors=4)
Training acc:  0.4732749485462337
validation acc 0.4764044943820225


In [1]:
import matplotlib as mpl
mpl.__version__

'3.6.2'