In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import argparse

from sklearn.preprocessing import StandardScaler
import sklearn.svm
import sklearn.neighbors
import sklearn.linear_model
import sklearn.neural_network
import sklearn.ensemble

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.model_selection import permutation_test_score
from sklearn.metrics import classification_report, confusion_matrix

from imblearn.pipeline import Pipeline
from imblearn import FunctionSampler
from imblearn.over_sampling import SMOTE,RandomOverSampler
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler

import lightgbm as lgbm

import seaborn as sns
from itertools import product

In [161]:
def load_data():
    # dfX = pd.read_csv('data_hursts.csv',index_col=0)
    # dfX = clean_nans(dfX)
    df = None
    for axis in ['x','y','z']:
        df_ = pd.read_csv(f'data_hursts_{axis}_large_scales.csv',index_col=0)
        df_.columns = [x+'_'+axis for x in df_.columns]
        df = pd.concat([df,df_],axis=1)
    dfX = clean_data_nans(df)
    
    dfY = pd.read_csv('data_labels.csv',index_col=0)
    
    dfY = dfY[dfY['CDR']!=2]
    Y = dfY['CDR'].dropna()
    y = Y.to_numpy()
    y = (y*2).astype(int)
    X = dfX.loc[Y.index].to_numpy()

    return X,y

def clean_data_nans(df):
    return df.loc[:,(~df.isna()).all()]

def smooth_X(X, window_size = 3):
    Xsliced = np.lib.stride_tricks.sliding_window_view(X,window_shape=(1,window_size)).squeeze()
    X = np.ma.average(Xsliced,axis=-1,weights=window_size*[1/window_size])
    return X

def pipeline_param_grid(param_grid,model_name):
    pg = []
    for x in param_grid:
        d = {}
        for k,v in x.items():
            d[model_name+'__'+k] = v
        pg += [d]

    return pg

def train_model_nestgrid(model,
                         X,y,
                         hparams,
                         rebalance_type='none',
                         cv_param_folds=5, 
                         cv_select_folds=5, 
                         random_seed=12234):

    pipeline = []
    
    if rebalance_type == 'over_smote':
        pipeline.append(('resampler',SMOTE(random_state = random_seed)))
    elif rebalance_type == 'combine_smotenn':
        pipeline.append(('resampler',SMOTEENN(random_state = random_seed)))    
    elif rebalance_type == 'under_random':
        pipeline.append(('resampler',RandomUnderSampler(random_state = random_seed)))
    elif rebalance_type == 'over_random':
        pipeline.append(('resampler',RandomOverSampler(random_state = random_seed)))
    elif rebalance_type == 'none':
        pass

    pipeline.append(('scaler', StandardScaler()))
    pipeline.append(('model', model))
    pipeline = Pipeline(pipeline)

    inner_fold = StratifiedKFold(n_splits=cv_param_folds,
                                 shuffle=True,
                                 random_state=random_seed)
    
    mcc = sklearn.metrics.make_scorer(sklearn.metrics.matthews_corrcoef)
    scoring = dict(accuracy='accuracy',
               f1_macro='f1_macro',
               mcc=mcc)
    
    grid = GridSearchCV(estimator = pipeline,
                param_grid = hparams,
                cv = inner_fold,
                verbose = verbose,
                scoring = scoring,
                refit = 'mcc')
    # return grid, ''
    outer_cv = StratifiedKFold(n_splits=cv_select_folds,
                           shuffle=True,
                           random_state=random_seed)

    cv_select_results = cross_validate(estimator = grid,
                                       X = X,
                                       y = y,
                                       cv = outer_cv,
                                       n_jobs = n_jobs,
                                       verbose = verbose,
                                       return_estimator = True,
                                       return_train_score = True,
                                       scoring = scoring)
    return grid, cv_select_results

metrics = [f'{split}_{metric}' for metric,split in product(['accuracy','f1_macro','mcc'],['train','test'])]
def calc_results(cv_select_results, metrics = metrics):
    results = {}

    for metric in metrics:
        results[f'{metric}'] = cv_select_results[metric]
    
    return results


In [162]:
X,y = load_data()

## single model tests

In [44]:
verbose = False
rebalance_type = 'none'
cv_param_folds = 5
cv_select_folds = 5
random_seed = 1234

n_jobs = -1

model = sklearn.svm.SVC(verbose=verbose)
param_grid = [{'C': 10.**np.arange(-3,3,2),
               'gamma': 10.**np.arange(-3,3,1),
               'kernel': [ 'rbf', 'sigmoid'],
               'decision_function_shape': ['ovo','ovr']}]
hparams = pipeline_param_grid(param_grid,'model')

grid, cv_select_results = train_model_nestgrid(model,
                                               X,y,
                                               hparams,
                                               rebalance_type = rebalance_type,
                                               cv_param_folds = cv_param_folds, 
                                               cv_select_folds = cv_select_folds, 
                                               random_seed = random_seed)
results = calc_results(cv_select_results)
results['model'] = 'svc'
results['rebalance_type'] = rebalance_type


## cross model tests

In [166]:
rebalance_type_list = ['over_smote','over_random','combine_smotenn','under_random','none']

model_list = []
model_name_list = []
hparams_list = []


model_list.append(sklearn.linear_model.SGDClassifier(verbose=verbose,random_state=random_seed))
model_name_list.append('sgd')
param_grid = [{'loss':['log_loss'],'alpha':np.linspace(0.1,1,10)}]
hparams_list.append(pipeline_param_grid(param_grid,'model'))


model_list.append(sklearn.svm.SVC(verbose=verbose))
model_name_list.append('svc')
param_grid = [{'C': 10.**np.arange(-3,3,2),
               'gamma': 10.**np.arange(-3,3,1),
               'kernel': [ 'rbf', 'sigmoid'],
               'decision_function_shape': ['ovo','ovr']}]
hparams_list.append(pipeline_param_grid(param_grid,'model'))


model_list.append(sklearn.neighbors.KNeighborsClassifier())
model_name_list.append('knn')
param_grid = [{'n_neighbors': range(2,41,1),
               'weights': ['uniform']}]
hparams_list.append(pipeline_param_grid(param_grid,'model'))


model_list.append(lgbm.LGBMClassifier(verbose=2*int(verbose)-1, n_jobs=n_jobs, random_state=random_seed))
model_name_list.append('lgbm')
param_grid = [{'num_leaves': [2,3,4,5],
               'max_depth': [2,3],
               'n_estimators': range(35,75,5),
               'min_child_samples':[4,5,6]
               }]
hparams_list.append(pipeline_param_grid(param_grid,'model'))


model_list.append(sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(80,40,10,),
                                       alpha=20,
                                       max_iter=5000,
                                       random_state=random_seed))
model_name_list.append('mlp')
param_grid = [{'alpha':np.linspace(30,50,5)}]
hparams_list.append(pipeline_param_grid(param_grid,'model'))


model_list.append(sklearn.ensemble.RandomForestClassifier(random_state=random_seed))
model_name_list.append('rf')
param_grid = [{'n_estimators':[90,120,150],
               'min_samples_split':[5,10],
               'max_leaf_nodes':[5,10]}]
hparams_list.append(pipeline_param_grid(param_grid,'model'))

In [167]:
verbose = False
cv_param_folds = 5
cv_select_folds = 5
random_seed = 1234

n_jobs = -1

df_results = None

for rebalance_type, model_tuple in product(rebalance_type_list,zip(model_list,model_name_list,hparams_list)):
    
    model, model_name, hparams = model_tuple
    print(model_name,rebalance_type)
    
    grid, cv_select_results = train_model_nestgrid(model,
                                                   X,y,
                                                   hparams,
                                                   rebalance_type = rebalance_type,
                                                   cv_param_folds = cv_param_folds, 
                                                   cv_select_folds = cv_select_folds, 
                                                   random_seed = random_seed)
    results = calc_results(cv_select_results)
    results['model'] = model_name
    results['rebalance_type'] = rebalance_type
    
    df_results = pd.concat([df_results,pd.DataFrame(results)])
    # df_results.loc[len(df_results)] = results

sgd over_smote
svc over_smote
knn over_smote
lgbm over_smote
mlp over_smote
rf over_smote
sgd over_random
svc over_random
knn over_random
lgbm over_random
mlp over_random
rf over_random
sgd combine_smotenn
svc combine_smotenn
knn combine_smotenn
lgbm combine_smotenn
mlp combine_smotenn
rf combine_smotenn
sgd under_random
svc under_random
knn under_random
lgbm under_random
mlp under_random
rf under_random
sgd none
svc none
knn none
lgbm none
mlp none
rf none


In [168]:
df_results.to_csv('results.csv')

## results

In [249]:
df_results = pd.read_csv('results.csv',index_col=0)

In [250]:
df_results.groupby(['model','rebalance_type']).mean().reset_index().sort_values(by='test_mcc',ascending=False).head().loc[:,['rebalance_type','model','test_accuracy','test_f1_macro','test_mcc']]

Unnamed: 0,rebalance_type,model,test_accuracy,test_f1_macro,test_mcc
8,over_smote,lgbm,0.56272,0.483093,0.261954
18,over_smote,rf,0.532747,0.456586,0.246613
28,over_smote,svc,0.523497,0.434067,0.236747
12,over_random,mlp,0.549491,0.4797,0.232935
17,over_random,rf,0.541073,0.448499,0.228025
