In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from math import sqrt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA

import torch
from torch import nn

from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from multiclass_performanceMetrics import *
from dataMining_functions import *
from neuralNet_functions import *
from nested_design_analysis import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from imblearn.over_sampling import SMOTE

In [4]:
# load csv files containing slope of wavelet decomposition extracted from data based on discrete wavelet transform
slope_m = pd.read_csv('features/window512_neuron_slope_rowwindow_Molino.csv', header=None)
slope_p = pd.read_csv('features/window512_neuron_slope_rowwindow_Pachon.csv', header=None)
slope_s = pd.read_csv('features/window512_neuron_slope_rowwindow_Surface.csv', header=None)

In [5]:
slope_m

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,869,870,871,872,873,874,875,876,877,878
0,-0.067605,-0.118922,-0.094129,0.066032,-0.238939,0.254376,-0.040246,-0.523493,0.087673,0.015258,...,0,0,0,0,0,0,0,0,0,0
1,0.332007,-0.134249,-0.199915,0.160182,-0.087102,0.085284,-0.115161,-0.078553,-0.028346,0.131704,...,0,0,0,0,0,0,0,0,0,0
2,-0.038976,-0.040227,0.055008,-0.038261,-0.154654,0.070836,-0.114923,0.036480,0.135799,0.022447,...,0,0,0,0,0,0,0,0,0,0
3,0.212578,0.135834,-0.017293,-0.067745,0.269292,-0.100228,0.107695,-0.215125,0.118761,-0.020157,...,0,0,0,0,0,0,0,0,0,0
4,-0.162624,-0.076423,-0.017957,0.056792,-0.081956,-0.101262,-0.147040,0.138392,0.080331,-0.032050,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,-0.066847,0.015115,-0.005959,0.140707,-0.069607,-0.107607,-0.848645,-0.110150,-0.414189,0.010325,...,0,0,0,0,0,0,0,0,0,0
172,0.016731,-0.081127,0.099292,-0.113153,0.166376,0.124942,-0.647028,-0.153044,-0.502469,-0.102382,...,0,0,0,0,0,0,0,0,0,0
173,0.065406,-0.255316,-0.078730,-0.139135,-0.008821,-0.238510,-0.553786,-0.415466,-0.494167,-0.203211,...,0,0,0,0,0,0,0,0,0,0
174,-0.180683,-0.158076,-0.078359,-0.168406,0.140557,-0.090916,-0.817421,-0.217100,0.018712,0.150379,...,0,0,0,0,0,0,0,0,0,0


In [6]:
min_col = 275  # minimum number of neurons of a fish in the data

# slicing the data set
slope_m = slope_m.iloc[:, :min_col]
slope_p = slope_p.iloc[:, :min_col]
slope_s = slope_s.iloc[:, :min_col]

# compute Hurst exponent
slope_m = (slope_m + 1)/2
slope_p = (slope_p + 1)/2
slope_s = (slope_s + 1)/2


# class lables
# molino = 0, pachon = 1, surface = 2
slope_m['class'] = 0
slope_p['class'] = 1
slope_s['class'] = 2
num_class = 3





In [7]:
slope_m.shape, slope_p.shape, slope_s.shape

((176, 276), (176, 276), (121, 276))

In [8]:
# get the number of nonoverlapping window obtained from the neural signal
num_window = slope_m.shape[0] / 16  # 16 is the number of fishes in molino group in the dataset
num_window

11.0

In [9]:
# combine all fish classes datasets
df = pd.concat([slope_m, slope_p], axis=0)
df = pd.concat([df, slope_s], axis=0)

In [10]:
# apply balanced nested design to the dataset
# to break the dependency caused by subjects(fishes) on the neural signals of a fish
df = balanced_nested_design_sampling(df, num_window=num_window)

# Classifications

In [11]:
# shuffle data set for more randomness during training
df = df.sample(frac=1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,266,267,268,269,270,271,272,273,274,class
180,0.622941,0.513346,0.639624,0.571883,0.468604,0.212209,0.450531,0.18384,0.595156,0.612845,...,-0.764694,0.590716,0.085538,0.233663,0.660313,-0.061753,-0.294556,-0.003977,-0.128442,1
155,0.472821,0.428433,0.513067,0.480952,0.276771,0.467638,0.276713,0.425258,0.499106,0.554421,...,0.482482,0.72396,0.538948,0.503018,0.331014,0.529846,0.575509,0.333,0.565632,1
333,0.488498,0.456501,0.535857,0.296304,0.457634,0.437827,0.526757,0.437954,0.467341,0.368531,...,0.546228,-0.007945,0.022178,0.435559,0.531495,-0.262726,0.576122,0.02423,0.437662,2
135,0.41127,0.342418,0.371153,0.440253,0.554291,0.507482,0.221412,0.49507,0.319191,0.318409,...,0.314481,0.418021,0.371473,0.384378,0.361355,0.517646,-0.134003,0.468478,0.258481,1
169,0.523019,0.520932,0.440093,0.423419,0.455561,0.401018,0.451781,0.449161,0.404028,0.326113,...,0.405144,0.214405,0.435032,0.423786,0.427322,0.197284,0.117755,0.284588,0.123334,1


In [12]:
# train and test data split
X, y = df.drop(labels='class', axis=1), df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
# Apply SMOTE to balance the training data
# sometimes random train and test data split causes minor data imbalance
# not a big issue, but used SMOTE just in case
sm = SMOTE(random_state=42, k_neighbors=12)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [14]:
y_train.value_counts()

2    101
0    101
1    101
Name: class, dtype: int64

In [15]:
# Logistic Regression
lr_mod = LogisticRegression(max_iter=700)
lr_mod.fit(X_train, y_train)
train_pred = lr_mod.predict(X_train)
test_pred = lr_mod.predict(X_test)

In [16]:
target_names = ['Molino', 'Pachon', 'Surface']
print(classification_report(y_train, train_pred, target_names=target_names))
confusion_matrix(y_train, train_pred)

              precision    recall  f1-score   support

      Molino       1.00      1.00      1.00       101
      Pachon       1.00      1.00      1.00       101
     Surface       1.00      1.00      1.00       101

    accuracy                           1.00       303
   macro avg       1.00      1.00      1.00       303
weighted avg       1.00      1.00      1.00       303



array([[101,   0,   0],
       [  0, 101,   0],
       [  0,   0, 101]])

In [17]:
target_names = ['Molino', 'Pachon', 'Surface']
print(classification_report(y_test, test_pred, target_names=target_names))
confusion_matrix(y_test, test_pred)

              precision    recall  f1-score   support

      Molino       0.96      1.00      0.98        26
      Pachon       1.00      1.00      1.00        20
     Surface       1.00      0.96      0.98        27

    accuracy                           0.99        73
   macro avg       0.99      0.99      0.99        73
weighted avg       0.99      0.99      0.99        73



array([[26,  0,  0],
       [ 0, 20,  0],
       [ 1,  0, 26]])

In [18]:
def SMOTE_oversample(data):
    # param: data is a dataframe (normally training data) where the target feature is in the last column
    # return balanced data by applying SMOTE oversampling technique
    # this function will be used in repeat_sampling_and_training function
    #             to apply data preprocessing for every sample
    X, y = data.iloc[:,:-1], data.iloc[:,-1]
    sm = SMOTE(random_state=42, k_neighbors=12)
    X, y = sm.fit_resample(X, y)
    df = pd.concat([X, y], axis=1)
    df = df.sample(frac=1)
    return df

In [19]:
def build_logisticRegression(**kwargs):
    # return an untrained logistic regression model
    return LogisticRegression(max_iter=500)

model_f = build_logisticRegression
predictors = list(df.drop(labels='class', axis=1).columns)  # names of the features
# random sampling and train a new model
# to more accurately test our model performace indepedent of particular samples
# Similar to the idea of K-fold cross validation
res = repeat_sampling_and_training(model_f, [], df,
                                  'class', predictors, num_repeat=100, doMinMaxScaling=False,
                                  num_window=num_window, data_processing_f=balanced_nested_design_sampling,
                                  is_oversample=True, oversample_f=SMOTE_oversample)

In [20]:
# print out model performance and confusion matrix (averaged over all sampling repetitions)
performance_dict, avg_cm = res
print_performance_metrics(performance_dict)
avg_cm

Mean of training_accuracy: 1.0
Standard deviation of training_accuracy: 0.0
Mean of testing_accuracy: 0.9633027522935776
Standard deviation of testing_accuracy: 3.3306690738754696e-16
Mean of AUC_score: 0.9982482482482483
Standard deviation of AUC_score: 0.0
Mean of recall_class0: 1.0
Standard deviation of recall_class0: 0.0
Mean of recall_class1: 0.9729729729729729
Standard deviation of recall_class1: 1.1102230246251565e-16
Mean of recall_class2: 0.9166666666666667
Standard deviation of recall_class2: 1.1102230246251565e-16
Mean of precision_class0: 0.9729729729729729
Standard deviation of precision_class0: 1.1102230246251565e-16
Mean of precision_class1: 0.923076923076923
Standard deviation of precision_class1: 1.1102230246251565e-16
Mean of precision_class2: 1.0
Standard deviation of precision_class2: 0.0
Mean of f1_class0: 0.986301369863014
Standard deviation of f1_class0: 2.220446049250313e-16
Mean of f1_class1: 0.9473684210526316
Standard deviation of f1_class1: 1.110223024625156

Unnamed: 0,Predicted Class 0,Predicted Class 1,Predicted Class 2
Actual Class 0,36.0,0.0,0.0
Actual Class 1,1.0,36.0,0.0
Actual Class 2,0.0,3.0,33.0


In [21]:
# L1 penalty(Lasso) Logistic Regression
# to check if some neurons are responding more to the stimulation in vision
# if yes, then the corresponding column index of a neuron responds less to the stimulus
#               will have 0 coefficient in the weight matrix of model
from sklearn.model_selection import GridSearchCV

# 5-fold cross-validation based grid search
# to find the best hyper parameters for L1 penalty Logistic Regression
parameters = {'solver':['liblinear', 'saga'], 'C':[10, 1, 0.1, 0.01]}
model = LogisticRegression(max_iter=1000, penalty='l1')
gs = GridSearchCV(model, parameters, n_jobs=-1, cv=5)
gs.fit(X_train, y_train)
gs.best_params_



{'C': 10, 'solver': 'saga'}

In [22]:
# train a model based on the best hyperparameter found in the given hyperparameter space
lr_mod = LogisticRegression(max_iter=1000, penalty='l1', C=10, solver='saga')
lr_mod.fit(X_train, y_train)
train_pred = lr_mod.predict(X_train)
test_pred = lr_mod.predict(X_test)



In [23]:
target_names = ['Molino', 'Pachon', 'Surface']
print(classification_report(y_train, train_pred, target_names=target_names))
confusion_matrix(y_train, train_pred)

              precision    recall  f1-score   support

      Molino       1.00      1.00      1.00       101
      Pachon       1.00      1.00      1.00       101
     Surface       1.00      1.00      1.00       101

    accuracy                           1.00       303
   macro avg       1.00      1.00      1.00       303
weighted avg       1.00      1.00      1.00       303



array([[101,   0,   0],
       [  0, 101,   0],
       [  0,   0, 101]])

In [24]:
target_names = ['Molino', 'Pachon', 'Surface']
print(classification_report(y_test, test_pred, target_names=target_names))
confusion_matrix(y_test, test_pred)

              precision    recall  f1-score   support

      Molino       0.96      0.96      0.96        26
      Pachon       1.00      1.00      1.00        20
     Surface       0.96      0.96      0.96        27

    accuracy                           0.97        73
   macro avg       0.97      0.97      0.97        73
weighted avg       0.97      0.97      0.97        73



array([[25,  0,  1],
       [ 0, 20,  0],
       [ 1,  0, 26]])

In [25]:
w = lr_mod.coef_
eps = 1e-4
count = 0
for j in range(w.shape[1]):
    if all(w[:, j] < eps):
        count += 1
print(f"The number of components of the weight less than {eps} is {count} out of {w.shape[1]} number of features")

The number of components of the weight less than 0.0001 is 191 out of 275 number of features


The L1 penalty logistic regression model still classify eyeless fishes well. However, as we can see from the weight matrix of the model, 172 neurons out of 275 don't contribute in distinguishing fish types.

In [26]:
# KNN classifier without data balancing
knn_mod = KNeighborsClassifier()
knn_mod.fit(X_train, y_train)
train_pred = knn_mod.predict(X_train)
test_pred = knn_mod.predict(X_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [27]:
print(classification_report(y_train, train_pred, target_names=target_names))
confusion_matrix(y_train, train_pred)

              precision    recall  f1-score   support

      Molino       1.00      1.00      1.00       101
      Pachon       0.99      1.00      1.00       101
     Surface       1.00      0.99      1.00       101

    accuracy                           1.00       303
   macro avg       1.00      1.00      1.00       303
weighted avg       1.00      1.00      1.00       303



array([[101,   0,   0],
       [  0, 101,   0],
       [  0,   1, 100]])

In [28]:
print(classification_report(y_test, test_pred, target_names=target_names))
confusion_matrix(y_test, test_pred)

              precision    recall  f1-score   support

      Molino       1.00      1.00      1.00        26
      Pachon       0.95      1.00      0.98        20
     Surface       1.00      0.96      0.98        27

    accuracy                           0.99        73
   macro avg       0.98      0.99      0.99        73
weighted avg       0.99      0.99      0.99        73



array([[26,  0,  0],
       [ 0, 20,  0],
       [ 0,  1, 26]])

In [29]:
# KNN
def build_knn(**kwargs):
    # return an untrained KNN model
    return KNeighborsClassifier()

model_f = build_knn
# random sampling and train a new model
# to more accurately test our model performace indepedent of particular samples
# Similar to the idea of K-fold cross validation
res = repeat_sampling_and_training(model_f, [], df,
                                  'class', predictors, num_repeat=100, doMinMaxScaling=False,
                                  num_window=num_window, data_processing_f=balanced_nested_design_sampling,
                                  is_oversample=True, oversample_f=SMOTE_oversample)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [30]:
performance_dict, avg_cm = res
print_performance_metrics(performance_dict)
avg_cm

Mean of training_accuracy: 1.0
Standard deviation of training_accuracy: 0.0
Mean of testing_accuracy: 0.981651376146789
Standard deviation of testing_accuracy: 0.0
Mean of AUC_score: 0.9992492492492492
Standard deviation of AUC_score: 0.0
Mean of recall_class0: 1.0
Standard deviation of recall_class0: 0.0
Mean of recall_class1: 1.0
Standard deviation of recall_class1: 0.0
Mean of recall_class2: 0.9444444444444444
Standard deviation of recall_class2: 0.0
Mean of precision_class0: 1.0
Standard deviation of precision_class0: 0.0
Mean of precision_class1: 0.9487179487179485
Standard deviation of precision_class1: 2.220446049250313e-16
Mean of precision_class2: 1.0
Standard deviation of precision_class2: 0.0
Mean of f1_class0: 1.0
Standard deviation of f1_class0: 0.0
Mean of f1_class1: 0.9736842105263156
Standard deviation of f1_class1: 2.220446049250313e-16
Mean of f1_class2: 0.9714285714285715
Standard deviation of f1_class2: 1.1102230246251565e-16


Unnamed: 0,Predicted Class 0,Predicted Class 1,Predicted Class 2
Actual Class 0,36.0,0.0,0.0
Actual Class 1,0.0,37.0,0.0
Actual Class 2,0.0,2.0,34.0


In [31]:
# SVM
svm_mod = SVC()
svm_mod.fit(X_train, y_train)
train_pred = svm_mod.predict(X_train)
test_pred = svm_mod.predict(X_test)

In [32]:
print(classification_report(y_train, train_pred, target_names=target_names))
confusion_matrix(y_train, train_pred)

              precision    recall  f1-score   support

      Molino       1.00      1.00      1.00       101
      Pachon       1.00      1.00      1.00       101
     Surface       1.00      1.00      1.00       101

    accuracy                           1.00       303
   macro avg       1.00      1.00      1.00       303
weighted avg       1.00      1.00      1.00       303



array([[101,   0,   0],
       [  0, 101,   0],
       [  0,   0, 101]])

In [33]:
print(classification_report(y_test, test_pred, target_names=target_names))
confusion_matrix(y_test, test_pred)

              precision    recall  f1-score   support

      Molino       1.00      1.00      1.00        26
      Pachon       1.00      1.00      1.00        20
     Surface       1.00      1.00      1.00        27

    accuracy                           1.00        73
   macro avg       1.00      1.00      1.00        73
weighted avg       1.00      1.00      1.00        73



array([[26,  0,  0],
       [ 0, 20,  0],
       [ 0,  0, 27]])

In [34]:
# SVM

def build_svm(**kwargs):
    # return an untrained SVM model
    return SVC(probability=True)

model_f = build_svm
# random sampling and train a new model
# to more accurately test our model performace indepedent of particular samples
# Similar to the idea of K-fold cross validation
res = repeat_sampling_and_training(model_f, [], df,
                                  'class', predictors, num_repeat=100, doMinMaxScaling=False,
                                  num_window=num_window, data_processing_f=balanced_nested_design_sampling,
                                  is_oversample=True, oversample_f=SMOTE_oversample)

In [35]:
performance_dict, avg_cm = res
print_performance_metrics(performance_dict)
avg_cm

Mean of training_accuracy: 1.0
Standard deviation of training_accuracy: 0.0
Mean of testing_accuracy: 0.981651376146789
Standard deviation of testing_accuracy: 0.0
Mean of AUC_score: 1.0
Standard deviation of AUC_score: 0.0
Mean of recall_class0: 1.0
Standard deviation of recall_class0: 0.0
Mean of recall_class1: 1.0
Standard deviation of recall_class1: 0.0
Mean of recall_class2: 0.9444444444444444
Standard deviation of recall_class2: 0.0
Mean of precision_class0: 1.0
Standard deviation of precision_class0: 0.0
Mean of precision_class1: 0.9487179487179485
Standard deviation of precision_class1: 2.220446049250313e-16
Mean of precision_class2: 1.0
Standard deviation of precision_class2: 0.0
Mean of f1_class0: 1.0
Standard deviation of f1_class0: 0.0
Mean of f1_class1: 0.9736842105263156
Standard deviation of f1_class1: 2.220446049250313e-16
Mean of f1_class2: 0.9714285714285715
Standard deviation of f1_class2: 1.1102230246251565e-16


Unnamed: 0,Predicted Class 0,Predicted Class 1,Predicted Class 2
Actual Class 0,36.0,0.0,0.0
Actual Class 1,0.0,37.0,0.0
Actual Class 2,0.0,2.0,34.0
