In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from math import sqrt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA

import torch
from torch import nn

from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from multiclass_performanceMetrics import *
from dataMining_functions import *
from neuralNet_functions import *
from nested_design_analysis import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from imblearn.over_sampling import SMOTE

In [4]:
# load csv files containing entropy extracted from data based on discrete wavelet transform (two finest levels)
entropy_m = pd.read_csv('features/window512_neuron_entropy_rowwindow_Molino.csv', header=None)
entropy_p = pd.read_csv('features/window512_neuron_entropy_rowwindow_Pachon.csv', header=None)
entropy_s = pd.read_csv('features/window512_neuron_entropy_rowwindow_Surface.csv', header=None)

In [5]:
entropy_m

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,869,870,871,872,873,874,875,876,877,878
0,7.893035,6.539838,8.013306,7.729771,7.895328,7.904673,7.777006,2.092313,7.206554,7.775234,...,0,0,0,0,0,0,0,0,0,0
1,7.888959,7.481470,7.961513,7.758067,7.923035,7.882443,7.908902,7.665921,7.761139,8.042748,...,0,0,0,0,0,0,0,0,0,0
2,7.323106,7.173376,7.899712,7.980307,7.830376,7.952646,8.005411,7.939777,7.904558,7.959187,...,0,0,0,0,0,0,0,0,0,0
3,7.646228,7.791536,7.905634,7.774594,7.885257,7.999103,7.782272,6.043083,7.570184,7.514908,...,0,0,0,0,0,0,0,0,0,0
4,7.874129,7.687619,7.830016,7.672146,7.996142,7.895995,7.407081,7.692267,7.947519,7.166222,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,7.991577,7.734572,7.921799,7.928110,7.919026,7.853644,3.308346,7.914074,7.015459,7.973388,...,0,0,0,0,0,0,0,0,0,0
172,7.917425,7.828190,7.899973,5.127672,7.962174,7.854050,5.380408,4.102701,4.132467,7.940133,...,0,0,0,0,0,0,0,0,0,0
173,7.893928,5.378387,7.872902,4.426261,8.080024,7.930582,3.236722,5.033111,3.782166,7.636636,...,0,0,0,0,0,0,0,0,0,0
174,7.941442,7.748443,7.906529,6.403298,7.948029,7.843256,4.618738,3.950022,6.467974,7.901544,...,0,0,0,0,0,0,0,0,0,0


In [6]:
min_col = 275  # minimum number of neurons of a fish in the data

# slicing the data set
entropy_m = entropy_m.iloc[:, :min_col]
entropy_p = entropy_p.iloc[:, :min_col]
entropy_s = entropy_s.iloc[:, :min_col]

# class lables
# molino = 0, pachon = 1, surface = 2
entropy_m['class'] = 0
entropy_p['class'] = 1
entropy_s['class'] = 2
num_class = 3

# combine all fish classes datasets
df = pd.concat([entropy_m, entropy_p], axis=0)
df = pd.concat([df, entropy_s], axis=0)




In [7]:
# get the number of nonoverlapping window obtained from the neural signal
num_window = entropy_m.shape[0] / 16  # 16 is the number of fishes in molino group in the dataset
num_window

11.0

In [8]:
# apply balanced nested design to the dataset
# to break the dependency caused by subjects(fishes) on the neural signals of a fish
df = balanced_nested_design_sampling(df, num_window=num_window)


# Classifications

In [9]:
# shuffle data set for more randomness during training
df = df.sample(frac=1)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,266,267,268,269,270,271,272,273,274,class
64,8.000075,8.134985,7.975749,8.029195,8.143746,7.39104,8.103675,8.033819,8.073322,7.972685,...,7.966996,8.110592,7.692527,8.056752,8.118324,8.135099,5.506548,7.944978,7.64974,0
274,7.516678,7.605565,7.589559,7.519959,7.666327,7.464143,7.729748,7.647938,7.713722,7.628415,...,7.490236,7.624943,7.543152,7.27125,7.601057,7.35299,7.654594,7.248154,6.982921,2
149,7.34982,5.90607,7.36885,7.371147,7.245899,7.385132,7.407041,7.370028,4.744099,7.557321,...,4.247311,7.590659,4.35686,7.500441,5.947522,4.733771,7.524336,6.881733,5.374411,1
287,8.361956,8.408396,8.384787,8.161379,8.312564,8.446175,8.387452,8.302162,8.314788,8.256542,...,8.304216,8.383763,8.180644,6.184561,6.634781,8.38779,8.179131,7.600551,7.432013,2
306,4.665079,7.482879,7.634192,7.66738,7.503673,7.703136,7.727295,7.051296,7.60265,7.619707,...,7.110573,7.689824,3.054284,7.595081,7.291121,7.024674,6.83675,7.638719,7.524175,2


In [10]:
# train and test data split
X, y = df.drop(labels='class', axis=1), df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
# Apply SMOTE to balance the training data
# sometimes random train and test data split causes minor data imbalance
# not a big issue, but used SMOTE just in case
sm = SMOTE(random_state=42, k_neighbors=12)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [12]:
y_train.value_counts()

1    102
2    102
0    102
Name: class, dtype: int64

In [13]:
# Logistic Regression
lr_mod = LogisticRegression(max_iter=700)
lr_mod.fit(X_train, y_train)
train_pred = lr_mod.predict(X_train)
test_pred = lr_mod.predict(X_test)

In [14]:
target_names = ['Molino', 'Pachon', 'Surface']
print(classification_report(y_train, train_pred, target_names=target_names))
confusion_matrix(y_train, train_pred)

              precision    recall  f1-score   support

      Molino       1.00      1.00      1.00       102
      Pachon       1.00      1.00      1.00       102
     Surface       1.00      1.00      1.00       102

    accuracy                           1.00       306
   macro avg       1.00      1.00      1.00       306
weighted avg       1.00      1.00      1.00       306



array([[102,   0,   0],
       [  0, 102,   0],
       [  0,   0, 102]])

In [15]:
target_names = ['Molino', 'Pachon', 'Surface']
print(classification_report(y_test, test_pred, target_names=target_names))
confusion_matrix(y_test, test_pred)

              precision    recall  f1-score   support

      Molino       1.00      1.00      1.00        34
      Pachon       1.00      1.00      1.00        19
     Surface       1.00      1.00      1.00        20

    accuracy                           1.00        73
   macro avg       1.00      1.00      1.00        73
weighted avg       1.00      1.00      1.00        73



array([[34,  0,  0],
       [ 0, 19,  0],
       [ 0,  0, 20]])

In [16]:
def SMOTE_oversample(data):
    # param: data is a dataframe (normally training data) where the target feature is in the last column
    # return balanced data by applying SMOTE oversampling technique
    # this function will be used in repeat_sampling_and_training function
    #             to apply data preprocessing for every sample
    X, y = data.iloc[:,:-1], data.iloc[:,-1]
    sm = SMOTE(random_state=42, k_neighbors=12)
    X, y = sm.fit_resample(X, y)
    df = pd.concat([X, y], axis=1)
    df = df.sample(frac=1)
    return df

In [17]:
def build_logisticRegression(**kwargs):
    # return an untrained logistic regression model
    return LogisticRegression(max_iter=500)

model_f = build_logisticRegression
predictors = list(df.drop(labels='class', axis=1).columns)  # names of the features
# random sampling and train a new model
# to more accurately test our model performace indepedent of particular samples
# Similar to the idea of K-fold cross validation
res = repeat_sampling_and_training(model_f, [], df,
                                  'class', predictors, num_repeat=100, doMinMaxScaling=False,
                                  num_window=num_window, data_processing_f=balanced_nested_design_sampling,
                                  is_oversample=True, oversample_f=SMOTE_oversample)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [18]:
# print out model performance and confusion matrix (averaged over all sampling repetitions)
performance_dict, avg_cm = res
print_performance_metrics(performance_dict)
avg_cm  # 0.42, 0.57  (3)

Mean of training_accuracy: 1.0
Standard deviation of training_accuracy: 0.0
Mean of testing_accuracy: 0.981651376146789
Standard deviation of testing_accuracy: 0.0
Mean of AUC_score: 0.9994073020388811
Standard deviation of AUC_score: 0.0
Mean of recall_class0: 1.0
Standard deviation of recall_class0: 0.0
Mean of recall_class1: 0.9736842105263156
Standard deviation of recall_class1: 2.220446049250313e-16
Mean of recall_class2: 0.9729729729729729
Standard deviation of recall_class2: 1.1102230246251565e-16
Mean of precision_class0: 0.9714285714285715
Standard deviation of precision_class0: 1.1102230246251565e-16
Mean of precision_class1: 1.0
Standard deviation of precision_class1: 0.0
Mean of precision_class2: 0.9729729729729729
Standard deviation of precision_class2: 1.1102230246251565e-16
Mean of f1_class0: 0.9855072463768119
Standard deviation of f1_class0: 3.3306690738754696e-16
Mean of f1_class1: 0.9866666666666665
Standard deviation of f1_class1: 1.1102230246251565e-16
Mean of f1_c

Unnamed: 0,Predicted Class 0,Predicted Class 1,Predicted Class 2
Actual Class 0,34.0,0.0,0.0
Actual Class 1,0.0,37.0,1.0
Actual Class 2,1.0,0.0,36.0


In [19]:
# L1 penalty(Lasso) Logistic Regression
# to check if some neurons are responding more to the stimulation in vision
# if yes, then the corresponding column index of a neuron responds less to the stimulus
#               will have 0 coefficient in the weight matrix of model
from sklearn.model_selection import GridSearchCV

# 5-fold cross-validation based grid search
# to find the best hyper parameters for L1 penalty Logistic Regression
parameters = {'solver':['liblinear', 'saga'], 'C':[10, 1, 0.1, 0.01]}
model = LogisticRegression(max_iter=1000, penalty='l1')
gs = GridSearchCV(model, parameters, n_jobs=-1, cv=5)
gs.fit(X_train, y_train)
gs.best_params_



{'C': 10, 'solver': 'saga'}

In [20]:
# train a model based on the best hyperparameter found in the given hyperparameter space
lr_mod = LogisticRegression(max_iter=1000, penalty='l1', C=10, solver='saga')
lr_mod.fit(X_train, y_train)
train_pred = lr_mod.predict(X_train)
test_pred = lr_mod.predict(X_test)



In [21]:
target_names = ['Molino', 'Pachon', 'Surface']
print(classification_report(y_train, train_pred, target_names=target_names))
confusion_matrix(y_train, train_pred)

              precision    recall  f1-score   support

      Molino       1.00      1.00      1.00       102
      Pachon       1.00      1.00      1.00       102
     Surface       1.00      1.00      1.00       102

    accuracy                           1.00       306
   macro avg       1.00      1.00      1.00       306
weighted avg       1.00      1.00      1.00       306



array([[102,   0,   0],
       [  0, 102,   0],
       [  0,   0, 102]])

In [22]:
target_names = ['Molino', 'Pachon', 'Surface']
print(classification_report(y_test, test_pred, target_names=target_names))
confusion_matrix(y_test, test_pred)

              precision    recall  f1-score   support

      Molino       1.00      1.00      1.00        34
      Pachon       1.00      1.00      1.00        19
     Surface       1.00      1.00      1.00        20

    accuracy                           1.00        73
   macro avg       1.00      1.00      1.00        73
weighted avg       1.00      1.00      1.00        73



array([[34,  0,  0],
       [ 0, 19,  0],
       [ 0,  0, 20]])

In [23]:
w = lr_mod.coef_
eps = 1e-4
count = 0
for j in range(w.shape[1]):
    if all(w[:, j] < eps):
        count += 1
print(f"The number of components of the weight less than {eps} is {count} out of {w.shape[1]} number of features")

The number of components of the weight less than 0.0001 is 6 out of 275 number of features


The L1 penalty logistic regression model still classify eyeless fishes well. Unlike Hurst exponent features, very few neurons don't contribute in distinguishing fish types as we can see from the weight matrix of the model.

In [24]:
# KNN classifier without data balancing
knn_mod = KNeighborsClassifier()
knn_mod.fit(X_train, y_train)
train_pred = knn_mod.predict(X_train)
test_pred = knn_mod.predict(X_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [25]:
print(classification_report(y_train, train_pred, target_names=target_names))
confusion_matrix(y_train, train_pred)

              precision    recall  f1-score   support

      Molino       0.99      1.00      1.00       102
      Pachon       1.00      0.99      1.00       102
     Surface       1.00      1.00      1.00       102

    accuracy                           1.00       306
   macro avg       1.00      1.00      1.00       306
weighted avg       1.00      1.00      1.00       306



array([[102,   0,   0],
       [  1, 101,   0],
       [  0,   0, 102]])

In [26]:
print(classification_report(y_test, test_pred, target_names=target_names))
confusion_matrix(y_test, test_pred)

              precision    recall  f1-score   support

      Molino       1.00      1.00      1.00        34
      Pachon       1.00      1.00      1.00        19
     Surface       1.00      1.00      1.00        20

    accuracy                           1.00        73
   macro avg       1.00      1.00      1.00        73
weighted avg       1.00      1.00      1.00        73



array([[34,  0,  0],
       [ 0, 19,  0],
       [ 0,  0, 20]])

In [27]:
# KNN after data balancing with multiple repetition of downsampling data for data balancing
def build_knn(**kwargs):
    # return an untrained KNN model
    return KNeighborsClassifier()

model_f = build_knn
# random sampling and train a new model
# to more accurately test our model performace indepedent of particular samples
# Similar to the idea of K-fold cross validation
res = repeat_sampling_and_training(model_f, [], df,
                                  'class', predictors, num_repeat=100, doMinMaxScaling=False,
                                  num_window=num_window, data_processing_f=balanced_nested_design_sampling,
                                  is_oversample=True, oversample_f=SMOTE_oversample)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [28]:
performance_dict, avg_cm = res
print_performance_metrics(performance_dict)
avg_cm

Mean of training_accuracy: 1.0
Standard deviation of training_accuracy: 0.0
Mean of testing_accuracy: 0.981651376146789
Standard deviation of testing_accuracy: 0.0
Mean of AUC_score: 0.9997629208155523
Standard deviation of AUC_score: 0.0
Mean of recall_class0: 1.0
Standard deviation of recall_class0: 0.0
Mean of recall_class1: 0.9473684210526316
Standard deviation of recall_class1: 1.1102230246251565e-16
Mean of recall_class2: 1.0
Standard deviation of recall_class2: 0.0
Mean of precision_class0: 1.0
Standard deviation of precision_class0: 0.0
Mean of precision_class1: 1.0
Standard deviation of precision_class1: 0.0
Mean of precision_class2: 0.9487179487179485
Standard deviation of precision_class2: 2.220446049250313e-16
Mean of f1_class0: 1.0
Standard deviation of f1_class0: 0.0
Mean of f1_class1: 0.9729729729729729
Standard deviation of f1_class1: 1.1102230246251565e-16
Mean of f1_class2: 0.9736842105263156
Standard deviation of f1_class2: 2.220446049250313e-16


Unnamed: 0,Predicted Class 0,Predicted Class 1,Predicted Class 2
Actual Class 0,34.0,0.0,0.0
Actual Class 1,0.0,36.0,2.0
Actual Class 2,0.0,0.0,37.0


In [29]:
# SVM
svm_mod = SVC()
svm_mod.fit(X_train, y_train)
train_pred = svm_mod.predict(X_train)
test_pred = svm_mod.predict(X_test)

In [30]:
print(classification_report(y_train, train_pred, target_names=target_names))
confusion_matrix(y_train, train_pred)

              precision    recall  f1-score   support

      Molino       1.00      1.00      1.00       102
      Pachon       1.00      1.00      1.00       102
     Surface       1.00      1.00      1.00       102

    accuracy                           1.00       306
   macro avg       1.00      1.00      1.00       306
weighted avg       1.00      1.00      1.00       306



array([[102,   0,   0],
       [  0, 102,   0],
       [  0,   0, 102]])

In [31]:
print(classification_report(y_test, test_pred, target_names=target_names))
confusion_matrix(y_test, test_pred)

              precision    recall  f1-score   support

      Molino       1.00      1.00      1.00        34
      Pachon       1.00      1.00      1.00        19
     Surface       1.00      1.00      1.00        20

    accuracy                           1.00        73
   macro avg       1.00      1.00      1.00        73
weighted avg       1.00      1.00      1.00        73



array([[34,  0,  0],
       [ 0, 19,  0],
       [ 0,  0, 20]])

In [32]:
# SVM after data balancing with multiple repetition of downsampling data for data balancing

def build_svm(**kwargs):
    # return an untrained SVM model
    return SVC(probability=True)

model_f = build_svm
# random sampling and train a new model
# to more accurately test our model performace indepedent of particular samples
# Similar to the idea of K-fold cross validation
res = repeat_sampling_and_training(model_f, [], df,
                                  'class', predictors, num_repeat=100, doMinMaxScaling=False,
                                  num_window=num_window, data_processing_f=balanced_nested_design_sampling,
                                  is_oversample=True, oversample_f=SMOTE_oversample)

In [33]:
performance_dict, avg_cm = res
print_performance_metrics(performance_dict)
avg_cm

Mean of training_accuracy: 1.0
Standard deviation of training_accuracy: 0.0
Mean of testing_accuracy: 0.9908256880733949
Standard deviation of testing_accuracy: 3.3306690738754696e-16
Mean of AUC_score: 0.9997629208155524
Standard deviation of AUC_score: 0.0
Mean of recall_class0: 1.0
Standard deviation of recall_class0: 0.0
Mean of recall_class1: 1.0
Standard deviation of recall_class1: 0.0
Mean of recall_class2: 0.9729729729729729
Standard deviation of recall_class2: 1.1102230246251565e-16
Mean of precision_class0: 1.0
Standard deviation of precision_class0: 0.0
Mean of precision_class1: 0.9743589743589745
Standard deviation of precision_class1: 1.1102230246251565e-16
Mean of precision_class2: 1.0
Standard deviation of precision_class2: 0.0
Mean of f1_class0: 1.0
Standard deviation of f1_class0: 0.0
Mean of f1_class1: 0.987012987012987
Standard deviation of f1_class1: 1.1102230246251565e-16
Mean of f1_class2: 0.986301369863014
Standard deviation of f1_class2: 2.220446049250313e-16


Unnamed: 0,Predicted Class 0,Predicted Class 1,Predicted Class 2
Actual Class 0,34.0,0.0,0.0
Actual Class 1,0.0,38.0,0.0
Actual Class 2,0.0,1.0,36.0
