In [1]:
import pandas as pd
import os
import numpy as np

import Utils as ut
import CortesAlignmentFile as ca
import mySampler as ms
 
from sklearn.model_selection import StratifiedShuffleSplit

  from collections import Sequence


In [2]:
d_clinical = pd.read_csv(os.path.join('data', 'dataset_clinical_cleaned.csv'))
d_genetic = pd.read_csv(os.path.join('data', 'dataset_genetic_cleaned_noOHE.csv'))
d_vampire = pd.read_csv(os.path.join('data', 'dataset_vampire_cleaned.csv'))
outputs = pd.read_csv(os.path.join('data', 'outputs_cleaned.csv'))

## Dementia

In [3]:
y = outputs["dement_fail"].values
C = d_clinical.values
G = d_genetic.values
V = d_vampire.values

In [4]:
# COMPUTATIONAL COMPLEXITY: Reduce #samples
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.75).split(C, y))

In [5]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
G_ = G[tr_idx]
G_test = G[ts_idx]
V_ = V[tr_idx]
V_test = V[ts_idx]

In [6]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

## Basic approach

In [7]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3, 7], 'gaussian':[0.1, 0.5, 0.7]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}]

estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25)

In [8]:
result1 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

1 split out of 3 ...
	Working on config 1 of 2: {'linear': [0.5], 'polynomial': [2, 3], 'gaussian': [0.1, 0.5]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.5, 3, 0.5], [0.5, 3, 0.5], [0.5, 3, 0.5]]
	Working on config 2 of 2: {'linear': [0.2], 'polynomial': [4, 5], 'gaussian': [0.7, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.2, 4, 1], [0.2, 4, 1], [0.2, 4, 0.7]]
	Result of 1:
CA: 0.0399761715551632
Accuracy: 0.5174418604651163
Precision: 0.19540229885057472
Recall: 0.5666666666666667
[linear:0.5, polynomial:3, gaussian:0.5, ]
[linear:0.5, polynomial:3, gaussian:0.5, ]
[linear:0.5, polynomial:3, gaussian:0.5, ]

eta vector: [ 2.57480709e-07 -8.52177600e-14  5.22118522e-04 -3.82422497e-07
  1.85829455e-05  7.07149120e-01  6.64650747e-13 -3.55783374e-26
 -7.07064247e-01]

CA: 0.0384034381927207
Accuracy: 0.5174418604651163
Precision: 0.19540229885057472
Recall: 0.5666666666666667
[linear:0.2, polynomial:4, gaussian:1, ]
[line

In [9]:
w_dict, w_list = result1.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

Perfomances computed for 1 dictionary settings:
	Accuracy: 0.45330739299610895
	Precision: 0.1702493551160791
	Recall: 0.5546218487394958
Perfomances computed for 2 dictionary settings:
	Accuracy: 0.5778210116731517
	Precision: 0.17369093231162197
	Recall: 0.38095238095238093


In [10]:
result1.performancesFeatures()

statistics of configuration 0
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'polynomial': [[3, 3, 3], [3, 3, 3], [3, 3, 2]], 'gaussian': [[0.5, 0.5, 0.5], [0.5, 0.1, 0.5], [0.5, 0.5, 0.1]]}, 'CA': (0.03849912120315535, 4.889496208389129e-06), 'Accuracy': (0.5116279069767442, 0.0004732287723093553), 'Precision': (0.18877419235807458, 0.00021284184563042405), 'Recall': (0.5444444444444444, 0.000987654320987654), 'eta': (array([ 1.85428717e-04, -5.94012937e-11,  3.48307376e-01, -1.00486870e-04,
        1.70036936e-03,  7.07095620e-01,  6.81158656e-10,  3.77623781e-16,
       -5.28840010e-01]), array([2.76343494e-08, 2.93380603e-21, 7.13750348e-02, 3.28149818e-08,
       3.43554041e-05, 1.00989434e-04, 1.57962823e-18, 2.85199535e-31,
       2.75126237e-02]))}
statistics of configuration 1
{'config': {'linear': [[0.2, 0.2, 0.2], [0.2, 0.2, 0.2], [0.2, 0.2, 0.2]], 'polynomial': [[4, 4, 4], [5, 5, 5], [5, 5, 5]], 'gaussian': [[1, 1, 0.7], [1, 1, 0.7], [1, 1, 0.7]]

## Kernel normalization

In [11]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3], 'gaussian':[0.1, 0.3, 0.6]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}
              ]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, normalize_kernels = True)

In [12]:
result3 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

1 split out of 3 ...
	Working on config 1 of 2: {'linear': [0.5], 'polynomial': [2, 3], 'gaussian': [0.1, 0.3, 0.6]}
Fold no. 1
		Perfomances computed for 200
Fold no. 2
		Perfomances computed for 200
Fold no. 3
		Perfomances computed for 200
Validation complete, config selected:[[0.5, 3, 0.1], [0.5, 3, 0.1], [0.5, 2, 0.3]]
	Working on config 2 of 2: {'linear': [0.2], 'polynomial': [4, 5, 8], 'gaussian': [0.7, 1]}
Fold no. 1
		Perfomances computed for 200
Fold no. 2
		Perfomances computed for 200
Fold no. 3
		Perfomances computed for 200
Validation complete, config selected:[[0.2, 8, 1], [0.2, 5, 0.7], [0.2, 4, 1]]
	Result of 1:
CA: 0.042805566029267335
Accuracy: 0.5
Precision: 0.22549019607843138
Recall: 0.7666666666666667
[linear:0.5, polynomial:3, gaussian:0.1, ]
[linear:0.5, polynomial:3, gaussian:0.1, ]
[linear:0.5, polynomial:2, gaussian:0.3, ]

eta vector: [ 9.86165562e-01 -1.65609681e-01  9.49288530e-04  2.43229746e-03
 -2.30412445e-03  3.36560197e-03  3.71844754e-03 -2.0978385

In [13]:
w_dict, w_list = result3.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

Perfomances computed for 1 dictionary settings:
	Accuracy: 0.5729571984435797
	Precision: 0.2544769085768143
	Recall: 0.7563025210084033
Perfomances computed for 2 dictionary settings:
	Accuracy: 0.44455252918287935
	Precision: 0.17154811715481172
	Recall: 0.5742296918767507


In [14]:
result3.performancesFeatures()

statistics of configuration 0
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'polynomial': [[3, 3, 2], [2, 2, 3], [3, 2, 3]], 'gaussian': [[0.1, 0.1, 0.3], [0.1, 0.1, 0.6], [0.1, 0.6, 0.1]]}, 'CA': (0.03796231665018907, 1.186538762000716e-05), 'Accuracy': (0.5174418604651163, 0.0002028123309897236), 'Precision': (0.20886352393882943, 0.0006501443665972419), 'Recall': (0.6444444444444445, 0.016543209876543213), 'eta': (array([ 9.82724896e-01, -1.71667103e-01, -5.98617436e-05,  3.54135191e-03,
       -2.65876304e-03,  2.18464610e-02,  2.24796070e-03, -9.97245668e-04,
       -1.99719522e-02]), array([5.24701838e-05, 2.22771775e-03, 1.46490247e-06, 7.75897190e-06,
       1.76458321e-05, 8.30380273e-04, 2.20926400e-06, 1.01040591e-06,
       7.39719139e-04]))}
statistics of configuration 1
{'config': {'linear': [[0.2, 0.2, 0.2], [0.2, 0.2, 0.2], [0.2, 0.2, 0.2]], 'polynomial': [[8, 5, 4], [4, 4, 8], [4, 5, 8]], 'gaussian': [[1, 0.7, 1], [1, 0.7, 1], [0.7, 0.7, 1]

## Normalized data

In [None]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3, 7], 'gaussian':[0.1, 0.5, 0.7]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, normalizing = True)

In [None]:
result4 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[0, 3, 8, 13, 14], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result4.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result4.performancesFeatures()

## Origin Data Centering

In [None]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3, 7], 'gaussian':[0.1, 0.5, 0.7]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True)

In [None]:
result5 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result5.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result5.performancesFeatures()

## Origin Data  Centering and Normalization

In [None]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3, 7], 'gaussian':[0.1, 0.5, 0.7]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True, normalizing = True)

In [None]:
result6 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result6.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result6.performancesFeatures()

## Normalization, Kernel Normalization

In [None]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3, 4], 'gaussian':[0.1, 0.3, 0.6]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}
              ]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, normalizing = True, normalize_kernels = True)

In [None]:
result10 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result10.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result10.performancesFeatures()

## Centering, Kernel Normalization

In [None]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3,4], 'gaussian':[0.1, 0.3, 0.6]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}
              ]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True, normalize_kernels = True)

In [None]:
result11 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result11.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result11.performancesFeatures()

## Centering, Normalization, Kernel Normalization

In [None]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3,4], 'gaussian':[0.1, 0.3, 0.6]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}
              ]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True, normalizing = True, normalize_kernels = True)

In [None]:
result7 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result7.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result7.performancesFeatures()

## L2 Penalty, Centering, K-Normalization

In [None]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3,4], 'gaussian':[0.1, 0.3, 0.6]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}
              ]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, lamb = 0.5, centering = True, normalize_kernels = False)

In [None]:
result9 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result9.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result9.performancesFeatures()

## Sparsity in eta, Centering, K-Normalization

In [None]:
y = outputs["dement_fail"].values
C = d_clinical.values
G = d_genetic.values
V = d_vampire.values

In [None]:
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.95).split(C, y))

In [None]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
G_ = G[tr_idx]
G_test = G[ts_idx]
V_ = V[tr_idx]
V_test = V[ts_idx]

In [None]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

In [None]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3], 'gaussian':[0.1, 0.3]}#,
               #{'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}
              ]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, sparsity = 0.3, centering = True, normalize_kernels = True)

In [None]:
result8 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result8.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result8.performancesFeatures()