In [1]:
import pandas as pd
import os
import numpy as np

import Utils as ut
import CortesAlignmentFile as ca
import mySampler as ms
 
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
d_clinical = pd.read_csv(os.path.join('data', 'dataset_clinical_cleaned.csv'))
d_genetic = pd.read_csv(os.path.join('data', 'dataset_genetic_cleaned_noOHE.csv'))
d_vampire = pd.read_csv(os.path.join('data', 'dataset_vampire_cleaned.csv'))
outputs = pd.read_csv(os.path.join('data', 'outputs_cleaned.csv'))

## Dementia

In [3]:
y = outputs["dement_fail"].values
C = d_clinical.values
G = d_genetic.values
V = d_vampire.values

In [4]:
# COMPUTATIONAL COMPLEXITY: Reduce #samples
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.5).split(C, y))

In [5]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
G_ = G[tr_idx]
G_test = G[ts_idx]
V_ = V[tr_idx]
V_test = V[ts_idx]

In [6]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

## Basic approach

In [None]:
kernel_names = ['linear', 'gaussian']
kernel_type = [{'linear':[1], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35)

In [None]:
result1 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
w_dict, w_list = result1.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
w_dict

In [None]:
result1.performancesFeatures()

## Kernel normalization

In [None]:
kernel_names = ['linear', 'gaussian']
kernel_type = [{'linear':[1], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, normalize_kernels = True)

In [None]:
result3 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
w_dict, w_list = result3.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result3.performancesFeatures()

## Normalized data

In [None]:
kernel_names = ['linear', 'gaussian']
kernel_type = [{'linear':[1], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, normalizing = True)

In [None]:
result4 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[0, 3, 8, 13, 14], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result4.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result4.performancesFeatures()

## Origin Data Centering

In [None]:
kernel_names = ['linear', 'gaussian']
kernel_type = [{'linear':[1], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True)

In [None]:
result5 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result5.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result5.performancesFeatures()

## Origin Data  Centering and Normalization

In [None]:
kernel_names = ['linear', 'gaussian']
kernel_type = [{'linear':[1], 'gaussian':[0.3, 0.7]},
               {'linear':[1], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True, normalizing = True)

In [None]:
result6 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result6.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result6.performancesFeatures()

## Normalization, Kernel Normalization

In [None]:
kernel_names = ['linear', 'gaussian']
kernel_type = [{'linear':[1], 'gaussian':[0.3, 0.7]},
               {'linear':[1], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, normalizing = True, normalize_kernels = True)

In [None]:
result10 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result10.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result10.performancesFeatures()

## Centering, Kernel Normalization

In [7]:
kernel_names = ['linear', 'gaussian']
kernel_type = [{'linear':[1], 'gaussian':[0.3, 0.7]},
               {'linear':[1], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True, normalize_kernels = True)

In [8]:
result11 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])


1 split out of 3 ...
	Working on config 1 of 2: {'linear': [1], 'gaussian': [0.3, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[1, 0.7], [1, 0.7], [1, 0.7]]
	Result of 1:
CA: 0.026914340188291957
Accuracy: 0.48541666666666666
Precision: 0.3559870550161812
Recall: 0.6962025316455697
[linear:1, gaussian:0.7, linear:1, ]
[gaussian:0.7, linear:1, gaussian:0.7, ]

eta vector: [  0.04577721   1.01492512 -28.86103881  38.17489174  -0.62449439
   1.63459075]


	Completed in 0.21666666666666667 minutes
	Working on config 2 of 2: {'linear': [1], 'gaussian': [0.5, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[1, 1], [1, 1], [1, 1]]
	Result of 1:
CA: 0.026914340188291957
Accuracy: 0.48541666666666666
Precision: 0.3559870550161812
Recall: 0.6962025316455697
[linear:1, gaussian:0.7, linear:1, ]
[gaussian:0.7, linear:1, gaussian:0.7, ]

eta vector: [  0.04577721   1.01492512 -28.86103881  38.17489174  -0.62449439
   1.63459075]

CA: 0.028

In [9]:
w_dict, w_list = result11.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

Perfomances computed for dictionary settings 1:
	Accuracy: 0.5047410649161196
	Precision: 0.35124508519003933
	Recall: 0.5929203539823009
Perfomances computed for dictionary settings 2:
	Accuracy: 0.5054704595185996
	Precision: 0.356234096692112
	Recall: 0.6194690265486725


In [10]:
result11.performancesFeatures()

statistics of configuration 1
{'config': {'linear': [[1, 1, 1], [1, 1, 1], [1, 1, 1]], 'gaussian': [[0.7, 0.7, 0.7], [0.3, 0.3, 0.3], [0.3, 0.7, 0.7]]}, 'CA': (0.02407714566118607, 8.69825733655828e-06), 'Accuracy': (0.47708333333333336, 0.00032407407407407423), 'Precision': (0.3454446580416917, 0.0002211297384966867), 'Recall': (0.6582278481012659, 0.001148320247823532), 'eta': (array([  0.07334879,   1.27132833, -27.31586484,  49.66340022,
        -0.18640718,   0.57969543]), array([2.89963870e-02, 2.45507701e-01, 2.52094203e+01, 1.70592345e+02,
       9.67524816e-02, 5.56815346e-01]))}
statistics of configuration 2
{'config': {'linear': [[1, 1, 1], [1, 1, 1], [1, 1, 1]], 'gaussian': [[1, 1, 1], [0.5, 0.5, 0.5], [0.5, 1, 1]]}, 'CA': (0.025970381398108016, 8.448496442297286e-06), 'Accuracy': (0.5597222222222222, 0.002634066358024691), 'Precision': (0.29718648473034437, 0.002214662145701896), 'Recall': (0.31645569620253167, 0.06761736901137638), 'eta': (array([ 1.39895637e-01,  7.83041

## Centering, Normalization, Kernel Normalization

In [None]:
kernel_names = ['linear', 'gaussian']
kernel_type = [{'linear':[1], 'gaussian':[0.3, 0.7]},
               {'linear':[1], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True, normalizing = True, normalize_kernels = True)

In [None]:
result7 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result7.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result7.performancesFeatures()

## L2 Penalty, Centering, K-Normalization

In [None]:
kernel_names = ['linear', 'gaussian']
kernel_type = [{'linear':[1], 'gaussian':[0.3, 0.7]},
               {'linear':[1], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, lamb = 0.5, centering = True, normalize_kernels = False)

In [None]:
result9 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result9.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result9.performancesFeatures()

## Sparsity in eta, Centering, K-Normalization

In [3]:
y = outputs["cvd_fail"].values
C = d_clinical.values
G = d_genetic.values
V = d_vampire.values

In [4]:
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.5).split(C, y))

In [5]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
G_ = G[tr_idx]
G_test = G[ts_idx]
V_ = V[tr_idx]
V_test = V[ts_idx]

In [6]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

In [7]:
kernel_names = ['linear', 'gaussian']
kernel_type = [{'linear':[1], 'gaussian':[0.3, 0.7]},
               {'linear':[1], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, sparsity = 0.5, centering = True, normalize_kernels = True)

In [8]:
result8 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])


1 split out of 3 ...
	Working on config 1 of 2: {'linear': [1], 'gaussian': [0.3, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[1, 0.7], [1, 0.3], [1, 0.3]]
	Result of 1:
CA: 0.019485386161298163
Accuracy: 0.5979166666666667
Precision: 0.4279835390946502
Recall: 0.6582278481012658
[linear:1, gaussian:0.7, linear:1, ]
[gaussian:0.3, linear:1, gaussian:0.3, ]

eta vector: [8.73449399e-01 4.86835170e-01 8.02580970e-03 3.14506912e-03
 1.76909964e-03 4.79374361e-04]


	Completed in 0.18333333333333332 minutes
	Working on config 2 of 2: {'linear': [1], 'gaussian': [0.5, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[1, 1], [1, 0.5], [1, 0.5]]
	Result of 1:
CA: 0.019485386161298163
Accuracy: 0.5979166666666667
Precision: 0.4279835390946502
Recall: 0.6582278481012658
[linear:1, gaussian:0.7, linear:1, ]
[gaussian:0.3, linear:1, gaussian:0.3, ]

eta vector: [8.73449399e-01 4.86835170e-01 8.02580970e-03 3.14506912e-03
 1.76909964e-03 

In [9]:
w_dict, w_list = result8.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

Perfomances computed for dictionary settings 1:
	Accuracy: 0.4762946754194019
	Precision: 0.33207070707070707
	Recall: 0.581858407079646
Perfomances computed for dictionary settings 2:
	Accuracy: 0.45951859956236324
	Precision: 0.3313885647607935
	Recall: 0.6283185840707964


In [10]:
result8.performancesFeatures()

statistics of configuration 1
{'config': {'linear': [[1, 1, 1], [1, 1, 1], [1, 1, 1]], 'gaussian': [[0.7, 0.3, 0.3], [0.7, 0.7, 0.3], [0.7, 0.7, 0.3]]}, 'CA': (0.016828865809953474, 3.6211565448090923e-06), 'Accuracy': (0.5979166666666668, 2.8935185185184976e-06), 'Precision': (0.4296982167352537, 4.094566298353525e-06), 'Recall': (0.6772151898734178, 0.00018693585429685475), 'eta': (array([0.8732318 , 0.48704121, 0.01048799, 0.00960516, 0.00448973,
       0.00192995]), array([2.47804446e-08, 2.14647611e-08, 3.32399562e-06, 2.12215710e-05,
       4.45912916e-06, 1.88974130e-06]))}
statistics of configuration 2
{'config': {'linear': [[1, 1, 1], [1, 1, 1], [1, 1, 1]], 'gaussian': [[1, 0.5, 0.5], [0.5, 1, 0.5], [1, 1, 0.5]]}, 'CA': (0.01694848336289625, 3.7827089056325856e-06), 'Accuracy': (0.5916666666666667, 2.0254629629629487e-05), 'Precision': (0.42322435710185213, 5.022808684578233e-06), 'Recall': (0.6624472573839663, 0.0004361836600259939), 'eta': (array([0.85778759, 0.50712758, 0.0