In [19]:
import pandas as pd
import os
import numpy as np

import Utils as ut
import CortesAlignmentFile as ca
import mySampler as ms
 
from sklearn.model_selection import StratifiedShuffleSplit

In [20]:
d_clinical = pd.read_csv(os.path.join('data', 'dataset_clinical_cleaned.csv'))
d_genetic = pd.read_csv(os.path.join('data', 'dataset_genetic_cleaned_noOHE.csv'))
d_vampire = pd.read_csv(os.path.join('data', 'dataset_vampire_cleaned.csv'))
outputs = pd.read_csv(os.path.join('data', 'outputs_cleaned.csv'))

## Dementia

In [11]:
y = outputs["dement_fail"].values
C = d_clinical.values
G = d_genetic.values
V = d_vampire.values

In [12]:
# COMPUTATIONAL COMPLEXITY: Reduce #samples
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.5).split(C, y))

In [13]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
G_ = G[tr_idx]
G_test = G[ts_idx]
V_ = V[tr_idx]
V_test = V[ts_idx]

In [14]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

## Basic approach

In [None]:
kernel_names = ['polynomial', 'gaussian']
kernel_type = [{'polynomial':[2, 7], 'gaussian':[0.4, 0.7]},
               {'polynomial':[3, 5], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35)

In [None]:
result1 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
w_dict, w_list = result1.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
w_dict

In [None]:
result1.performancesFeatures()

## Kernel normalization

In [None]:
kernel_names = ['polynomial', 'gaussian']
kernel_type = [{'polynomial':[2, 7], 'gaussian':[0.4, 0.7]},
               {'polynomial':[3, 5], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, normalize_kernels = True)

In [None]:
result3 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
w_dict, w_list = result3.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result3.performancesFeatures()

## Normalized data

In [None]:
kernel_names = ['polynomial', 'gaussian']
kernel_type = [{'polynomial':[2, 7], 'gaussian':[0.4, 0.7]},
               {'polynomial':[3, 5], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, normalizing = True)

In [None]:
result4 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[0, 3, 8, 13, 14], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result4.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result4.performancesFeatures()

## Origin Data Centering

In [None]:
kernel_names = ['polynomial', 'gaussian']
kernel_type = [{'polynomial':[2, 7], 'gaussian':[0.4, 0.7]},
               {'polynomial':[3, 5], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True)

In [None]:
result5 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result5.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result5.performancesFeatures()

## Origin Data  Centering and Normalization

In [None]:
kernel_names = ['polynomial', 'gaussian']
kernel_type = [{'polynomial':[2, 7], 'gaussian':[0.4, 0.7]},
               {'polynomial':[3, 5], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True, normalizing = True)

In [None]:
result6 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result6.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result6.performancesFeatures()

## Normalization, Kernel Normalization

In [None]:
kernel_names = ['polynomial', 'gaussian']
kernel_type = [{'polynomial':[2, 7], 'gaussian':[0.4, 0.7]},
               {'polynomial':[3, 5], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, normalizing = True, normalize_kernels = True)

In [None]:
result10 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result10.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result10.performancesFeatures()

## Centering, Kernel Normalization

In [15]:
kernel_names = ['polynomial', 'gaussian']
kernel_type = [{'polynomial':[2, 7], 'gaussian':[0.4, 0.7]},
               {'polynomial':[3, 5], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True, normalize_kernels = True)

In [16]:
result11 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])


1 split out of 3 ...
	Working on config 1 of 2: {'polynomial': [2, 7], 'gaussian': [0.4, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[2, 0.7], [7, 0.7], [2, 0.4]]
	Result of 1:
CA: 0.029232025954077354
Accuracy: 0.4479166666666667
Precision: 0.3257328990228013
Recall: 0.6329113924050633
[polynomial:2, gaussian:0.7, polynomial:7, gaussian:0.7, polynomial:2, gaussian:0.4, ]

eta vector: [ 4.95050826e+00  7.97506276e-01 -4.56085903e+03  6.50013517e+01
 -2.86936394e+02  4.43132202e+00]


	Completed in 1.9 minutes
	Working on config 2 of 2: {'polynomial': [3, 5], 'gaussian': [0.5, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[3, 1], [5, 1], [3, 0.5]]
	Result of 1:
CA: 0.029232025954077354
Accuracy: 0.4479166666666667
Precision: 0.3257328990228013
Recall: 0.6329113924050633
[polynomial:2, gaussian:0.7, polynomial:7, gaussian:0.7, polynomial:2, gaussian:0.4, ]

eta vector: [ 4.95050826e+00  7.97506276e-01 -4.56085903e+03  6.50013

In [17]:
w_dict, w_list = result11.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

Perfomances computed for dictionary settings 1:
	Accuracy: 0.5149525893508388
	Precision: 0.34978843441466856
	Recall: 0.5486725663716814
Perfomances computed for dictionary settings 2:
	Accuracy: 0.5142231947483589
	Precision: 0.3488700564971751
	Recall: 0.5464601769911505


In [18]:
result11.performancesFeatures()

statistics of configuration 1
{'config': {'polynomial': [[2, 7, 2], [2, 2, 2], [2, 7, 2]], 'gaussian': [[0.7, 0.7, 0.4], [0.7, 0.4, 0.4], [0.7, 0.7, 0.4]]}, 'CA': (0.032741580038960216, 7.59764378790269e-06), 'Accuracy': (0.4444444444444444, 0.00012827932098765453), 'Precision': (0.32014842189797693, 3.7009705052000186e-05), 'Recall': (0.6118143459915611, 0.00024924780572913995), 'eta': (array([ 8.63941749e+00,  6.96170018e-01, -1.33880429e+04,  1.17462242e+02,
       -1.72378494e+02,  2.67406324e+00]), array([1.09231228e+01, 1.46238552e-02, 1.57654613e+08, 5.64349524e+03,
       7.02765308e+03, 1.64138533e+00]))}
statistics of configuration 2
{'config': {'polynomial': [[3, 5, 3], [3, 3, 3], [3, 5, 3]], 'gaussian': [[1, 1, 0.5], [0.5, 0.5, 0.5], [1, 1, 0.5]]}, 'CA': (0.03291615008559988, 7.353313903663933e-06), 'Accuracy': (0.5715277777777779, 0.003536844135802472), 'Precision': (0.34003927092670844, 3.3024811478428684e-05), 'Recall': (0.3206751054852321, 0.03622104719685237), 'eta': (

## Centering, Normalization, Kernel Normalization

In [None]:
kernel_names = ['polynomial', 'gaussian']
kernel_type = [{'polynomial':[2, 7], 'gaussian':[0.4, 0.7]},
               {'polynomial':[3, 5], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True, normalizing = True, normalize_kernels = True)

In [None]:
result7 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result7.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result7.performancesFeatures()

## L2 Penalty, Centering, K-Normalization

In [None]:
kernel_names = ['polynomial', 'gaussian']
kernel_type = [{'polynomial':[2, 7], 'gaussian':[0.4, 0.7]},
               {'polynomial':[3, 5], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, lamb = 0.5, centering = True, normalize_kernels = False)

In [None]:
result9 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result9.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result9.performancesFeatures()

## Sparsity in eta, Centering, K-Normalization

In [21]:
y = outputs["cvd_fail"].values
C = d_clinical.values
G = d_genetic.values
V = d_vampire.values

In [22]:
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.5).split(C, y))

In [23]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
G_ = G[tr_idx]
G_test = G[ts_idx]
V_ = V[tr_idx]
V_test = V[ts_idx]

In [24]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

In [25]:
kernel_names = ['polynomial', 'gaussian']
kernel_type = [{'polynomial':[2, 7], 'gaussian':[0.4, 0.7]},
               {'polynomial':[3, 5], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, sparsity = 0.7, centering = True, normalize_kernels = True)

In [26]:
result8 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])


1 split out of 3 ...
	Working on config 1 of 2: {'polynomial': [2, 7], 'gaussian': [0.4, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[7, 0.4], [2, 0.7], [2, 0.4]]
	Result of 1:
CA: 0.027883276075167097
Accuracy: 0.5729166666666666
Precision: 0.4063745019920319
Recall: 0.6455696202531646
[polynomial:7, gaussian:0.4, polynomial:2, gaussian:0.7, polynomial:2, gaussian:0.4, ]

eta vector: [ 4.07502442e-01  9.12415607e-01 -0.00000000e+00  3.79406161e-02
 -0.00000000e+00  1.72551904e-04]


	Completed in 2.0 minutes
	Working on config 2 of 2: {'polynomial': [3, 5], 'gaussian': [0.5, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[5, 0.5], [3, 1], [3, 0.5]]
	Result of 1:
CA: 0.027883276075167097
Accuracy: 0.5729166666666666
Precision: 0.4063745019920319
Recall: 0.6455696202531646
[polynomial:7, gaussian:0.4, polynomial:2, gaussian:0.7, polynomial:2, gaussian:0.4, ]

eta vector: [ 4.07502442e-01  9.12415607e-01 -0.00000000e+00  3.794

In [27]:
w_dict, w_list = result8.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

Perfomances computed for dictionary settings 1:
	Accuracy: 0.4959883296863603
	Precision: 0.3378561736770692
	Recall: 0.5508849557522124
Perfomances computed for dictionary settings 2:
	Accuracy: 0.48431801604668123
	Precision: 0.33674775928297057
	Recall: 0.581858407079646


In [28]:
result8.performancesFeatures()

statistics of configuration 1
{'config': {'polynomial': [[7, 2, 2], [2, 2, 2], [7, 2, 2]], 'gaussian': [[0.4, 0.7, 0.4], [0.7, 0.4, 0.7], [0.7, 0.7, 0.4]]}, 'CA': (0.023395792866311795, 1.5526576131103264e-05), 'Accuracy': (0.5597222222222222, 0.0006143904320987652), 'Precision': (0.3985755626079341, 0.00018781587063379165), 'Recall': (0.6540084388185654, 0.0009969912229165574), 'eta': (array([2.55659854e-01, 9.56365491e-01, 0.00000000e+00, 2.56604961e-02,
       0.00000000e+00, 2.91276755e-04]), array([1.80632938e-02, 1.18881701e-03, 0.00000000e+00, 9.23404932e-05,
       0.00000000e+00, 8.90130184e-08]))}
statistics of configuration 2
{'config': {'polynomial': [[5, 3, 3], [3, 5, 5], [3, 3, 5]], 'gaussian': [[0.5, 1, 0.5], [1, 0.5, 1], [0.5, 1, 0.5]]}, 'CA': (0.02359234820539224, 1.3712051551672204e-05), 'Accuracy': (0.5750000000000001, 0.0005758101851851857), 'Precision': (0.39299379674284945, 0.0004632167625114249), 'Recall': (0.5358649789029536, 0.00940910466627499), 'eta': (array(