In [1]:
import pandas as pd
import os
import numpy as np

import Utils as ut
import CortesAlignmentFile as ca
import mySampler as ms
 
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
d_clinical = pd.read_csv(os.path.join('data', 'dataset_clinical_cleaned.csv'))
#d_genetic = pd.read_csv(os.path.join('data', 'dataset_genetic_cleaned_noOHE.csv'))
#d_vampire = pd.read_csv(os.path.join('data', 'dataset_vampire_cleaned.csv'))
outputs = pd.read_csv(os.path.join('data', 'outputs_cleaned.csv'))

## Heart Attack

In [3]:
y = outputs["cvd_fail"].values
C = d_clinical.values
#G = d_genetic.values
#V = d_vampire.values

In [4]:
# COMPUTATIONAL COMPLEXITY: Reduce #samples
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.25).split(C, y))

In [5]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
#G_ = G[tr_idx]
#G_test = G[ts_idx]
#V_ = V[tr_idx]
#V_test = V[ts_idx]

In [6]:
ds_list = [C_]#, G_, V_]
ds_test = [C_test]#, G_test, V_test]
ds_names = ['clinic']#, 'genetic', 'vampire']

## Basic approach

In [None]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25)

In [None]:
result1 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
w_dict, w_list = result1.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
w_dict

In [None]:
result1.performancesFeatures()

## Kernel normalization

In [None]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, normalize_kernels = True)

In [None]:
result3 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
w_dict, w_list = result3.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result3.performancesFeatures()

## Normalized data

In [None]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, normalizing = True)

In [None]:
result4 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19]])

In [None]:
w_dict, w_list = result4.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result4.performancesFeatures()

## Origin Data Centering

In [None]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19]])

In [None]:
result5 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
w_dict, w_list = result5.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result5.performancesFeatures()

## Origin Data  Centering and Normalization

In [None]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True, normalizing = True)

In [None]:
result6 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19]])

In [None]:
w_dict, w_list = result6.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result6.performancesFeatures()

## Normalization, Kernel Normalization

In [None]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, normalizing = True, normalize_kernels = True)

In [None]:
result10 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result10.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result10.performancesFeatures()

## Centering, Kernel Normalization

In [47]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True, normalize_kernels = True)

In [48]:
result11 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19]])


1 split out of 3 ...
	Working on config 1 of 2: {'linear': [1], 'laplacian': [0.1, 0.05], 'sigmoid': [0.3, 0.6], 'polynomial': [2, 3], 'gaussian': [0.4, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[1, 0.05, 0.3, 3, 0.4]
	Result of 1:
CA: 0.025440055151371346
Accuracy: 0.48249027237354086
Precision: 0.31954887218045114
Recall: 0.5
[linear:1, laplacian:0.05, sigmoid:0.3, polynomial:3, gaussian:0.4, ]

eta vector: [-2.11235008e-02  3.24299106e+01  1.18238087e+02 -2.39809534e+02
  1.84319813e+01]


	Completed in 1.6166666666666667 minutes
	Working on config 2 of 2: {'linear': [1], 'laplacian': [0.1, 0.15], 'sigmoid': [0.4, 0.5], 'polynomial': [3, 4], 'gaussian': [0.5, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[1, 0.1, 0.4, 3, 0.5]
	Result of 1:
CA: 0.025440055151371346
Accuracy: 0.48249027237354086
Precision: 0.31954887218045114
Recall: 0.5
[linear:1, laplacian:0.05, sigmoid:0.3, polynomial:3, gaussian:0.4, ]

eta vector: [-2

In [49]:
w_dict, w_list = result11.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

Perfomances computed for dictionary settings 1:
	Accuracy: 0.5845481049562682
	Precision: 0.4178272980501393
	Recall: 0.6637168141592921
Perfomances computed for dictionary settings 2:
	Accuracy: 0.5801749271137027
	Precision: 0.4157608695652174
	Recall: 0.6769911504424779


In [50]:
result11.performancesFeatures()

statistics of configuration 1
{'config': {'linear': [[1], [1], [1]], 'laplacian': [[0.05], [0.05], [0.05]], 'sigmoid': [[0.3], [0.3], [0.3]], 'polynomial': [[3], [3], [3]], 'gaussian': [[0.4], [0.4], [0.4]]}, 'CA': (0.024844941315866783, 2.319506644167194e-07), 'Accuracy': (0.4805447470817121, 0.0002800950809247683), 'Precision': (0.3144115268250307, 0.00016021854304055158), 'Recall': (0.48235294117647065, 0.00020761245674740469), 'eta': (array([-1.38940048e-01,  3.34559322e+01,  1.28849980e+02, -2.58866193e+02,
        1.98196219e+01]), array([3.81786990e-02, 5.36988317e-01, 5.63341170e+01, 1.87656457e+02,
       9.63478733e-01]))}
statistics of configuration 2
{'config': {'linear': [[1], [1], [1]], 'laplacian': [[0.1], [0.1], [0.1]], 'sigmoid': [[0.4], [0.4], [0.4]], 'polynomial': [[3], [3], [3]], 'gaussian': [[0.5], [0.5], [0.5]]}, 'CA': (0.024924048981483066, 2.0577698563438816e-07), 'Accuracy': (0.4773022049286641, 0.00013205683995552088), 'Precision': (0.311780271689062, 7.946486

## Centering, Normalization, Kernel Normalization

In [None]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True, normalizing = True, normalize_kernels = True)

In [None]:
result7 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19]])

In [None]:
w_dict, w_list = result7.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result7.performancesFeatures()

## L2 Penalty, Centering, K-Normalization

In [63]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, lamb = 0.5, centering = True, normalize_kernels = False)

In [64]:
result9 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19]])


1 split out of 3 ...
	Working on config 1 of 2: {'linear': [1], 'laplacian': [0.1, 0.05], 'sigmoid': [0.3, 0.6], 'polynomial': [2, 3], 'gaussian': [0.4, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[1, 0.05, 0.6, 3, 0.7]
	Result of 1:
CA: 0.031380328915002614
Accuracy: 0.5603112840466926
Precision: 0.384297520661157
Recall: 0.5470588235294118
[linear:1, laplacian:0.05, sigmoid:0.6, polynomial:3, gaussian:0.7, ]

eta vector: [-0.23646578  1.03285042  0.57320741  0.07395261  0.13470301]


	Completed in 1.2 minutes
	Working on config 2 of 2: {'linear': [1], 'laplacian': [0.1, 0.15], 'sigmoid': [0.4, 0.5], 'polynomial': [3, 4], 'gaussian': [0.5, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[1, 0.1, 0.5, 3, 1]
	Result of 1:
CA: 0.031380328915002614
Accuracy: 0.5603112840466926
Precision: 0.384297520661157
Recall: 0.5470588235294118
[linear:1, laplacian:0.05, sigmoid:0.6, polynomial:3, gaussian:0.7, ]

eta vector: [-0.23646578  1.0

In [65]:
w_dict, w_list = result9.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

Perfomances computed for dictionary settings 1:
	Accuracy: 0.565597667638484
	Precision: 0.40217391304347827
	Recall: 0.6548672566371682
Perfomances computed for dictionary settings 2:
	Accuracy: 0.5685131195335277
	Precision: 0.40437158469945356
	Recall: 0.6548672566371682


In [66]:
result9.performancesFeatures()

statistics of configuration 1
{'config': {'linear': [[1], [1], [1]], 'laplacian': [[0.05], [0.1], [0.05]], 'sigmoid': [[0.6], [0.6], [0.6]], 'polynomial': [[3], [3], [3]], 'gaussian': [[0.7], [0.4], [0.4]]}, 'CA': (0.02832242019930528, 4.6848107377786426e-06), 'Accuracy': (0.47016861219195855, 0.0040685282475468555), 'Precision': (0.3008291263839077, 0.0034845828381239465), 'Recall': (0.4470588235294118, 0.005051903114186854), 'eta': (array([-0.39777115,  0.83859541,  0.85122253, -0.02192073,  0.4007772 ]), array([0.01318234, 0.02660224, 0.03886366, 0.00486335, 0.03702654]))}
statistics of configuration 2
{'config': {'linear': [[1], [1], [1]], 'laplacian': [[0.1], [0.15], [0.1]], 'sigmoid': [[0.5], [0.5], [0.5]], 'polynomial': [[3], [4], [4]], 'gaussian': [[1], [0.5], [0.5]]}, 'CA': (0.028401939738017874, 4.7164334774159335e-06), 'Accuracy': (0.4974059662775616, 0.0018681416658675968), 'Precision': (0.3457229131832307, 0.0006845687590586332), 'Recall': (0.5686274509803922, 0.0003767781

## Sparsity in eta, Centering, K-Normalization

In [51]:
y = outputs["cvd_fail"].values
C = d_clinical.values
#G = d_genetic.values
#V = d_vampire.values

In [52]:
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.25).split(C, y))

In [53]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
#G_ = G[tr_idx]
#G_test = G[ts_idx]
#V_ = V[tr_idx]
#V_test = V[ts_idx]

In [54]:
ds_list = [C_]#, G_, V_]
ds_test = [C_test]#, G_test, V_test]
ds_names = ['clinic']#, 'genetic', 'vampire']

In [59]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, sparsity = 0.7, centering = True, normalize_kernels = True)

In [60]:
result8 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19]])


1 split out of 3 ...
	Working on config 1 of 2: {'linear': [1], 'laplacian': [0.1, 0.05], 'sigmoid': [0.3, 0.6], 'polynomial': [2, 3], 'gaussian': [0.4, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[1, 0.1, 0.6, 2, 0.7]
	Result of 1:
CA: 0.022156462385752862
Accuracy: 0.5817120622568094
Precision: 0.4187725631768953
Recall: 0.6823529411764706
[linear:1, laplacian:0.1, sigmoid:0.6, polynomial:2, gaussian:0.7, ]

eta vector: [0.86117539 0.08365906 0.15121173 0.04179688 0.4761997 ]


	Completed in 1.2666666666666666 minutes
	Working on config 2 of 2: {'linear': [1], 'laplacian': [0.1, 0.15], 'sigmoid': [0.4, 0.5], 'polynomial': [3, 4], 'gaussian': [0.5, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[1, 0.15, 0.5, 3, 1]
	Result of 1:
CA: 0.022156462385752862
Accuracy: 0.5817120622568094
Precision: 0.4187725631768953
Recall: 0.6823529411764706
[linear:1, laplacian:0.1, sigmoid:0.6, polynomial:2, gaussian:0.7, ]

eta vector: [0.8611

In [61]:
w_dict, w_list = result8.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

Perfomances computed for dictionary settings 1:
	Accuracy: 0.5670553935860059
	Precision: 0.4032697547683924
	Recall: 0.6548672566371682
Perfomances computed for dictionary settings 2:
	Accuracy: 0.5728862973760933
	Precision: 0.40771349862258954
	Recall: 0.6548672566371682


In [62]:
result8.performancesFeatures()

statistics of configuration 1
{'config': {'linear': [[1], [1], [1]], 'laplacian': [[0.1], [0.1], [0.1]], 'sigmoid': [[0.6], [0.6], [0.6]], 'polynomial': [[2], [3], [3]], 'gaussian': [[0.7], [0.7], [0.7]]}, 'CA': (0.021934171566082187, 6.889184400779272e-07), 'Accuracy': (0.5797665369649806, 0.00046177837665975133), 'Precision': (0.4168367079997907, 0.000404147999786995), 'Recall': (0.6784313725490195, 0.001138023836985776), 'eta': (array([0.85892317, 0.08389301, 0.15154791, 0.05598131, 0.47854101]), array([3.76405729e-06, 8.36165405e-08, 8.79361219e-08, 1.00614252e-04,
       6.22865315e-06]))}
statistics of configuration 2
{'config': {'linear': [[1], [1], [1]], 'laplacian': [[0.15], [0.15], [0.15]], 'sigmoid': [[0.5], [0.5], [0.5]], 'polynomial': [[3], [4], [4]], 'gaussian': [[1], [1], [1]]}, 'CA': (0.022122093193568573, 7.453412489096185e-07), 'Accuracy': (0.5881971465629053, 0.0002531790371121775), 'Precision': (0.4241113128209902, 0.0002033928498971627), 'Recall': (0.68431372549019