In [1]:
import pandas as pd
import os
import numpy as np

import Utils as ut
import CortesAlignmentFile as ca
import mySampler as ms
 
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
d_clinical = pd.read_csv(os.path.join('data', 'dataset_clinical_cleaned.csv'))
d_genetic = pd.read_csv(os.path.join('data', 'dataset_genetic_cleaned_noOHE.csv'))
d_vampire = pd.read_csv(os.path.join('data', 'dataset_vampire_cleaned.csv'))
outputs = pd.read_csv(os.path.join('data', 'outputs_cleaned.csv'))

## Heart Attack

In [3]:
y = outputs["cvd_fail"].values
C = d_clinical.values
G = d_genetic.values
V = d_vampire.values

In [4]:
# COMPUTATIONAL COMPLEXITY: Reduce #samples
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.5).split(C, y))

In [5]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
G_ = G[tr_idx]
G_test = G[ts_idx]
V_ = V[tr_idx]
V_test = V[ts_idx]

In [6]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

## Basic approach

In [None]:
kernel_names = ['laplacian', 'gaussian']
kernel_type = [{'laplacian':[0.2, 0.6], 'gaussian':[0.3, 0.7]},
               {'laplacian':[0.4, 0.9], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35)

In [None]:
result1 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
result1.global_best_

In [None]:
w_dict, w_list = result1.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
w_list

In [None]:
w_dict

In [None]:
result1.performancesFeatures()

## Kernel normalization

In [None]:
kernel_names = ['laplacian', 'gaussian']
kernel_type = [{'laplacian':[0.2, 0.6], 'gaussian':[0.3, 0.7]},
               {'laplacian':[0.4, 0.9], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, normalize_kernels = True)

In [None]:
result3 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
w_dict, w_list = result3.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result3.performancesFeatures()

## Normalized data

In [None]:
kernel_names = ['laplacian', 'gaussian']
kernel_type = [{'laplacian':[0.2, 0.6], 'gaussian':[0.3, 0.7]},
               {'laplacian':[0.4, 0.9], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, normalizing = True)

In [None]:
result4 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[0, 3, 8, 13, 14], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result4.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result4.performancesFeatures()

## Origin Data Centering

In [None]:
kernel_names = ['laplacian', 'gaussian']
kernel_type = [{'laplacian':[0.2, 0.6], 'gaussian':[0.3, 0.7]},
               {'laplacian':[0.4, 0.9], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True)

In [None]:
result5 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result5.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result5.performancesFeatures()

## Origin Data  Centering and Normalization

In [None]:
kernel_names = ['laplacian', 'gaussian']
kernel_type = [{'laplacian':[0.2, 0.6], 'gaussian':[0.3, 0.7]},
               {'laplacian':[0.4, 0.9], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True, normalizing = True)

In [None]:
result6 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result6.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result6.performancesFeatures()

## Normalization, Kernel Normalization

In [None]:
kernel_names = ['laplacian', 'gaussian']
kernel_type = [{'laplacian':[0.2, 0.6], 'gaussian':[0.3, 0.7]},
               {'laplacian':[0.4, 0.9], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, normalizing = True, normalize_kernels = True)

In [None]:
result10 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result10.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result10.performancesFeatures()

## Centering, Kernel Normalization

In [7]:
kernel_names = ['laplacian', 'gaussian']
kernel_type = [{'laplacian':[0.2, 0.6], 'gaussian':[0.3, 0.7]},
               {'laplacian':[0.4, 0.9], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True, normalize_kernels = True)

In [8]:
result11 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])


1 split out of 3 ...
	Working on config 1 of 2: {'laplacian': [0.2, 0.6], 'gaussian': [0.3, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.2, 0.7], [0.6, 0.3], [0.6, 0.7]]
	Result of 1:
CA: 0.04052015086582846
Accuracy: 0.5645833333333333
Precision: 0.3876651982378855
Recall: 0.5569620253164557
[laplacian:0.2, gaussian:0.7, laplacian:0.6, ]
[gaussian:0.3, laplacian:0.6, gaussian:0.7, ]

eta vector: [ 0.96303617 -0.20260912  0.07318886 -0.15542038  0.02965136 -0.03346007]


	Completed in 5.183333333333334 minutes
	Working on config 2 of 2: {'laplacian': [0.4, 0.9], 'gaussian': [0.5, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.4, 1], [0.4, 0.5], [0.9, 1]]
	Result of 1:
CA: 0.04052015086582846
Accuracy: 0.5645833333333333
Precision: 0.3876651982378855
Recall: 0.5569620253164557
[laplacian:0.2, gaussian:0.7, laplacian:0.6, ]
[gaussian:0.3, laplacian:0.6, gaussian:0.7, ]

eta vector: [ 0.96303617 -0.20260912  0.07318886 -0.

KeyboardInterrupt: 

In [None]:
w_dict, w_list = result11.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result11.performancesFeatures()

## Centering, Normalization, Kernel Normalization

In [None]:
kernel_names = ['laplacian', 'gaussian']
kernel_type = [{'laplacian':[0.2, 0.6], 'gaussian':[0.3, 0.7]},
               {'laplacian':[0.4, 0.9], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True, normalizing = True, normalize_kernels = True)

In [None]:
result7 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result7.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result7.performancesFeatures()

## L2 Penalty, Centering, K-Normalization

In [None]:
kernel_names = ['laplacian', 'gaussian']
kernel_type = [{'laplacian':[0.2, 0.6], 'gaussian':[0.3, 0.7]},
               {'laplacian':[0.4, 0.9], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, lamb = 0.5, centering = True, normalize_kernels = False)

In [None]:
result9 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result9.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result9.performancesFeatures()

## Sparsity in eta, Centering, K-Normalization

In [11]:
y = outputs["cvd_fail"].values
C = d_clinical.values
G = d_genetic.values
V = d_vampire.values

In [12]:
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.35).split(C, y))

In [13]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
G_ = G[tr_idx]
G_test = G[ts_idx]
V_ = V[tr_idx]
V_test = V[ts_idx]

In [14]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

In [15]:
kernel_names = ['laplacian', 'gaussian']
kernel_type = [{'laplacian':[0.2, 0.6], 'gaussian':[0.3, 0.7]},
               {'laplacian':[0.4, 0.9], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, sparsity = 0.5, centering = True, normalize_kernels = True)

In [16]:
result8 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])


1 split out of 3 ...
	Working on config 1 of 1: {'linear': [0.5], 'polynomial': [2, 3], 'gaussian': [0.1, 0.3]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.5, 2, 0.1], [0.5, 2, 0.3], [0.5, 2, 0.3]]
	Result of 1:
CA: -0.015263825037119911
Accuracy: 0.4182692307692308
Precision: 0.23024054982817868
Recall: 0.32524271844660196
[linear:0.5, polynomial:2, gaussian:0.1, ]
[linear:0.5, polynomial:2, gaussian:0.3, ]
[linear:0.5, polynomial:2, gaussian:0.3, ]

eta vector: [-9.93611462e-01 -4.82139236e-02 -1.01655801e-01 -8.20215759e-03
  0.00000000e+00 -3.20479718e-03 -4.05470649e-04  0.00000000e+00
 -2.59317489e-04]


	Completed in 4.916666666666667 minutes

2 split out of 3 ...
	Working on config 1 of 1: {'linear': [0.5], 'polynomial': [2, 3], 'gaussian': [0.1, 0.3]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.5, 2, 0.1], [0.5, 2, 0.3], [0.5, 2, 0.3]]
	Result of 2:
CA: -0.0204784139735876
Accuracy: 0.41346153846153844
Precision: 0.2

In [17]:
w_dict, w_list = result8.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

Perfomances computed for 1 dictionary settings:
	Accuracy: 0.515625
	Precision: 0.33181818181818185
	Recall: 0.4605678233438486


In [18]:
result8.performancesFeatures()

statistics of configuration 1
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'polynomial': [[2, 2, 2], [2, 2, 2], [3, 2, 3]], 'gaussian': [[0.1, 0.3, 0.3], [0.1, 0.3, 0.3], [0.3, 0.3, 0.3]]}, 'CA': (-0.016506016815073216, 8.259000339410878e-06), 'Accuracy': (0.40651709401709396, 0.00017863339177441728), 'Precision': (0.2299864418919956, 0.0001257140632285835), 'Recall': (0.33980582524271846, 0.0004241681591101891), 'eta': (array([-9.81445535e-01, -5.54371569e-02, -1.61337851e-01, -8.50486084e-03,
        0.00000000e+00, -3.33130518e-03, -3.28808515e-03,  0.00000000e+00,
       -8.38342767e-04]), array([2.96335052e-04, 1.05432693e-04, 7.15606271e-03, 9.99715356e-07,
       0.00000000e+00, 1.68389282e-07, 7.28314377e-06, 0.00000000e+00,
       2.54659079e-07]))}
