In [1]:
import pandas as pd
import os
import numpy as np

import Utils as ut
import CortesAlignmentFile as ca
import mySampler as ms
 
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
d_clinical = pd.read_csv(os.path.join('data', 'dataset_clinical_cleaned.csv'))
d_genetic = pd.read_csv(os.path.join('data', 'dataset_genetic_cleaned_noOHE.csv'))
d_vampire = pd.read_csv(os.path.join('data', 'dataset_vampire_cleaned.csv'))
outputs = pd.read_csv(os.path.join('data', 'outputs_cleaned.csv'))

## Heart Attack

In [3]:
y = outputs["cvd_fail"].values
C = d_clinical.values
G = d_genetic.values
V = d_vampire.values

In [4]:
# COMPUTATIONAL COMPLEXITY: Reduce #samples
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.75).split(C, y))

In [5]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
G_ = G[tr_idx]
G_test = G[ts_idx]
V_ = V[tr_idx]
V_test = V[ts_idx]

In [6]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

## Basic approach

In [7]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3, 7], 'gaussian':[0.1, 0.5, 0.7]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25)

In [8]:
result1 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

1 split out of 3 ...
	Working on config 1 of 2: {'linear': [0.5], 'polynomial': [2, 3, 7], 'gaussian': [0.1, 0.5, 0.7]}
Fold no. 1
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 2
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 3
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Validation complete, config selected:[[0.5, 2, 0.1], [0.5, 7, 0.7], [0.5, 3, 0.5]]
[-1. -1. -1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.  1.
 -1. -1.  1. -1. -1.  1. -1.  1.  1. -1. -1. -1. -1.  1. -1.  1.  1. -1.
 -1. -1. -1. -1. -1. -1.  1.  1.  1. -1.  1.  1. -1.  1.  1. -1.  1. -1.
  1.  1. -1. -1. -1.  1. -1. -1.  1.  1.  1. -1. -1. -1.  1. -1. -1. -1.
 -1. -1.  1.  1. -1.  1.  1. -1. -1.  1.  1.  1. -1.  1. -1. -1.  1. -1.
  1. -1. -1. -1.  1. -1.  1. -1. -1. -1. -1. -1. -1. -1.  1.  1. -1. -1.
 -1.  1.  1. -1. -1.  1.  1. -1.  1. -1. -1.  1

In [None]:
w_dict, w_list = result1.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [9]:
result1.performancesFeatures()

statistics of configuration 0
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'polynomial': [[2, 7, 3], [3, 3, 2], [3, 3, 7]], 'gaussian': [[0.1, 0.7, 0.5], [0.1, 0.7, 0.7], [0.7, 0.7, 0.5]]}, 'CA': (0.05347224759449957, 0.00022333802800931696), 'Accuracy': (0.5193798449612403, 3.0046271257737023e-05), 'Precision': (0.3476759783702985, 2.099510807941181e-05), 'Recall': (0.5146198830409356, 0.0008891624773434571), 'eta': (array([ 1.45173885e-09, -5.42500655e-13,  7.59503964e-04, -5.01796366e-10,
        5.17960939e-09,  7.06728132e-01,  7.72140360e-15, -1.98397990e-20,
       -7.07483580e-01]), array([2.27841898e-18, 5.87454892e-25, 1.16772346e-06, 1.13711837e-19,
       1.96225507e-17, 2.94785364e-07, 8.51208113e-29, 7.87235245e-40,
       2.91951117e-07]))}
statistics of configuration 1
{'config': {'linear': [[0.2, 0.2, 0.2], [0.2, 0.2, 0.2], [0.2, 0.2, 0.2]], 'polynomial': [[4, 8, 8], [5, 8, 8], [4, 8, 5]], 'gaussian': [[1, 0.7, 1], [1, 1, 0.7], [0.7, 1, 1]

## Kernel normalization

In [10]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3], 'gaussian':[0.1, 0.3, 0.6]}#,
               #{'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}
              ]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, normalize_kernels = True)

In [11]:
result3 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

1 split out of 3 ...
	Working on config 1 of 1: {'linear': [0.5], 'polynomial': [2, 3], 'gaussian': [0.1, 0.3, 0.6]}
Fold no. 1
		Perfomances computed for 200
Fold no. 2
		Perfomances computed for 200
Fold no. 3
		Perfomances computed for 200
Validation complete, config selected:[[0.5, 3, 0.1], [0.5, 2, 0.1], [0.5, 2, 0.1]]
[ 1.  1.  1.  1.  1.  1.  1.  1.  1. -1. -1.  1. -1. -1. -1.  1.  1.  1.
  1. -1. -1. -1.  1. -1. -1. -1. -1. -1.  1. -1.  1. -1. -1.  1. -1.  1.
 -1. -1.  1.  1.  1. -1. -1.  1.  1.  1.  1. -1.  1. -1. -1.  1. -1. -1.
 -1.  1.  1. -1.  1.  1.  1. -1. -1. -1. -1.  1.  1. -1.  1. -1.  1. -1.
 -1.  1.  1. -1.  1.  1.  1. -1.  1.  1. -1.  1. -1. -1.  1. -1.  1. -1.
  1.  1.  1. -1. -1. -1.  1.  1.  1. -1. -1.  1.  1.  1. -1.  1. -1.  1.
  1.  1.  1.  1. -1.  1.  1.  1. -1.  1. -1. -1.  1.  1. -1.  1. -1. -1.
 -1. -1. -1.  1. -1.  1. -1. -1.  1. -1. -1.  1.  1. -1.  1.  1. -1. -1.
  1.  1.  1.  1. -1. -1. -1. -1. -1. -1.  1.  1. -1. -1.  1. -1.  1.  1.
  1.  1. -1.  1. 

In [None]:
w_dict, w_list = result3.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [12]:
result3.performancesFeatures()

statistics of configuration 0
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'polynomial': [[3, 2, 2], [2, 2, 2], [2, 2, 2]], 'gaussian': [[0.1, 0.1, 0.1], [0.1, 0.1, 0.3], [0.1, 0.1, 0.3]]}, 'CA': (0.06037883264195667, 2.8547030167898026e-06), 'Accuracy': (0.6007751937984496, 0.000818760891773331), 'Precision': (0.4341233854617406, 0.000628604063575547), 'Recall': (0.6608187134502924, 6.839711364180387e-05), 'eta': (array([ 9.52830255e-01, -2.87387829e-01,  5.29669331e-04, -1.95770557e-03,
        2.18146431e-03, -1.43973633e-04, -1.16467489e-03,  1.25486754e-03,
        5.92722047e-04]), array([6.71437581e-04, 8.83404531e-03, 9.71795126e-08, 4.34811027e-08,
       6.69366236e-07, 1.07831137e-06, 3.64513758e-07, 2.84086344e-07,
       2.54616781e-06]))}


## Normalized data

In [7]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3, 7], 'gaussian':[0.1, 0.5, 0.7]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, normalizing = True)

In [8]:
result4 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[0, 3, 8, 13, 14], list(range(G.shape[1]-3)), []])

1 split out of 3 ...
	Working on config 1 of 2: {'linear': [0.5], 'polynomial': [2, 3, 7], 'gaussian': [0.1, 0.5, 0.7]}
Fold no. 1
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 2
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 3
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Validation complete, config selected:[[0.5, 7, 0.7], [0.5, 7, 0.1], [0.5, 2, 0.1]]
[ 1. -1.  1.  1. -1.  1. -1.  1. -1.  1.  1. -1.  1. -1.  1. -1. -1.  1.
  1.  1.  1.  1.  1.  1. -1.  1. -1.  1.  1.  1. -1. -1.  1.  1. -1.  1.
  1.  1. -1. -1.  1.  1.  1.  1. -1.  1. -1.  1.  1.  1.  1.  1. -1.  1.
  1. -1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1. -1. -1.  1.  1.  1. -1.
 -1.  1. -1.  1. -1. -1. -1. -1.  1.  1.  1.  1.  1. -1.  1.  1.  1. -1.
 -1.  1. -1. -1.  1. -1. -1.  1.  1.  1.  1.  1. -1. -1.  1.  1. -1.  1.
  1.  1. -1.  1.  1. -1. -1. -1. -1. -1.  1. -1

In [None]:
w_dict, w_list = result4.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [9]:
result4.performancesFeatures()

statistics of configuration 0
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'polynomial': [[7, 7, 2], [7, 7, 2], [3, 7, 2]], 'gaussian': [[0.7, 0.1, 0.1], [0.7, 0.7, 0.1], [0.1, 0.7, 0.1]]}, 'CA': (0.04854373025301791, 5.394982363551252e-06), 'Accuracy': (0.43604651162790703, 0.0002028123309897236), 'Precision': (0.32046624017380865, 5.866143364355213e-05), 'Recall': (0.6257309941520468, 0.0002735884545672172), 'eta': (array([ 4.07819565e-02, -3.53554344e-01,  1.14955533e-01,  9.47491795e-03,
       -4.90974602e-01,  2.62905754e-03, -6.53550732e-04,  5.28828939e-02,
       -1.22725737e-04]), array([2.79520070e-03, 1.87084455e-01, 2.46091706e-02, 4.03677068e-05,
       1.11119127e-01, 1.26643227e-05, 4.44963560e-05, 2.90464682e-01,
       1.43209338e-06]))}
statistics of configuration 1
{'config': {'linear': [[0.2, 0.2, 0.2], [0.2, 0.2, 0.2], [0.2, 0.2, 0.2]], 'polynomial': [[8, 4, 4], [8, 8, 4], [8, 8, 4]], 'gaussian': [[1, 0.7, 0.7], [1, 1, 0.7], [1, 1, 0.

## Origin Data Centering

In [10]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3, 7], 'gaussian':[0.1, 0.5, 0.7]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True)

In [11]:
result5 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

1 split out of 3 ...
	Working on config 1 of 2: {'linear': [0.5], 'polynomial': [2, 3, 7], 'gaussian': [0.1, 0.5, 0.7]}
Fold no. 1
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 2
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 3
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Validation complete, config selected:[[0.5, 7, 0.7], [0.5, 7, 0.7], [0.5, 2, 0.1]]
[-1. -1.  1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1

In [None]:
w_dict, w_list = result5.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [12]:
result5.performancesFeatures()

statistics of configuration 0
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'polynomial': [[7, 7, 2], [2, 2, 7], [7, 7, 7]], 'gaussian': [[0.7, 0.7, 0.1], [0.7, 0.1, 0.7], [0.5, 0.7, 0.1]]}, 'CA': (0.04074637818795215, 1.0355353971238924e-06), 'Accuracy': (0.45348837209302323, 0.024495222642870015), 'Precision': (0.4451213818860878, 0.024557405750971404), 'Recall': (0.672514619883041, 0.2032078246298006), 'eta': (array([ 7.53591333e-06, -3.12281815e-05,  6.60584303e-06,  1.37457585e-02,
       -9.22596379e-01,  3.52055143e-04, -2.84578551e-03,  2.19216135e-01,
       -8.13730628e-04]), array([4.21621518e-11, 6.82089412e-10, 4.69650350e-11, 3.60670455e-05,
       1.19247815e-02, 6.13116890e-09, 1.28169234e-05, 8.85882947e-02,
       4.09351761e-07]))}
statistics of configuration 1
{'config': {'linear': [[0.2, 0.2, 0.2], [0.2, 0.2, 0.2], [0.2, 0.2, 0.2]], 'polynomial': [[8, 8, 4], [4, 4, 8], [4, 8, 4]], 'gaussian': [[1, 1, 0.7], [1, 0.7, 1], [0.7, 1, 0.7]]}, 

## Origin Data  Centering and Normalization

In [13]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3, 7], 'gaussian':[0.1, 0.5, 0.7]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True, normalizing = True)

In [14]:
result6 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

1 split out of 3 ...
	Working on config 1 of 2: {'linear': [0.5], 'polynomial': [2, 3, 7], 'gaussian': [0.1, 0.5, 0.7]}
Fold no. 1
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 2
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 3
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Validation complete, config selected:[[0.5, 3, 0.5], [0.5, 2, 0.1], [0.5, 7, 0.7]]
[-1.  1.  1. -1.  1. -1. -1. -1.  1.  1.  1. -1.  1. -1. -1. -1. -1. -1.
 -1. -1. -1.  1.  1. -1.  1.  1. -1. -1.  1. -1. -1.  1.  1.  1.  1. -1.
  1.  1. -1.  1. -1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1.  1. -1.
 -1. -1. -1.  1. -1. -1.  1. -1. -1. -1.  1.  1. -1.  1.  1.  1. -1.  1.
 -1. -1.  1.  1. -1. -1.  1. -1.  1.  1.  1. -1.  1.  1.  1.  1.  1. -1.
 -1.  1.  1. -1. -1.  1.  1.  1. -1.  1. -1. -1. -1.  1. -1. -1.  1.  1.
 -1. -1.  1.  1.  1.  1.  1.  1. -1. -1.  1.  1

In [None]:
w_dict, w_list = result6.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [15]:
result6.performancesFeatures()

statistics of configuration 0
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'polynomial': [[3, 2, 7], [3, 2, 7], [7, 7, 2]], 'gaussian': [[0.5, 0.1, 0.7], [0.1, 0.1, 0.5], [0.7, 0.7, 0.1]]}, 'CA': (0.041224817571126164, 1.7347072975688588e-06), 'Accuracy': (0.501937984496124, 0.0027342106844540606), 'Precision': (0.33095238095238094, 0.00027588813303098967), 'Recall': (0.52046783625731, 0.04603125748093429), 'eta': (array([ 3.95519938e-06, -2.07423568e-05,  5.46908588e-06,  1.04207118e-02,
       -9.96357971e-01,  4.18824766e-04, -6.00538950e-04,  4.86273052e-02,
       -1.17654466e-04]), array([1.79884238e-11, 2.12296551e-10, 1.61545456e-11, 4.47726040e-05,
       2.62919860e-05, 2.81763999e-09, 7.19167498e-07, 4.72522294e-03,
       2.76503747e-08]))}
statistics of configuration 1
{'config': {'linear': [[0.2, 0.2, 0.2], [0.2, 0.2, 0.2], [0.2, 0.2, 0.2]], 'polynomial': [[4, 4, 8], [4, 4, 4], [8, 8, 4]], 'gaussian': [[0.7, 0.7, 1], [0.7, 0.7, 0.7], [0.7, 1,

## Normalization, Kernel Normalization

In [7]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3,4], 'gaussian':[0.1, 0.3, 0.6]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}
              ]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, normalizing = True, normalize_kernels = True)

In [8]:
result10 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

1 split out of 3 ...
	Working on config 1 of 1: {'linear': [0.5], 'polynomial': [2, 3, 4], 'gaussian': [0.1, 0.3, 0.6]}
Fold no. 1
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 2
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 3
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Validation complete, config selected:[[0.5, 4, 0.1], [0.5, 4, 0.6], [0.5, 2, 0.1]]
[-1. -1. -1. -1.  1.  1. -1. -1. -1. -1. -1.  1.  1. -1.  1. -1.  1.  1.
 -1.  1.  1. -1. -1.  1.  1.  1. -1.  1. -1.  1.  1.  1.  1.  1. -1.  1.
  1.  1.  1.  1. -1.  1.  1.  1.  1.  1. -1. -1.  1.  1.  1.  1. -1.  1.
  1. -1.  1.  1.  1.  1.  1.  1.  1. -1.  1.  1.  1. -1.  1.  1. -1.  1.
  1.  1. -1.  1. -1.  1. -1.  1.  1.  1. -1. -1.  1. -1. -1.  1.  1.  1.
  1.  1.  1.  1. -1.  1. -1. -1. -1.  1.  1.  1.  1. -1.  1. -1.  1. -1.
  1.  1.  1.  1. -1. -1.  1.  1. -1.  1.  1.  1

In [None]:
w_dict, w_list = result10.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [9]:
result10.performancesFeatures()

statistics of configuration 0
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'polynomial': [[4, 4, 2], [4, 4, 4], [4, 4, 2]], 'gaussian': [[0.1, 0.6, 0.1], [0.1, 0.6, 0.6], [0.1, 0.6, 0.1]]}, 'CA': (0.030292943788144098, 9.35385402361012e-07), 'Accuracy': (0.4728682170542635, 0.00023285860224746114), 'Precision': (0.349208744337909, 0.00014335053622458428), 'Recall': (0.6842105263157895, 0.0008207653637016518), 'eta': (array([-1.00787955e-03, -7.36039883e-01,  6.75926452e-01, -1.14592255e-04,
       -3.08011309e-02,  4.57180108e-04,  3.72730486e-06,  1.39074241e-02,
       -8.89341951e-04]), array([2.54013775e-07, 1.07852237e-06, 2.58622298e-06, 2.30621010e-10,
       2.64009892e-06, 1.37638933e-09, 9.01910039e-12, 2.17086260e-04,
       9.20225282e-07]))}


## Centering, Kernel Normalization

In [10]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3,4], 'gaussian':[0.1, 0.3, 0.6]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}
              ]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True, normalize_kernels = True)

In [11]:
result11 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

1 split out of 3 ...
	Working on config 1 of 1: {'linear': [0.5], 'polynomial': [2, 3, 4], 'gaussian': [0.1, 0.3, 0.6]}
Fold no. 1
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 2
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 3
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Validation complete, config selected:[[0.5, 4, 0.6], [0.5, 4, 0.6], [0.5, 2, 0.6]]
[ 1.  1.  1.  1.  1.  1. -1.  1. -1. -1.  1.  1.  1.  1.  1. -1. -1.  1.
  1.  1. -1.  1.  1.  1. -1. -1. -1.  1.  1.  1.  1.  1.  1. -1. -1. -1.
  1.  1.  1.  1.  1.  1.  1. -1.  1. -1. -1.  1. -1.  1.  1. -1.  1.  1.
 -1. -1.  1.  1. -1.  1.  1.  1. -1.  1. -1. -1. -1.  1. -1.  1. -1. -1.
 -1. -1.  1. -1. -1.  1.  1.  1. -1.  1.  1.  1.  1. -1. -1. -1.  1. -1.
  1.  1. -1. -1.  1.  1.  1.  1. -1.  1.  1. -1.  1.  1.  1.  1.  1. -1.
  1. -1.  1. -1. -1. -1.  1.  1.  1.  1. -1. -1

In [None]:
w_dict, w_list = result11.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [12]:
result11.performancesFeatures()

statistics of configuration 0
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'polynomial': [[4, 4, 2], [2, 4, 4], [2, 4, 2]], 'gaussian': [[0.6, 0.6, 0.6], [0.6, 0.6, 0.6], [0.3, 0.6, 0.1]]}, 'CA': (0.0325347914740802, 1.4868733698900853e-06), 'Accuracy': (0.4573643410852713, 0.00023285860224746114), 'Precision': (0.3457685749885238, 4.695818727332925e-05), 'Recall': (0.7192982456140351, 0.008823227659792751), 'eta': (array([ 4.55148470e-05, -1.09993908e-03,  9.11450287e-05, -2.77100193e-03,
       -9.99618892e-01,  1.36476319e-02, -9.53899011e-05,  1.29904981e-02,
       -5.37817885e-04]), array([9.07569483e-10, 6.01174122e-07, 1.82197477e-09, 5.92845045e-08,
       1.49853044e-07, 1.93664976e-07, 1.61105256e-08, 3.96024706e-04,
       8.15959858e-07]))}


## Centering, Normalization, Kernel Normalization

In [13]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3,4], 'gaussian':[0.1, 0.3, 0.6]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}
              ]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True, normalizing = True, normalize_kernels = True)

In [14]:
result7 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

1 split out of 3 ...
	Working on config 1 of 2: {'linear': [0.5], 'polynomial': [2, 3, 4], 'gaussian': [0.1, 0.3, 0.6]}
Fold no. 1
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 2
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 3
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Validation complete, config selected:[[0.5, 4, 0.6], [0.5, 4, 0.6], [0.5, 2, 0.6]]
[ 1.  1.  1. -1. -1. -1. -1.  1. -1.  1. -1. -1.  1. -1.  1.  1. -1.  1.
 -1.  1. -1.  1. -1. -1.  1.  1. -1. -1.  1.  1. -1.  1. -1. -1.  1. -1.
 -1. -1.  1.  1. -1. -1. -1. -1.  1. -1. -1.  1.  1.  1. -1. -1.  1. -1.
  1. -1.  1. -1. -1. -1. -1. -1. -1.  1.  1. -1.  1. -1. -1. -1. -1. -1.
  1. -1. -1.  1.  1. -1. -1. -1. -1.  1. -1.  1. -1. -1.  1. -1. -1.  1.
  1. -1. -1. -1.  1.  1.  1.  1. -1.  1. -1.  1. -1.  1. -1. -1.  1.  1.
 -1. -1.  1. -1. -1.  1. -1.  1. -1. -1. -1. -1

In [None]:
w_dict, w_list = result7.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [15]:
result7.performancesFeatures()

statistics of configuration 0
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'polynomial': [[4, 4, 2], [2, 4, 2], [4, 4, 4]], 'gaussian': [[0.6, 0.6, 0.6], [0.1, 0.6, 0.1], [0.3, 0.6, 0.6]]}, 'CA': (0.031749787630639126, 1.6768062646255818e-09), 'Accuracy': (0.5755813953488372, 0.0008788534342888049), 'Precision': (0.35388916768227113, 0.0003066151475751967), 'Recall': (0.3508771929824561, 0.017236072637734686), 'eta': (array([ 4.22786626e-05, -2.93218244e-03,  1.26123139e-03, -2.81786049e-03,
       -9.98308914e-01,  1.37305416e-02, -2.79454902e-04, -2.60606745e-02,
        3.18511772e-03]), array([4.42114461e-10, 1.15491314e-05, 2.84463124e-06, 2.14086392e-08,
       4.79895558e-06, 6.12035471e-08, 1.94044024e-08, 2.44479422e-03,
       1.91819837e-05]))}
statistics of configuration 1
{'config': {'linear': [[0.2, 0.2, 0.2], [0.2, 0.2, 0.2], [0.2, 0.2, 0.2]], 'polynomial': [[8, 8, 4], [8, 8, 8], [8, 8, 8]], 'gaussian': [[0.7, 1, 1], [0.7, 1, 1], [0.7, 1, 0.

## L2 Penalty, Centering, K-Normalization

In [19]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3,4], 'gaussian':[0.1, 0.3, 0.6]},
               {'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}
              ]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, lamb = 0.5, centering = True, normalize_kernels = False)

In [20]:
result9 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

1 split out of 3 ...
	Working on config 1 of 2: {'linear': [0.5], 'polynomial': [2, 3, 4], 'gaussian': [0.1, 0.3, 0.6]}
Fold no. 1
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 2
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Fold no. 3
		Perfomances computed for 200
		Perfomances computed for 400
		Perfomances computed for 600
Validation complete, config selected:[[0.5, 2, 0.3], [0.5, 2, 0.1], [0.5, 2, 0.6]]
[ 1.  1.  1.  1. -1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1

In [None]:
w_dict, w_list = result9.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [21]:
result9.performancesFeatures()

statistics of configuration 0
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'polynomial': [[2, 2, 2], [4, 3, 2], [4, 4, 2]], 'gaussian': [[0.3, 0.1, 0.6], [0.3, 0.1, 0.3], [0.6, 0.6, 0.3]]}, 'CA': (0.018046251668370913, 7.590203868436717e-05), 'Accuracy': (0.4573643410852713, 0.009494621717444862), 'Precision': (0.3447173489278752, 0.00041513704121686135), 'Recall': (0.6725146198830408, 0.04849355357203925), 'eta': (array([-0.04001888,  0.17132485,  0.03849481, -0.03779838, -0.00163098,
       -0.35693965,  0.03670692,  0.00049703, -0.06214645]), array([4.98704030e-04, 1.11365675e-02, 8.23325610e-04, 1.48776844e-01,
       1.21136617e-05, 6.70221479e-01, 2.87110178e-04, 4.46981932e-08,
       1.76113008e-03]))}
statistics of configuration 1
{'config': {'linear': [[0.2, 0.2, 0.2], [0.2, 0.2, 0.2], [0.2, 0.2, 0.2]], 'polynomial': [[8, 8, 4], [5, 8, 4], [5, 8, 4]], 'gaussian': [[0.7, 1, 0.7], [1, 1, 0.7], [1, 1, 0.7]]}, 'CA': (0.031022351372646865, 4.113776066

## Sparsity in eta, Centering, K-Normalization

In [3]:
y = outputs["cvd_fail"].values
C = d_clinical.values
G = d_genetic.values
V = d_vampire.values

In [4]:
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.95).split(C, y))

In [5]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
G_ = G[tr_idx]
G_test = G[ts_idx]
V_ = V[tr_idx]
V_test = V[ts_idx]

In [6]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

In [7]:
kernel_names = ['linear', 'polynomial', 'gaussian']
kernel_type = [{'linear':[0.5], 'polynomial':[2, 3], 'gaussian':[0.1, 0.3]}#,
               #{'linear':[0.2], 'polynomial':[4, 5, 8], 'gaussian':[0.7, 1]}
              ]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, sparsity = 0.3, centering = True, normalize_kernels = True)

In [8]:
result8 = sampler.sample(kernel_type, estimator, ds_list, y, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

1 split out of 3 ...
	Working on config 1 of 1: {'linear': [0.5], 'polynomial': [2, 3], 'gaussian': [0.1, 0.3]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.5, 3, 0.3], [0.5, 2, 0.3], [0.5, 3, 0.1]]
[ 1.  1. -1.  1. -1. -1.  1.  1.  1. -1. -1. -1.  1. -1. -1. -1. -1.  1.
  1. -1. -1. -1.  1.  1.  1.  1. -1.  1.  1.  1.  1. -1.  1.  1.  1.]
	Result of 0:
CA: 0.08084771872871385
Accuracy: 0.45714285714285713
Recall: 0.5454545454545454
[linear:0.5, polynomial:3, gaussian:0.3, ]
[linear:0.5, polynomial:2, gaussian:0.3, ]
[linear:0.5, polynomial:3, gaussian:0.1, ]

eta vector: [ 0.         -0.31571982  0.08626693 -0.29187424  0.          0.85099165
  0.          0.28896653  0.        ]
2 split out of 3 ...
	Working on config 1 of 1: {'linear': [0.5], 'polynomial': [2, 3], 'gaussian': [0.1, 0.3]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.5, 3, 0.1], [0.5, 2, 0.1], [0.5, 2, 0.3]]
[-1. -1.  1.  1.  1.  1. -1.  1.  1.  1. -1.  1.  1. 

In [None]:
w_dict, w_list = result8.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [10]:
result8.performancesFeatures()

statistics of configuration 0
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'polynomial': [[3, 2, 3], [3, 2, 2], [2, 2, 2]], 'gaussian': [[0.3, 0.3, 0.1], [0.1, 0.1, 0.3], [0.1, 0.3, 0.1]]}, 'CA': (0.07599035135991884, 3.102964386541546e-05), 'Accuracy': (0.45714285714285713, 0.0), 'Precision': (0.3121212121212121, 7.346189164370987e-05), 'Recall': (0.606060606060606, 0.001836547291092747), 'eta': (array([-0.0141041 , -0.10523994,  0.19132926, -0.20350799,  0.        ,
        0.9082223 , -0.00343633,  0.09632218,  0.00904469]), array([1.48627665e-04, 2.21508899e-02, 1.29623640e-02, 2.08272388e-02,
       0.00000000e+00, 1.63812388e-03, 1.71412622e-05, 1.85559238e-02,
       1.63612844e-04]))}
