In [1]:
import pandas as pd
import os
import numpy as np

import Utils as ut
import CortesAlignmentFile as ca
import mySampler as ms
 
from sklearn.model_selection import train_test_split, ShuffleSplit

In [2]:
d_clinical = pd.read_csv(os.path.join('data', 'dataset_clinical_cleaned.csv'))
d_genetic = pd.read_csv(os.path.join('data', 'dataset_genetic_cleaned_noOHE.csv'))
d_vampire = pd.read_csv(os.path.join('data', 'dataset_vampire_cleaned.csv'))
outputs = pd.read_csv(os.path.join('data', 'outputs_cleaned.csv'))

## Dementia

In [3]:
y_class = outputs["cvd_fail"].values
meaningful_idxs = np.where(y_class==1)
y = outputs["dement_time_age"].values[meaningful_idxs]
C = d_clinical.values[meaningful_idxs]
G = d_genetic.values[meaningful_idxs]
V = d_vampire.values[meaningful_idxs]

In [4]:
# COMPUTATIONAL COMPLEXITY: Reduce #samples
#tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.75).split(C, y))
C_, C_test, G_, G_test, V_, V_test, y_, y_test = train_test_split(C, G, V, y, test_size=0.25)

In [5]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

## Kernel Configuration

In [6]:
kernel_names = ['linear', 'gaussian']
kernel_type = [{'linear':[0.5], 'gaussian':[0.1, 0.5, 0.7]},
               {'linear':[0.2], 'gaussian':[0.7, 1]}]
estimator = ca.centeredKernelAlignment

## Basic approach

In [7]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression")

In [8]:
result1 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)


1 split out of 3 ...
	Working on config 1 of 2: {'linear': [0.5], 'gaussian': [0.1, 0.5, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.5, 0.1], [0.5, 0.7], [0.5, 0.7]]
	Result of 1:
CA: 0.0029245529614775083
Average error: 0.07632706320999941
Error variance: 7.335527807167394e-05
[linear:0.5, gaussian:0.1, ]
[linear:0.5, gaussian:0.7, ]
[linear:0.5, gaussian:0.7, ]

eta vector: [ 1.33127168e-08  1.11617380e-11  1.29712873e-08  1.11608584e-11
 -6.88421888e-16  1.11608582e-11]


	Completed in 0.16666666666666666 minutes
	Working on config 2 of 2: {'linear': [0.2], 'gaussian': [0.7, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.2, 1], [0.2, 0.7], [0.2, 0.7]]
	Result of 1:
CA: 0.0029245529614775083
Average error: 0.07632706320999941
Error variance: 7.335527807167394e-05
[linear:0.5, gaussian:0.1, ]
[linear:0.5, gaussian:0.7, ]
[linear:0.5, gaussian:0.7, ]

eta vector: [ 1.33127168e-08  1.11617380e-11  1.29712873e-08  1.1160

In [9]:
w_dict, w_list = result1.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

Perfomances computed for dictionary settings 1:
	Average error: 77.77034766814161
	Error variance: 87.8952860091704
	Pred: [ 2.87497868e-05  1.51674617e-05  2.76936983e-05 -7.88295398e-06
 -9.98191215e-06  2.46661137e-05 -1.69206536e-05  7.87506547e-07
  2.15262153e-05  1.68265883e-05  6.94542713e-06  2.41868196e-05
  7.04039890e-06 -2.94487439e-05 -1.11582292e-05  1.52255039e-05
 -1.27115230e-05 -2.74763013e-05  2.12415631e-05  1.70898013e-05
 -2.06638311e-05 -4.40891759e-06 -9.29982775e-06  3.86376673e-06
  2.50136454e-05  6.21921360e-06  1.74016588e-05  5.39192775e-06
  1.30824681e-05  1.42102791e-05 -2.88941088e-05 -7.08055032e-06
 -9.96424839e-06 -1.38944652e-05 -3.57519679e-06  3.89789185e-05
 -1.71589667e-06  1.43807284e-05  7.36075622e-06  1.75001019e-05
  1.15140805e-05  3.57943603e-05  8.62188152e-06 -6.30657263e-06
  5.59852291e-05 -1.53035222e-05 -3.08591196e-06 -7.97440490e-06
  2.28591515e-05 -9.25792540e-06  7.81602050e-06 -5.72950439e-06
 -2.67420586e-05  9.30431824e-06

In [10]:
w_dict

[{'clinic': {'gaussian': 0.1, 'linear': 0.5},
  'genetic': {'gaussian': 0.1, 'linear': 0.5},
  'vampire': {'gaussian': 0.1, 'linear': 0.5}},
 {'clinic': {'gaussian': 1, 'linear': 0.2},
  'genetic': {'gaussian': 0.7, 'linear': 0.2},
  'vampire': {'gaussian': 0.7, 'linear': 0.2}}]

In [11]:
result1.performancesFeatures()

statistics of configuration 1
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'gaussian': [[0.1, 0.7, 0.7], [0.1, 0.1, 0.1], [0.1, 0.1, 0.1]]}, 'CA': (0.0025768259078823874, 1.3247666119732952e-07), 'meanErr': (0.0762170042449045, 1.0736234758292169e-08), 'varErr': (8.881886735266678e-05, 2.243095227367233e-10), 'eta': (array([ 1.20689531e-08,  1.11711060e-11,  1.34820836e-08,  1.14127906e-11,
       -4.38225585e-16,  1.11701120e-11]), array([1.95433257e-18, 2.78283083e-24, 2.87222659e-18, 3.14242780e-24,
       3.53911416e-32, 2.77688281e-24]))}
statistics of configuration 2
{'config': {'linear': [[0.2, 0.2, 0.2], [0.2, 0.2, 0.2], [0.2, 0.2, 0.2]], 'gaussian': [[1, 0.7, 0.7], [1, 0.7, 1], [1, 0.7, 0.7]]}, 'CA': (0.0025768259064839496, 1.3247666141773496e-07), 'meanErr': (0.0762170042449045, 1.0736234758293188e-08), 'varErr': (8.881886735844021e-05, 2.2430952274795392e-10), 'eta': (array([ 1.20689531e-08,  1.11701544e-11,  1.34820811e-08,  1.11701583e-11,
   

## Kernel normalization

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", normalize_kernels = True)

In [None]:
result3 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
w_dict, w_list = result3.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result3.performancesFeatures()

## Normalized data

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", normalizing = True)

In [None]:
result4 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[0, 3, 8, 13, 14], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result4.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result4.performancesFeatures()

## Origin Data Centering

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", centering = True)

In [None]:
result5 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result5.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result5.performancesFeatures()

## Origin Data  Centering and Normalization

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", centering = True, normalizing = True)

In [None]:
result6 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result6.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result6.performancesFeatures()

## Normalization, Kernel Normalization

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", normalizing = True, normalize_kernels = True)

In [None]:
result10 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result10.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result10.performancesFeatures()

## Centering, Kernel Normalization

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", centering = True, normalize_kernels = True)

In [None]:
result11 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result11.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result11.performancesFeatures()

## Centering, Normalization, Kernel Normalization

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", centering = True, normalizing = True, normalize_kernels = True)

In [None]:
result7 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result7.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result7.performancesFeatures()

## L2 Penalty, Centering, K-Normalization

In [12]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", lamb = 0.5, centering = True, normalize_kernels = False)

In [13]:
result9 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])


1 split out of 3 ...
	Working on config 1 of 2: {'linear': [0.5], 'gaussian': [0.1, 0.5, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.5, 0.1], [0.5, 0.5], [0.5, 0.1]]
	Result of 1:
CA: 0.004060544812182261
Average error: 0.5690943318836446
Error variance: 4.674303182618229
[linear:0.5, gaussian:0.1, ]
[linear:0.5, gaussian:0.5, ]
[linear:0.5, gaussian:0.1, ]

eta vector: [ 1.62083034e-05 -5.25920177e-06 -4.73265845e-05  7.42071270e-05
  5.55225495e-06 -3.26728200e-05]


	Completed in 0.1 minutes
	Working on config 2 of 2: {'linear': [0.2], 'gaussian': [0.7, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.2, 0.7], [0.2, 0.7], [0.2, 1]]
	Result of 1:
CA: 0.004060544812182261
Average error: 0.5690943318836446
Error variance: 4.674303182618229
[linear:0.5, gaussian:0.1, ]
[linear:0.5, gaussian:0.5, ]
[linear:0.5, gaussian:0.1, ]

eta vector: [ 1.62083034e-05 -5.25920177e-06 -4.73265845e-05  7.42071270e-05
  5.55225495e-06 -3

In [14]:
w_dict, w_list = result9.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

KeyboardInterrupt: 

In [None]:
w_dict

In [None]:
result9.performancesFeatures()

## Sparsity in eta, Centering, K-Normalization

In [None]:
y_class = outputs["cvd_fail"].values
meaningful_idxs = np.where(y_class==1)
y = outputs["dement_time_age"].values[meaningful_idxs]
C = d_clinical.values[meaningful_idxs]
G = d_genetic.values[meaningful_idxs]
V = d_vampire.values[meaningful_idxs]

In [None]:
tr_idx, ts_idx = next(ShuffleSplit(n_splits=1, test_size=0.5).split(C, y))

In [None]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
G_ = G[tr_idx]
G_test = G[ts_idx]
V_ = V[tr_idx]
V_test = V[ts_idx]

In [None]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", sparsity = 0.7, centering = True, normalize_kernels = True)

In [None]:
result8 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result8.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result8.performancesFeatures()