In [1]:
import pandas as pd
import os
import numpy as np

import Utils as ut
import CortesAlignmentFile as ca
import mySampler as ms
 
from sklearn.model_selection import train_test_split, ShuffleSplit

  from collections import Sequence


In [2]:
d_clinical = pd.read_csv(os.path.join('data', 'dataset_clinical_cleaned.csv'))
d_genetic = pd.read_csv(os.path.join('data', 'dataset_genetic_cleaned_noOHE.csv'))
d_vampire = pd.read_csv(os.path.join('data', 'dataset_vampire_cleaned.csv'))
outputs = pd.read_csv(os.path.join('data', 'outputs_cleaned.csv'))

## Dementia

In [3]:
y_class = outputs["cvd_fail"].values
meaningful_idxs = np.where(y_class==1)
y = outputs["dement_time_age"].values[meaningful_idxs]
C = d_clinical.values[meaningful_idxs]
G = d_genetic.values[meaningful_idxs]
V = d_vampire.values[meaningful_idxs]

In [4]:
# COMPUTATIONAL COMPLEXITY: Reduce #samples
#tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.75).split(C, y))
C_, C_test, G_, G_test, V_, V_test, y_, y_test = train_test_split(C, G, V, y, test_size=0.25)

In [5]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

## Kernel Configuration

In [6]:
kernel_names = ['linear', 'gaussian']
kernel_type = [{'linear':[0.5], 'gaussian':[0.1, 0.5, 0.7]},
               {'linear':[0.2], 'gaussian':[0.7, 1]}]
estimator = ca.centeredKernelAlignment

## Basic approach

In [7]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression")

In [8]:
result1 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)


1 split out of 3 ...
	Working on config 1 of 2: {'linear': [0.5], 'gaussian': [0.1, 0.5, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.5, 0.1], [0.5, 0.1], [0.5, 0.1]]
	Result of 1:
CA: 0.002937355678745542
Average error: 77.68558196470589
Error variance: 69.68576090051883
[linear:0.5, gaussian:0.1, ]
[linear:0.5, gaussian:0.1, ]
[linear:0.5, gaussian:0.1, ]

eta vector: [ 1.36921731e-08  1.18270754e-11  1.40725662e-08  1.20817132e-11
 -5.44377795e-16  1.18274490e-11]


	Completed in 0.1 minutes
	Working on config 2 of 2: {'linear': [0.2], 'gaussian': [0.7, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.2, 1], [0.2, 0.7], [0.2, 1]]
	Result of 1:
CA: 0.002937355678745542
Average error: 77.68558196470589
Error variance: 69.68576090051883
[linear:0.5, gaussian:0.1, ]
[linear:0.5, gaussian:0.1, ]
[linear:0.5, gaussian:0.1, ]

eta vector: [ 1.36921731e-08  1.18270754e-11  1.40725662e-08  1.20817132e-11
 -5.44377795e-16  1.182

In [9]:
w_dict, w_list = result1.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

Perfomances computed for dictionary settings 1:
	Average error: 77.4938248362832
	Error variance: 78.33555750876612
	Pred: [-3.58733233e-07 -8.63141382e-06 -7.25558611e-06 -1.93870718e-06
 -6.62089308e-07  5.04044483e-06 -2.88840972e-07  1.32097943e-05
 -2.04922991e-05  1.99113115e-05  2.08698794e-06  2.12036271e-05
  2.29947385e-07 -1.13737399e-05  2.13809350e-06 -2.23138858e-05
  1.88586936e-06 -2.96948131e-05 -6.66076560e-06  3.57550306e-05
  3.46508096e-05 -2.10623597e-05 -1.85784082e-05 -2.90296787e-05
  3.45871330e-06 -1.88094430e-06  4.53778226e-06 -2.01454131e-05
 -1.15799803e-06  1.01554977e-05 -2.15923253e-05  1.80483806e-05
 -3.02306526e-06  1.17131510e-05 -4.54012809e-06 -5.67022149e-06
 -6.30109328e-06  9.95907377e-06  3.83879992e-06 -5.78030514e-06
 -7.85745331e-06 -1.00280059e-06  1.77837594e-05  1.09786170e-05
  3.87385806e-06 -2.37919626e-06  1.03523308e-05  1.47554107e-05
  2.69557184e-05  1.98271026e-05 -1.86472437e-05  9.62557771e-06
 -7.14898951e-06  4.00811519e-05

In [10]:
w_dict

[{'clinic': {'linear': 0.5, 'gaussian': 0.1},
  'genetic': {'linear': 0.5, 'gaussian': 0.1},
  'vampire': {'linear': 0.5, 'gaussian': 0.1}},
 {'clinic': {'linear': 0.2, 'gaussian': 1},
  'genetic': {'linear': 0.2, 'gaussian': 1},
  'vampire': {'linear': 0.2, 'gaussian': 1}}]

In [11]:
result1.performancesFeatures()

statistics of configuration 1
{'config': {'linear': [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]], 'gaussian': [[0.1, 0.1, 0.1], [0.1, 0.1, 0.1], [0.1, 0.1, 0.1]]}, 'CA': (0.0028861260041469342, 1.5303247432988233e-09), 'meanErr': (78.64897261372549, 0.5475079075199448), 'varErr': (69.63488935683966, 1.2907365102651418), 'eta': (array([ 1.30980607e-08,  1.38973525e-11,  1.70439792e-08,  1.43497242e-11,
       -1.83963233e-16,  1.38994001e-11]), array([1.77741931e-19, 1.35538707e-23, 2.03132693e-17, 1.50733252e-23,
       2.77132856e-31, 1.35670494e-23]))}
statistics of configuration 2
{'config': {'linear': [[0.2, 0.2, 0.2], [0.2, 0.2, 0.2], [0.2, 0.2, 0.2]], 'gaussian': [[1, 0.7, 1], [1, 1, 1], [1, 1, 1]]}, 'CA': (0.0028861260022480643, 1.530324790820636e-09), 'meanErr': (78.64897261372549, 0.5475079075199369), 'varErr': (69.63488935684141, 1.290736510271162), 'eta': (array([ 1.30980607e-08,  1.38993967e-11,  1.70439763e-08,  1.38994279e-11,
       -1.83963203e-16,  1.38994279e-1

## Kernel normalization

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", normalize_kernels = True)

In [None]:
result3 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
w_dict, w_list = result3.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result3.performancesFeatures()

## Normalized data

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", normalizing = True)

In [None]:
result4 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[0, 3, 8, 13, 14], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result4.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result4.performancesFeatures()

## Origin Data Centering

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", centering = True)

In [None]:
result5 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result5.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result5.performancesFeatures()

## Origin Data  Centering and Normalization

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", centering = True, normalizing = True)

In [None]:
result6 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result6.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result6.performancesFeatures()

## Normalization, Kernel Normalization

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", normalizing = True, normalize_kernels = True)

In [None]:
result10 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result10.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result10.performancesFeatures()

## Centering, Kernel Normalization

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", centering = True, normalize_kernels = True)

In [None]:
result11 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result11.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result11.performancesFeatures()

## Centering, Normalization, Kernel Normalization

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", centering = True, normalizing = True, normalize_kernels = True)

In [None]:
result7 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result7.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result7.performancesFeatures()

## L2 Penalty, Centering, K-Normalization

In [12]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", lamb = 0.5, centering = True, normalize_kernels = False)

In [13]:
result9 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])


1 split out of 3 ...
	Working on config 1 of 2: {'linear': [0.5], 'gaussian': [0.1, 0.5, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.5, 0.1], [0.5, 0.5], [0.5, 0.1]]
	Result of 1:
CA: 0.004060544812182261
Average error: 0.5690943318836446
Error variance: 4.674303182618229
[linear:0.5, gaussian:0.1, ]
[linear:0.5, gaussian:0.5, ]
[linear:0.5, gaussian:0.1, ]

eta vector: [ 1.62083034e-05 -5.25920177e-06 -4.73265845e-05  7.42071270e-05
  5.55225495e-06 -3.26728200e-05]


	Completed in 0.1 minutes
	Working on config 2 of 2: {'linear': [0.2], 'gaussian': [0.7, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[[0.2, 0.7], [0.2, 0.7], [0.2, 1]]
	Result of 1:
CA: 0.004060544812182261
Average error: 0.5690943318836446
Error variance: 4.674303182618229
[linear:0.5, gaussian:0.1, ]
[linear:0.5, gaussian:0.5, ]
[linear:0.5, gaussian:0.1, ]

eta vector: [ 1.62083034e-05 -5.25920177e-06 -4.73265845e-05  7.42071270e-05
  5.55225495e-06 -3

In [14]:
w_dict, w_list = result9.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

KeyboardInterrupt: 

In [None]:
w_dict

In [None]:
result9.performancesFeatures()

## Sparsity in eta, Centering, K-Normalization

In [None]:
y_class = outputs["cvd_fail"].values
meaningful_idxs = np.where(y_class==1)
y = outputs["dement_time_age"].values[meaningful_idxs]
C = d_clinical.values[meaningful_idxs]
G = d_genetic.values[meaningful_idxs]
V = d_vampire.values[meaningful_idxs]

In [None]:
tr_idx, ts_idx = next(ShuffleSplit(n_splits=1, test_size=0.5).split(C, y))

In [None]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
G_ = G[tr_idx]
G_test = G[ts_idx]
V_ = V[tr_idx]
V_test = V[ts_idx]

In [None]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

In [None]:
sampler = ms.mySampler(n_splits=3, test_size=.25, Ptype="regression", sparsity = 0.7, centering = True, normalize_kernels = True)

In [None]:
result8 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result8.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names, Ptype='regression')

In [None]:
w_dict

In [None]:
result8.performancesFeatures()