In [1]:
import pandas as pd
import os
import numpy as np

import Utils as ut
import CortesAlignmentFile as ca
import mySampler as ms
 
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
d_clinical = pd.read_csv(os.path.join('data', 'dataset_clinical_cleaned.csv'))
#d_genetic = pd.read_csv(os.path.join('data', 'dataset_genetic_cleaned_noOHE.csv'))
#d_vampire = pd.read_csv(os.path.join('data', 'dataset_vampire_cleaned.csv'))
outputs = pd.read_csv(os.path.join('data', 'outputs_cleaned.csv'))

## Dementia

In [3]:
y = outputs["dement_fail"].values
C = d_clinical.values
#G = d_genetic.values
#V = d_vampire.values

In [4]:
# COMPUTATIONAL COMPLEXITY: Reduce #samples
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.5).split(C, y))

In [5]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
#G_ = G[tr_idx]
#G_test = G[ts_idx]
#V_ = V[tr_idx]
#V_test = V[ts_idx]

In [6]:
ds_list = [C_]#, G_, V_]
ds_test = [C_test]#, G_test, V_test]
ds_names = ['clinic']#, 'genetic', 'vampire']

## Basic approach

In [None]:
kernel_names = ['linear', 'laplacian', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35)

In [None]:
result1 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
w_dict, w_list = result1.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
w_dict

In [None]:
result1.performancesFeatures()

## Kernel normalization

In [None]:
kernel_names = ['linear', 'laplacian', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, normalize_kernels = True)

In [None]:
result3 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
w_dict, w_list = result3.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result3.performancesFeatures()

## Normalized data

In [None]:
kernel_names = ['linear', 'laplacian', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, normalizing = True)

In [None]:
result4 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[0, 3, 8, 13, 14], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result4.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result4.performancesFeatures()

## Origin Data Centering

In [None]:
kernel_names = ['linear', 'laplacian', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True)

In [None]:
result5 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result5.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result5.performancesFeatures()

## Origin Data  Centering and Normalization

In [None]:
kernel_names = ['linear', 'laplacian', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True, normalizing = True)

In [None]:
result6 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result6.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result6.performancesFeatures()

## Normalization, Kernel Normalization

In [None]:
kernel_names = ['linear', 'laplacian', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, normalizing = True, normalize_kernels = True)

In [None]:
result10 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result10.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result10.performancesFeatures()

## Centering, Kernel Normalization

In [28]:
kernel_names = ['linear', 'laplacian', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.2, 0.3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.2, 0.1], 'gaussian':[0.5, 0.2]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True, normalize_kernels = True)

In [29]:
result11 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)


1 split out of 3 ...
	Working on config 1 of 2: {'linear': [1], 'laplacian': [0.2, 0.3], 'gaussian': [0.4, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[1, 0.2, 0.7]
	Result of 1:
CA: 0.028482049832347676
Accuracy: 0.6333333333333333
Precision: 0.4560810810810811
Recall: 0.5672268907563025
[linear:1, laplacian:0.2, gaussian:0.7, ]

eta vector: [ 0.35209412  7.83237376 -1.53704859]


	Completed in 0.16666666666666666 minutes
	Working on config 2 of 2: {'linear': [1], 'laplacian': [0.2, 0.1], 'gaussian': [0.5, 0.2]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[1, 0.1, 0.5]
	Result of 1:
CA: 0.028482049832347676
Accuracy: 0.6333333333333333
Precision: 0.4560810810810811
Recall: 0.5672268907563025
[linear:1, laplacian:0.2, gaussian:0.7, ]

eta vector: [ 0.35209412  7.83237376 -1.53704859]

CA: 0.028264457792389893
Accuracy: 0.6319444444444444
Precision: 0.45484949832775917
Recall: 0.5714285714285714
[linear:1, laplacian:0.1, gaussia

In [32]:
w_dict, w_list = result11.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

Perfomances computed for dictionary settings 1:
	Accuracy: 0.5568513119533528
	Precision: 0.39285714285714285
	Recall: 0.6327433628318584
Perfomances computed for dictionary settings 2:
	Accuracy: 0.5641399416909622
	Precision: 0.40054495912806537
	Recall: 0.6504424778761062


In [33]:
result11.performancesFeatures()

statistics of configuration 1
{'config': {'linear': [[1], [1], [1]], 'laplacian': [[0.2], [0.2], [0.2]], 'gaussian': [[0.7], [0.4], [0.4]]}, 'CA': (0.02804830083489017, 5.774339046218112e-06), 'Accuracy': (0.6634259259259259, 0.0005688443072702334), 'Precision': (0.4926858676858677, 0.0008414498785551642), 'Recall': (0.6134453781512605, 0.0013064049149071402), 'eta': (array([ 0.82005389,  8.46130174, -3.64588233]), array([0.10964617, 0.20479944, 2.30007244]))}
statistics of configuration 2
{'config': {'linear': [[1], [1], [1]], 'laplacian': [[0.1], [0.1], [0.1]], 'gaussian': [[0.5], [0.2], [0.2]]}, 'CA': (0.027718420780598895, 6.005578591165746e-06), 'Accuracy': (0.662962962962963, 0.0005739883401920441), 'Precision': (0.49205722946865876, 0.0008215536018537393), 'Recall': (0.6274509803921569, 0.0017575657714066042), 'eta': (array([ 0.93053802, 15.65816823, -6.26525293]), array([0.0795446 , 0.05427653, 8.53479445]))}


## Centering, Normalization, Kernel Normalization

In [None]:
kernel_names = ['linear', 'laplacian', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True, normalizing = True, normalize_kernels = True)

In [None]:
result7 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result7.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result7.performancesFeatures()

## L2 Penalty, Centering, K-Normalization

In [None]:
kernel_names = ['linear', 'laplacian', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.2, 0.7], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, lamb = 0.5, centering = True, normalize_kernels = False)

In [None]:
result9 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result9.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result9.performancesFeatures()

## Sparsity in eta, Centering, K-Normalization

In [3]:
y = outputs["cvd_fail"].values
C = d_clinical.values
#G = d_genetic.values
#V = d_vampire.values

In [4]:
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.25).split(C, y))

In [5]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
#G_ = G[tr_idx]
#G_test = G[ts_idx]
#V_ = V[tr_idx]
#V_test = V[ts_idx]

In [6]:
ds_list = [C_]#, G_, V_]
ds_test = [C_test]#, G_test, V_test]
ds_names = ['clinic']#, 'genetic', 'vampire']

In [7]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.7, 1], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[1, 1.5], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, sparsity = 0.7, centering = True, normalize_kernels = True)

In [8]:
result8 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)


1 split out of 3 ...
	Working on config 1 of 2: {'linear': [1], 'laplacian': [0.7, 1], 'sigmoid': [0.3, 0.6], 'polynomial': [2, 3], 'gaussian': [0.4, 0.7]}
Fold no. 1


  x = z/n
  delta = np.linalg.norm(eta_new - eta)
  k_eta += eta_i * Ki


KeyboardInterrupt: 

In [None]:
w_dict, w_list = result8.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result8.performancesFeatures()