In [5]:
import pandas as pd
import os
import numpy as np

import Utils as ut
import CortesAlignmentFile as ca
import mySampler as ms
 
from sklearn.model_selection import StratifiedShuffleSplit

In [6]:
d_clinical = pd.read_csv(os.path.join('data', 'dataset_clinical_cleaned.csv'))
#d_genetic = pd.read_csv(os.path.join('data', 'dataset_genetic_cleaned_noOHE.csv'))
#d_vampire = pd.read_csv(os.path.join('data', 'dataset_vampire_cleaned.csv'))
outputs = pd.read_csv(os.path.join('data', 'outputs_cleaned.csv'))

## Dementia

In [7]:
y = outputs["dement_fail"].values
C = d_clinical.values
#G = d_genetic.values
#V = d_vampire.values

In [8]:
# COMPUTATIONAL COMPLEXITY: Reduce #samples
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.25).split(C, y))

In [9]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
#G_ = G[tr_idx]
#G_test = G[ts_idx]
#V_ = V[tr_idx]
#V_test = V[ts_idx]

In [10]:
ds_list = [C_]#, G_, V_]
ds_test = [C_test]#, G_test, V_test]
ds_names = ['clinic']#, 'genetic', 'vampire']

## Basic approach

In [None]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25)

In [None]:
result1 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
w_dict, w_list = result1.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
w_dict

In [None]:
result1.performancesFeatures()

## Kernel normalization

In [None]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, normalize_kernels = True)

In [None]:
result3 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True)

In [None]:
w_dict, w_list = result3.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result3.performancesFeatures()

## Normalized data

In [None]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, normalizing = True)

In [None]:
result4 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19]])

In [None]:
w_dict, w_list = result4.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result4.performancesFeatures()

## Origin Data Centering

In [None]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True)

In [None]:
result5 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19]])

In [None]:
w_dict, w_list = result5.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result5.performancesFeatures()

## Origin Data  Centering and Normalization

In [None]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, centering = True, normalizing = True)

In [None]:
result6 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19]])

In [None]:
w_dict, w_list = result6.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result6.performancesFeatures()

## Normalization, Kernel Normalization

In [None]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.35, normalizing = True, normalize_kernels = True)

In [None]:
result10 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []])

In [None]:
w_dict, w_list = result10.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result10.performancesFeatures()

## Centering, Kernel Normalization

In [11]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True, normalize_kernels = True)

In [12]:
result11 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19]])


1 split out of 3 ...
	Working on config 1 of 2: {'linear': [1], 'laplacian': [0.1, 0.05], 'sigmoid': [0.3, 0.6], 'polynomial': [2, 3], 'gaussian': [0.4, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[1, 0.05, 0.3, 2, 0.4]
	Result of 1:
CA: 0.020526871996112587
Accuracy: 0.603112840466926
Precision: 0.24444444444444444
Recall: 0.6179775280898876
[linear:1, laplacian:0.05, sigmoid:0.3, polynomial:2, gaussian:0.4, ]

eta vector: [   0.80655534   -4.6520079    82.07401995 -248.96752957   14.41998378]


	Completed in 1.3833333333333333 minutes
	Working on config 2 of 2: {'linear': [1], 'laplacian': [0.1, 0.15], 'sigmoid': [0.4, 0.5], 'polynomial': [3, 4], 'gaussian': [0.5, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[1, 0.1, 0.4, 3, 0.5]
	Result of 1:
CA: 0.020526871996112587
Accuracy: 0.603112840466926
Precision: 0.24444444444444444
Recall: 0.6179775280898876
[linear:1, laplacian:0.05, sigmoid:0.3, polynomial:2, gaussian:0.4, ]



In [13]:
w_dict, w_list = result11.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

Perfomances computed for dictionary settings 1:
	Accuracy: 0.5816326530612245
	Precision: 0.26536312849162014
	Recall: 0.7983193277310925
Perfomances computed for dictionary settings 2:
	Accuracy: 0.5787172011661808
	Precision: 0.2638888888888889
	Recall: 0.7983193277310925


In [14]:
result11.performancesFeatures()

statistics of configuration 1
{'config': {'linear': [[1], [1], [1]], 'laplacian': [[0.05], [0.1], [0.1]], 'sigmoid': [[0.3], [0.3], [0.3]], 'polynomial': [[2], [3], [2]], 'gaussian': [[0.4], [0.7], [0.4]]}, 'CA': (0.022435988668297695, 4.617503021215098e-06), 'Accuracy': (0.553177691309987, 0.0030188025388558307), 'Precision': (0.20519645600841016, 0.0016895392229201729), 'Recall': (0.5393258426966292, 0.0066489921306232365), 'eta': (array([   0.772506  ,   -3.07441749,   75.07376345, -204.12207525,
         11.55490709]), array([8.68090010e-02, 1.49165739e+00, 3.27595043e+02, 7.75752622e+03,
       2.84089278e+01]))}
statistics of configuration 2
{'config': {'linear': [[1], [1], [1]], 'laplacian': [[0.1], [0.1], [0.15]], 'sigmoid': [[0.4], [0.4], [0.4]], 'polynomial': [[3], [4], [3]], 'gaussian': [[0.5], [0.5], [0.5]]}, 'CA': (0.022440717694377596, 4.5152305003927206e-06), 'Accuracy': (0.49351491569390404, 0.0007830886496725497), 'Precision': (0.14644529397525677, 0.000470322462438603

## Centering, Normalization, Kernel Normalization

In [None]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, centering = True, normalizing = True, normalize_kernels = True)

In [None]:
result7 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19]])

In [None]:
w_dict, w_list = result7.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

In [None]:
result7.performancesFeatures()

## L2 Penalty, Centering, K-Normalization

In [15]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, lamb = 0.5, centering = True, normalize_kernels = False)

In [16]:
result9 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19]])


1 split out of 3 ...
	Working on config 1 of 2: {'linear': [1], 'laplacian': [0.1, 0.05], 'sigmoid': [0.3, 0.6], 'polynomial': [2, 3], 'gaussian': [0.4, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[1, 0.05, 0.6, 2, 0.4]
	Result of 1:
CA: 0.020146067415361686
Accuracy: 0.5622568093385214
Precision: 0.23228346456692914
Recall: 0.6629213483146067
[linear:1, laplacian:0.05, sigmoid:0.6, polynomial:2, gaussian:0.4, ]

eta vector: [ 0.06867573 -0.18130627 -0.08287536  0.01468922  0.00727067]


	Completed in 1.2333333333333334 minutes
	Working on config 2 of 2: {'linear': [1], 'laplacian': [0.1, 0.15], 'sigmoid': [0.4, 0.5], 'polynomial': [3, 4], 'gaussian': [0.5, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[1, 0.1, 0.5, 3, 0.5]
	Result of 1:
CA: 0.020146067415361686
Accuracy: 0.5622568093385214
Precision: 0.23228346456692914
Recall: 0.6629213483146067
[linear:1, laplacian:0.05, sigmoid:0.6, polynomial:2, gaussian:0.4, ]

eta vect

In [17]:
w_dict, w_list = result9.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

Perfomances computed for dictionary settings 1:
	Accuracy: 0.5830903790087464
	Precision: 0.2661064425770308
	Recall: 0.7983193277310925
Perfomances computed for dictionary settings 2:
	Accuracy: 0.5787172011661808
	Precision: 0.2638888888888889
	Recall: 0.7983193277310925


In [18]:
result9.performancesFeatures()

statistics of configuration 1
{'config': {'linear': [[1], [1], [1]], 'laplacian': [[0.05], [0.05], [0.1]], 'sigmoid': [[0.6], [0.6], [0.6]], 'polynomial': [[2], [2], [2]], 'gaussian': [[0.4], [0.4], [0.4]]}, 'CA': (0.021662144730490104, 1.40674338670479e-06), 'Accuracy': (0.5051880674448768, 0.002123002955718061), 'Precision': (0.1596290353411706, 0.003274425519593475), 'Recall': (0.4419475655430712, 0.029148957062099332), 'eta': (array([ 0.0046752 , -0.19413869,  0.03290555,  0.01100262,  0.11336085]), array([3.49974386e-03, 1.42627074e-03, 1.05327475e-02, 3.27297933e-05,
       9.17188738e-03]))}
statistics of configuration 2
{'config': {'linear': [[1], [1], [1]], 'laplacian': [[0.1], [0.1], [0.1]], 'sigmoid': [[0.5], [0.5], [0.5]], 'polynomial': [[3], [3], [3]], 'gaussian': [[0.5], [0.5], [0.5]]}, 'CA': (0.021663067952537324, 1.4218458211312524e-06), 'Accuracy': (0.5265888456549935, 1.3458021906295062e-05), 'Precision': (0.18407376264339007, 2.3562364034556703e-05), 'Recall': (0.505

## Sparsity in eta, Centering, K-Normalization

In [19]:
y = outputs["cvd_fail"].values
C = d_clinical.values
#G = d_genetic.values
#V = d_vampire.values

In [20]:
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.25).split(C, y))

In [21]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
#G_ = G[tr_idx]
#G_test = G[ts_idx]
#V_ = V[tr_idx]
#V_test = V[ts_idx]

In [22]:
ds_list = [C_]#, G_, V_]
ds_test = [C_test]#, G_test, V_test]
ds_names = ['clinic']#, 'genetic', 'vampire']

In [23]:
kernel_names = ['linear', 'laplacian', 'sigmoid', 'polynomial', 'gaussian']
kernel_type = [{'linear':[1], 'laplacian':[0.1, 0.05], 'sigmoid':[0.3, 0.6], 'polynomial':[2, 3], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'laplacian':[0.1, 0.15], 'sigmoid':[0.4, 0.5], 'polynomial':[3, 4], 'gaussian':[0.5, 1]}]
estimator = ca.centeredKernelAlignment
sampler = ms.mySampler(n_splits=3, test_size=.25, sparsity = 0.7, centering = True, normalize_kernels = True)

In [24]:
result8 = sampler.sample(kernel_type, estimator, ds_list, y_, valid_fold = 3, verbose=True, exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19]])


1 split out of 3 ...
	Working on config 1 of 2: {'linear': [1], 'laplacian': [0.1, 0.05], 'sigmoid': [0.3, 0.6], 'polynomial': [2, 3], 'gaussian': [0.4, 0.7]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[1, 0.1, 0.3, 2, 0.7]
	Result of 1:
CA: 0.019086145586624234
Accuracy: 0.5486381322957199
Precision: 0.38345864661654133
Recall: 0.6
[linear:1, laplacian:0.1, sigmoid:0.3, polynomial:2, gaussian:0.7, ]

eta vector: [0.86432267 0.08634358 0.07340475 0.04222153 0.4881805 ]


	Completed in 1.0666666666666667 minutes
	Working on config 2 of 2: {'linear': [1], 'laplacian': [0.1, 0.15], 'sigmoid': [0.4, 0.5], 'polynomial': [3, 4], 'gaussian': [0.5, 1]}
Fold no. 1
Fold no. 2
Fold no. 3
Validation complete, config selected:[1, 0.15, 0.4, 3, 1]
	Result of 1:
CA: 0.019086145586624234
Accuracy: 0.5486381322957199
Precision: 0.38345864661654133
Recall: 0.6
[linear:1, laplacian:0.1, sigmoid:0.3, polynomial:2, gaussian:0.7, ]

eta vector: [0.86432267 0.08634358 0.07340475 0

In [25]:
w_dict, w_list = result8.votingOverCA(ds_names, kernel_names)
ut.testConfigurations(estimator, y_, y_test, w_list, ds_list, ds_test, kernel_names)

Perfomances computed for dictionary settings 1:
	Accuracy: 0.5787172011661808
	Precision: 0.41509433962264153
	Recall: 0.6814159292035398
Perfomances computed for dictionary settings 2:
	Accuracy: 0.5874635568513119
	Precision: 0.42318059299191374
	Recall: 0.6946902654867256


In [26]:
result8.performancesFeatures()

statistics of configuration 1
{'config': {'linear': [[1], [1], [1]], 'laplacian': [[0.1], [0.1], [0.1]], 'sigmoid': [[0.3], [0.3], [0.3]], 'polynomial': [[2], [2], [2]], 'gaussian': [[0.7], [0.7], [0.7]]}, 'CA': (0.016551223026846185, 3.2700463449848135e-06), 'Accuracy': (0.5784695201037614, 0.00044495584927688273), 'Precision': (0.413691822392853, 0.00045814759472217705), 'Recall': (0.6607843137254902, 0.0019915417147251084), 'eta': (array([0.86514776, 0.08693535, 0.07342288, 0.04222459, 0.48660629]), array([3.97161425e-07, 1.75217383e-07, 4.50889558e-08, 4.24220564e-09,
       1.45085316e-06]))}
statistics of configuration 2
{'config': {'linear': [[1], [1], [1]], 'laplacian': [[0.15], [0.15], [0.15]], 'sigmoid': [[0.4], [0.4], [0.4]], 'polynomial': [[3], [3], [3]], 'gaussian': [[1], [1], [1]]}, 'CA': (0.016791617817765576, 3.409352626302252e-06), 'Accuracy': (0.5797665369649806, 0.00043149782737058755), 'Precision': (0.4146095375405452, 0.00046871640422921075), 'Recall': (0.660784313