In [2]:
import pandas as pd
import os
import numpy as np

import Utils as ut
import CortesAlignmentFile as ca
import mySampler as ms
 
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold

from threading import Thread, Lock

## Dataset Configuration

In [3]:
d_clinical = pd.read_csv(os.path.join('data', 'dataset_clinical_cleaned.csv'))
d_genetic = pd.read_csv(os.path.join('data', 'dataset_genetic_cleaned_noOHE.csv'))
d_vampire = pd.read_csv(os.path.join('data', 'dataset_vampire_cleaned.csv'))
outputs = pd.read_csv(os.path.join('data', 'outputs_cleaned.csv'))

In [4]:
C = d_clinical.values
G = d_genetic.values
V = d_vampire.values

In [5]:
y_d = outputs["dement_fail"].values
y_c = outputs["cvd_fail"].values

In [6]:
# COMPUTATIONAL COMPLEXITY: Reduce #samples
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.25).split(C, y_d))

In [7]:
C_ = C[tr_idx]
C_test = C[ts_idx]
G_ = G[tr_idx]
G_test = G[ts_idx]
V_ = V[tr_idx]
V_test = V[ts_idx]

In [8]:
y_d_ = y_d[tr_idx]
y_d_test = y_d[ts_idx]

y_c_ = y_c[tr_idx]
y_c_test = y_c[ts_idx]

In [9]:
ds_list = [C_, G_, V_]
ds_test = [C_test, G_test, V_test]
ds_names = ['clinic', 'genetic', 'vampire']

## Kernel Definition

In [10]:
kernel_names_0 = ['laplacian', 'gaussian']
kernel_type_0 = [{'laplacian':[0.2, 0.6], 'gaussian':[0.3, 0.7]},
               {'laplacian':[0.4, 0.9], 'gaussian':[0.5, 1]}]

In [11]:
kernel_names_1 = ['linear', 'gaussian']
kernel_type_1 = [{'linear':[1], 'gaussian':[0.4, 0.7]},
               {'linear':[1], 'gaussian':[0.5, 1]}]

In [12]:
kernel_names_2 = ['polynomial', 'gaussian']
kernel_type_2 = [{'polynomial':[2, 7], 'gaussian':[0.4, 0.7]},
               {'polynomial':[3, 5], 'gaussian':[0.5, 1]}]

In [13]:
kernel_names_3 = ['sigmoid', 'gaussian']
kernel_type_3 = [{'sigmoid':[0.2, 0.6], 'gaussian':[0.3, 0.7]},
               {'sigmoid':[0.4, 0.9], 'gaussian':[0.5, 1]}]

## Global parameters

In [14]:
lock_dementia = Lock()
lock_cardio = Lock()

kernel_names = [kernel_names_0, kernel_names_1, kernel_names_2, kernel_names_3]
kernel_types = [kernel_type_0, kernel_type_1, kernel_type_2, kernel_type_3]

## Other shared parameters initialization

In [15]:
estimator = ca.centeredKernelAlignment

threads = []

valid_fold = 3

exclusion_list = [[5,6,7,9,10,13,15,16,17,18,19], list(range(G.shape[1]-3)), []]

l1_params = [0.1, 0.3, 0.5, 0.7, 0.1]
l2_params = [0.1, 0.3, 0.5, 0.7, 0.1]

## Thread

In [33]:
def executeKernels(sampler, estimator, penalty_type, parameter, ds_list, ds_test, y_d_, y_d_test, y_c_, y_c_test, valid_fold, exclusion_list, verbose, approach):
    
    results = np.empty(len(kernel_names)*2)
    
    for idx, (k_names, k_type) in enumerate(zip(kernel_names, kernel_types)):
          
        #DEMENTIA
        result = sampler.sample(k_type, estimator, ds_list, y_d_, valid_fold=valid_fold, verbose=verbose, exclusion_list=exclusion_list)
        w_dict, w_list = result.votingOverCA(ds_names, k_names)
        ut.testConfigurations(estimator, penalty_type, parameter, y_d_, y_d_test, w_list, ds_list, ds_test, k_names, 'classification', verbose=verbose)
        outcome_dict = result.performancesFeatures(verbose=verbose)
        results[idx] = outcome_dict['CA'][0]
        
        # CARDIO
        result = sampler.sample(k_type, estimator, ds_list, y_c_, valid_fold=valid_fold, verbose=verbose, exclusion_list=exclusion_list)
        w_dict, w_list = result.votingOverCA(ds_names, k_names)
        ut.testConfigurations(estimator, penalty_type, parameter, y_c_, y_c_test, w_list, ds_list, ds_test, k_names, 'classification', verbose=verbose)
        outcome_dict = result.performancesFeatures(verbose=verbose)
        results[idx+4] = outcome_dict['CA'][0]
    
    return results
    

In [None]:
def selectParam(params, penalty_type, train_set_list, train_label_c, train_label_d, estimator, approach, n_splits=3, centering=False, normalizing=False, normalize_kernels=False, exclusion_list=None, verbose=False):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=False)
    n_params = len(params)
    n_kernels = len(kernel_names)*2 # *2 datasets
    
    results = np.zeros((n_params, n_kernels))
    
    for tr_idx, val_idx in skf.split(train_set_list[0], train_label):
        tr_set_list = [X[tr_idx] for X in train_set_list]
        val_set_list = [X[val_idx] for X in train_set_list]
        tr_label_c = train_label_c[tr_idx]
        tr_label_d = train_label_d[tr_idx]
        val_label_c = train_label_c[val_idx]
        val_label_d = train_label_d[val_idx]
        
        for idx, param in enumerate(params):
            if penalty_type == 'l1':
                sampler = ms.mySampler(n_splits=3, test_size=.25, sparsity=param, centering=centering, normalizing=normalizing, normalize_kernels=normalize_kernels)
            elif penalty_type == 'l2':
                sampler = ms.mySampler(n_splits=3, test_size=.25, lamb=param, centering=centering, normalizing=normalizing, normalize_kernels=normalize_kernels)
            else:
                raise ValueError('Penalty type not set properly')
            
            results[idx] += executeKernels(sampler, estimator, penalty_type, param, tr_set_list, val_set_list, tr_label_d, val_label_d, tr_label_c, val_label_c, n_splits, exclusion_list, verbose, approach)
    results /= skf.get_n_splits
    avg_results = np.sum(results, axis=0)/n_kernels
    print(approach+"\n"+avg_results)


## L2 Penalty, Origin Data  Centering and Normalization

In [None]:
#sampler = ms.mySampler(n_splits=3, test_size=.25, lamb = 0.5, centering = True, normalizing = True)

In [None]:
t = Thread(target=selectParam, args=(l2_params, 'l2', ds_list, y_c_, y_d_, estimator, 'Centering - Normalizing', valid_fold, True, True, False, exclusion_list, False))
t.start()
threads.append(t)

## L1 Penalty, Origin Data  Centering and Normalization

In [None]:
#sampler = ms.mySampler(n_splits=3, test_size=.25, sparsity = 0.7, centering = True, normalizing = True)

In [None]:
t = Thread(target=selectParam, args=(l1_params, 'l1', ds_list, y_c_, y_d_, estimator, 'Centering - Normalizing', valid_fold, True, True, False, exclusion_list, False))
t.start()
threads.append(t)

## L2 Penalty, Normalization, Kernel Normalization

In [None]:
#sampler = ms.mySampler(n_splits=3, test_size=.25, lamb = 0.5, normalizing = True, normalize_kernels = True)

In [None]:
t = Thread(target=selectParam, args=(l2_params, 'l2', ds_list, y_c_, y_d_, estimator, 'Normalizing - K Normalizing', valid_fold, False, True, True, exclusion_list, False))
t.start()
threads.append(t)

## L1 Penalty, Normalization, Kernel Normalization

In [None]:
#sampler = ms.mySampler(n_splits=3, test_size=.25, sparsity = 0.7, normalizing = True, normalize_kernels = True)

In [None]:
t = Thread(target=selectParam, args=(l1_params, 'l1', ds_list, y_c_, y_d_, estimator, 'Normalizing - K Normalizing', valid_fold, False, True, True, exclusion_list, False))
t.start()
threads.append(t)

## L2 Penalty, Centering, Normalization, Kernel Normalization

In [None]:
#sampler = ms.mySampler(n_splits=3, test_size=.25, lamb = 0.5, centering = True, normalizing = True, normalize_kernels = True)

In [None]:
t = Thread(target=selectParam, args=(l2_params, 'l2', ds_list, y_c_, y_d_, estimator, 'Centering - Normalizing - K Normalizing', valid_fold, True, True, True, exclusion_list, False))
t.start()
threads.append(t)

## L1 Penalty, Centering, Normalization, Kernel Normalization

In [None]:
#sampler = ms.mySampler(n_splits=3, test_size=.25, sparsity = 0.7, centering = True, normalizing = True, normalize_kernels = True)

In [None]:
t = Thread(target=selectParam, args=(l1_params, 'l1', ds_list, y_c_, y_d_, estimator, 'Centering - Normalizing - K Normalizing', valid_fold, True, True, True, exclusion_list, False))
t.start()
threads.append(t)

## L2 Penalty, Centering, K-Normalization

In [None]:
#sampler = ms.mySampler(n_splits=3, test_size=0.25, lamb = 0.5, centering = True, normalize_kernels = False)

In [None]:
t = Thread(target=selectParam, args=(l2_params, 'l2', ds_list, y_c_, y_d_, estimator, 'Centering - K Normalizing', valid_fold, True, False, True, exclusion_list, False))
t.start()
threads.append(t)

## L1 Penalty, Centering, K-Normalization

In [None]:
#sampler = ms.mySampler(n_splits=3, test_size=.25, sparsity = 0.7, centering = True, normalize_kernels = True)

In [None]:
t = Thread(target=selectParam, args=(l1_params, 'l1', ds_list, y_c_, y_d_, estimator, 'Centering - K Normalizing', valid_fold, True, False, True, exclusion_list, False))
t.start()
threads.append(t)

## Waiting

In [None]:
for t in threads:
    t.join()

print("Operations completed")