In [4]:
import pandas as pd
import os
import numpy as np

import Utils as ut
import CortesAlignmentFile as ca
import mySampler as ms
 
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import normalize

## Dataset Configuration

In [5]:
d_clinical = pd.read_csv(os.path.join('data', 'dataset_clinical_cleaned.csv'))
#d_genetic = pd.read_csv(os.path.join('data', 'dataset_genetic_cleaned_noOHE.csv'))
#d_vampire = pd.read_csv(os.path.join('data', 'dataset_vampire_cleaned.csv'))
outputs = pd.read_csv(os.path.join('data', 'outputs_cleaned.csv'))

In [6]:
C = d_clinical.values
#G = d_genetic.values
#V = d_vampire.values

In [7]:
y_d = outputs["dement_fail"].values
y_c = outputs["cvd_fail"].values

In [8]:
# COMPUTATIONAL COMPLEXITY: Reduce #samples
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.25).split(C, y_d))

In [9]:
C_ = C[tr_idx]
C_test = C[ts_idx]
#G_ = G[tr_idx]
#G_test = G[ts_idx]
#V_ = V[tr_idx]
#V_test = V[ts_idx]

In [10]:
y_d_ = y_d[tr_idx]
y_d_test = y_d[ts_idx]

y_c_ = y_c[tr_idx]
y_c_test = y_c[ts_idx]

In [11]:
ds_list = [C_]#, G_, V_]
ds_test = [C_test]#, G_test, V_test]
ds_names = ['clinic']#, 'genetic', 'vampire']

## Utilities

In [12]:
def centering_normalizing(train, test, exclusion_list = None):
    
    if exclusion_list is not None:
        scale, train = ut.centering_normalizing(train, exclusion_list)
    else:
        scale, train = ut.centering_normalizing(train)

    new_Xts = test-scale

    if exclusion_list is not None:
        new_Xts[:, exclusion_list] = test[:, exclusion_list]

    return train, new_Xts, scale


def normalizing(train, test):
    return normalize(train), normalize(test)

def make_regression(X, y, sparsity):

    scores = np.zeros(len(sparsity))
    
    for idx, sp in enumerate(sparsity):
        model = LogisticRegression(penalty = 'l1', C = sp)
        scores[idx] = np.mean(cross_validate(model,  X, y, return_train_score=False, cv=3)['test_score'])
        
    return sparsity[np.argmax(scores)]


def learn(C_, C_test, y, y_test, sparsity, centering = False, norm = False):
    
    if centering == True:
        C_, C_test, _ = centering_normalizing(C_, C_test)#, [5,6,7,9,10,13,15,16,17,18,19])

    if norm == True:
         C_, C_test = normalizing(C_, C_test)

    best_alpha = make_regression(C_, y, sparsity)
    model = LogisticRegression(penalty = 'l1', C = best_alpha)
    model.fit(C_, y)
    y_pred = model.predict(C_test)
    coef = model.coef_
    
    accuracy = ut.balanced_accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print("\tAccuracy: {}".format(accuracy))
    print("\tPrecision: {}".format(precision))
    print("\tRecall: {}".format(recall))
    print("Coef: {}".format(coef))
    print("BestLambda: {}".format(best_alpha))

In [13]:
sparsity = list(np.arange(0.1, 2, 0.2))

## Origin Data Centering

### Dementia

In [18]:
learn(C_, C_test, y_d_, y_d_test, sparsity, centering = True)

	Accuracy: 0.6715945637514265
	Precision: 0.27575757575757576
	Recall: 0.7647058823529411
Coef: [[ 0.          0.          0.         -0.34944809  0.47924946 -4.33491868
   0.          0.          1.68668473  0.          0.          0.
   0.          0.         -0.1756032   0.          0.          0.
   0.          0.          0.        ]]
BestLambda: 0.5000000000000001


### Cardio

In [19]:
learn(C_, C_test, y_c_, y_c_test, sparsity, centering = True)

	Accuracy: 0.6411265609438443
	Precision: 0.4723127035830619
	Recall: 0.6359649122807017
Coef: [[ 0.24541592  0.          2.20349362 -0.75647233  0.23339294  0.52580679
   0.          0.          0.71058628  2.84110754 -1.29468123  2.3778646
  -3.18673149  8.22266713  0.         -0.09714606  0.          0.50942805
   0.81592509  0.          1.18512638]]
BestLambda: 1.9000000000000004


## Origin Data  Centering and Normalization

### Dementia

In [20]:
learn(C_, C_test, y_d_, y_d_test, sparsity, centering = True, norm = True)

	Accuracy: 0.5048760244838676
	Precision: 0.3333333333333333
	Recall: 0.01680672268907563
Coef: [[ 0.          0.          0.         -0.34940619  0.47929952 -4.3346685
   0.          0.          1.68661418  0.          0.          0.
   0.          0.         -0.17567118  0.          0.          0.
   0.          0.          0.        ]]
BestLambda: 0.5000000000000001


### Cardio

In [21]:
learn(C_, C_test, y_c_, y_c_test, sparsity, centering = True, norm = True)

	Accuracy: 0.5846452922699763
	Precision: 0.5754716981132075
	Recall: 0.2675438596491228
Coef: [[ 0.24295029  0.          2.20323597 -0.75412797  0.23335976  0.52585566
   0.          0.          0.71051908  2.84089014 -1.29482284  2.3777739
  -3.18657635  8.22248962  0.         -0.10014134  0.          0.51144475
   0.82023147  0.          1.18526461]]
BestLambda: 1.9000000000000004
