In [1]:
import pandas as pd
import os
import numpy as np

import Utils as ut
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize

from sklearn.metrics import precision_score, recall_score, accuracy_score

from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, cross_validate

In [2]:
d_clinical = pd.read_csv(os.path.join('data', 'dataset_clinical_cleaned.csv'))
#d_genetic = pd.read_csv(os.path.join('data', 'dataset_genetic_cleaned_noOHE.csv'))
#d_vampire = pd.read_csv(os.path.join('data', 'dataset_vampire_cleaned.csv'))
outputs = pd.read_csv(os.path.join('data', 'outputs_cleaned.csv'))

In [3]:
y_class = outputs["dement_fail"].values
meaningful_idxs = np.where(y_class==1)
y = outputs["dement_time_age"].values[meaningful_idxs]
C = d_clinical.values[meaningful_idxs]
#G = d_genetic.values[meaningful_idxs]
#V = d_vampire.values[meaningful_idxs]

## Dementia

In [4]:
y = outputs["dement_fail"].values
C = d_clinical.values
#G = d_genetic.values
#V = d_vampire.values

In [5]:
# COMPUTATIONAL COMPLEXITY: Reduce #samples
tr_idx, ts_idx = next(StratifiedShuffleSplit(n_splits=1, test_size=0.25).split(C, y))

In [6]:
y_ = y[tr_idx]
y_test = y[ts_idx]
C_ = C[tr_idx]
C_test = C[ts_idx]
#G_ = G[tr_idx]
#G_test = G[ts_idx]
#V_ = V[tr_idx]
#V_test = V[ts_idx]

In [7]:
ds_list = [C_]#, G_, V_]
ds_test = [C_test]#, G_test, V_test]
ds_names = ['clinic']#, 'genetic', 'vampire']

In [8]:
def centering_normalizing(train, test, exclusion_list = None):
    
    if exclusion_list is not None:
        scale, train = ut.centering_normalizing(train, exclusion_list)
    else:
        scale, train = ut.centering_normalizing(train)

    new_Xts = test-scale

    if exclusion_list is not None:
        new_Xts[:, exclusion_list] = test[:, exclusion_list]

    return train, new_Xts, scale

def normalizing(train, test):
    return normalize(train), normalize(test)

In [9]:
def make_regression(X, y, sparsity):

    scores = np.zeros(len(sparsity))
    
    for idx, sp in enumerate(sparsity):
        model = LogisticRegression(penalty = 'l1', C = sp)
        scores[idx] = np.mean(cross_validate(model,  X, y, return_train_score=False, cv=3)['test_score'])
        
    return scores[np.argmax(scores)]

In [10]:
def learn(C_, C_test, sparsity, centering = False, norm = False):
    if centering == True:
        C_, C_test, _ = centering_normalizing(C_, C_test)#, [5,6,7,9,10,13,15,16,17,18,19])

    if norm == True:
         C_, C_test = normalizing(C_, C_test)

    best_alpha = make_regression(C_, y_, sparsity)
    model = LogisticRegression(penalty = 'l1', C = best_alpha)
    model.fit(C_, y_)
    y_pred = model.predict(C_test)
    coef = model.coef_
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print("\tAccuracy: {}".format(accuracy))
    print("\tPrecision: {}".format(precision))
    print("\tRecall: {}".format(recall))
    print("Coef: {}".format(coef))
    print("BestLambda: {}".format(best_alpha))


## Basic approach

In [11]:
sparsity = [0.01, 0.03, 0.05, 0.07, 0.1, 0.12]
learn(C_, C_test, sparsity)

	Accuracy: 0.8309037900874635
	Precision: 0.6363636363636364
	Recall: 0.058823529411764705
Coef: [[-0.01398715 -0.00408849  0.07218938 -0.01218718  0.00552691  0.
   0.28523127  0.50500396  0.07532933 -0.045247   -0.04712868  0.02252231
   0.         -0.08172616 -0.00475526  0.07420523  0.          0.044142
   0.09311077  0.04901335 -0.08055445]]
BestLambda: 0.8248175182481751


## Normalized data

In [12]:
parsity = [0.01, 0.03, 0.05, 0.07, 0.1, 0.12]
learn(C_, C_test, sparsity, norm = True)

	Accuracy: 0.8250728862973761
	Precision: 0.0
	Recall: 0.0
Coef: [[-1.80695001  0.          0.         -3.90260001  0.49196985  0.
   0.          0.         11.76514929  0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]]
BestLambda: 0.8262773722627738


## Origin Data Centering

In [13]:
sparsity = [0.01, 0.03, 0.05, 0.07, 0.1, 0.12]
learn(C_, C_test, sparsity, centering = True)

	Accuracy: 0.5860058309037901
	Precision: 0.27146814404432135
	Recall: 0.8235294117647058
Coef: [[-0.01246406 -0.21571939  0.         -0.37966948  0.45482668 -4.2677673
   0.          0.          1.66446389  0.          0.          0.
   0.          0.         -0.02592585  0.          0.          0.
   0.          0.          0.        ]]
BestLambda: 0.8262773722627738


## Origin Data  Centering and Normalization

In [14]:
sparsity = [0.01, 0.03, 0.05, 0.07, 0.1, 0.12]
learn(C_, C_test, sparsity, centering = True, norm = True)

	Accuracy: 0.8279883381924198
	Precision: 0.6
	Recall: 0.025210084033613446
Coef: [[-0.03237277 -0.19602707  0.         -0.35980584  0.45490152 -4.26837403
   0.          0.          1.66447739  0.          0.          0.
   0.          0.         -0.0455439   0.          0.          0.
   0.          0.          0.        ]]
BestLambda: 0.8262773722627738
