# Análise Exploratória


## Descrição
Neste notebook consta a análise exploratória da base de dados utilizada no projeto *Correlação de dados de imagens de RM e dados genéticos em paciente com Esclerose Lateral Amiotrófica* para a disciplina *Ciência e Visualização de Dados em Saúde* da Universidade Estadual de Campinas, Unicamp.

## Bibliotecas

In [1]:
# Import libraries
## Basic
import numpy as np
import scipy as sp
import pandas as pd
import random

## Graph
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
from seaborn_qqplot import pplot

## Machine Learning
import statsmodels.api as sm
#from statsmodels.formula.api import ols
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## Metrics
from sklearn.metrics import r2_score
#from scipy.stats import shapiro
from sklearn import metrics

## Inportação de Dados

In [2]:
# Import data
## Path to file
pathDTI = "../data/raw/DTI_MultAtlas.xlsx"
pathT1 = "../data/raw/freesurfer_stats_REGIONS_T1_ANALYSIS.xlsx"

## Sheets names
### DTI
faPath = 'FA'
l1Path = 'l1'
l2Path = 'l2'
l3Path = 'l3'
vlPath = 'volumeLabels'

### T1 sheets
volumePath = 'Volume';
gyriSulciLHPath = 'Gyri+Sulci LH';
gyriSulciRHPath = 'Gyri+Sulci RH';

## Read each excel sheet
### DTI data
faDataRaw = pd.read_excel(pathDTI, sheet_name = faPath)
l1DataRaw = pd.read_excel(pathDTI, sheet_name = l1Path)
l2DataRaw = pd.read_excel(pathDTI, sheet_name = l2Path)
l3DataRaw = pd.read_excel(pathDTI, sheet_name = l3Path)
vlDataRaw = pd.read_excel(pathDTI, sheet_name = vlPath)

### T1 data
volumeDataRaw = pd.read_excel(pathT1, sheet_name = volumePath, skiprows=[1])
gyriSulciLHDataRaw = pd.read_excel(pathT1, sheet_name = gyriSulciLHPath, skiprows = [1]);
gyriSulciRHDataRaw = pd.read_excel(pathT1, sheet_name = gyriSulciRHPath, skiprows = [1]);

vlDataRaw.head()

Unnamed: 0,ID/Labls,SUPERIOR PARIETAL GYRUS left (gm),CINGULATE GYRUS left (gm),SUPERIOR FRONTAL GYRUS left (gm),MIDDLE FRONTAL GYRUS left (gm),INFERIOR FRONTAL GYRUS left (gm),PRECENTRAL GYRUS left (gm),POSTCENTRAL GYRUS left (gm),ANGULAR GYRUS left (gm),PRE-CUNEUS left (gm),...,SLF-tLeft,SLFF-tRight,ICP-cerebellumLeft,ICP-cerebellumRight,CerebellumBranch-ALeft,CerebellumBranch-ARight,CerebellumBranch-BLeft,CerebellumBranch-BRight,CSF,Unused
0,c9o_02,12362,7084,16584,11484,13216,17806,8200,3140,1820,...,3888,2056,470,472,2828,2916,264,370,319946,94.0
1,c9o_03,7348,4466,10238,6452,7454,9370,4710,2136,1066,...,2822,1392,260,296,2082,1766,422,420,230842,
2,c9o_04,9602,7164,14782,11700,7882,13954,5848,2262,810,...,4228,1754,216,326,2382,2712,398,474,262602,40.0
3,c9o_05,9284,7274,13514,8108,9264,11830,6120,4064,1672,...,3732,1818,256,316,2292,2118,374,320,226770,4.0
4,c9o_06,8294,6598,10790,9498,7964,10112,5232,2158,1196,...,2306,1776,262,224,2006,1812,240,286,212792,2.0


## Sumário dos Dados

In [3]:
# Data Summary 2
## Data types
print("Dataset: FA")
print(faDataRaw.dtypes)

print("\n")

print("Dataset: L1")
print(l1DataRaw.dtypes)

print("\n")

print("Dataset: L2")
print(l2DataRaw.dtypes)

print("\n")

print("Dataset: L3")
print(l3DataRaw.dtypes)

print("\n")

print("Dataset: Volume Labels")
print(vlDataRaw.dtypes)

print("Dataset: Volume")
print(volumeDataRaw.dtypes)

print("Dataset: gyriSulciLH")
print(gyriSulciLHDataRaw.dtypes)

print("Dataset: gyriSulciRH")
print(gyriSulciRHDataRaw.dtypes)

Dataset: FA
ID/Labls                               object
SUPERIOR PARIETAL GYRUS left  (gm)    float64
CINGULATE GYRUS left  (gm)            float64
SUPERIOR FRONTAL GYRUS left (gm)      float64
MIDDLE FRONTAL GYRUS left (gm)        float64
                                       ...   
CerebellumBranch-ARight               float64
CerebellumBranch-BLeft                float64
CerebellumBranch-BRight               float64
CSF                                   float64
Unused                                float64
Length: 170, dtype: object


Dataset: L1
ID/Labls                               object
SUPERIOR PARIETAL GYRUS left  (gm)    float64
CINGULATE GYRUS left  (gm)            float64
SUPERIOR FRONTAL GYRUS left (gm)      float64
MIDDLE FRONTAL GYRUS left (gm)        float64
                                       ...   
CerebellumBranch-ARight               float64
CerebellumBranch-BLeft                float64
CerebellumBranch-BRight               float64
CSF                        

## Dimensões e Dados Faltantes

In [4]:
def dataShape(dataset):
    rows = dataset.shape[0]
    columns = dataset.shape[1]
    missing = dataset.isnull().sum().sum()
    
    return rows, columns, missing

In [5]:
faRows, faColumns, faMissing = dataShape(faDataRaw)
l1Rows, l1Columns, l1Missing = dataShape(l1DataRaw)
l2Rows, l2Columns, l2Missing = dataShape(l2DataRaw)
l3Rows, l3Columns, l3Missing = dataShape(l3DataRaw)
vlRows, vlColumns, vlMissing = dataShape(vlDataRaw)
volRows, volColumns, volMissing = dataShape(volumeDataRaw)
gslhRows, gslhColumns, gslhMissing = dataShape(gyriSulciLHDataRaw)
gsrhRows, gsrhColumns, gsrhMissing = dataShape(gyriSulciRHDataRaw)

In [6]:
# Summary
dtiDataSummary = pd.DataFrame({'Data' : ['FA', 'L1',  'L2', 'L3', 'volumeLabels'],
                              'Rows' : [faRows, l1Rows, l2Rows, l3Rows, vlRows], 
                              'Columns' : [faColumns, l1Columns, l2Columns, l3Columns, vlColumns],
                              'Missing' : [faMissing, l1Missing, l2Missing, l3Missing, vlMissing]});

t1DataSummary = pd.DataFrame({'Data' : ['volume', 'GS_LH',  'GS_RH'],
                              'Rows' : [volRows, gslhRows, gsrhRows], 
                              'Columns' : [volColumns, gslhColumns, gsrhColumns],
                              'Missing' : [volMissing, gslhMissing, gsrhMissing]});


print("Table 1: DTI Data Summary")
print(dtiDataSummary)
print('\n')
print("Table 2: T1 Data Summary")
print(t1DataSummary)

Table 1: DTI Data Summary
           Data  Rows  Columns  Missing
0            FA    87      170      322
1            L1    87      170      322
2            L2    87      170      322
3            L3    87      170      322
4  volumeLabels    87      170      322


Table 2: T1 Data Summary
     Data  Rows  Columns  Missing
0  volume    91       28        0
1   GS_LH    91       77        0
2   GS_RH    91       77        0


In [7]:
def getMissing(dataset):
    missingColumns = []
    
    print('Column\tMissing Values')
    for i in range(0, dataset.shape[1]):
        missing = dataset.iloc[:,i].isnull().sum()
        if(missing > 0):
            missingColumns.append(dataset.columns[i])
            print('%s\t%d'%(dataset.columns[i], missing))
            
    return missingColumns

In [8]:
print('Dataset: FA\n')
faMissingColumns = getMissing(faDataRaw)
print('\n=======================\n')
print('Dataset: L1\n')
l1MissingColumns = getMissing(l1DataRaw)
print('\n=======================\n')
print('Dataset: L2\n')
l2MissingColumns = getMissing(l2DataRaw)
print('\n=======================\n')
print('Dataset: L3\n')
l3MissingColumns = getMissing(l3DataRaw)
print('\n=======================\n')
print('Dataset: Volume Labels\n')
vlMissingColumns = getMissing(vlDataRaw)

Dataset: FA

Column	Missing Values
SuperiorFronto-occipitalFasciculusLeft	6
UncinateFasciculusLeft	1
Red Nucleus left	21
SuperiorFronto-occipitalFasciculusRight	1
Red Nucleus right	17
GLOBUS PALLIDUS right	1
Mammillary body right	75
Mammillary body left	21
Hypothalamus E left	76
Hypothalamus E right	86
LVL_occipitalRight	12
Unused	5


Dataset: L1

Column	Missing Values
SuperiorFronto-occipitalFasciculusLeft	6
UncinateFasciculusLeft	1
Red Nucleus left	21
SuperiorFronto-occipitalFasciculusRight	1
Red Nucleus right	17
GLOBUS PALLIDUS right	1
Mammillary body right	75
Mammillary body left	21
Hypothalamus E left	76
Hypothalamus E right	86
LVL_occipitalRight	12
Unused	5


Dataset: L2

Column	Missing Values
SuperiorFronto-occipitalFasciculusLeft	6
UncinateFasciculusLeft	1
Red Nucleus left	21
SuperiorFronto-occipitalFasciculusRight	1
Red Nucleus right	17
GLOBUS PALLIDUS right	1
Mammillary body right	75
Mammillary body left	21
Hypothalamus E left	76
Hypothalamus E right	86
LVL_occipitalRight	12


## Limpeza

1. Backup dos dados
2. Excluir colunas irrelevantes ("Unnamed 1" e "Unnamed 2")
3. Renomear primeira coluna de "Unnamed 0" para "subject"
4. Criação da coluna "als", discriminando pacientes com e sem ELA
5. Criaçào da coluna "group", discriminando pacientes do grupo de controle e diferentes tipos 
de ELA (ELAs, C9orf72 e VAPB)
6. União dos 3 dataset em um único dataset

In [9]:
def dataClean(dataset, missingColumns, suffix, dataType):
    # Backup Data
    data = dataset.copy(); 
    
    if(dataType == 'dti'):
        # Drop all columns with missing values
        data.drop(missingColumns, axis = 1, inplace = True);
    
        # Change first column name
        data.rename(columns = {'ID/Labls' : 'subject'}, inplace = True);
    
        # Add sufix to each feature name
        for i in range(1, data.shape[1]):
            colName = data.columns[i] + '_' + suffix
            data.rename(columns = {data.columns[i] : colName}, inplace = True)
    
    if(dataType == 't1'):
        ## Drop unnamed columns
        data.drop(["Unnamed: 1", 'Unnamed: 2'], axis = 1, inplace = True);
        ## Rename first column
        data.rename(columns = {'Unnamed: 0' : 'subject'}, inplace = True);
    
    # Create two new features    
    ## New column: ALS
    ### Map Values
    #### Legend
    #### 0 = control
    #### 1 =  ALS confirmed
    data['als'] = 1
    data.loc[data['subject'].str.startswith('ctl'), 'als'] = 0
   
    ## New column: Group
    ### Map Values
    #### Legend
    #### 0 = control
    #### 1 = sporadic ALS
    #### 2 = c9o ALS
    #### 3 = vapb ALS
    data['group'] = 0
    data.loc[data['subject'].str.startswith('sals'), 'group'] = 1
    data.loc[data['subject'].str.startswith('c9o'), 'group'] = 2
    data.loc[data['subject'].str.startswith('vap'), 'group'] = 3
    
    return data

In [10]:
faData = dataClean(faDataRaw, faMissingColumns, 'fa', 'dti')
l1Data = dataClean(l1DataRaw, l1MissingColumns, 'l1', 'dti')
l2Data = dataClean(l2DataRaw, l2MissingColumns, 'l2', 'dti')
l3Data = dataClean(l3DataRaw, l3MissingColumns, 'l3', 'dti')
vlData = dataClean(vlDataRaw, vlMissingColumns, 'vl', 'dti')
volData = dataClean(volumeDataRaw, [], 'vol', 't1')
gslhData = dataClean(gyriSulciLHDataRaw, [], 'gslh', 't1')
gsrhData = dataClean(gyriSulciRHDataRaw, [], 'gsrh', 't1')

In [11]:
newMetricsMD = l1Data.copy()
newMetricsRD = l1Data.copy()

for i in range(1, newMetricsMD.shape[1]):
    if(newMetricsMD.columns[i] != 'als' and newMetricsMD.columns[i] != 'group'):
        colName = newMetricsMD.columns[i][0:-3] + '_md'
        newMetricsMD.rename(columns = {newMetricsMD.columns[i] : colName}, inplace = True)
        
        colName = newMetricsRD.columns[i][0:-3] + '_rd'     
        newMetricsRD.rename(columns = {newMetricsRD.columns[i] : colName}, inplace = True)


for i in range(0, l1Data.shape[0]):
    for j in range(1, l1Data.shape[1]):
        newMetricsMD.iloc[i, j] = (l1Data.iloc[i, j] + 
                                   l2Data.iloc[i, j] + 
                                   l3Data.iloc[i, j])/3
        
        newMetricsRD.iloc[i, j] = (l2Data.iloc[i, j] + 
                                   l3Data.iloc[i, j])/2

In [12]:
# Merge DTI all data into one dataframe
dtiData = faData.copy()
dtiData = dtiData.merge(l1Data, how = 'inner', on = ['subject', 'als', 'group'])
dtiData = dtiData.merge(newMetricsMD, how = 'inner', on = ['subject', 'als', 'group'])
dtiData = dtiData.merge(newMetricsRD, how = 'inner', on = ['subject', 'als', 'group'])
dtiData = dtiData.merge(vlData, how = 'inner', on = ['subject', 'als', 'group'])

# Merge T1 all data into one dataframe
t1Data = volData.copy()
t1Data = t1Data.merge(gslhData, how = 'inner', on = ['subject', 'als', 'group'])
t1Data = t1Data.merge(gsrhData, how = 'inner', on = ['subject', 'als', 'group'])

In [13]:
t1Copy = t1Data.copy()

found = False
for i in range(0, t1Copy.shape[0]):
    for j in range(0, dtiData.shape[0]):
        if(t1Copy['subject'][i] == dtiData['subject'][j]):
            found = True
    
    if(not(found)):
        t1Copy.drop(i, inplace = True)
        found = False
    found = False
t1Copy.reset_index(inplace = True)

allData = t1Copy.copy()
allData = allData.merge(dtiData, how = 'inner', on = ['subject', 'als', 'group'])
allData

Unnamed: 0,index,subject,#eTIV,Brain-Stem,CC_Anterior,CC_Central,CC_Mid_Anterior,CC_Mid_Posterior,CC_Posterior,Left-Accumbens-area,...,III and IV ventricle_vl,SLF-tLeft_vl,SLFF-tRight_vl,ICP-cerebellumLeft_vl,ICP-cerebellumRight_vl,CerebellumBranch-ALeft_vl,CerebellumBranch-ARight_vl,CerebellumBranch-BLeft_vl,CerebellumBranch-BRight_vl,CSF_vl
0,0,c9o_02,1.913195e+06,28663.3,1537.6,621.4,617.7,653.4,1164.1,538.5,...,6892,3888,2056,470,472,2828,2916,264,370,319946
1,1,c9o_03,1.156390e+06,17449.3,790.4,593.5,562.1,461.4,953.7,394.1,...,4226,2822,1392,260,296,2082,1766,422,420,230842
2,2,c9o_04,1.841581e+06,22053.6,1559.7,509.1,506.2,438.4,1264.5,455.6,...,8470,4228,1754,216,326,2382,2712,398,474,262602
3,3,c9o_05,1.406566e+06,18810.9,857.5,521.3,644.0,532.3,1006.9,467.0,...,5952,3732,1818,256,316,2292,2118,374,320,226770
4,4,c9o_06,1.191194e+06,20039.0,808.5,578.2,507.3,566.1,1086.1,400.6,...,5298,2306,1776,262,224,2006,1812,240,286,212792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,86,vap_23,1.322953e+06,17890.6,919.7,647.8,687.5,635.3,1119.6,470.8,...,5228,2658,1338,246,284,1746,1530,212,266,262232
83,87,vap_24,1.199364e+06,17381.8,846.1,482.6,604.1,509.2,1000.0,528.7,...,7624,2678,1156,252,350,1954,2218,280,308,229296
84,88,vap_25,1.468567e+06,17870.6,1043.6,503.1,805.1,672.8,1281.2,533.0,...,7222,3382,2042,280,390,2568,2404,276,276,243586
85,89,vap_26,1.548146e+06,19757.0,845.5,509.0,633.7,665.3,1135.4,513.3,...,6672,3108,2246,236,374,2550,2632,370,370,255380


## Treinamento do Modelo

In [14]:
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import feature_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

In [15]:
# Define Helper functions

# Compute confusion matrix, sensitivity and specificity
def results(model, x, y):
    yPred = model.predict(x) # Model predictions
    CM = confusion_matrix(y, yPred) # Compute confusion Matrix
    Sens = CM[1,1]/(y == 1).sum() # Calculate Sensitivity
    Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity
    
    print('matriz de confusão = \n', CM)
    print('Sensibilidade = ', Sens)
    print('Especificidade = ', Spec)
    print('acc = ', np.sum(y == yPred)/yPred.size)
    
# Show result for the different datasets
def showResults(model, name, x, y, xTrain, xVal, xTest, yTrain, yVal, yTest):
    title = 'Modelo: ' + name
    print(title)

    print('Conjunto de Treino\n')
    results(model, xTrain, yTrain)
    
    print('\nConjunto de Validação\n')
    results(model, xVal, yVal)
    
    print('\nConjunto de Teste\n')
    results(model, xTest, yTest)
    
    #Calculate Cross Validation
    k = 100
    print('Cross validation (%d-Fold):'%(k))
    cv = ShuffleSplit(n_splits = k, test_size = 0.33, random_state = 5)
    scores = cross_val_score(model, x, y, cv=cv)
    print('Score = ', scores.mean())
    print('Std = ', scores.std())
    
def prepareData(dataset, target, zeroVariance = 0, pca = 0):
    x = dataset.drop(['subject', 'als', 'group'], axis=1)
    y = dataset[target]

    print('Initial Dimensions: ', x.shape)
    if(zeroVariance > 0):
        sel = feature_selection.VarianceThreshold(threshold = zeroVariance)
        x = sel.fit_transform(x)
        print('Post zeroVar: ', x.shape)
    if(pca > 0):
        x = PCA(pca).fit_transform(x)
        print('Post pca: ', x.shape)

    xTrain, xTmp, yTrain, yTmp = train_test_split(x, y, test_size = 0.6, random_state = 5)
    xVal, xTest, yVal, yTest = train_test_split(xTmp, yTmp, test_size = 0.5, random_state = 5)

    xTrain = sm.add_constant(xTrain)
    xVal = sm.add_constant(xVal)
    xTest = sm.add_constant(xTest)

    return x, y, xTrain, yTrain, xVal, yVal, xTest, yTest    

### Classificador ELA/Saudavel
#### Dados: Apenas DTI

In [16]:
filteredData = dtiData

x, y, xTrain, yTrain, xVal, yVal, xTest, yTest = prepareData(filteredData, 'als', 0.1, 0.99)

clfAlsLR = make_pipeline(StandardScaler(), LogisticRegression(penalty='l2', max_iter = 5000))
clfAlsSVM = make_pipeline(StandardScaler(), svm.SVC(gamma='auto', kernel='rbf'))
clfAlsRF = make_pipeline(StandardScaler(), RandomForestClassifier())

clfAlsLR.fit(xTrain, yTrain);
clfAlsSVM.fit(xTrain, yTrain);
clfAlsRF.fit(xTrain, yTrain);

Initial Dimensions:  (87, 785)
Post zeroVar:  (87, 157)
Post pca:  (87, 18)


In [17]:
showResults(clfAlsLR, 'Logistic Regression', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)
print('\n======================================================\n')
showResults(clfAlsSVM, 'Support Vector Machine', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)
print('\n======================================================\n')
showResults(clfAlsRF, 'Random Forests', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)

Modelo: Logistic Regression
Conjunto de Treino

matriz de confusão = 
 [[ 5  1]
 [ 1 27]]
Sensibilidade =  0.9642857142857143
Especificidade =  0.8333333333333334
acc =  0.9411764705882353

Conjunto de Validação

matriz de confusão = 
 [[ 1  6]
 [ 1 18]]
Sensibilidade =  0.9473684210526315
Especificidade =  0.14285714285714285
acc =  0.7307692307692307

Conjunto de Teste

matriz de confusão = 
 [[ 3  4]
 [ 3 17]]
Sensibilidade =  0.85
Especificidade =  0.42857142857142855
acc =  0.7407407407407407
Cross validation (100-Fold):
Score =  0.7417241379310344
Std =  0.07075186497291966


Modelo: Support Vector Machine
Conjunto de Treino

matriz de confusão = 
 [[ 3  3]
 [ 0 28]]
Sensibilidade =  1.0
Especificidade =  0.5
acc =  0.9117647058823529

Conjunto de Validação

matriz de confusão = 
 [[ 0  7]
 [ 0 19]]
Sensibilidade =  1.0
Especificidade =  0.0
acc =  0.7307692307692307

Conjunto de Teste

matriz de confusão = 
 [[ 0  7]
 [ 0 20]]
Sensibilidade =  1.0
Especificidade =  0.0
acc =  0.

#### Dados: Apenas T1

In [18]:
filteredData = t1Data

x, y, xTrain, yTrain, xVal, yVal, xTest, yTest = prepareData(filteredData, 'als', 0.1)

clfAlsLR = make_pipeline(StandardScaler(), LogisticRegression(penalty='l2', max_iter = 5000))
clfAlsSVM = make_pipeline(StandardScaler(), svm.SVC(gamma='auto', kernel='rbf'))
clfAlsRF = make_pipeline(StandardScaler(), RandomForestClassifier())

clfAlsLR.fit(xTrain, yTrain);
clfAlsSVM.fit(xTrain, yTrain);
clfAlsRF.fit(xTrain, yTrain);

Initial Dimensions:  (91, 173)
Post zeroVar:  (91, 29)


In [19]:
showResults(clfAlsLR, 'Logistic Regression', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)
print('\n======================================================\n')
showResults(clfAlsSVM, 'Support Vector Machine', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)
print('\n======================================================\n')
showResults(clfAlsRF, 'Random Forests', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)

Modelo: Logistic Regression
Conjunto de Treino

matriz de confusão = 
 [[ 4  2]
 [ 0 30]]
Sensibilidade =  1.0
Especificidade =  0.6666666666666666
acc =  0.9444444444444444

Conjunto de Validação

matriz de confusão = 
 [[ 2  9]
 [ 1 15]]
Sensibilidade =  0.9375
Especificidade =  0.18181818181818182
acc =  0.6296296296296297

Conjunto de Teste

matriz de confusão = 
 [[ 1  3]
 [ 4 20]]
Sensibilidade =  0.8333333333333334
Especificidade =  0.25
acc =  0.75
Cross validation (100-Fold):
Score =  0.7135483870967744
Std =  0.06488282883496582


Modelo: Support Vector Machine
Conjunto de Treino

matriz de confusão = 
 [[ 0  6]
 [ 0 30]]
Sensibilidade =  1.0
Especificidade =  0.0
acc =  0.8333333333333334

Conjunto de Validação

matriz de confusão = 
 [[ 0 11]
 [ 0 16]]
Sensibilidade =  1.0
Especificidade =  0.0
acc =  0.5925925925925926

Conjunto de Teste

matriz de confusão = 
 [[ 0  4]
 [ 0 24]]
Sensibilidade =  1.0
Especificidade =  0.0
acc =  0.8571428571428571
Cross validation (100-Fol

#### Dados: DTI e T1

In [20]:
filteredData = allData

x, y, xTrain, yTrain, xVal, yVal, xTest, yTest = prepareData(filteredData, 'als', 0.1)

clfAlsLR = make_pipeline(StandardScaler(), LogisticRegression(penalty='l2', max_iter = 5000))
clfAlsSVM = make_pipeline(StandardScaler(), svm.SVC(gamma='auto', kernel='rbf'))
clfAlsRF = make_pipeline(StandardScaler(), RandomForestClassifier())

clfAlsLR.fit(xTrain, yTrain);
clfAlsSVM.fit(xTrain, yTrain);
clfAlsRF.fit(xTrain, yTrain);

Initial Dimensions:  (87, 959)
Post zeroVar:  (87, 187)


In [21]:
showResults(clfAlsLR, 'Logistic Regression', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)
print('\n======================================================\n')
showResults(clfAlsSVM, 'Support Vector Machine', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)
print('\n======================================================\n')
showResults(clfAlsRF, 'Random Forests', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)

Modelo: Logistic Regression
Conjunto de Treino

matriz de confusão = 
 [[ 6  0]
 [ 0 28]]
Sensibilidade =  1.0
Especificidade =  1.0
acc =  1.0

Conjunto de Validação

matriz de confusão = 
 [[ 3  4]
 [ 0 19]]
Sensibilidade =  1.0
Especificidade =  0.42857142857142855
acc =  0.8461538461538461

Conjunto de Teste

matriz de confusão = 
 [[ 2  5]
 [ 1 19]]
Sensibilidade =  0.95
Especificidade =  0.2857142857142857
acc =  0.7777777777777778
Cross validation (100-Fold):
Score =  0.840344827586207
Std =  0.0638918155194279


Modelo: Support Vector Machine
Conjunto de Treino

matriz de confusão = 
 [[ 4  2]
 [ 0 28]]
Sensibilidade =  1.0
Especificidade =  0.6666666666666666
acc =  0.9411764705882353

Conjunto de Validação

matriz de confusão = 
 [[ 0  7]
 [ 0 19]]
Sensibilidade =  1.0
Especificidade =  0.0
acc =  0.7307692307692307

Conjunto de Teste

matriz de confusão = 
 [[ 0  7]
 [ 0 20]]
Sensibilidade =  1.0
Especificidade =  0.0
acc =  0.7407407407407407
Cross validation (100-Fold):
Sc

### Classificador: Esporadico/C9o/VAPB
#### Dados:  Apenas DTI

In [22]:
filteredData = dtiData[dtiData['als'] == 1]
x, y, xTrain, yTrain, xVal, yVal, xTest, yTest = prepareData(filteredData, 'group', 0.1)

clfGroupSVM = make_pipeline(StandardScaler(), svm.SVC(gamma='auto', kernel='rbf'))
clfGroupRF = make_pipeline(StandardScaler(), RandomForestClassifier())

clfGroupSVM.fit(xTrain, yTrain);
clfGroupRF.fit(xTrain, yTrain);

Initial Dimensions:  (67, 785)
Post zeroVar:  (67, 157)


In [23]:
showResults(clfGroupSVM, 'Support Vector Machine', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)
print('\n======================================================\n')
showResults(clfGroupRF, 'Random Forests', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)

Modelo: Support Vector Machine
Conjunto de Treino

matriz de confusão = 
 [[11  0  0]
 [ 0  5  0]
 [ 0  0 10]]
Sensibilidade =  0.45454545454545453
Especificidade =  inf
acc =  1.0

Conjunto de Validação

matriz de confusão = 
 [[5 0 2]
 [2 0 2]
 [7 0 2]]
Sensibilidade =  0.0
Especificidade =  inf
acc =  0.35

Conjunto de Teste

matriz de confusão = 
 [[9 0 2]
 [1 1 1]
 [4 0 3]]
Sensibilidade =  0.09090909090909091
Especificidade =  inf
acc =  0.6190476190476191
Cross validation (100-Fold):


  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity
  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity
  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity


Score =  0.497391304347826
Std =  0.08482607581396312


Modelo: Random Forests
Conjunto de Treino

matriz de confusão = 
 [[11  0  0]
 [ 0  5  0]
 [ 0  0 10]]
Sensibilidade =  0.45454545454545453
Especificidade =  inf
acc =  1.0

Conjunto de Validação

matriz de confusão = 
 [[5 1 1]
 [2 0 2]
 [5 0 4]]
Sensibilidade =  0.0
Especificidade =  inf
acc =  0.45

Conjunto de Teste

matriz de confusão = 
 [[8 1 2]
 [1 1 1]
 [3 1 3]]
Sensibilidade =  0.09090909090909091
Especificidade =  inf
acc =  0.5714285714285714
Cross validation (100-Fold):


  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity
  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity
  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity


Score =  0.508695652173913
Std =  0.08531050813195036


#### Dados:  Apenas T1

In [24]:
filteredData = t1Data[t1Data['als'] == 1]
x, y, xTrain, yTrain, xVal, yVal, xTest, yTest = prepareData(filteredData, 'group', 0.1)

clfGroupSVM = make_pipeline(StandardScaler(), svm.SVC(gamma='auto', kernel='rbf'))
clfGroupRF = make_pipeline(StandardScaler(), RandomForestClassifier())

clfGroupSVM.fit(xTrain, yTrain);
clfGroupRF.fit(xTrain, yTrain);

Initial Dimensions:  (70, 173)
Post zeroVar:  (70, 28)


In [25]:
showResults(clfGroupSVM, 'Support Vector Machine', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)
print('\n======================================================\n')
showResults(clfGroupRF, 'Random Forests', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)

Modelo: Support Vector Machine
Conjunto de Treino

matriz de confusão = 
 [[11  0  0]
 [ 0  7  0]
 [ 1  0  9]]
Sensibilidade =  0.6363636363636364
Especificidade =  inf
acc =  0.9642857142857143

Conjunto de Validação

matriz de confusão = 
 [[7 0 0]
 [3 0 1]
 [8 2 0]]
Sensibilidade =  0.0
Especificidade =  inf
acc =  0.3333333333333333

Conjunto de Teste

matriz de confusão = 
 [[7 4 1]
 [0 3 0]
 [4 0 2]]
Sensibilidade =  0.25
Especificidade =  inf
acc =  0.5714285714285714
Cross validation (100-Fold):


  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity
  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity
  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity


Score =  0.4875
Std =  0.09973924336321519


Modelo: Random Forests
Conjunto de Treino

matriz de confusão = 
 [[11  0  0]
 [ 0  7  0]
 [ 0  0 10]]
Sensibilidade =  0.6363636363636364
Especificidade =  inf
acc =  1.0

Conjunto de Validação

matriz de confusão = 
 [[5 0 2]
 [1 0 3]
 [6 2 2]]
Sensibilidade =  0.0
Especificidade =  inf
acc =  0.3333333333333333

Conjunto de Teste

matriz de confusão = 
 [[7 3 2]
 [1 2 0]
 [5 0 1]]
Sensibilidade =  0.16666666666666666
Especificidade =  inf
acc =  0.47619047619047616
Cross validation (100-Fold):


  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity
  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity
  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity


Score =  0.46791666666666665
Std =  0.10169654615570776


#### Dados:  DTI e T1

In [26]:
filteredData = allData[allData['als'] == 1]
x, y, xTrain, yTrain, xVal, yVal, xTest, yTest = prepareData(filteredData, 'group', 0.1)

clfGroupSVM = make_pipeline(StandardScaler(), svm.SVC(gamma='auto', kernel='rbf'))
clfGroupRF = make_pipeline(StandardScaler(), RandomForestClassifier())

clfGroupSVM.fit(xTrain, yTrain);
clfGroupRF.fit(xTrain, yTrain);

Initial Dimensions:  (67, 959)
Post zeroVar:  (67, 186)


In [27]:
showResults(clfGroupSVM, 'Support Vector Machine', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)
print('\n======================================================\n')
showResults(clfGroupRF, 'Random Forests', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)

Modelo: Support Vector Machine
Conjunto de Treino

matriz de confusão = 
 [[11  0  0]
 [ 0  5  0]
 [ 0  0 10]]
Sensibilidade =  0.45454545454545453
Especificidade =  inf
acc =  1.0

Conjunto de Validação

matriz de confusão = 
 [[6 1 0]
 [2 0 2]
 [7 0 2]]
Sensibilidade =  0.0
Especificidade =  inf
acc =  0.4

Conjunto de Teste

matriz de confusão = 
 [[8 1 2]
 [1 2 0]
 [4 0 3]]
Sensibilidade =  0.18181818181818182
Especificidade =  inf
acc =  0.6190476190476191
Cross validation (100-Fold):


  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity
  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity
  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity


Score =  0.5182608695652173
Std =  0.08445980912409098


Modelo: Random Forests
Conjunto de Treino

matriz de confusão = 
 [[11  0  0]
 [ 0  5  0]
 [ 0  0 10]]
Sensibilidade =  0.45454545454545453
Especificidade =  inf
acc =  1.0

Conjunto de Validação

matriz de confusão = 
 [[5 1 1]
 [2 0 2]
 [2 0 7]]
Sensibilidade =  0.0
Especificidade =  inf
acc =  0.6

Conjunto de Teste

matriz de confusão = 
 [[9 2 0]
 [1 2 0]
 [1 0 6]]
Sensibilidade =  0.18181818181818182
Especificidade =  inf
acc =  0.8095238095238095
Cross validation (100-Fold):


  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity
  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity
  Spec = CM[0,0]/(y == 0).sum() # Calculate Specificity


Score =  0.6895652173913044
Std =  0.09990922155629518


## Exportar Dados

In [28]:
# Interim
faData.to_csv('../data/interim/faData.csv', index=False)
l1Data.to_csv('../data/interim/l1Data.csv', index=False)
l2Data.to_csv('../data/interim/l2Data.csv', index=False)
l3Data.to_csv('../data/interim/l3Data.csv', index=False)
vlData.to_csv('../data/interim/vlData.csv', index=False)

# Processed
allData.to_csv('../data/processed/processedData.csv', index=False)