# Análise Exploratória


## Descrição
Neste notebook consta a análise exploratória da base de dados utilizada no projeto *Correlação de dados de imagens de RM e dados genéticos em paciente com Esclerose Lateral Amiotrófica* para a disciplina *Ciência e Visualização de Dados em Saúde* da Universidade Estadual de Campinas, Unicamp.

## Bibliotecas

In [107]:
# Import libraries
## Basic
import numpy as np
import scipy as sp
import pandas as pd
import random

## Graph
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
from seaborn_qqplot import pplot

## Machine Learning
import statsmodels.api as sm
#from statsmodels.formula.api import ols
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## Metrics
from sklearn.metrics import r2_score
#from scipy.stats import shapiro
from sklearn import metrics

## Inportação de Dados

In [108]:
# Import data
## Path to file
path = "../data/raw/DTI_MultAtlas.xlsx"

## Sheets names
faPath = 'FA'
l1Path = 'l1'
l2Path = 'l2'
l3Path = 'l3'
volumeLabelsPath = 'volumeLabels'

## Read each excel sheet
faDataRaw = pd.read_excel(path, sheet_name = faPath)
l1DataRaw = pd.read_excel(path, sheet_name = l1Path)
l2DataRaw = pd.read_excel(path, sheet_name = l2Path)
l3DataRaw = pd.read_excel(path, sheet_name = l3Path)
volumeLabelsDataRaw = pd.read_excel(path, sheet_name = volumeLabelsPath)
volumeLabelsDataRaw.head()

Unnamed: 0,ID/Labls,SUPERIOR PARIETAL GYRUS left (gm),CINGULATE GYRUS left (gm),SUPERIOR FRONTAL GYRUS left (gm),MIDDLE FRONTAL GYRUS left (gm),INFERIOR FRONTAL GYRUS left (gm),PRECENTRAL GYRUS left (gm),POSTCENTRAL GYRUS left (gm),ANGULAR GYRUS left (gm),PRE-CUNEUS left (gm),...,SLF-tLeft,SLFF-tRight,ICP-cerebellumLeft,ICP-cerebellumRight,CerebellumBranch-ALeft,CerebellumBranch-ARight,CerebellumBranch-BLeft,CerebellumBranch-BRight,CSF,Unused
0,c9o_02,12362,7084,16584,11484,13216,17806,8200,3140,1820,...,3888,2056,470,472,2828,2916,264,370,319946,94.0
1,c9o_03,7348,4466,10238,6452,7454,9370,4710,2136,1066,...,2822,1392,260,296,2082,1766,422,420,230842,
2,c9o_04,9602,7164,14782,11700,7882,13954,5848,2262,810,...,4228,1754,216,326,2382,2712,398,474,262602,40.0
3,c9o_05,9284,7274,13514,8108,9264,11830,6120,4064,1672,...,3732,1818,256,316,2292,2118,374,320,226770,4.0
4,c9o_06,8294,6598,10790,9498,7964,10112,5232,2158,1196,...,2306,1776,262,224,2006,1812,240,286,212792,2.0


## Sumário dos Dados

In [109]:
# Data Summary 2
## Data types
print("Dataset: FA")
print(faDataRaw.dtypes)

print("\n")

print("Dataset: L1")
print(l1DataRaw.dtypes)

print("\n")

print("Dataset: L2")
print(l2DataRaw.dtypes)

print("\n")

print("Dataset: L3")
print(l3DataRaw.dtypes)

print("\n")

print("Dataset: Volume Labels")
print(volumeLabelsDataRaw.dtypes)

Dataset: FA
ID/Labls                               object
SUPERIOR PARIETAL GYRUS left  (gm)    float64
CINGULATE GYRUS left  (gm)            float64
SUPERIOR FRONTAL GYRUS left (gm)      float64
MIDDLE FRONTAL GYRUS left (gm)        float64
                                       ...   
CerebellumBranch-ARight               float64
CerebellumBranch-BLeft                float64
CerebellumBranch-BRight               float64
CSF                                   float64
Unused                                float64
Length: 170, dtype: object


Dataset: L1
ID/Labls                               object
SUPERIOR PARIETAL GYRUS left  (gm)    float64
CINGULATE GYRUS left  (gm)            float64
SUPERIOR FRONTAL GYRUS left (gm)      float64
MIDDLE FRONTAL GYRUS left (gm)        float64
                                       ...   
CerebellumBranch-ARight               float64
CerebellumBranch-BLeft                float64
CerebellumBranch-BRight               float64
CSF                        

In [110]:
# Data Summary 2
## Summarry Statistics
print("Summary: FA")
print(faDataRaw.describe().transpose())

print("\n")

print("Summary: L1")
print(l1DataRaw.describe().transpose())

print("\n")

print("Summary: L2")
print(l2DataRaw.describe().transpose())

print("\n")

print("Summary: L3")
print(l3DataRaw.describe().transpose())

print("\n")

print("Summary: L3")
print(volumeLabelsDataRaw.describe().transpose())

Summary: FA
                                    count      mean       std       min  \
SUPERIOR PARIETAL GYRUS left  (gm)   87.0  0.444349  0.020059  0.391171   
CINGULATE GYRUS left  (gm)           87.0  0.341726  0.008848  0.316205   
SUPERIOR FRONTAL GYRUS left (gm)     87.0  0.387573  0.016413  0.341545   
MIDDLE FRONTAL GYRUS left (gm)       87.0  0.379601  0.014524  0.342091   
INFERIOR FRONTAL GYRUS left (gm)     87.0  0.420632  0.020671  0.344214   
...                                   ...       ...       ...       ...   
CerebellumBranch-ARight              87.0  0.370548  0.029269  0.313544   
CerebellumBranch-BLeft               87.0  0.381883  0.033387  0.306729   
CerebellumBranch-BRight              87.0  0.384852  0.035752  0.310387   
CSF                                  87.0  0.150976  0.012091  0.122123   
Unused                               82.0  0.224499  0.059865  0.093848   

                                         25%       50%       75%       max  
SUPERIOR P

## Dimensões e Dados Faltantes

In [111]:
# Data shape and missing values

## FA
faRows = faDataRaw.shape[0];
faColumns = faDataRaw.shape[1];
faMissing = faDataRaw.isnull().sum().sum();

## L1
l1Rows = l1DataRaw.shape[0];
l1Columns = l1DataRaw.shape[1];
l1Missing = l1DataRaw.isnull().sum().sum();

## L2
l2Rows = l2DataRaw.shape[0];
l2Columns = l2DataRaw.shape[1];
l2Missing = l2DataRaw.isnull().sum().sum();

## L3
l3Rows = l3DataRaw.shape[0];
l3Columns = l3DataRaw.shape[1];
l3Missing = l3DataRaw.isnull().sum().sum();

## L3
volumeLabelsRows = volumeLabelsDataRaw.shape[0];
volumeLabelsColumns = volumeLabelsDataRaw.shape[1];
volumeLabelsMissing = volumeLabelsDataRaw.isnull().sum().sum();

In [112]:
# Summary
dataSummary = pd.DataFrame({'Data' : ['FA', 'L1',  'L2', 'L3', 'volumeLabels'],
                              'Rows' : [faRows, l1Rows, l2Rows, l3Rows, volumeLabelsRows], 
                              'Columns' : [faColumns, l1Columns, l2Columns, l3Columns, volumeLabelsColumns],
                              'Missing' : [faMissing, l1Missing, l2Missing, l3Missing, volumeLabelsMissing]});
print("Table 1: Data Summary")
print(dataSummary)

Table 1: Data Summary
           Data  Rows  Columns  Missing
0            FA    87      170      322
1            L1    87      170      322
2            L2    87      170      322
3            L3    87      170      322
4  volumeLabels    87      170      322


In [113]:
faMissingColumns = []
l1MissingColumns = []
l2MissingColumns = []
l3MissingColumns = []
volumeLabelsMissingColumns = []

for i in range(0, faDataRaw.shape[1]):
    missing = faDataRaw.iloc[:,i].isnull().sum()
    if(missing > 0):
        faMissingColumns.append(faDataRaw.columns[i])

for i in range(0, l1DataRaw.shape[1]):
    missing = l1DataRaw.iloc[:,i].isnull().sum()
    if(missing > 0):
        l1MissingColumns.append(l1DataRaw.columns[i])

for i in range(0, l2DataRaw.shape[1]):
    missing = l2DataRaw.iloc[:,i].isnull().sum()
    if(missing > 0):
        l2MissingColumns.append(l2DataRaw.columns[i])

for i in range(0, l3DataRaw.shape[1]):
    missing = l3DataRaw.iloc[:,i].isnull().sum()
    if(missing > 0):
        l3MissingColumns.append(l3DataRaw.columns[i])
        
for i in range(0, volumeLabelsDataRaw.shape[1]):
    missing = volumeLabelsDataRaw.iloc[:,i].isnull().sum()
    if(missing > 0):
        volumeLabelsMissingColumns.append(volumeLabelsDataRaw.columns[i])

## Limpeza

1. Backup dos dados
2. Excluir colunas irrelevantes ("Unnamed 1" e "Unnamed 2")
3. Renomear primeira coluna de "Unnamed 0" para "subject"
4. Criação da coluna "als", discriminando pacientes com e sem ELA
5. Criaçào da coluna "group", discriminando pacientes do grupo de controle e diferentes tipos 
de ELA (ELAs, C9orf72 e VAPB)
6. União dos 3 dataset em um único dataset

In [114]:
# Cleaning Data
## Copy dataframes and drop first row
faData = pd.DataFrame(faDataRaw);
l1Data = pd.DataFrame(l1DataRaw);
l2Data = pd.DataFrame(l2DataRaw);
l3Data = pd.DataFrame(l3DataRaw);
volumeLabelsData = pd.DataFrame(volumeLabelsDataRaw);

## Drop unnamed columns
faData.drop(faMissingColumns, axis = 1, inplace = True);
l1Data.drop(l1MissingColumns, axis = 1, inplace = True);
l2Data.drop(l2MissingColumns, axis = 1, inplace = True);
l3Data.drop(l3MissingColumns, axis = 1, inplace = True);
volumeLabelsData.drop(volumeLabelsMissingColumns, axis = 1, inplace = True);

## Rename first column
faData.rename(columns = {'ID/Labls' : 'subject'}, inplace = True);
l1Data.rename(columns = {'ID/Labls' : 'subject'}, inplace = True);
l2Data.rename(columns = {'ID/Labls' : 'subject'}, inplace = True);
l3Data.rename(columns = {'ID/Labls' : 'subject'}, inplace = True);
volumeLabelsData.rename(columns = {'ID/Labls' : 'subject'}, inplace = True);

for i in range(1, faData.shape[1]):
    faName = faDataRaw.columns[i] + '_fa'
    l1Name = faDataRaw.columns[i] + '_l1'
    l2Name = faDataRaw.columns[i] + '_l2'
    l3Name = faDataRaw.columns[i] + '_l3'
    volumeLabelsName = faDataRaw.columns[i] + '_volumeLabels'
    
    faData.rename(columns = {faDataRaw.columns[i] : faName}, inplace = True);
    l1Data.rename(columns = {l1DataRaw.columns[i] : l1Name}, inplace = True);
    l2Data.rename(columns = {l2DataRaw.columns[i] : l2Name}, inplace = True);
    l3Data.rename(columns = {l3DataRaw.columns[i] : l3Name}, inplace = True);
    volumeLabelsData.rename(columns = {volumeLabelsDataRaw.columns[i] : volumeLabelsName}, inplace = True);


## New column: ALS
### Map Values
#### Legend
#### 0 = control
#### 1 =  ALS confirmed
faData['als'] = 1
l1Data['als'] = 1
l2Data['als'] = 1
l3Data['als'] = 1
volumeLabelsData['als'] = 1
faData.loc[faData['subject'].str.startswith('ctl'), 'als'] = 0
l1Data.loc[l1Data['subject'].str.startswith('ctl'), 'als'] = 0
l2Data.loc[l2Data['subject'].str.startswith('ctl'), 'als'] = 0
l3Data.loc[l3Data['subject'].str.startswith('ctl'), 'als'] = 0
volumeLabelsData.loc[volumeLabelsData['subject'].str.startswith('ctl'), 'als'] = 0

## New column: Group
faData['group'] = 0
l1Data['group'] = 0
l2Data['group'] = 0
l3Data['group'] = 0
volumeLabelsData['group'] = 0


## Group: Map Values
### Legend
### 0 = control
### 1 = sporadic ALS
faData.loc[faData['subject'].str.startswith('sals'), 'group'] = 1
l1Data.loc[l1Data['subject'].str.startswith('sals'), 'group'] = 1
l2Data.loc[l2Data['subject'].str.startswith('sals'), 'group'] = 1
l3Data.loc[l3Data['subject'].str.startswith('sals'), 'group'] = 1
volumeLabelsData.loc[volumeLabelsData['subject'].str.startswith('sals'), 'group'] = 1

### 2 = c9o ALS
faData.loc[faData['subject'].str.startswith('c9o'), 'group'] = 2
l1Data.loc[l1Data['subject'].str.startswith('c9o'), 'group'] = 2
l2Data.loc[l2Data['subject'].str.startswith('c9o'), 'group'] = 2
l3Data.loc[l3Data['subject'].str.startswith('c9o'), 'group'] = 2
volumeLabelsData.loc[volumeLabelsData['subject'].str.startswith('c9o'), 'group'] = 2

### 3 = vapb ALS
faData.loc[faData['subject'].str.startswith('vap'), 'group'] = 3
l1Data.loc[l1Data['subject'].str.startswith('vap'), 'group'] = 3
l2Data.loc[l2Data['subject'].str.startswith('vap'), 'group'] = 3
l3Data.loc[l3Data['subject'].str.startswith('vap'), 'group'] = 3
volumeLabelsData.loc[volumeLabelsData['subject'].str.startswith('vap'), 'group'] = 3

In [115]:
# Merge all data into one dataframe
allData = pd.DataFrame(faData)
allData = allData.merge(l1Data, how = 'inner', on = ['subject', 'als', 'group'])
allData = allData.merge(l2Data, how = 'inner', on = ['subject', 'als', 'group'])
allData = allData.merge(l3Data, how = 'inner', on = ['subject', 'als', 'group'])
allData = allData.merge(volumeLabelsData, how = 'inner', on = ['subject', 'als', 'group'])
allData

Unnamed: 0,subject,SUPERIOR PARIETAL GYRUS left (gm)_fa,CINGULATE GYRUS left (gm)_fa,SUPERIOR FRONTAL GYRUS left (gm)_fa,MIDDLE FRONTAL GYRUS left (gm)_fa,INFERIOR FRONTAL GYRUS left (gm)_fa,PRECENTRAL GYRUS left (gm)_fa,POSTCENTRAL GYRUS left (gm)_fa,ANGULAR GYRUS left (gm)_fa,PRE-CUNEUS left (gm)_fa,...,III and IV ventricle,SLF-tLeft,SLFF-tRight,ICP-cerebellumLeft,ICP-cerebellumRight,CerebellumBranch-ALeft,CerebellumBranch-ARight,CerebellumBranch-BLeft,CerebellumBranch-BRight,CSF
0,c9o_02,0.436139,0.332152,0.378837,0.374929,0.412669,0.403631,0.387549,0.381594,0.337111,...,6892,3888,2056,470,472,2828,2916,264,370,319946
1,c9o_03,0.464681,0.343998,0.394399,0.365355,0.418513,0.428737,0.387495,0.415324,0.345184,...,4226,2822,1392,260,296,2082,1766,422,420,230842
2,c9o_04,0.417323,0.337455,0.358907,0.364955,0.402419,0.393287,0.382468,0.381495,0.325173,...,8470,4228,1754,216,326,2382,2712,398,474,262602
3,c9o_05,0.419740,0.342843,0.376812,0.370663,0.408628,0.434609,0.391832,0.397391,0.321499,...,5952,3732,1818,256,316,2292,2118,374,320,226770
4,c9o_06,0.434883,0.340932,0.374331,0.384086,0.424424,0.416647,0.386945,0.404603,0.342649,...,5298,2306,1776,262,224,2006,1812,240,286,212792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,vap_23,0.455010,0.349545,0.386440,0.387335,0.423370,0.452815,0.425148,0.425813,0.360319,...,5228,2658,1338,246,284,1746,1530,212,266,262232
83,vap_24,0.437136,0.346213,0.385347,0.381680,0.428368,0.447692,0.387844,0.401134,0.347650,...,7624,2678,1156,252,350,1954,2218,280,308,229296
84,vap_25,0.466444,0.351894,0.413071,0.402986,0.453248,0.473965,0.428378,0.430933,0.355090,...,7222,3382,2042,280,390,2568,2404,276,276,243586
85,vap_26,0.444580,0.345025,0.387747,0.379145,0.426062,0.449424,0.402729,0.402317,0.344495,...,6672,3108,2246,236,374,2550,2632,370,370,255380


## Treinamento do Modelo

In [220]:
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import feature_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

In [338]:
def results(model, x, y):
    yPred = model.predict(x)
    CM = confusion_matrix(y, yPred)
    Sens = CM[1,1]/(y == 1).sum()
    Espe = CM[0,0]/(y == 0).sum()
    
    print('matriz de confusão = \n', CM)
    print('Sensibilidade = ', Sens)
    print('Especificidade = ', Espe)
    print('acc = ', np.sum(y == yPred)/yPred.size)
    
    
def showResults(model, name, x, y, xTrain, xVal, xTest, yTrain, yVal, yTest):
    title = 'Modelo: ' + name
    print(title)

    print('Conjunto de Treino\n')
    results(model, xTrain, yTrain)
    
    print('\nConjunto de Validação\n')
    results(model, xVal, yVal)
    
    print('\nConjunto de Teste\n')
    results(model, xTest, yTest)
    
    print('Crossvalidation (100-Fold):')
    cv = ShuffleSplit(n_splits = 100, test_size = 0.5, random_state = 5)
    scores = cross_val_score(model, x, y, cv=cv)
    print('Score = ', scores.mean())
    print('Std = ', scores.std())


In [357]:
#filteredData = allData[allData['als'] == 1]
filteredData = allData

x = filteredData.drop(['subject', 'als', 'group'], axis=1)
y = filteredData['als']

print(x.shape)
sel = feature_selection.VarianceThreshold(threshold = 0.1)
x = sel.fit_transform(x)
x = PCA(0.99).fit_transform(x)
print(x.shape)

xTrain, xTmp, yTrain, yTmp = train_test_split(x, y, test_size = 0.5, random_state = 5)
xVal, xTest, yVal, yTest = train_test_split(xTmp, yTmp, test_size = 0.33, random_state = 5)

xTrain = sm.add_constant(xTrain)
xVal = sm.add_constant(xVal)
xTest = sm.add_constant(xTest)
#cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

(87, 785)
(87, 18)


In [358]:
clfLR = make_pipeline(StandardScaler(), LogisticRegression(penalty='l2', max_iter = 5000))
clfSVM = make_pipeline(StandardScaler(), svm.SVC(gamma='auto', kernel='rbf'))
clfRF = make_pipeline(StandardScaler(), RandomForestClassifier())

In [359]:
clfLR.fit(xTrain, yTrain);
clfSVM.fit(xTrain, yTrain);
clfRF.fit(xTrain, yTrain);

In [360]:
showResults(clfLR, 'Logistic Regression', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)

Modelo: Logistic Regression
Conjunto de Treino

matriz de confusão = 
 [[ 4  4]
 [ 1 34]]
Sensibilidade =  0.9714285714285714
Especificidade =  0.5
acc =  0.8837209302325582

Conjunto de Validação

matriz de confusão = 
 [[ 2  8]
 [ 1 18]]
Sensibilidade =  0.9473684210526315
Especificidade =  0.2
acc =  0.6896551724137931

Conjunto de Teste

matriz de confusão = 
 [[ 1  1]
 [ 2 11]]
Sensibilidade =  0.8461538461538461
Especificidade =  0.5
acc =  0.8
Crossvalidation (100-Fold):
Score =  0.7193181818181817
Std =  0.06407116072277477


In [361]:
showResults(clfSVM, 'Support Vector Machine', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)

Modelo: Support Vector Machine
Conjunto de Treino

matriz de confusão = 
 [[ 1  7]
 [ 0 35]]
Sensibilidade =  1.0
Especificidade =  0.125
acc =  0.8372093023255814

Conjunto de Validação

matriz de confusão = 
 [[ 0 10]
 [ 0 19]]
Sensibilidade =  1.0
Especificidade =  0.0
acc =  0.6551724137931034

Conjunto de Teste

matriz de confusão = 
 [[ 0  2]
 [ 0 13]]
Sensibilidade =  1.0
Especificidade =  0.0
acc =  0.8666666666666667
Crossvalidation (100-Fold):
Score =  0.7618181818181817
Std =  0.04890114839193029


In [362]:
showResults(clfRF, 'Support Random Forests', x, y, xTrain, xVal, xTest, yTrain, yVal, yTest)

Modelo: Support Random Forests
Conjunto de Treino

matriz de confusão = 
 [[ 8  0]
 [ 0 35]]
Sensibilidade =  1.0
Especificidade =  1.0
acc =  1.0

Conjunto de Validação

matriz de confusão = 
 [[ 0 10]
 [ 0 19]]
Sensibilidade =  1.0
Especificidade =  0.0
acc =  0.6551724137931034

Conjunto de Teste

matriz de confusão = 
 [[ 0  2]
 [ 0 13]]
Sensibilidade =  1.0
Especificidade =  0.0
acc =  0.8666666666666667
Crossvalidation (100-Fold):
Score =  0.7506818181818181
Std =  0.04695863750540082


## Exportar Dados

In [123]:
# Interim
#volumeData.to_csv('../data/interim/volumeData.csv', index=False)
#gsLHData.to_csv('../data/interim/gsLHData.csv', index=False)
#gsRHData.to_csv('../data/interim/gsRHData.csv', index=False)

# Processed
#allData.to_csv('../data/processed/processedData.csv', index=False)