In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# import scikit_posthocs as sp
# import scipy.stats as stats
# from sklearn import preprocessing
# from sklearn.metrics import r2_score

# from statsmodels.stats.multicomp import MultiComparison
from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import metrics
# from sklearn.model_selection import GridSearchCV


# from scipy.stats import linregress
# from sklearn.decomposition import PCA

import matplotlib.cm as cm
import matplotlib.colors as mcolors

# from sklearn.decomposition import PCA, KernelPCA

# Leitura de DataSet
Estamos trabalhando com o DataFrame de pacientes que possuem, originalmente, pelo menos um valor dos seguintes exames:

- Albumina;
- Microalbuminuria;
- Rel.AlbuminaCreatininaUAUC;
- Proteinuria24hs;


Entre os exames, nenhum destes possuem valores nulos, estes valor foram preenchidos com a média dos mesmos. 

In [3]:
df = pd.read_csv('datasets/banco_por_semestre_imputado.csv', sep = ',', encoding = "ISO-8859-1", low_memory = False, 
                 error_bad_lines=False).drop(['Unnamed: 0'], axis=1,errors='ignore')


# passando algumas colunas mais importantes para inteiroA
df['Raça'].replace(['Branca','Parda','Preta','Indigena','Amarela'],[1,2,3,4,5], inplace=True)
df['tabagismo'].replace(['Ex','Sim'],[0,1], inplace=True)
df['Codsexo'].replace(['Feminino','Masculino'],[1,2], inplace=True)

#ao invés de mexermos com o estágio final, iremos analisar o estágio do semstre
df["SEMESTRE"].replace(df["SEMESTRE"].unique(),[2010.0, 2011.1, 2011.2, 2012.1, 2012.2, 2013.1, 2013.2,
       2014.1, 2014.2, 2015.0], inplace=True)
df["Estagio_EQ"].replace({'Estágio 2 - 60-89 ml':2, 'Estágio 3a - 45-59 ml':3,
                           'Estágio 1 - >= 90 ml':1, 'Estágio 5 - < 15 ml':6,
                           'Estágio 3b - 30-44 ml':4, 'Estágio 4 - 15-29 ml':5}, inplace=True)

df['etilismo'].replace(['Ex','Sim'],[0,1], inplace=True)
df['sedentario'].replace(['Não','Sim'],[0,1], inplace=True)
df['insulina'].replace(['Não','Sim'],[0,1], inplace=True)

#index de pacientes que possuem albumina
df_alb = pd.read_csv('datasets/banco_por_semestre.csv', sep = ',', encoding = "ISO-8859-1", low_memory = False, 
                 error_bad_lines=False).drop(['Unnamed: 0'], axis=1,errors='ignore')
index_albumina = [i for i in df_alb.dropna(subset=['Albumina']).index]

#dataframe, já preenchido, apenas com os pacientes que possuem os exames citados
albumina_pacientes = df.iloc[(df_alb['Albumina'].isna()==False).values]
microalbuminuria_pacientes= df.iloc[(df_alb['Microalbuminuria'].isna()==False).values]
relAlbCrea_pacientes= df.iloc[(df_alb['Rel.AlbuminaCreatininaUAUC'].isna()==False).values]
proteinuria_pacientes= df.iloc[(df_alb['Proteinuria24hs'].isna()==False).values]


#juntando os dataframes
df_pacientes_exames = pd.concat([albumina_pacientes,microalbuminuria_pacientes,relAlbCrea_pacientes,
                                proteinuria_pacientes]).drop_duplicates()

# Formar Grupos 
Iremos utilizar o algoritmo Kmeans para formar grupos, reduzindo os exames em duas componentes.

### Gráfico

In [4]:
def grafico(finalDf,df):
    #renomeando index
    tuple_index=[]
    for i in range(len(finalDf)):
        tuple_index.append((df.index[i],i))
    finalDf.rename(index={velho:novo for velho,novo in tuple_index},inplace=True)
    #Visualização
    fig = plt.figure(figsize = (12,12))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('Component 1', fontsize = 15)
    ax.set_ylabel('Component 2', fontsize = 15)    
    ax.set_title('2 component TSNE with Euclidean metric', fontsize = 20)
    targets = finalDf['Estagio_EQ'].unique()
    colors ={1:'green',2:'yellow',3:'orange',4:'red',5:'crimson',6:'blue'}
    for target in targets:
        indicesToKeep = finalDf['Estagio_EQ'] == target
        color = colors[target]
        ax.scatter(finalDf.loc[indicesToKeep, 'Component 1']
                   , finalDf.loc[indicesToKeep, 'Component 2']
                   , c = color
                   , s = 90)
    ax.legend(targets,title='Stages')
    ax.grid() 

In [5]:
def grafico_tnse(finalDf):
    fig = plt.figure(figsize = (12,12))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('Component 1', fontsize = 15)
    ax.set_ylabel('Component 2', fontsize = 15)    
    ax.set_title('2 component TSNE with Euclidean metric', fontsize = 20)

    targets = finalDf['Estagio_EQ'].unique()
    colors ={1:'green',2:'yellow',3:'orange',4:'red',5:'c',6:'blue'}
    for target in targets:
        indicesToKeep = finalDf['Estagio_EQ'] == target
        color = colors[target]
        ax.scatter(finalDf.loc[indicesToKeep,'Component 1']
                   , finalDf.loc[indicesToKeep, 'Component 2']
                   , c = color
                   , s = 90)

    ax.legend(targets,title='Stages')
    ax.grid()

In [6]:
# import chart_studio.plotly as py
import plotly as py
import plotly.graph_objs as go
def visualizacao_3d(df):

    X = df.drop(columns=['Estagio_EQ','Albumina_Estagio'])
    y = df['Estagio_EQ']
    
    #T-SNE
    tsne = TSNE(n_components=3,metric='euclidean')
    Xt = tsne.fit_transform(X)

    df_Xt = pd.DataFrame(Xt,columns=['Componente 1', 'Componente 2', 'Componente 3'],index=X.index)

    xtrain, xtest, ytrain, ytest = train_test_split(df_Xt,
                                                    y, test_size=0.30,
                                                    random_state=42)
    df_Xt['Estagio_EQ'] =y.values
    
    trace1 = go.Scatter3d(
        x= df_Xt['Componente 1'],
        y= df_Xt['Componente 2'],
        z= df_Xt['Componente 3'],
        mode='markers',
        marker=dict(
            color = df_Xt['Estagio_EQ'], 
            size=2,
            line=dict(
                color= df_Xt['Estagio_EQ'],
                width= 1
            ),
            opacity=1.0
         )
    )
    data = [trace1]
    layout = go.Layout(
        title= 'Clusters',
        scene = dict(
                xaxis = dict(title  = 'Componente 1'),
                yaxis = dict(title  = 'Componente 2'),
                zaxis = dict(title  = 'Componente 3')
            )
    )
    fig = go.Figure(data=data, layout=layout)
    fig.update_layout(legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    ), showlegend=True)
    py.offline.iplot(fig)

### TSNE e Kmeans

In [7]:
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import homogeneity_score,completeness_score, v_measure_score, adjusted_rand_score, adjusted_mutual_info_score
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
def tsne_tests(df,numk):
    X = df.drop(columns=['Estagio_EQ'])#,'Albumina_Estagio'
    y = df['Estagio_EQ']
    
    #T-SNE
    tsne = TSNE(n_components=2,metric='euclidean')
    Xt = tsne.fit_transform(X)

    df_Xt = pd.DataFrame(Xt,columns=['Component 1', 'Component 2'],index=X.index)

    
    df_Xt['Estagio_EQ'] =y.values


    #separando em conjunto de teste e de treino
    xtrain, xtest, ytrain, ytest = train_test_split(df_Xt.drop('Estagio_EQ',axis=1),
                                                    y, test_size=0.30,
                                                    random_state=42)
    
    #kmeans, labels será o Estagio TFG com T-SNE
    k=numk
    kmeans = KMeans(init='k-means++',n_clusters = k,random_state=42).fit(xtrain)
    ypred = kmeans.predict(xtest)
    

    #clustering performance evaluation
    silhoutte_erro = silhouette_score(xtest, ypred, metric = 'euclidean')
    calinski_erro = calinski_harabasz_score(xtest,ypred)
    homogeneity =  homogeneity_score(ytest, ypred)
    
    completeness = completeness_score(ytest, ypred)
    v_measure =  v_measure_score(ytest, ypred)
    adjusted_rand = adjusted_rand_score(ytest, ypred)
    adjusted_mutual = adjusted_mutual_info_score(ytest, ypred)

    
    print("k={0}, Silhouette score={1}, Calinski harabasz score={2}\n".format(k,silhoutte_erro, calinski_erro))
    print("completeness_score={0}, v_measure={1}, adjusted_rand={2}, adjusted_mutual ={3}".format(completeness,
                                                                       v_measure, adjusted_rand,adjusted_mutual))
    print('\n Homogeneity:{0}'.format(homogeneity))

    
    #elbow
    kmeans_el = KMeans(init='k-means++',random_state=42)
    visualizer = KElbowVisualizer(kmeans_el, k=np.arange(2,12,1))
    visualizer.fit(xtrain)        
    visualizer.show() 
    

    finalDf = pd.concat([xtest,
                         pd.DataFrame(ypred, 
                                      index=xtest.index,
                                      columns=['Estagio_EQ'])], 
                         axis = 1)
    finalDf['Estagio_EQ'].replace({0:1,1:2,2:3,3:4,4:5,5:6},inplace=True)

    aux = pd.DataFrame(data =finalDf['Estagio_EQ'].values,
                              columns=['ypred'],
                              index=xtest.index)

    df_estagio = pd.concat([aux,pd.DataFrame(ytest.values, 
                                      index=ytest.index,
                                      columns=['ytest'])],axis=1)
    grafico(finalDf,df_estagio)
    return df_estagio,df_Xt

# Testes com os exames mais importantes

### Teste1:

Todos+Hemoglobina+Ureia+Creatinina+Ácido Úrico

In [None]:
#verificar se todos esses exames existem no dataset
exames =['PAS','PAD' , 
       'ColesterolTotal', 'GlicemiadeJejum', 'Triglicerides', 
       'Potassio', 'ColesterolHDL', 'TSH',
       'HemoglobinaGlicada', 'TGP',
       'Albumina','Microalbuminuria','AcidoUrico', 'Ureia',
       'Rel.AlbuminaCreatininaUAUC','Proteinuria24hs',
       'Codsexo', 'Idade', 'Raça', 
       'Estagio_EQ']#'etilismo', 'sedentario','tabagismo','Codsexo', 'Idade', 'Raça', 'pesoi', 'pesof''AcidoUrico' 'Ureia',
        #'Albumina_Estagio''Creatinina',, 'pesoi''Hemoglobina','etilismo', 'sedentario','tabagismo',
#colcocar em um dataframe
df_verificar_estagio = pd.DataFrame(index=df_pacientes3exames.index, columns = exames)
for i in exames:
    df_verificar_estagio[i] = df_pacientes3exames[i]
df_verificar_estagio.isna().any()
