# PRÉ-PROCESSAMENTO

Realizando os imports necessários

In [57]:
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import zipfile
import os
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

Carregando dados

In [58]:
df = pd.read_parquet("../data/interim/sim_2006_2017.parquet")
df

Unnamed: 0,ASSISTMED,DTOBITO,ESC,ESTCIV,HORAOBITO,IDADE,LOCOCOR,NATURAL,OCUP,RACACOR,SEXO,SUICIDIO
0,,9022006,9.0,4.0,130.0,463.0,3.0,,999993.0,1.0,1,0
1,,26012006,2.0,2.0,1130.0,481.0,3.0,,214305.0,,1,0
2,1.0,19032006,2.0,3.0,1520.0,493.0,3.0,,514105.0,1.0,2,0
3,1.0,21112006,2.0,1.0,1000.0,489.0,3.0,77.0,214305.0,1.0,1,0
4,,16042006,9.0,3.0,2130.0,480.0,3.0,,,1.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3237588,,13042017,,,1840.0,465.0,3.0,835.0,,1.0,1,0
3237589,,19042017,2.0,1.0,937.0,455.0,3.0,829.0,999993.0,4.0,1,0
3237590,1.0,10012017,2.0,3.0,615.0,487.0,2.0,835.0,999993.0,1.0,1,0
3237591,1.0,10082017,2.0,2.0,500.0,486.0,3.0,835.0,999993.0,1.0,1,0


## TRATANDO VALORES NULOS E NORMALIZANDO

In [59]:
df['OCUP'] = df['OCUP'].fillna(-1)
cnt_ocup = df['OCUP'].value_counts()
df['OCUP'] = df['OCUP'].map(cnt_ocup)

In [60]:
cnt_loc = df['LOCOCOR'].value_counts()
df['LOCOCOR'] = df['LOCOCOR'].map(cnt_loc)

In [61]:
df['RACACOR'] = df['RACACOR'].fillna(-1)
cnt_raca = df['RACACOR'].value_counts()
df['RACACOR'] = df['RACACOR'].map(cnt_raca)

In [62]:
df['ESTCIV'] = df['ESTCIV'].fillna(9)
cnt_est = df['ESTCIV'].value_counts()
df['ESTCIV'] = df['ESTCIV'].map(cnt_est)

In [63]:
df['ASSISTMED'] = df['ASSISTMED'].replace(9, 2)
df['ASSISTMED'] = df['ASSISTMED'].fillna(2)

In [64]:
one_hot = pd.get_dummies(df['ASSISTMED'], prefix='ASSISTMED')
df = pd.concat([df, one_hot], axis=1)

In [65]:
df = df.rename(columns={'ASSISTMED_1.0' : 'TEVE_ASSIST', 'ASSISTMED_2.0' : 'NTEVE_ASSIST'}).drop(columns=['ASSISTMED'])

In [66]:
df = df[(df['SEXO']!=0) & (df['SEXO']!=9)]

In [67]:
one_hot = pd.get_dummies(df['SEXO'], prefix='SEXO')
df = pd.concat([df, one_hot], axis=1)

In [68]:
df = df.rename(columns={'SEXO_1' : 'MASCULINO', 'SEXO_2' : 'FEMININO'}).drop(columns=['SEXO'])

In [69]:
del df['HORAOBITO']
del df['DTOBITO']

In [70]:
df

Unnamed: 0,ESC,ESTCIV,IDADE,LOCOCOR,NATURAL,OCUP,RACACOR,SUICIDIO,TEVE_ASSIST,NTEVE_ASSIST,MASCULINO,FEMININO
0,9.0,229326,463.0,480252.0,,828601,2354375,0,False,True,True,False
1,2.0,1170466,481.0,480252.0,,1680,130117,0,False,True,True,False
2,2.0,853982,493.0,480252.0,,769,2354375,0,True,False,False,True
3,2.0,679997,489.0,480252.0,77.0,1680,2354375,0,True,False,True,False
4,9.0,853982,480.0,480252.0,,760072,2354375,0,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
3237588,,271622,465.0,480252.0,835.0,760072,2354375,0,False,True,True,False
3237589,2.0,679997,455.0,480252.0,829.0,828601,525380,0,False,True,True,False
3237590,2.0,853982,487.0,178466.0,835.0,828601,2354375,0,True,False,True,False
3237591,2.0,1170466,486.0,480252.0,835.0,828601,2354375,0,True,False,True,False


In [71]:
df['ESC'] = df['ESC'].replace(9,0)

In [72]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [73]:
imputer = imputer.fit(df[['ESC']])
df['ESC'] = imputer.transform(df[['ESC']])

In [74]:
df['ESC'].value_counts()

ESC
2.000000    722242
3.000000    623298
2.245797    539334
0.000000    473893
4.000000    390211
1.000000    314079
5.000000    173723
Name: count, dtype: int64

In [75]:
df = df.dropna(subset=['LOCOCOR'])
df = df.drop(columns=['NATURAL'])
df.isnull().sum()

ESC               0
ESTCIV            0
IDADE           619
LOCOCOR           0
OCUP              0
RACACOR           0
SUICIDIO          0
TEVE_ASSIST       0
NTEVE_ASSIST      0
MASCULINO         0
FEMININO          0
dtype: int64

In [76]:
df['OCUP'].max()

828601

In [81]:
df['IDADE'] = df['IDADE'].fillna(999.0) 
df['IDADE'] = df['IDADE'].astype(int)

In [78]:
def arruma_idade(row):
    idade = str(row['IDADE'])
    if idade==999:
        return idade 
    elif idade[0]=='4' and len(idade)>2:
        idade = idade[1:]
    elif idade[0]=='5' and len(idade)>2:
        idade = '1' + idade[1:]
    else:
        return 999
    return int(idade)

In [82]:
df['IDADE'] = df.apply(arruma_idade, axis=1)

In [83]:
idade999 = SimpleImputer(missing_values=999, strategy='mean')
idade999 = idade999.fit(df[['IDADE']])
df['IDADE'] = idade999.transform(df[['IDADE']])

In [91]:
df

Unnamed: 0,ESC,ESTCIV,IDADE,LOCOCOR,OCUP,RACACOR,SUICIDIO,TEVE_ASSIST,NTEVE_ASSIST,MASCULINO,FEMININO
0,0.000000,229326,63.000000,480252.0,828601,2354375,0,False,True,True,False
1,2.000000,1170466,81.000000,480252.0,1680,130117,0,False,True,True,False
2,2.000000,853982,93.000000,480252.0,769,2354375,0,True,False,False,True
3,2.000000,679997,89.000000,480252.0,1680,2354375,0,True,False,True,False
4,0.000000,853982,80.000000,480252.0,760072,2354375,0,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...
3237588,2.245797,271622,65.000000,480252.0,760072,2354375,0,False,True,True,False
3237589,2.000000,679997,55.000000,480252.0,828601,525380,0,False,True,True,False
3237590,2.000000,853982,87.000000,178466.0,828601,2354375,0,True,False,True,False
3237591,2.000000,1170466,86.000000,480252.0,828601,2354375,0,True,False,True,False


In [92]:
std = StandardScaler()
columns = ['ESC', 'ESTCIV', 'IDADE' , 'LOCOCOR', 'OCUP', 'RACACOR']
df[columns] = std.fit_transform(df[columns])

In [93]:
df

Unnamed: 0,ESC,ESTCIV,IDADE,LOCOCOR,OCUP,RACACOR,SUICIDIO,TEVE_ASSIST,NTEVE_ASSIST,MASCULINO,FEMININO
0,-1.678515e+00,-1.857843,-1.852877e-01,-1.522570,0.960065,0.608540,0,False,True,True,False
1,-1.837094e-01,1.049612,7.700565e-01,-1.522570,-1.431338,-1.895188,0,False,True,True,False
2,-1.837094e-01,0.071901,1.406953e+00,-1.522570,-1.433972,0.608540,0,True,False,False,True
3,-1.837094e-01,-0.465589,1.194654e+00,-1.522570,-1.431338,0.608540,0,True,False,True,False
4,-1.678515e+00,0.071901,7.169818e-01,-1.522570,0.761883,0.608540,0,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...
3237588,-1.659568e-15,-1.727178,-7.913830e-02,-1.522570,0.761883,0.608540,0,False,True,True,False
3237589,-1.837094e-01,-0.465589,-6.098851e-01,-1.522570,0.960065,-1.450262,0,False,True,True,False
3237590,-1.837094e-01,0.071901,1.088505e+00,-1.854218,0.960065,0.608540,0,True,False,True,False
3237591,-1.837094e-01,1.049612,1.035430e+00,-1.522570,0.960065,0.608540,0,True,False,True,False


In [95]:
df.to_parquet("../data/processed/sim_2006_2017.parquet")