## Imports e Dataframes

In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('SINASC_RO_2019.csv')

## Identificando e Tratando dados ausentes

#### Informações do Dataframe

In [16]:
colunas = ['ORIGEM','LOCNASC', 'IDADEMAE', 'QTDFILVIVO', 'QTDFILMORT', 'GESTACAO', 'CONSULTAS', 'GRAVIDEZ', 'IDADEPAI', 'PESO', 'RACACOR' ]

In [17]:
df = df[colunas]
df.head(3)

Unnamed: 0,ORIGEM,LOCNASC,IDADEMAE,QTDFILVIVO,QTDFILMORT,GESTACAO,CONSULTAS,GRAVIDEZ,IDADEPAI,PESO,RACACOR
0,1,1,19,0.0,0.0,37 a 41 semanas,4,Única,26.0,3685.0,Branca
1,1,1,29,1.0,0.0,37 a 41 semanas,4,Única,24.0,3055.0,Branca
2,1,1,37,2.0,0.0,37 a 41 semanas,4,Única,32.0,3460.0,Branca


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27028 entries, 0 to 27027
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ORIGEM      27028 non-null  int64  
 1   LOCNASC     27028 non-null  int64  
 2   IDADEMAE    27028 non-null  int64  
 3   QTDFILVIVO  25455 non-null  float64
 4   QTDFILMORT  24930 non-null  float64
 5   GESTACAO    25796 non-null  object 
 6   CONSULTAS   27028 non-null  int64  
 7   GRAVIDEZ    26949 non-null  object 
 8   IDADEPAI    7607 non-null   float64
 9   PESO        27028 non-null  float64
 10  RACACOR     26381 non-null  object 
dtypes: float64(4), int64(4), object(3)
memory usage: 2.3+ MB


#### Alterando tipo de dados `astype`

In [19]:
df['PESO'] = df['PESO'].astype(float)

#### Dados ausentes

In [22]:
print(f'Dados ausente: {df["RACACOR"].isna().sum()}')

Dados ausente: 647


In [23]:
print(f'Dados ausente: {df["RACACOR"].isnull().sum()}')

Dados ausente: 647


#### Tratando dados ausentes

In [46]:
df['RACACOR'].fillna(0) #substituir por 0

0        Branca
1        Branca
2        Branca
3         Parda
4         Parda
          ...  
27023    Branca
27024    Branca
27025     Parda
27026     Parda
27027     Parda
Name: RACACOR, Length: 27028, dtype: object

In [47]:
med_idadepai = df['IDADEPAI'].mean() #substituir por média ou mediana
df['IDADEPAI'].fillna(med_idadepai)

0        26.000000
1        24.000000
2        32.000000
3        24.000000
4        27.000000
           ...    
27023    35.000000
27024    31.092415
27025    30.000000
27026    24.000000
27027    31.092415
Name: IDADEPAI, Length: 27028, dtype: float64

In [33]:
df['GESTACAO'].fillna(method='ffill') 
#substituir os valores ausentes por valores que estão na frente

  df['GESTACAO'].fillna(method='ffill')


0        37 a 41 semanas
1        37 a 41 semanas
2        37 a 41 semanas
3        37 a 41 semanas
4        37 a 41 semanas
              ...       
27023    32 a 36 semanas
27024    37 a 41 semanas
27025    37 a 41 semanas
27026    32 a 36 semanas
27027    37 a 41 semanas
Name: GESTACAO, Length: 27028, dtype: object

In [35]:
df['GESTACAO'].fillna(method='bfill') 
#substituir os valores ausentes por valores que estão atraz

  df['GESTACAO'].fillna(method='bfill')


0        37 a 41 semanas
1        37 a 41 semanas
2        37 a 41 semanas
3        37 a 41 semanas
4        37 a 41 semanas
              ...       
27023    32 a 36 semanas
27024    37 a 41 semanas
27025    37 a 41 semanas
27026    32 a 36 semanas
27027    37 a 41 semanas
Name: GESTACAO, Length: 27028, dtype: object

#### Dropando dados ausentes

In [39]:
df.dropna().head(0)

Unnamed: 0,ORIGEM,LOCNASC,IDADEMAE,QTDFILVIVO,QTDFILMORT,GESTACAO,CONSULTAS,GRAVIDEZ,IDADEPAI,PESO,RACACOR


#### Valores duplicados

In [43]:
df[df.duplicated()] #Verificar quais dados estão duplicados

Unnamed: 0,ORIGEM,LOCNASC,IDADEMAE,QTDFILVIVO,QTDFILMORT,GESTACAO,CONSULTAS,GRAVIDEZ,IDADEPAI,PESO,RACACOR
558,1,1,30,0.0,0.0,37 a 41 semanas,4,Única,,2690.0,Parda
893,1,1,20,0.0,0.0,37 a 41 semanas,4,Única,,2795.0,Parda
1043,1,1,18,0.0,0.0,37 a 41 semanas,4,Única,,3650.0,Parda
1056,1,1,19,0.0,0.0,37 a 41 semanas,3,Única,,3100.0,Parda
1070,1,1,26,1.0,0.0,37 a 41 semanas,4,Única,,3320.0,Parda
...,...,...,...,...,...,...,...,...,...,...,...
26898,1,1,19,0.0,0.0,37 a 41 semanas,3,Única,,3390.0,Parda
26991,1,1,29,0.0,0.0,37 a 41 semanas,4,Única,,3315.0,Branca
27003,1,1,28,1.0,1.0,37 a 41 semanas,4,Única,,2660.0,Parda
27007,1,1,24,0.0,0.0,37 a 41 semanas,4,Única,,3470.0,Branca


In [45]:
df.drop_duplicates(subset=['ORIGEM'])

Unnamed: 0,ORIGEM,LOCNASC,IDADEMAE,QTDFILVIVO,QTDFILMORT,GESTACAO,CONSULTAS,GRAVIDEZ,IDADEPAI,PESO,RACACOR
0,1,1,19,0.0,0.0,37 a 41 semanas,4,Única,26.0,3685.0,Branca


## Renomeando índices e colunas

#### Renomeando colunas

In [56]:
df = pd.DataFrame(
    np.random.randn(9, 4)*100,
    index=["A", "B", "C", "D", "E", "F", "G", "H", "I"],
    columns=["coluna1", "coluna2", "coluna3","coluna4"],
)
df

Unnamed: 0,coluna1,coluna2,coluna3,coluna4
A,93.972885,119.160819,9.060763,67.292429
B,33.587088,-46.176505,101.099331,44.159778
C,-88.145561,12.81827,-328.775485,-43.700892
D,-123.242125,13.83974,-131.893684,-40.191985
E,38.324078,77.38505,-89.208362,-190.915984
F,258.924843,137.643663,66.580674,44.252702
G,93.232464,175.065228,1.829196,124.840768
H,53.078393,-15.825751,3.866467,25.456052
I,-96.337232,51.101973,-30.158204,-187.143475


In [57]:
df.columns = ['segunda', 'terça', 'quarta', 'quinta']

In [58]:
df

Unnamed: 0,segunda,terça,quarta,quinta
A,93.972885,119.160819,9.060763,67.292429
B,33.587088,-46.176505,101.099331,44.159778
C,-88.145561,12.81827,-328.775485,-43.700892
D,-123.242125,13.83974,-131.893684,-40.191985
E,38.324078,77.38505,-89.208362,-190.915984
F,258.924843,137.643663,66.580674,44.252702
G,93.232464,175.065228,1.829196,124.840768
H,53.078393,-15.825751,3.866467,25.456052
I,-96.337232,51.101973,-30.158204,-187.143475


#### Renomeando Índices

In [59]:
df['coluna5'] = ['letraA','letraB','letraC','letraD','letraE','letraF','letraG','letraH','letraI']

In [60]:
df

Unnamed: 0,segunda,terça,quarta,quinta,coluna5
A,93.972885,119.160819,9.060763,67.292429,letraA
B,33.587088,-46.176505,101.099331,44.159778,letraB
C,-88.145561,12.81827,-328.775485,-43.700892,letraC
D,-123.242125,13.83974,-131.893684,-40.191985,letraD
E,38.324078,77.38505,-89.208362,-190.915984,letraE
F,258.924843,137.643663,66.580674,44.252702,letraF
G,93.232464,175.065228,1.829196,124.840768,letraG
H,53.078393,-15.825751,3.866467,25.456052,letraH
I,-96.337232,51.101973,-30.158204,-187.143475,letraI


In [61]:
df.set_index('coluna5')

Unnamed: 0_level_0,segunda,terça,quarta,quinta
coluna5,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
letraA,93.972885,119.160819,9.060763,67.292429
letraB,33.587088,-46.176505,101.099331,44.159778
letraC,-88.145561,12.81827,-328.775485,-43.700892
letraD,-123.242125,13.83974,-131.893684,-40.191985
letraE,38.324078,77.38505,-89.208362,-190.915984
letraF,258.924843,137.643663,66.580674,44.252702
letraG,93.232464,175.065228,1.829196,124.840768
letraH,53.078393,-15.825751,3.866467,25.456052
letraI,-96.337232,51.101973,-30.158204,-187.143475


In [64]:
df.reset_index()

Unnamed: 0,index,segunda,terça,quarta,quinta,coluna5
0,A,93.972885,119.160819,9.060763,67.292429,letraA
1,B,33.587088,-46.176505,101.099331,44.159778,letraB
2,C,-88.145561,12.81827,-328.775485,-43.700892,letraC
3,D,-123.242125,13.83974,-131.893684,-40.191985,letraD
4,E,38.324078,77.38505,-89.208362,-190.915984,letraE
5,F,258.924843,137.643663,66.580674,44.252702,letraF
6,G,93.232464,175.065228,1.829196,124.840768,letraG
7,H,53.078393,-15.825751,3.866467,25.456052,letraH
8,I,-96.337232,51.101973,-30.158204,-187.143475,letraI


In [65]:
df.reset_index(drop=True)

Unnamed: 0,segunda,terça,quarta,quinta,coluna5
0,93.972885,119.160819,9.060763,67.292429,letraA
1,33.587088,-46.176505,101.099331,44.159778,letraB
2,-88.145561,12.81827,-328.775485,-43.700892,letraC
3,-123.242125,13.83974,-131.893684,-40.191985,letraD
4,38.324078,77.38505,-89.208362,-190.915984,letraE
5,258.924843,137.643663,66.580674,44.252702,letraF
6,93.232464,175.065228,1.829196,124.840768,letraG
7,53.078393,-15.825751,3.866467,25.456052,letraH
8,-96.337232,51.101973,-30.158204,-187.143475,letraI


## Categorização e Dummies

In [66]:
df_imc = pd.DataFrame({
    'nome': [
        'Fernando', 'Maria', 'Felipe', 'Pedro', 'Bianca',
        'Beatriz', 'Patricia', 'Lucia'
    ],
    'imc': [27, 26, 25, 16, 16.7, 17.5, 18.6, 24]
})
df_imc

Unnamed: 0,nome,imc
0,Fernando,27.0
1,Maria,26.0
2,Felipe,25.0
3,Pedro,16.0
4,Bianca,16.7
5,Beatriz,17.5
6,Patricia,18.6
7,Lucia,24.0


#### Categorização

In [70]:
df_imc.loc[df_imc['imc'] <= 16.9, 'imc_cat'] = 'muito_abaixo'
df_imc.loc[(df_imc['imc'] > 16.9) & (df_imc['imc'] <= 18.4),
           'imc_cat'] = 'abaixo'
df_imc.loc[(df_imc['imc'] > 18.4) & (df_imc['imc'] <= 24.9),
           'imc_cat'] = 'normal'
df_imc.loc[(df_imc['imc'] > 24.9), 'imc_cat'] = 'acima'

In [71]:
df_imc

Unnamed: 0,nome,imc,imc_cat
0,Fernando,27.0,acima
1,Maria,26.0,acima
2,Felipe,25.0,acima
3,Pedro,16.0,muito_abaixo
4,Bianca,16.7,muito_abaixo
5,Beatriz,17.5,abaixo
6,Patricia,18.6,normal
7,Lucia,24.0,normal


#### Dummies

``pd.get_dummies()`` é usado para transformar uma variável categórica em variáveis flag (0 ou 1)

In [98]:
dummies = pd.get_dummies(df_imc['imc_cat'])

In [99]:
dummies = dummies.astype(int) # Transformando os dummies de False/True para 0 e 1

In [100]:
df = pd.concat([df_imc, dummies], axis=1)

In [101]:
df

Unnamed: 0,nome,imc,imc_cat,abaixo,acima,muito_abaixo,normal
0,Fernando,27.0,acima,0,1,0,0
1,Maria,26.0,acima,0,1,0,0
2,Felipe,25.0,acima,0,1,0,0
3,Pedro,16.0,muito_abaixo,0,0,1,0
4,Bianca,16.7,muito_abaixo,0,0,1,0
5,Beatriz,17.5,abaixo,1,0,0,0
6,Patricia,18.6,normal,0,0,0,1
7,Lucia,24.0,normal,0,0,0,1
