## Ajustes

In [35]:
import pandas as pd
import numpy as np

In [36]:
# Cria uma lista com o path dos datasets tratados (concessões de auxílios acidentarios)
lista_acid = []
for i in range(2017, 2022):
    u = 'Acidentario/datasets_tratados/' +str(i) + '.csv'    
    lista_acid.append(u)
lista_acid

['Acidentario/datasets_tratados/2017.csv',
 'Acidentario/datasets_tratados/2018.csv',
 'Acidentario/datasets_tratados/2019.csv',
 'Acidentario/datasets_tratados/2020.csv',
 'Acidentario/datasets_tratados/2021.csv']

In [37]:
# Cria uma lista com o path dos datasets tratados (concessões de auxílios previdenciários)
lista_prev = []
for i in range(2017, 2022):
    u = 'Previdenciario/datasets_tratados/' +str(i) + '.csv'    
    lista_prev.append(u)
lista_prev

['Previdenciario/datasets_tratados/2017.csv',
 'Previdenciario/datasets_tratados/2018.csv',
 'Previdenciario/datasets_tratados/2019.csv',
 'Previdenciario/datasets_tratados/2020.csv',
 'Previdenciario/datasets_tratados/2021.csv']

In [38]:
# Carrega os datasets
# Auxílios Acidentários
df2017a = pd.read_csv(lista_acid[0], sep=';', on_bad_lines='skip')
df2018a = pd.read_csv(lista_acid[1], sep=';', on_bad_lines='skip')
df2019a = pd.read_csv(lista_acid[2], sep=';', on_bad_lines='skip')
df2020a = pd.read_csv(lista_acid[3], sep=';', on_bad_lines='skip')
df2021a = pd.read_csv(lista_acid[4], sep=';', on_bad_lines='skip')

# Auxílios Previdenciários
df2017p = pd.read_csv(lista_prev[0], sep=';', on_bad_lines='skip')
df2018p = pd.read_csv(lista_prev[1], sep=';', on_bad_lines='skip')
df2019p = pd.read_csv(lista_prev[2], sep=';', on_bad_lines='skip')
df2020p = pd.read_csv(lista_prev[3], sep=';', on_bad_lines='skip')
df2021p = pd.read_csv(lista_prev[4], sep=';', on_bad_lines='skip')

In [39]:
# Remove linhas finais e outras
df2017a.drop(df2017a.index[2067:], inplace = True)
df2018a.drop(df2018a.index[2067:], inplace = True)
df2019a.drop(df2019a.index[2067:], inplace = True)
df2020a.drop(df2020a.index[2067:], inplace = True)
df2021a.drop(df2021a.index[754:], inplace = True)

df2017p.drop(df2017p.index[2067:], inplace = True)
df2018p.drop(df2018p.index[2067:], inplace = True)
df2019p.drop(df2019p.index[2067:], inplace = True)
df2020p.drop(df2020p.index[2067:], inplace = True)
df2021p.drop(df2021p.index[1707:], inplace = True)


words = ['Total', 'Capítulo', 'Capitulo']
pattern2 = '|'.join(words)

df2017a = df2017a[~df2017a['CID10 CATEGORIA'].str.contains(pattern2)]
df2018a = df2018a[~df2018a['CID10 CATEGORIA'].str.contains(pattern2)]
df2019a = df2019a[~df2019a['CID10 CATEGORIA'].str.contains(pattern2)]
df2020a = df2020a[~df2020a['CID10 CATEGORIA'].str.contains(pattern2)]
df2021a = df2021a[~df2021a['CID10 CATEGORIA'].str.contains(pattern2)]

df2017p = df2017p[~df2017p['CID10 CATEGORIA'].str.contains(pattern2)]
df2018p = df2018p[~df2018p['CID10 CATEGORIA'].str.contains(pattern2)]
df2019p = df2019p[~df2019p['CID10 CATEGORIA'].str.contains(pattern2)]
df2020p = df2020p[~df2020p['CID10 CATEGORIA'].str.contains(pattern2)]
df2021p = df2021p[~df2021p['CID10 CATEGORIA'].str.contains(pattern2)]

In [40]:
# Divide e reordena colunas, criando uma variável só para os códigos da CID
def aplicaSplit(df, ano):
    df[['Cod', 'Descricao']] = df['CID10 CATEGORIA'].str.split(':', expand=True)
    df.drop(['CID10 CATEGORIA'], axis=1, inplace=True)
    #df = df['Descricao', 'Total_' + str(ano)]
    df.sort_index(axis=1, inplace=True)    

In [41]:
aplicaSplit(df2017a, 2017)
aplicaSplit(df2018a, 2018)
aplicaSplit(df2019a, 2019)
aplicaSplit(df2020a, 2020)
aplicaSplit(df2021a, 2021)

aplicaSplit(df2017p, 2017)
aplicaSplit(df2018p, 2018)
aplicaSplit(df2019p, 2019)
aplicaSplit(df2020p, 2020)
aplicaSplit(df2021p, 2021)

In [42]:
# Dimensões
print('Auxílio-doença acidentário:')
print(df2017a.shape)
print(df2018a.shape)
print(df2019a.shape)
print(df2020a.shape)
print(df2021a.shape)
print()
print('Auxílio-doença previdenciário:')
print(df2017p.shape)
print(df2018p.shape)
print(df2019p.shape)
print(df2020p.shape)
print(df2021p.shape)

Auxílio-doença acidentário:
(2042, 3)
(2042, 3)
(2042, 3)
(2042, 3)
(730, 3)

Auxílio-doença previdenciário:
(2042, 3)
(2042, 3)
(2042, 3)
(2042, 3)
(1682, 3)


In [43]:
# Verifica o dataframe
df2017a.head()

Unnamed: 0,Cod,Descricao,Total_2017
2,A00,Colera,0
3,A01,Febres Tifoide e Paratifoide,0
4,A02,Outras Infeccoes por Salmonella,0
5,A03,Shiguelose,0
6,A04,Outras Infeccoes Intestinais Bacterianas,1


## Análises

In [64]:
from functools import reduce
data_frames = [df2017a, df2018a, df2019a, df2020a, df2021a]

df_merged = reduce(lambda left,right: pd.merge(left,right,on=['Cod'],
        how='left', sort=True), data_frames)

  df_merged = reduce(lambda left,right: pd.merge(left,right,on=['Cod'],


In [66]:
df_merged.drop(['Descricao_x', 'Descricao_y', 'Descricao'], axis=1, inplace=True)

In [67]:
df_merged

Unnamed: 0,Cod,Total_2017,Total_2018,Total_2019,Total_2020,Total_2021
0,A00,0,0,0,2,2.0
1,A01,0,0,0,0,
2,A02,0,1,0,0,1.0
3,A03,0,0,0,0,
4,A04,1,0,1,0,
...,...,...,...,...,...,...
2037,Z95,0,2,0,0,
2038,Z96,1,0,2,0,
2039,Z97,0,0,1,0,
2040,Z98,55,72,97,36,31.0


In [68]:
df_merged.isna().sum()

Cod              0
Total_2017       0
Total_2018       0
Total_2019       0
Total_2020       0
Total_2021    1317
dtype: int64

In [71]:
df_merged[df_merged['Cod'].str.startswith(('Z99', 'F42'))]

Unnamed: 0,Cod,Total_2017,Total_2018,Total_2019,Total_2020,Total_2021
447,F42,21,30,24,11,29.0
2041,Z99,0,0,0,0,


In [81]:
df2021a[df2021a.index == 0]

Unnamed: 0,Cod,Descricao,Total_2021


In [86]:
df2021a[df2021a['Cod'].str.contains('F42')]

Unnamed: 0,Cod,Descricao,Total_2021
126,F42,Transtorno Obsessivo-Compulsivo,29.0
