In [1]:
import os
from typing import List
from pathlib import Path
import pandas as pd
import seaborn as sns
ROOT_DIR : str = Path(os.getcwd())
DATA_DIR : str = ROOT_DIR/"data"
SIH_DIR : str = DATA_DIR/"sih"
IBGE_DATA : str = DATA_DIR/"tabelas_ibge"
CID_DATA : str = DATA_DIR/"cid_10"
OUTPUT : str = DATA_DIR/"output"
colunas_selecionadas : List[str] = [
    'N_AIH',
    'ANO_CMPT',
    'MES_CMPT',
    'DIAG_PRINC',
    'MUNIC_RES',
    'NASC',
    'SEXO',
    'QT_DIARIAS',
    'VAL_TOT'

]
def create_folder(path:str)->None:
    if not os.path.exists(path): 
        os.makedirs(path)
def get_subdirectories(parent_directory:str)->List[str]:
    subdirectories = []
    for item in os.listdir(parent_directory):
        item_path = os.path.join(parent_directory, item)
        subdirectories.append(item_path)
    return subdirectories
def load_files()->pd.DataFrame:
    list_of_files : List[pd.DataFrame] = []
    list_of_dirs = get_subdirectories(SIH_DIR)
    for item in list_of_dirs:
        list_of_files.append(pd.read_parquet(item))
    return pd.concat(list_of_files)
# Cria tabela time_table - agrupando por ano e mês
def create_time_table(df:pd.DataFrame)->pd.DataFrame:
    selected_columns : List[str] = ['ANO_CMPT','MES_CMPT']
    df_time_table :pd.DataFrame = df[selected_columns]
    df_time_table = df_time_table.drop_duplicates(subset=selected_columns)
    return df_time_table.sort_values(by=selected_columns)
def create_time_table_key(df:pd.DataFrame)->pd.DataFrame:
    df['TIME_KEY'] = df['ANO_CMPT'] + '|' + df['MES_CMPT']
    return df
def create_paciente_table(df:pd.DataFrame)->pd.DataFrame:
    df_paciente_table = df[['N_AIH','NASC','SEXO']]
    df_paciente_table = df_paciente_table.drop_duplicates()
    df_paciente_table = df_paciente_table.rename(columns={'N_AIH':'ID_PACIENTE'})
    return df_paciente_table
def convert_date_to_datetime(df:pd.DataFrame)->pd.DataFrame:
    df['NASC'] = pd.to_datetime(df['NASC'], format='%Y%m%d')
    df['NASC'] = df['NASC'].dt.strftime('%d/%m/%Y')
    return df
def create_municipios_table()->pd.DataFrame:
    df_municipios :pd.DataFrame = pd.read_csv(filepath_or_buffer=IBGE_DATA/'municipios.csv', sep=',')
    return df_municipios.rename(columns={'COD':'COD_IBGE', 'COD UF':'COD_UF'})
def create_estados_table()->pd.DataFrame:
    df_estados:pd.DataFrame = pd.read_csv(filepath_or_buffer=IBGE_DATA/'estados.csv',sep=',')
    return df_estados.rename(columns={'COD':'COD_UF'})
# Define key da cid-10
def create_cid_10_key(df:pd.DataFrame)->pd.DataFrame:
    df_cid_10 = df.rename(columns={'DIAG_PRINC':'CID_10_KEY'})
    df_cid_10['CID_10_KEY'] = df_cid_10['CID_10_KEY'].transform(func=lambda arg:arg[:3])
    return df_cid_10
# importa tabela de categorias da CID-10
def create_cid_table()->pd.DataFrame:
    df_cid : pd.DataFrame= pd.read_csv(filepath_or_buffer=CID_DATA/'cid_10.csv',sep=';',encoding='iso-8859-1')
    select_columns : List[str] = ['CAT','DESCRICAO']
    df_cid = df_cid[select_columns]
    df_cid.rename(columns={'CAT':'CID_10_KEY'})
    return df_cid


In [3]:
def transform()->None:
    df_sih : pd.DataFrame = load_files()
    df_sih = df_sih[colunas_selecionadas]
    df_time_table : pd.DataFrame = create_time_table(df=df_sih)
    df_time_table = create_time_table_key(df_time_table)
    df_paciente_table : pd.DataFrame = create_paciente_table(df_sih)
    df_paciente_table = convert_date_to_datetime(df=df_paciente_table)
    df_municipios_table : pd.DataFrame = create_municipios_table()
    df_estados_table : pd.DataFrame = create_estados_table()
    # Cria time_key
    df_sih['TIME_KEY'] = df_sih['ANO_CMPT'] + '|' + df_sih['MES_CMPT']
    df_sih = create_cid_10_key(df=df_sih)
    df_cid_table : pd.DataFrame = create_cid_table()
    df_sih = df_sih.drop(columns=['NASC', 'SEXO', 'ANO_CMPT', 'MES_CMPT'])
    df_sih = df_sih.rename(columns={'MUNIC_RES':'COD_IBGE','VAL_TOT':'VALOR_INTERNACAO'})
    df_sih = df_sih.groupby(by=['N_AIH','CID_10_KEY','COD_IBGE','TIME_KEY'],
               as_index=False).sum()
    df_time_table.to_csv(path_or_buf=OUTPUT/'time_table.csv',sep=';')
    df_paciente_table.to_csv(path_or_buf=OUTPUT/'paciente_table.csv',sep=';')
    df_municipios_table.to_csv(path_or_buf=OUTPUT/'municipios_table.csv',sep=';')
    df_estados_table.to_csv(path_or_buf=OUTPUT/'estados_table.csv',sep=';')
    df_cid_table.to_csv(path_or_buf=OUTPUT/'cid_table.csv',sep=';')
    df_sih.to_csv(path_or_buf=OUTPUT/'internacoes.csv',sep=';')
    
transform()
