In [2]:
# código para criar a leitura dos dados armazenados em csv
import configparser
import itertools
import os
from datetime import datetime
import pandas as pd

In [20]:
# Carregar o arquivo de configuração
config = configparser.ConfigParser()
config.read("config.ini")

transformacao_dir = config["Paths"]["transformacao_dir"]

In [45]:

def process_csv(file_path):
    """Função para processar o arquivo CSV e retornar um DataFrame"""
    # Ler o arquivo CSV, extraindo as 10 primeiras linhas (para pegar a linha 4)
    temp_df = pd.read_csv(file_path, nrows=10, header=None, encoding="ISO-8859-1")

    # Extrair o mês da linha 4 (index 3 porque indexação começa em 0)
    linha = (
        temp_df.iloc[3].dropna().values[0]
    )  # Supondo que o mês esteja na primeira coluna não vazia

    # Extrair o mês da linha
    mes = linha.split(" ")[1].split("/")[0]
    ano = linha.split(" ")[1].split("/")[1]

    # Ler o arquivo inteiro e remover as 7 primeiras e 4 últimas linhas
    df = pd.read_csv(
        file_path, skiprows=7, encoding="ISO-8859-1", sep=";"
    )  # Ignorar as 7 primeiras linhas
    df = df[:-2]  # Remover as 2 últimas linhas

    # Renomear e preencher a última coluna para mes
    df.rename(columns={df.columns[-1]: "Mes"}, inplace=True)
    df["Mes"] = mes

    # Adicionar a coluna de ano
    df["Ano"] = ano

    return df


def concat_csv_files(files):
    """Função para concatenar os arquivos CSV em grupos de 100"""
    # Inicializar um DataFrame vazio
    df = pd.DataFrame()

    # Colunas esperadas no DataFrame
    colunas =['Uf',
            'Ibge',
            'Municipio',
            'Asma',
            'Desnutrição',
            'Diabetes',
            'DPOC',
            'Hipertensão arterial',
            'Obesidade',
            'Pré-natal',
            'Puericultura',
            'Puerpério (até 42 dias)',
            'Saúde sexual e reprodutiva',
            'Tabagismo',
            'Usuário de álcool',
            'Usuário de outras drogas',
            'Saúde mental',
            'Reabilitação',
            'D.Transmissíveis - Dengue',
            'Doenças transmissíveis - DST',
            'D.Transmissíveis - Hanseníase',
            'D.Transmissíveis - Tuberculose',
            'Rast. câncer de mama',
            'Rast. câncer do colo do útero',
            'Rast. risco cardiovascular',
            'Mes',
            'Ano']
    # Iterar sobre os arquivos
    for i, file in enumerate(files):
        # Processar o arquivo
        temp_df = process_csv(file)
        # Pular arquivos que não possuem as colunas esperadas
        if temp_df.columns.tolist() != colunas:
            print(file, "não possui as colunas esperadas")
            print("Colunas DF", temp_df.columns.tolist())
            print()
            continue
        # Concatenar com o DataFrame principal
        df = pd.concat([df, temp_df])

        # Salvar a cada 100 arquivos
        if i % 100 == 0 and i > 0:
            df.to_csv(f"partial_{i}.csv", index=False)
            df = pd.DataFrame()
    try:
        # Salvar o restante
        df.to_csv(f"partial_{i}.csv", index=False)
    # Catch UnboundLocalError
    except UnboundLocalError:
        print("Menos de 100 arquivos foram processados")
        df.to_csv(f"partial.csv", index=False)


# Função para listar os arquivos CSV
def list_files(directory="/home/daniel/Downloads"):
    """Função para listar os arquivos CSV em um diretório"""
    import os

    # Listar os arquivos com o diretório
    files = [
        os.path.join(directory, file)
        for file in os.listdir(directory)
        if file.endswith(".csv")
    ]

    return files


def concat_final_csv(name, dir="."):    
    """Função para concatenar os arquivos CSV"""
    # Inicializar um DataFrame vazio
    df = pd.DataFrame()
    for file in list_files(dir):
        print(file)
        df_temp = pd.read_csv(file)
        print(df_temp.head())
        df = pd.concat([df, df_temp])
    # Remove duplicados
    print(df.shape)
    df.drop_duplicates(inplace=True)
    print(df.shape)
    # Salvar o arquivo final
    df.to_csv(f"{name}.csv", index=False)
    return df


def ausentes(df):
    """Função para contar os valores ausentes em um DataFrame"""

    now = datetime.now()
    ano_atual = now.year + 1
    mes_atual = now.month + 1

    anos = list(range(2021, ano_atual))
    mes = list(range(1, mes_atual))

    combinacoes = list(itertools.product(df['Uf'].unique(), df['Mes'].unique(), anos))
    combinacoes_24 = list(itertools.product(df['Uf'].unique(), mes, [2024]))

    combinacoes = combinacoes + combinacoes_24

    # Filtrar as combinações que já existem no DataFrame
    combinacoes_existentes = set(zip(df['Uf'], df['Mes'], df['Ano']))

    # Identificar as combinações ausentes
    combinacoes_ausentes = [comb for comb in combinacoes if comb not in combinacoes_existentes]
    return combinacoes_ausentes


def existentes(df):
    """Função para contar os valores existentes em um DataFrame"""

    # Filtrar as combinações que já existem no DataFrame
    combinacoes_existentes = set(zip(df['Uf'], df['Mes'], df['Ano']))

    return combinacoes_existentes


def atualiza_controle(combinacoes_existentes):
    """Função para atualizar o controle de combinações"""
    with open('controle/producao.txt', 'w') as f:
        for comb in combinacoes_existentes:
            if comb[1] < 10:
                f.write(f'{comb[0]}_0{comb[1]}/{comb[2]}\n')
            else:
                f.write(f'{comb[0]}_{comb[1]}/{comb[2]}\n')


def remove_temp_files(transformacao_dir):
    """Função para remover os arquivos temporários"""
    for file in os.listdir("."):
        if file.startswith("partial_"):
            os.remove(file)
    for file in os.listdir(transformacao_dir):
        if file.endswith(".csv"):
            os.remove(os.path.join(transformacao_dir, file))



if __name__ == "__main__":
    print("Iniciando a transformação dos dados")
    # Listar os arquivos
    files = list_files(transformacao_dir)
    print("Concatenando os arquivos")
    # Concatenar os arquivos
    concat_csv_files(files)
    print("Concatenando o arquivo final")
    # Concatenar o arquivo final
    df = concat_final_csv('procedimentos')
    print("Removendo os arquivos temporários")
    if len(df) > 1000:
        # remove_temp_files(transformacao_dir)
        print("Transformação concluída")
    else:
        print("Erro na transformação")

Iniciando a transformação dos dados
Concatenando os arquivos
Concatenando o arquivo final
./partial_100.csv
   Uf      Ibge            Municipio   Asma  Desnutrição  Diabetes  DPOC  \
0  PI  220900.0  RIO GRANDE DO PIAUÍ    1.0          2.0       5.0   1.0   
1  PR  410520.0           CERRO AZUL    2.0          2.0     145.0   2.0   
2  RJ  330190.0             ITABORAÍ   86.0         15.0     970.0  28.0   
3  RS  431410.0          PASSO FUNDO  161.0         52.0     594.0  51.0   
4  GO  520630.0       CRISTIANÓPOLIS    5.0          0.0      68.0   1.0   

   Hipertensão arterial  Obesidade  Pré-natal  ...  Reabilitação  \
0                25.000        2.0       32.0  ...         122.0   
1               280.000       39.0       49.0  ...         200.0   
2                 2.964      171.0      586.0  ...         358.0   
3                 1.211      269.0      853.0  ...           0.0   
4               168.000       15.0       11.0  ...           1.0   

   D.Transmissíveis - Deng

In [43]:
df = pd.read_csv('partial_100.csv')
print(len(df))
df2 = pd.read_csv('partial_134.csv')
print(len(df2))

df = pd.concat([df, df2])
df

477562
136542


Unnamed: 0,Uf,Ibge,Municipio,Asma,Desnutrição,Diabetes,DPOC,Hipertensão arterial,Obesidade,Pré-natal,...,Reabilitação,D.Transmissíveis - Dengue,Doenças transmissíveis - DST,D.Transmissíveis - Hanseníase,D.Transmissíveis - Tuberculose,Rast. câncer de mama,Rast. câncer do colo do útero,Rast. risco cardiovascular,Mes,Ano
0,PI,220900.0,RIO GRANDE DO PIAUÍ,1.0,2.0,5.0,1.0,25.000,2.0,32.0,...,122.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,MAI,2021.0
1,PR,410520.0,CERRO AZUL,2.0,2.0,145.0,2.0,280.000,39.0,49.0,...,200.0,0.0,0.0,0.0,0.0,19.0,20.0,15.0,MAI,2021.0
2,RJ,330190.0,ITABORAÍ,86.0,15.0,970.0,28.0,2.964,171.0,586.0,...,358.0,0.0,11.0,0.0,3.0,125.0,135.0,139.0,MAI,2021.0
3,RS,431410.0,PASSO FUNDO,161.0,52.0,594.0,51.0,1.211,269.0,853.0,...,0.0,1.0,52.0,0.0,31.0,6.0,36.0,0.0,MAI,2021.0
4,GO,520630.0,CRISTIANÓPOLIS,5.0,0.0,68.0,1.0,168.000,15.0,11.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,MAI,2021.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136537,MG,312310.0,DORES DE GUANHÃES,2.0,3.0,34.0,2.0,83.000,5.0,38.0,...,2.0,0.0,1.0,0.0,0.0,1.0,9.0,0.0,AGO,2019.0
136538,PR,412402.0,SANTA TEREZA DO OESTE,21.0,1.0,38.0,5.0,51.000,1.0,103.0,...,0.0,1.0,8.0,1.0,0.0,0.0,0.0,0.0,AGO,2019.0
136539,SP,350480.0,BÁLSAMO,11.0,2.0,48.0,15.0,59.000,54.0,55.0,...,0.0,10.0,3.0,1.0,0.0,0.0,90.0,0.0,AGO,2019.0
136540,BA,292670.0,RIO DE CONTAS,9.0,19.0,82.0,4.0,391.000,18.0,97.0,...,357.0,1.0,2.0,2.0,1.0,0.0,6.0,2.0,AGO,2019.0


In [29]:
print("Iniciando a transformação dos dados")
# Listar os arquivos
files = list_files(transformacao_dir)
print("Concatenando os arquivos")
# Concatenar os arquivos
concat_csv_files(files)

print("Concatenando o arquivo final")
# Concatenar o arquivo final
df = concat_final_csv('procedimentos')
"""
print("Gravando os arquivos existentes")
atualiza_controle(existentes(df))
print("Removendo os arquivos temporários")
remove_temp_files(transformacao_dir)
print("Transformação concluída")"""


Iniciando a transformação dos dados
Concatenando os arquivos
Concatenando o arquivo final


'\nprint("Gravando os arquivos existentes")\natualiza_controle(existentes(df))\nprint("Removendo os arquivos temporários")\nremove_temp_files(transformacao_dir)\nprint("Transformação concluída")'

In [42]:
import chardet

def detectar_encoding(arquivo):
    with open(arquivo, 'rb') as f:
        resultado = chardet.detect(f.read())
    return resultado['encoding']

# Exemplo de uso
arquivo = '/home/daniel/projetos/saude-basica/etl/data/transformacao/producao_profissionais_individual_SET-2024.csv'
encoding = detectar_encoding(arquivo)
print(f"O encoding do arquivo é: {encoding}")

O encoding do arquivo é: ISO-8859-1


In [64]:
arquivo
copia = '/home/daniel/projetos/saude-basica/etl/data/transformacao/producao_profissionais_individual_SET_c-2024.csv'

In [68]:
with open(arquivo, 'r', encoding=encoding) as file:
    # Pula as primeiras 7 linhas
    for _ in range(7):
        next(file)
    # Lê o cabeçalho
    lines = file.readlines()

# Detectar cabeçalho para encontrar colunas numéricas
header = lines[0].strip().split(';')

# Suponha que as primeiras três colunas são "Uf", "Municipio" e "Mes"
fixed_columns = ["Uf", "Municipio", "Mes"]
numeric_columns = [col for col in header if col not in fixed_columns]

# Limpar cada linha removendo pontos nas colunas numéricas
with open(copia, 'w', encoding=encoding) as temp_file:
    # Escrever cabeçalho
    temp_file.write(lines[0])
    # Limpar linhas
    for line in lines[1:]:
        # Separar a linha em colunas usando ';' como delimitador
        values = line.strip().split(';')
        # Remover pontos nas colunas numéricas
        cleaned_values = [
            value.replace('.', '') if header[i] in numeric_columns else value
            for i, value in enumerate(values)
        ]
        # Escrever linha limpa com ';' como separador
        temp_file.write(';'.join(cleaned_values) + '\n')

IndexError: list index out of range

In [80]:
# Define as colunas fixas e faz a leitura
fixed_columns = ["Uf", "Municipio", "Mes"]

# Usa read_csv com conversão para remover pontos nas colunas numéricas
df = pd.read_csv(
    arquivo,
    encoding=encoding,
    sep=';',
    skiprows=7,
    converters={
        col: lambda x: int(x.replace('.', '')) if pd.notnull(x) else None
        for col in pd.read_csv(arquivo, encoding=encoding, sep=';', nrows=1).columns
        if col not in fixed_columns
    }
)
# Remove as duas últimas linhas e a última coluna
df = df.iloc[:-2, :-1]

# Converte as colunas numéricas para o tipo int, agora que as últimas linhas foram removidas
for col in df.columns:
    if col not in fixed_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype(int)
# Verifica o resultado
print(df.dtypes)
print(df.head())

Uf                                object
Ibge                               int64
Municipio                         object
Agente comunitário de saúde        int64
Agente de combate a endemias       int64
Agente de saúde                    int64
Assistente Social                  int64
Cirurgião dentista                 int64
Educador social                    int64
Enfermeiro                         int64
Farmacêutico                       int64
Fisioterapeuta                     int64
Fonoaudiólogo                      int64
Médico                             int64
Médico veterinário                 int64
Nutricionista                      int64
Outros prof. de nível médio        int64
Outros prof. de nível superior     int64
Profissional de educação físic     int64
Psicólogo                          int64
Sanitarista                        int64
Técnico e auxiliar de enfermag     int64
Técnico e auxiliar de saúde bu     int64
Terapeuta ocupacional              int64
Naturólogo      

In [82]:
import pandas as pd

df = pd.read_csv(arquivo, encoding=encoding, sep=';', skiprows=7, dtype=str)
# apagar as últimas duas linhas
df = df[:-2]
# remove a ultima coluna
df = df.iloc[:, :-1]
df.head()

Unnamed: 0,Uf,Ibge,Municipio,Agente comunitário de saúde,Agente de combate a endemias,Agente de saúde,Assistente Social,Cirurgião dentista,Educador social,Enfermeiro,...,Psicólogo,Sanitarista,Técnico e auxiliar de enfermag,Técnico e auxiliar de saúde bu,Terapeuta ocupacional,Naturólogo,Musicoterapeuta,Arteterapeuta,Recepcionista,Terapeuto Holístico
0,RJ,330550,SAQUAREMA,0,0,0,16,0,0,5.676,...,51,0,0,0,0,0,0,0,0,0
1,SP,350730,BORACÉIA,0,0,0,0,0,0,78.0,...,5,0,0,0,0,0,0,0,0,0
2,GO,520235,ARENÓPOLIS,0,0,0,0,0,0,126.0,...,80,0,0,0,63,0,0,0,0,0
3,AL,270830,SÃO JOSÉ DA LAJE,0,0,0,0,0,0,1.8,...,129,0,0,0,0,0,0,0,0,0
4,SP,350750,BOTUCATU,0,0,0,186,0,0,12.092,...,377,0,0,0,0,0,0,0,0,0


In [84]:
# Substituir pontos por vazio e converter para int
for col in df.columns:
    if col not in fixed_columns:
        df[col] = df[col].str.replace('.', '').astype(int)

# Verificar o resultado
print(df.dtypes)
print(df.head())

Uf                                object
Ibge                               int64
Municipio                         object
Agente comunitário de saúde        int64
Agente de combate a endemias       int64
Agente de saúde                    int64
Assistente Social                  int64
Cirurgião dentista                 int64
Educador social                    int64
Enfermeiro                         int64
Farmacêutico                       int64
Fisioterapeuta                     int64
Fonoaudiólogo                      int64
Médico                             int64
Médico veterinário                 int64
Nutricionista                      int64
Outros prof. de nível médio        int64
Outros prof. de nível superior     int64
Profissional de educação físic     int64
Psicólogo                          int64
Sanitarista                        int64
Técnico e auxiliar de enfermag     int64
Técnico e auxiliar de saúde bu     int64
Terapeuta ocupacional              int64
Naturólogo      

  df[col] = df[col].str.replace('.', '').astype(int)


In [85]:
df[['Municipio', 'Enfermeiro', 'Médico']]

Unnamed: 0,Municipio,Enfermeiro,Médico
0,SAQUAREMA,5676,8358
1,BORACÉIA,78,2752
2,ARENÓPOLIS,126,539
3,SÃO JOSÉ DA LAJE,1800,3299
4,BOTUCATU,12092,22488
...,...,...,...
5534,SERRANÓPOLIS DO IGUAÇU,645,1936
5535,MORRO AGUDO,759,6846
5536,FORMOSA DO SUL,1708,1387
5537,SANTA ISABEL DO IVAÍ,306,2966


In [71]:
df[['Municipio', 'Enfermeiro', 'Médico']]

Unnamed: 0,Municipio,Enfermeiro,Médico
0,SAQUAREMA,5.676,8.358
1,BORACÉIA,78.000,2.752
2,ARENÓPOLIS,126.000,539.000
3,SÃO JOSÉ DA LAJE,1.800,3.299
4,BOTUCATU,12.092,22.488
...,...,...,...
5536,FORMOSA DO SUL,1.708,1.387
5537,SANTA ISABEL DO IVAÍ,306.000,2.966
5538,CASTILHO,453.000,3.093
5539,,,


In [51]:
# Criar lista com colunas numéricas
colunas_numericas = df.select_dtypes(include=['number']).columns.tolist()
colunas_numericas = [col for col in colunas_numericas if col not in ["Uf", "Municipio", "Mes"]]

# Criar lista com colunas não numéricas
colunas_nao_numericas = df.select_dtypes(exclude=['number']).columns.tolist()
colunas_nao_numericas = [col for col in colunas_nao_numericas if col not in ["Uf", "Municipio", "Mes"]]
colunas_nao_numericas

[]

In [52]:
# Criar lista com colunas numéricas
colunas_numericas = df.select_dtypes(include=['number']).columns.tolist()
colunas_numericas = [col for col in colunas_numericas if col not in ["Uf", "Municipio", "Mes"]]

# Criar lista com colunas não numéricas
colunas_nao_numericas = df.select_dtypes(exclude=['number']).columns.tolist()
colunas_nao_numericas = [col for col in colunas_nao_numericas if col not in ["Uf", "Municipio", "Mes"]]

for col in colunas_numericas:
    df[col] = (
        pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)
    )
for col in colunas_nao_numericas:
    df[col] = df[col].str.replace('.', '').astype(int)

df

Unnamed: 0,Uf,Ibge,Municipio,Agente comunitário de saúde,Agente de combate a endemias,Agente de saúde,Assistente Social,Cirurgião dentista,Educador social,Enfermeiro,...,Psicólogo,Sanitarista,Técnico e auxiliar de enfermag,Técnico e auxiliar de saúde bu,Terapeuta ocupacional,Naturólogo,Musicoterapeuta,Arteterapeuta,Recepcionista,Terapeuto Holístico
0,RJ,330550,SAQUAREMA,0,0,0,16,0,0,5,...,51,0,0,0,0,0,0,0,0,0
1,SP,350730,BORACÉIA,0,0,0,0,0,0,78,...,5,0,0,0,0,0,0,0,0,0
2,GO,520235,ARENÓPOLIS,0,0,0,0,0,0,126,...,80,0,0,0,63,0,0,0,0,0
3,AL,270830,SÃO JOSÉ DA LAJE,0,0,0,0,0,0,1,...,129,0,0,0,0,0,0,0,0,0
4,SP,350750,BOTUCATU,0,0,0,186,0,0,12,...,377,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5534,PR,412635,SERRANÓPOLIS DO IGUAÇU,0,0,0,0,0,0,645,...,64,0,0,0,0,0,0,0,0,0
5535,SP,353190,MORRO AGUDO,0,0,0,0,0,0,759,...,264,0,0,0,3,0,0,0,0,0
5536,SC,420543,FORMOSA DO SUL,0,0,0,0,0,0,1,...,113,0,0,0,0,0,0,0,0,0
5537,PR,412370,SANTA ISABEL DO IVAÍ,0,0,0,0,0,0,306,...,201,0,0,0,0,0,0,0,0,0


In [76]:
df

Unnamed: 0,Uf,Ibge,Municipio,Agente comunitário de saúde,Agente de combate a endemias,Agente de saúde,Assistente Social,Cirurgião dentista,Educador social,Enfermeiro,...,Sanitarista,Técnico e auxiliar de enfermag,Técnico e auxiliar de saúde bu,Terapeuta ocupacional,Naturólogo,Musicoterapeuta,Arteterapeuta,Recepcionista,Terapeuto Holístico,Unnamed: 29
0,RJ,330550.0,SAQUAREMA,0.0,0.0,0.0,16.0,0.0,0.0,5.676,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,SP,350730.0,BORACÉIA,0.0,0.0,0.0,0.0,0.0,0.0,78.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,GO,520235.0,ARENÓPOLIS,0.0,0.0,0.0,0.0,0.0,0.0,126.000,...,0.0,0.0,0.0,63.0,0.0,0.0,0.0,0.0,0.0,
3,AL,270830.0,SÃO JOSÉ DA LAJE,0.0,0.0,0.0,0.0,0.0,0.0,1.800,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,SP,350750.0,BOTUCATU,0.0,0.0,0.0,186.0,0.0,0.0,12.092,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5534,PR,412635.0,SERRANÓPOLIS DO IGUAÇU,0.0,0.0,0.0,0.0,0.0,0.0,645.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
5535,SP,353190.0,MORRO AGUDO,0.0,0.0,0.0,0.0,0.0,0.0,759.000,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,
5536,SC,420543.0,FORMOSA DO SUL,0.0,0.0,0.0,0.0,0.0,0.0,1.708,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
5537,PR,412370.0,SANTA ISABEL DO IVAÍ,0.0,0.0,0.0,0.0,0.0,0.0,306.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [20]:
# converter coluna atendimento individual para inteiro
df['Atendimento Individual'] = df['Atendimento Individual'].str.replace('.', '').astype(int)
df[df['Municipio'] == 'SÃO PAULO']

  df['Atendimento Individual'] = df['Atendimento Individual'].str.replace('.', '').astype(int)


Unnamed: 0,Uf,Ibge,Municipio,Atendimento Individual,Atendimento Odontológico,Procedimento,Visita Domiciliar,Unnamed: 7
540,SP,355030.0,SÃO PAULO,1617442,258.401,2.261.704,2.131.099,
