In [1]:
# Código do Procedimento Ambulatorial - PA_PROC_ID
# CID9 Principal (APAC ou BPA-I) - PA_CIDPRI
# Idade do paciente em anos - PA_IDADE
# Sexo do paciente - PA_SEXO
# Raça Cor do paciente- PA_RACACOR
# Valor Aprovado do procedimento - PA_VALAPR
# Indicador de Alta (APAC) - PA_ALTA
# Indicador de Óbito - PA_OBITO

### Import the Libraries

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from pyspark.sql import functions as F
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable

### Initialize SparkSession

In [3]:
spark = SparkSession.builder \
    .appName("Brazil States CSVs to DataFrame") \
    .getOrCreate()

### Define the Paths to Your CSV Files

In [4]:
file_paths = {
    'Acre': ["csv/PAAC2301.csv"], 
    'Alagoas': ["csv/PAAL2301.csv"], 
    'Amapá': ["csv/PAAP2301.csv"], 
    'Amazonas': ["csv/PAAM2301.csv"],
    'Bahia': ["csv/PABA2301.csv"],
    'Ceará': ["csv/PACE2301.csv"],
    'Distrito Federal': ["csv/PADF2301.csv"],
    'Espírito Santo': ["csv/PASC2301.csv"], 
    'Goiás': ["csv/PAGO2301.csv"], 
    'Maranhão': ["csv/PAMA2301.csv"],
    'Mato Grosso': ["csv/PAMT2301.csv"], 
    'Mato Grosso do Sul': ["csv/PAMS2301.csv"], 
    'Pará': ["csv/PAPA2301.csv"], 
    'Paraíba': ["csv/PAPB2301.csv"], 
    'Paraná': ["csv/PAPR2301.csv"], 
    'Pernambuco': ["csv/PAPE2301.csv"],
    'Piauí': ["csv/PAPI2301.csv"], 
    'Rio Grande do Norte': ["csv/PARN2301.csv"], 
    'Rio Grande do Sul': ["csv/PARS2301.csv"], 
    'Rondônia': ["csv/PARO2301.csv"], 
    'Roraima': ["csv/PARR2301.csv"], 
    'Santa Catarina': ["csv/PASC2301.csv"],
    'Sergipe': ["csv/PASE2301.csv"],  
    'Tocantins': ["csv/PATO2301.csv"],
    "São Paulo": ["csv/PASP2301a.csv", "csv/PASP2301b.csv", "csv/PASP2301c.csv"],
    "Minas Gerais": ["csv/PAMG2301a.csv", "csv/PAMG2301b.csv"],
    "Rio de Janeiro": ["csv/PARJ2301a.csv", "csv/PARJ2301b.csv"]
}

### Read the CSV Files and Union Them

In [5]:
columns_to_keep = ['PA_PROC_ID', 
                   'PA_CIDPRI',
                   'PA_IDADE', 
                   'PA_SEXO',
                   'PA_RACACOR',
                   'PA_VALAPR',
                   'PA_ALTA',
                   'PA_OBITO']

### Filter by the N180

In [6]:
# Process and merge the CSV files
all_states_df = None

for state, paths in file_paths.items():
    state_df = None
    for path in paths:
        # Read each CSV file
        df = spark.read.csv(path, header=True, inferSchema=True)
        # Select the necessary columns
        df = df.select(columns_to_keep)
        # Filter the rows where PA_CIDPRI equals 'N180'
        df = df.filter(df.PA_CIDPRI == 'N180')
        # Add a new column for the state
        df = df.withColumn("State", lit(state))
        # Union the dataframes for the same state
        if state_df is None:
            state_df = df
        else:
            state_df = state_df.unionByName(df, allowMissingColumns=True)
    # Print the progress indicator
    print(f"State '{state}' done.")
    # Union the dataframes for all states
    if all_states_df is None:
        all_states_df = state_df
    else:
        all_states_df = all_states_df.unionByName(state_df, allowMissingColumns=True)

State 'Acre' done.
State 'Alagoas' done.
State 'Amapá' done.
State 'Amazonas' done.
State 'Bahia' done.
State 'Ceará' done.
State 'Distrito Federal' done.
State 'Espírito Santo' done.
State 'Goiás' done.
State 'Maranhão' done.
State 'Mato Grosso' done.
State 'Mato Grosso do Sul' done.
State 'Pará' done.
State 'Paraíba' done.
State 'Paraná' done.
State 'Pernambuco' done.
State 'Piauí' done.
State 'Rio Grande do Norte' done.
State 'Rio Grande do Sul' done.
State 'Rondônia' done.
State 'Roraima' done.
State 'Santa Catarina' done.
State 'Sergipe' done.
State 'Tocantins' done.
State 'São Paulo' done.
State 'Minas Gerais' done.
State 'Rio de Janeiro' done.


In [7]:
# Show the first 5 rows of the results
all_states_df.show(5)

+----------+---------+--------+-------+----------+---------+-------+--------+-----+
|PA_PROC_ID|PA_CIDPRI|PA_IDADE|PA_SEXO|PA_RACACOR|PA_VALAPR|PA_ALTA|PA_OBITO|State|
+----------+---------+--------+-------+----------+---------+-------+--------+-----+
| 604260016|     N180|      82|      F|         3|     50.4|      0|       0| Acre|
| 604260016|     N180|      58|      M|         3|     50.4|      0|       0| Acre|
| 604260016|     N180|      70|      M|         3|    10.08|      0|       0| Acre|
| 604260016|     N180|      60|      F|         3|     50.4|      0|       0| Acre|
| 604260016|     N180|      46|      F|         3|     50.4|      0|       0| Acre|
+----------+---------+--------+-------+----------+---------+-------+--------+-----+
only showing top 5 rows



### Convert the Spark DataFrame to a Pandas DataFrame

In [8]:
all_states_pd_df = all_states_df.toPandas()

# Show the first few rows of the Pandas DataFrame
all_states_pd_df.head()

Unnamed: 0,PA_PROC_ID,PA_CIDPRI,PA_IDADE,PA_SEXO,PA_RACACOR,PA_VALAPR,PA_ALTA,PA_OBITO,State
0,604260016,N180,82,F,3,50.4,0,0,Acre
1,604260016,N180,58,M,3,50.4,0,0,Acre
2,604260016,N180,70,M,3,10.08,0,0,Acre
3,604260016,N180,60,F,3,50.4,0,0,Acre
4,604260016,N180,46,F,3,50.4,0,0,Acre


In [9]:
# Define the mapping of codes to labels
racacor_mapping = {
    1: 'Branca',
    2: 'Preta',
    3: 'Parda',
    4: 'Amarela',
    5: 'Indígena',
    99: 'Sem informação'
}

# Replace the codes in 'PA_RACACOR' with the descriptive labels
all_states_pd_df['PA_RACACOR'] = all_states_pd_df['PA_RACACOR'].replace(racacor_mapping)

all_states_pd_df.head()

Unnamed: 0,PA_PROC_ID,PA_CIDPRI,PA_IDADE,PA_SEXO,PA_RACACOR,PA_VALAPR,PA_ALTA,PA_OBITO,State
0,604260016,N180,82,F,Parda,50.4,0,0,Acre
1,604260016,N180,58,M,Parda,50.4,0,0,Acre
2,604260016,N180,70,M,Parda,10.08,0,0,Acre
3,604260016,N180,60,F,Parda,50.4,0,0,Acre
4,604260016,N180,46,F,Parda,50.4,0,0,Acre


In [10]:
# Replace values in 'PA_SEXO'
all_states_pd_df['PA_SEXO'] = all_states_pd_df['PA_SEXO'].replace({'F': 'Mulher', 'M': 'Homem'})

# Replace values in 'PA_ALTA' and 'PA_OBITO'
replacement_map = {1: 'Não', 0: 'Sim'}
replacemente_map_ob = {1: 'Vivo', 0: 'Morto'}
all_states_pd_df['PA_ALTA'] = all_states_pd_df['PA_ALTA'].replace(replacement_map)
all_states_pd_df['PA_OBITO'] = all_states_pd_df['PA_OBITO'].replace(replacemente_map_ob)

In [11]:
# Renaming columns in the DataFrame
all_states_pd_df = all_states_pd_df.rename(columns={
    'PA_PROC_ID': 'Código do Procedimento Ambulatorial',
    'PA_CIDPRI': 'CID9 Principal (APAC ou BPA-I)',
    'PA_IDADE': 'Idade do paciente em anos',
    'PA_SEXO': 'Sexo do paciente',
    'PA_RACACOR': 'Raça Cor do paciente',
    'PA_VALAPR': 'Valor Aprovado do procedimento',
    'PA_ALTA': 'Indicador de Alta (APAC)',
    'PA_OBITO': 'Indicador de Óbito'
})

In [12]:
spark.stop()

### Creating Dataframes for the maps

### Frequency of Sexo do Paciente

In [13]:
# Creating a pivot table for PA_SEXO to count frequencies of options per state
df_state_sexo_freq = all_states_pd_df.pivot_table(index='State', columns='Sexo do paciente', aggfunc='size', fill_value=0)

In [14]:
df_state_sexo_freq.head()

Sexo do paciente,Homem,Mulher
State,Unnamed: 1_level_1,Unnamed: 2_level_1
Acre,162,112
Alagoas,3375,2493
Amapá,132,123
Amazonas,1984,1663
Bahia,13426,9690


In [15]:
# Remove the top level of the multi-index columns if it only contains one value 'Sexo do paciente'
if isinstance(df_state_sexo_freq.columns, pd.MultiIndex):
    # Assuming that the top level of the multi-index is not necessary and can be removed
    df_state_sexo_freq.columns = df_state_sexo_freq.columns.droplevel(0)

# Alternatively, if you want to explicitly set new column names:
df_state_sexo_freq.columns = ['Homem', 'Mulher']

In [16]:
# Reset the index to turn 'Estado' into a column
df_state_sexo_freq.reset_index(inplace=True)

In [17]:
df_state_sexo_freq.head()

Unnamed: 0,State,Homem,Mulher
0,Acre,162,112
1,Alagoas,3375,2493
2,Amapá,132,123
3,Amazonas,1984,1663
4,Bahia,13426,9690


### Frequency of Raça Cor do Paciente

In [18]:
# Creating a pivot table for PA_SEXO to count frequencies of options per state
df_state_raca_freq = all_states_pd_df.pivot_table(index='State', columns='Raça Cor do paciente', aggfunc='size', fill_value=0)

In [19]:
# Remove the top level of the multi-index columns if it only contains one value 'Sexo do paciente'
if isinstance(df_state_raca_freq.columns, pd.MultiIndex):
    # Assuming that the top level of the multi-index is not necessary and can be removed
    df_state_raca_freq.columns = df_state_raca_freq.columns.droplevel(0)

# Alternatively, if you want to explicitly set new column names:
df_state_raca_freq.columns = ['Amarela', 'Branca', 'Indígena', 'Parda', 'Preta', 'Sem informação']

In [20]:
# Reset the index to turn 'Estado' into a column
df_state_raca_freq.reset_index(inplace=True)

In [21]:
df_state_raca_freq.head()

Unnamed: 0,State,Amarela,Branca,Indígena,Parda,Preta,Sem informação
0,Acre,24,31,0,172,2,45
1,Alagoas,742,650,8,3852,281,335
2,Amapá,7,27,0,94,16,111
3,Amazonas,53,66,18,989,30,2491
4,Bahia,1763,1577,10,4746,1708,13312


### Avarage Idade

In [22]:
# Calculating the average age for each state
average_age_per_state = all_states_pd_df.groupby('State')['Idade do paciente em anos'].mean().reset_index()

# Renaming columns for clarity
average_age_per_state.columns = ['State', 'Idade média em anos']

average_age_per_state.head()

Unnamed: 0,State,Idade média em anos
0,Acre,52.332117
1,Alagoas,53.276244
2,Amapá,53.984314
3,Amazonas,52.185906
4,Bahia,55.053902


### Sum of Valor Aprovado

In [23]:
# Grouping by 'State' and calculating the sum for the specified columns
valor_aprovad_state_df = all_states_pd_df.groupby('State')[['Valor Aprovado do procedimento']].sum().reset_index()

# Displaying the first few rows of the resulting summary DataFrame
valor_aprovad_state_df.head()

Unnamed: 0,State,Valor Aprovado do procedimento
0,Acre,1406.16
1,Alagoas,105667.31
2,Amapá,2669.92
3,Amazonas,498534.36
4,Bahia,361532.25


### Frequency of Indicador de Alta

In [24]:
df_state_alta_freq = all_states_pd_df.pivot_table(index='State', columns='Indicador de Alta (APAC)', aggfunc='size', fill_value=0)
# Remove the top level of the multi-index columns if it only contains one value 'Sexo do paciente'
if isinstance(df_state_alta_freq.columns, pd.MultiIndex):
    # Assuming that the top level of the multi-index is not necessary and can be removed
    df_state_alta_freq.columns = df_state_alta_freq.columns.droplevel(0)

# Alternatively, if you want to explicitly set new column names:
df_state_alta_freq.columns = ['Sim', 'Não']
# Reset the index to turn 'Estado' into a column
df_state_alta_freq.reset_index(inplace=True)

df_state_alta_freq['Total'] = df_state_alta_freq['Sim'] + df_state_alta_freq['Não']

df_state_alta_freq.head()

Unnamed: 0,State,Sim,Não,Total
0,Acre,0,274,274
1,Alagoas,47,5821,5868
2,Amapá,6,249,255
3,Amazonas,68,3579,3647
4,Bahia,249,22867,23116


### Frequency of Indicador de Óbito

In [26]:
df_state_ob = all_states_pd_df.pivot_table(index='State', columns='Indicador de Óbito', aggfunc='size', fill_value=0)
# Remove the top level of the multi-index columns if it only contains one value 'Sexo do paciente'
if isinstance(df_state_ob.columns, pd.MultiIndex):
    # Assuming that the top level of the multi-index is not necessary and can be removed
    df_state_ob.columns = df_state_ob.columns.droplevel(0)

# Alternatively, if you want to explicitly set new column names:
df_state_ob.columns = ['Vivos', 'Mortos']
# Reset the index to turn 'Estado' into a column
df_state_ob.reset_index(inplace=True)

df_state_ob['Total'] = df_state_ob['Mortos'] + df_state_ob['Vivos']

df_state_ob.head()

Unnamed: 0,State,Vivos,Mortos,Total
0,Acre,274,0,274
1,Alagoas,5868,0,5868
2,Amapá,255,0,255
3,Amazonas,3647,0,3647
4,Bahia,23116,0,23116


### Rename the States to match BI

In [27]:
# Save the Pandas DataFrame to a CSV file
# Renaming 'State' to 'Estado' and saving to CSV for df_state_sexo_freq
all_states_pd_df = all_states_pd_df.rename(columns={'State': 'Estado'})


In [28]:
all_states_pd_df.head()

Unnamed: 0,Código do Procedimento Ambulatorial,CID9 Principal (APAC ou BPA-I),Idade do paciente em anos,Sexo do paciente,Raça Cor do paciente,Valor Aprovado do procedimento,Indicador de Alta (APAC),Indicador de Óbito,Estado
0,604260016,N180,82,Mulher,Parda,50.4,Sim,Morto,Acre
1,604260016,N180,58,Homem,Parda,50.4,Sim,Morto,Acre
2,604260016,N180,70,Homem,Parda,10.08,Sim,Morto,Acre
3,604260016,N180,60,Mulher,Parda,50.4,Sim,Morto,Acre
4,604260016,N180,46,Mulher,Parda,50.4,Sim,Morto,Acre


In [29]:
# Renaming 'State' to 'Estado' and saving to CSV for df_state_sexo_freq
df_state_sexo_freq = df_state_sexo_freq.rename(columns={'State': 'Estado'})

# Renaming 'State' to 'Estado' and saving to CSV for df_state_raca_freq
df_state_raca_freq = df_state_raca_freq.rename(columns={'State': 'Estado'})

# Renaming 'State' to 'Estado' and saving to CSV for average_age_per_state
average_age_per_state = average_age_per_state.rename(columns={'State': 'Estado'})

# Convert 'Valor Aprovado do procedimento' to string with comma as decimal separator
average_age_per_state['Idade média em anos'] = average_age_per_state['Idade média em anos'].apply(lambda x: f"{x:.2f}".replace('.', ','))

# Renaming 'State' to 'Estado' and saving to CSV for average_age_per_state
df_state_alta_freq = df_state_alta_freq.rename(columns={'State': 'Estado'})
df_state_ob = df_state_ob.rename(columns={'State': 'Estado'})

# Renaming 'State' to 'Estado' and saving to CSV for valor_aprovad_state_df
valor_aprovad_state_df = valor_aprovad_state_df.rename(columns={'State': 'Estado'})
# Convert 'Valor Aprovado do procedimento' to string with comma as decimal separator
valor_aprovad_state_df['Valor Aprovado do procedimento'] = valor_aprovad_state_df['Valor Aprovado do procedimento'].apply(lambda x: f"{x:.2f}".replace('.', ','))

In [30]:
from unidecode import unidecode
# Function to remove accents
def remove_accents(input_str):
    return unidecode(input_str)

# List of your DataFrames
dataframes = [df_state_sexo_freq, 
              df_state_raca_freq, 
              average_age_per_state, 
              valor_aprovad_state_df, 
              all_states_pd_df,
              df_state_ob,
              df_state_alta_freq
             ]

# Apply the function to the 'Estado' column of each DataFrame
for df in dataframes:
    df['Estado'] = df['Estado'].apply(remove_accents)

### Save as a CSV

In [31]:
all_states_pd_df.to_csv('csv/all_states_data.csv', index=False)
df_state_sexo_freq.to_csv('csv/state_sexo_freq.csv')
df_state_raca_freq.to_csv('csv/raca_cor_freq.csv', index=False)
df_state_ob.to_csv('csv/ob_freq.csv', index=False)
df_state_alta_freq.to_csv('csv/alta_freq.csv', index=False)
average_age_per_state.to_csv('csv/average_idade.csv', index=False)
valor_aprovad_state_df.to_csv('csv/sum_valor.csv', index=False)