### Import the libraries and load the data

In [1]:
import pandas as pd

# Load the dataframes
df1 = pd.read_csv('tuber_states_022023_df.csv')
df2 = pd.read_csv('tuber_states_032023_df.csv')
df3 = pd.read_csv('tuber_states_012023_df.csv')

# Check if they have the same columns
columns_match = (set(df1.columns) == set(df2.columns)) and (set(df2.columns) == set(df3.columns))

# Output the result and the columns of each dataframe if needed
columns_match

True

### Data Info

In [2]:
# Add a month column to each DataFrame
df1['Month'] = 'February'
df2['Month'] = 'March'
df3['Month'] = 'January'

# Concatenate the DataFrames
combined_df = pd.concat([df1, df2, df3], ignore_index=True)

# Display the combined DataFrame shape and the first few rows to verify
combined_df_shape = combined_df.shape
combined_df_head = combined_df.head()

combined_df_shape

(299, 30)

In [3]:
combined_df.to_csv('tuberculose_010203.csv')

In [4]:
combined_df.head()

Unnamed: 0,Code,Description,Acre,Alagoas,Amapá,Amazonas,Bahia,Ceará,Distrito Federal,Espírito Santo,Goiás,Maranhão,Mato Grosso,Mato Grosso do Sul,Minas Gerais,Paraná,Paraíba,Pará,Pernambuco,Piauí,Rio Grande do Norte,Rio Grande do Sul,Rio de Janeiro,Rondônia,Roraima,Santa Catarina,Sergipe,São Paulo,Tocantins,Month
0,201010402,BIOPSIA DE PLEURA (POR ASPIRACAO/AGULHA / PLEU...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,February
1,202010120,DOSAGEM DE ACIDO URICO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,February
2,202010201,DOSAGEM DE BILIRRUBINA TOTAL E FRACOES,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,February
3,202010228,DOSAGEM DE CALCIO IONIZAVEL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,February
4,202010260,DOSAGEM DE CLORETO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,February


### Create the dataframe for region instead of states

In [23]:
# Define a dictionary to map each state to its respective region
state_to_region = {
    'Acre': 'North', 'Alagoas': 'Northeast', 'Amapá': 'North', 'Amazonas': 'North',
    'Bahia': 'Northeast', 'Ceará': 'Northeast', 'Distrito Federal': 'Central-West',
    'Espírito Santo': 'Southeast', 'Goiás': 'Central-West', 'Maranhão': 'Northeast',
    'Mato Grosso': 'Central-West', 'Mato Grosso do Sul': 'Central-West', 'Minas Gerais': 'Southeast',
    'Pará': 'North', 'Paraíba': 'Northeast', 'Paraná': 'South', 'Pernambuco': 'Northeast',
    'Piauí': 'Northeast', 'Rio Grande do Norte': 'Northeast', 'Rio Grande do Sul': 'South',
    'Rio de Janeiro': 'Southeast', 'Rondônia': 'North', 'Roraima': 'North', 'Santa Catarina': 'South',
    'Sergipe': 'Northeast', 'São Paulo': 'Southeast', 'Tocantins': 'North'
}

# Create a new DataFrame to hold the aggregated data by region
data_by_region = pd.DataFrame()

# Include the 'Code', 'Description', and 'Month' columns as they are
data_by_region[['Code', 'Description', 'Month']] = combined_df[['Code', 'Description', 'Month']]

# Aggregate the data by region
for state, region in state_to_region.items():
    if region not in data_by_region.columns:
        # Initialize the region column with zeros if it doesn't exist yet
        data_by_region[region] = 0
    # Add the state's data to its corresponding region
    data_by_region[region] += combined_df[state]

# Display the first few rows of the new dataframe to verify the transformation
data_by_region.head()

Unnamed: 0,Code,Description,Month,North,Northeast,Central-West,Southeast,South
0,201010402,BIOPSIA DE PLEURA (POR ASPIRACAO/AGULHA / PLEU...,February,0.0,0.0,0.0,4.0,0.0
1,202010120,DOSAGEM DE ACIDO URICO,February,0.0,0.0,0.0,1.0,0.0
2,202010201,DOSAGEM DE BILIRRUBINA TOTAL E FRACOES,February,0.0,0.0,0.0,4.0,0.0
3,202010228,DOSAGEM DE CALCIO IONIZAVEL,February,0.0,0.0,0.0,1.0,0.0
4,202010260,DOSAGEM DE CLORETO,February,0.0,0.0,0.0,1.0,0.0


In [8]:
# Define the path for the new CSV file
output_file_path = 'tuberculose_by_region.csv'

# Export the transformed DataFrame to a new CSV file
data_by_region.to_csv(output_file_path, index=False)

# Return the path to the user
output_file_path

'tuberculose_by_region.csv'

### Table with full information

In [34]:
# Mapping of states to their corresponding regions
state_to_region = {
    'Acre': 'North', 'Amapá': 'North', 'Amazonas': 'North', 'Pará': 'North',
    'Rondônia': 'North', 'Roraima': 'North', 'Tocantins': 'North',
    'Alagoas': 'Northeast', 'Bahia': 'Northeast', 'Ceará': 'Northeast', 'Maranhão': 'Northeast',
    'Paraíba': 'Northeast', 'Pernambuco': 'Northeast', 'Piauí': 'Northeast',
    'Rio Grande do Norte': 'Northeast', 'Sergipe': 'Northeast',
    'Goiás': 'Central-West', 'Mato Grosso': 'Central-West', 'Mato Grosso do Sul': 'Central-West',
    'Distrito Federal': 'Central-West',
    'Espírito Santo': 'Southeast', 'Minas Gerais': 'Southeast', 'Rio de Janeiro': 'Southeast', 'São Paulo': 'Southeast',
    'Paraná': 'South', 'Rio Grande do Sul': 'South', 'Santa Catarina': 'South'
}

data_dropped = combined_df.drop(columns=['Code', 'Description'])

# Melting the adjusted dataset to organize it into a long format
data_adjusted_melted = data_dropped.melt(id_vars=['Month'], var_name='State', value_name='Procedures')
data_adjusted_melted = data_adjusted_melted[data_adjusted_melted['Procedures'] > 0]  # Keeping only rows with procedures

# Mapping states to their respective regions again
data_adjusted_melted['Region'] = data_adjusted_melted['State'].map(state_to_region)

# Selecting and renaming columns to fit the request
final_adjusted_data = data_adjusted_melted[['State', 'Region', 'Month', 'Procedures']]

# Display the first few rows of the newly adjusted table
final_adjusted_data.head()

Unnamed: 0,State,Region,Month,Procedures
376,Alagoas,Northeast,February,1.0
388,Alagoas,Northeast,February,1.0
468,Alagoas,Northeast,March,2.0
481,Alagoas,Northeast,March,2.0
502,Alagoas,Northeast,January,3.0


In [35]:
final_adjusted_data.to_excel('tuberculose_state_region_month_freq.xlsx')

### Final adjustaments to match the map on BI

In [37]:
import unidecode

# Remover a acentuação da coluna 'State'
final_adjusted_data['State'] = final_adjusted_data['State'].apply(lambda x: unidecode.unidecode(x))

# Salvar o arquivo modificado
modified_file_path = 'tuberculose_state_region_month_freq.xlsx'
final_adjusted_data.to_excel(modified_file_path, index=False)

modified_file_path

'tuberculose_state_region_month_freq.xlsx'