# Tuberculose A15 Janeiro 2023 EDA

### Reading the XSL files and convert them to csv

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable


# Specify the folder path where your .xls files are located
folder_path = 'tuberxls'

# A list to hold all DataFrames (one per sheet)
dataframes = []

# Loop through each file in the specified folder
for filename in os.listdir(folder_path):
    # Check if the file is an .xls file
    if filename.endswith('.xls'):
        # Construct the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Read the file into a DataFrame without headers, assuming the data starts from the first row
        df = pd.read_excel(file_path, header=None, engine='xlrd')
        
        # Directly assign the desired column names
        df.columns = ["Procedimentos realizados", "Frequência"]
        
        # Drop the first two lines of data after the headers have been set
        df = df.drop(index=[0, 1, 2])
        
        # Reset the index to make sure it starts from 0 after dropping rows
        df.reset_index(drop=True, inplace=True)
        
        # Append the DataFrame to our list
        dataframes.append(df)

# Creating DataFrames from the sample data
dataframes = [pd.DataFrame(data) for data in dataframes]
dataframes

### Checking one dataframe

In [None]:
dataframes[2]

In [None]:
dataframes[0].keys()

In [None]:
for index, row in dataframes[5].iterrows():
    print(index, row)

### Creating a dictionary with all dataframes

In [None]:
# Brazilian states and Federal District
state_names = [
    "Acre", "Alagoas", "Amazonas", "Amapá", "Bahia", "Ceará", "Distrito Federal",
    "Espírito Santo", "Goiás", "Maranhão", "Minas Gerais", "Mato Grosso do Sul", "Mato Grosso",
    "Pará", "Paraíba", "Pernambuco", "Piauí", "Paraná", "Rio de Janeiro",
    "Rio Grande do Norte", "Rondônia", "Roraima", "Rio Grande do Sul", "Santa Catarina",
    "São Paulo", "Sergipe", "Tocantins"
]

# Initialize df_states dictionary
df_states = {state: {} for state in state_names}

for state, df in zip(state_names, dataframes):
    df_states[state] = df.to_dict()

df_states['Amazonas']

### Checking the keys (states) e por Estado

In [None]:
df_states.keys()

In [None]:
df_states['Amazonas'].keys()

### Saving each state dataframe as csv 

In [None]:
for key in df_states:
    # Convert the current state's data to a DataFrame
    df = pd.DataFrame(df_states[key])
    
    # Construct the filename using the state's name
    filename = f"Tuber_{key}_012023.csv"
    
    # Save the DataFrame to a CSV file without the index
    df.to_csv(f"csv/{filename}", index=False)

    print(f"Saved {key} data to {filename}")

### Creating a merged dataframe

In [None]:
merged_df = {}
for state in state_names:
    data  = {}
    for i in range(len(df_states[state]["Procedimentos realizados"])):
        data[df_states[state]["Procedimentos realizados"][i]] = df_states[state]["Frequência"][i]
    merged_df[state] = data

In [None]:
merged_df = pd.DataFrame(merged_df)

### Replacing NaN values for 0

In [None]:
merged_df = merged_df.replace(np.nan, 0)
merged_df.head()

### Removing the rows with only zeros

In [None]:
# Calculate the sum across each row
row_sums = merged_df.sum(axis=1)

# Use boolean indexing to filter rows where the sum is greater than 0
filtered_df = merged_df.loc[row_sums > 0]

In [None]:
filtered_df

### Dropping the Total row

In [None]:
filtered_df_drop_Total = filtered_df.drop("Total")

In [None]:
filtered_df_drop_Total.describe()

### Dividing the first column into Code and Description of the procedure

In [None]:
# Reset the index to work with it as a regular column
filtered_df_drop_Total = filtered_df_drop_Total.reset_index()

# Regular expression to separate the numerical part from the text
filtered_df_drop_Total[['Code', 'Description']] = filtered_df_drop_Total['index'].str.extract(r'(\d+)\s+(.*)')

# Drop the original 'index' column if it's no longer needed
filtered_df_drop_Total = filtered_df_drop_Total.drop(columns=['index'])

# Reorder columns to put 'Code' and 'Description' at the beginning
cols = ['Code', 'Description'] + [col for col in filtered_df_drop_Total.columns if col not in ['Code', 'Description']]
filtered_df_drop_Total = filtered_df_drop_Total[cols]

filtered_df_drop_Total.head()

### Creating a total column

In [None]:
filtered_df_drop_Total['Total'] = filtered_df_drop_Total.iloc[:, 2:].sum(axis=1)

filtered_df_drop_Total.head()

### Sorting by the Total column

In [None]:
filtered_df_drop_Total = filtered_df_drop_Total.sort_values(by='Total', ascending=False)
filtered_df_drop_Total = filtered_df_drop_Total.drop("Total", axis=1)
filtered_df_drop_Total.head()

### Saving as a csv file

In [None]:
filtered_df_drop_Total.to_csv('csv/tuber_states_012023_df.csv', index=False)

### Transposing the table to excel for BI

In [None]:
# Dropping 'Code' and 'Description' columns
df_filtered = filtered_df_drop_Total.drop(columns=['Code', 'Description'])

# Transposing the dataframe
df_transposed = df_filtered.T

# Renaming the index and columns for clarity
df_transposed.index.name = 'State'
df_transposed.columns = [f'Procedure_{i+1}' for i in range(df_transposed.shape[1])]

# Summing up the frequencies for each state to get the 'Total Frequency'
df_total_frequency = df_transposed.sum(axis=1).reset_index(name='Total Frequency')

df_total_frequency.head()

### Save as a xlsx file

In [None]:
df_total_frequency.to_excel('excel/tuber_states_012023_df.xlsx', index=False)

### Total of procedures for each state

In [None]:
total_procedures = {}

for state in state_names:
    total_procedures[state] = filtered_df[state]['Total']

# Convert the dictionary to a DataFrame
total_procedures_df = pd.DataFrame(list(total_procedures.items()), columns=['State', 'Total Procedures'])

# Now df is your desired DataFrame
total_procedures_df.head()

In [None]:
total_procedures_df.to_csv('csv/tuberculose_012023_total_state.csv', index=False)

### Total of procedures for each state chart

In [None]:
plt.figure(figsize=(18, 10))
plt.bar(total_procedures_df['State'], total_procedures_df['Total Procedures'], color='skyblue')

plt.title('Total Procedures by State')
plt.xlabel('State')
plt.ylabel('Total Procedures')

for i, val in enumerate(total_procedures_df['Total Procedures']):
    plt.text(i, val + 50, str(val), ha='center')

plt.xticks(rotation=45) 
plt.tight_layout()  
plt.show()

### Brazil's map for Tuberculose A15 based on the data from 01/2023

In [None]:
# Brazil's map Data Wragling

# Read the shapefile data
uf_br = gpd.read_file('map/gadm36_BRA_1.shp')
uf_br_geo = uf_br[['NAME_1', 'geometry']]

# Rename the column to merge the dataset
uf_br_geo.rename(columns={'NAME_1': 'State'}, inplace=True)

# Merge the population data with the GeoDataFrame
merged_df = uf_br_geo.merge(total_procedures_df, on='State')

In [None]:
merged_df.head()

In [None]:
# Iterate through each column (excluding non-numeric columns)
for column in merged_df.columns:
    if column not in ['State', 'geometry']:
        # Normalize population values between 0 and 1 for coloring
        col_min = merged_df[column].min()
        col_max = merged_df[column].max()
        norm = Normalize(vmin=col_min, vmax=col_max)

        # Create a scalar mappable to apply colormap to the map
        sm = ScalarMappable(cmap='Blues', norm=norm)
        sm.set_array([])  # dummy array for the scalar mappable

        # Plot the map with the colored regions
        fig, ax = plt.subplots(1, 1, figsize=(10, 8))
        merged_df.plot(column=column, cmap='Blues', linewidth=0.8, ax=ax, edgecolor='0.8')
        ax.set_title(f'{column} for Tuberculose A15 in 01/2023')
        ax.set_axis_off()
        
        # Loop through the rows of the DataFrame to annotate state abbreviations
        for idx, row in merged_df.iterrows():
            state_abbr = row['State']  
            state_geometry = row['geometry'] 
            
            # Get the centroid of the state geometry
            centroid = state_geometry.centroid
            
            # Annotate the abbreviation at the centroid
            ax.annotate(text=state_abbr, xy=(centroid.x, centroid.y), xytext=(3, 3),
                        textcoords="offset points", color='black', fontsize=8)


        # Create colorbar
        cbar = fig.colorbar(sm, ax=ax)
        cbar.set_label(column)

        # Save or show the map
        plt.savefig(f'{column}_012023_map.png')  # Save the figure as an image
        plt.show()  # Display the figure