# 0. Imports

In [2]:
# Basic imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import shutil
from geopy.geocoders import Nominatim

import math

import datasus_dbc # Used to decompress the DBC file from DATASUS
import dbfread # Used to read the decompressed DBF

from IPython.core.display import HTML
from IPython.display import Image

geolocator = Nominatim(user_agent="my_geocoder_app")

In [3]:
# Set the option to display all columns in the DataFrame
pd.set_option('display.max_columns', None)

# 1. Class RT_APAC

In [4]:
class RT_APAC:

    # Basic imports
    import pandas as pd
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    import os
    import shutil
    from geopy.geocoders import Nominatim

    import math

    import datasus_dbc # Used to decompress the DBC file from DATASUS
    import dbfread # Used to read the decompressed DBF

    from IPython.core.display import HTML
    from IPython.display import Image

    geolocator = Nominatim(user_agent="my_geocoder_app")

    # organizing files from datasus
    # orginal_path: path to the directory where the files are located
    # state: two-letter abbreviation of the state (e.g., 'sp' for São Paulo
    def organize_files(original_path:str, state:str, start_year: 8, end_year: int = 25, file_structure: str = 'AR', type: str = '.dbc'):
        print('')
        print(original_path)
        print('')
        state = state.upper()
        file_structure = file_structure + state
        type = '.dbc'

        # creating list with years
        years = []
        for y in range(start_year, end_year):
            if y < 10:
                years.append('0' + str(y))
            else:
                years.append(str(y))
        # creating list with months
        months = []
        for m in range(1,13):
            if m < 10:
                months.append('0' + str(m))
            else:
                months.append(str(m))

        # creating directory for each year
        for y in years:
            path = os.path.join(original_path, y)        
            # creating directory if it does not exist
            if not os.path.exists(path):
                os.makedirs(path)
                print(f"Created directory: {y}")
            else:
                print(f"Directory already exists: {y}")

        # checking for files and moving them to the corresponding directory
        for y in years:
            path = os.path.join(original_path, y)
            # creating directory for each month
            for m in months:
                file_name = f"{file_structure}{y}{m}{type}"
                file_name = os.path.join(original_path, file_name)
                # print(f"Checking for file: {file_name}")                        
                # moving file to the corresponding directory
                if os.path.exists(file_name):
                    # print(f"File found: {file_name}")                     
                    shutil.move(file_name, path)                
                    # print(f"Moved {file_name} to {y}/")
      
    def concat_all_data(path_files, state): # Final dictionary to hold the results  

        # Create a full data DataFrame to hold all data
        full_data = pd.DataFrame()            
        
        # Path to the directory containing the DBC files
        path_files = os.path.join(path_files, state)  # Adjust the path as needed

        # Check all paths in the directory
        for path in os.listdir(path_files):            

            # Create an empty DataFrame to hold all data
            all_data = pd.DataFrame()

            # Print the path being processed
            print("Processing path:", path, end=' from -> ')            

            # Construct the full path to the directory            
            path_destiny = os.path.join(path_files, path)
            print(path_destiny)            
        
            # Check all files in the path
            try:
                for file_to_read in os.listdir(path_destiny):                
                
                    # Construct the full path to the file
                    file_to_read = os.path.join(path_destiny, file_to_read)
                    # print("Reading file:", file_to_read)   
                
                    # Decompress the DBC file to a temporary DBF file
                    datasus_dbc.decompress(file_to_read, "output.dbf")

                    # Read the decompressed DBF file using dbfread
                    dbf = dbfread.DBF('output.dbf', encoding='latin1')
                    #df = pd.DataFrame(iter(dbf))

                    # Now you can work with the DataFrame
                    df = pd.DataFrame(iter(dbf))  # Convert to DataFrame

                    for col in df.columns:
                        # Convert columns to string type
                        df[col] = df[col].astype(str)                

                    # Concatenate the DataFrame to the main DataFrame                
                    all_data = pd.concat([all_data, df], ignore_index=True)            
                
                    # Construct the full path to save the CSV file
                    csv_destiny = os.path.join(path_files, f"{path}.csv") 
            except NotADirectoryError:
                print(f"Error: {path_destiny} is not a directory or does not exist.")
                pass               
        
            # Concatenate the all_data DataFrame to the full_data DataFrame
            full_data = pd.concat([full_data, all_data], ignore_index=True)
        

        # Save the full_data DataFrame to a CSV file
        csv_destiny = os.path.join(path_files, f"{state}.csv")
        full_data.to_csv(csv_destiny, index=False)

    # Extracting lat long from cities
    def lat_long(dataset, state_country):
        # Using api to extract latitude and longitude
        lat_long_dict = {}

        state_country = ", Paraná, BR"

        for city in dataset['cidade']:
            city_name = city
            location = geolocator.geocode(city_name + state_country)
            if location:
                latitude = location.latitude
                longitude = location.longitude
                lat_long_dict[city_name] = (latitude, longitude)        
            else:
                print(f"Could not find coordinates for {city_name}")
                lat_long_dict[city_name] = (None, None)
        return lat_long_dict

# 2. Running functions

In [10]:
organize = RT_APAC.organize_files('C:\\Users\\175 MX\\Documents\\Gustavo\\datasus\\data_rt_states\\states\\pr\\qt\\dbcs', 'pr', 8, 25, file_structure='AQ', type='.dbc')
organize


C:\Users\175 MX\Documents\Gustavo\datasus\data_rt_states\states\pr\qt\dbcs

Created directory: 08
Created directory: 09
Created directory: 10
Created directory: 11
Created directory: 12
Created directory: 13
Created directory: 14
Created directory: 15
Created directory: 16
Created directory: 17
Created directory: 18
Created directory: 19
Created directory: 20
Created directory: 21
Created directory: 22
Created directory: 23
Created directory: 24


In [14]:
data = RT_APAC.concat_all_data("states\\pr\\qt", "dbcs")

Processing path: 08 from -> states\pr\qt\dbcs\08
Processing path: 09 from -> states\pr\qt\dbcs\09
Processing path: 10 from -> states\pr\qt\dbcs\10
Processing path: 11 from -> states\pr\qt\dbcs\11
Processing path: 12 from -> states\pr\qt\dbcs\12
Processing path: 13 from -> states\pr\qt\dbcs\13
Processing path: 14 from -> states\pr\qt\dbcs\14
Processing path: 15 from -> states\pr\qt\dbcs\15
Processing path: 16 from -> states\pr\qt\dbcs\16
Processing path: 17 from -> states\pr\qt\dbcs\17
Processing path: 18 from -> states\pr\qt\dbcs\18
Processing path: 19 from -> states\pr\qt\dbcs\19
Processing path: 20 from -> states\pr\qt\dbcs\20
Processing path: 21 from -> states\pr\qt\dbcs\21
Processing path: 22 from -> states\pr\qt\dbcs\22
Processing path: 23 from -> states\pr\qt\dbcs\23
Processing path: 24 from -> states\pr\qt\dbcs\24
