## Importing required libs: 

In [2]:
import pandas as pd
import glob
import os
from pandas_ods_reader import read_ods
import warnings
import concurrent.futures
import psutil
import csv
import re

### Ignoring errors: 

In [3]:
warnings.filterwarnings("ignore")

# Pre-processing

# Part 1 -  
# Having a first look at the files:

## We currently have 5 different data sources:  

### A - (Sales notifications of all compounded drugs that have restricted sales in Brazil, like antidepressants, antibiotics, etc.) 

Example: "project_data\raw_anvisa_files\manipulated_meds\EDA_Manipulados_201401.csv"   
(93 files in this dir ranging from 2014JAN ~ 2021NOV)   

### B - (Sales notifications of all industrialized drugs that have restricted sales in Brazil, like antidepressants, antibiotics, etc.)

Example: "project_data\raw_anvisa_files\industrialized_meds\EDA_Industrializados_201401.csv"   
(93 files in this dir ranging from 2014JAN ~ 2021NOV)

### C - (Notifications of all obituaries in Brazil)

Example: "project_data\raw_sim_files\Mortalidade_Geral_2014.csv"  
(8 files ranging yearly from 2014 to 2021)

### D - (Brazilian cities' total area)
Example:"\project_data\ibge_demographic\br_city_area.ods"  
(A single file from 2023) 



### E - (Brazilian cities' population)
Example: "\project_data\ibge_demographic\br_city_population.ods"   
(A single file from 2021) 



# Part 2 -
# Now let's check column names in all paths:

## 2.1 Defining all "file" paths:

In [54]:
file_path_a = (
    r"(...)\projects\My_2ndEDA_Drug_Resistent_Bacteria"
    r"\project_data\raw_anvisa_files\manipulated_meds\EDA_Manipulados_201401.csv"
)

file_path_b = (
    r"(...)\projects\My_2ndEDA_Drug_Resistent_Bacteria"
    r"\project_data\raw_anvisa_files\industrialized_meds\EDA_Industrializados_201401.csv"
)

file_path_c = (
    r"(...)\projects\My_2ndEDA_Drug_Resistent_Bacteria"
    r"\My_2ndEDA_Drug_Resistent_Bacteria\project_data\raw_sim_files\Mortalidade_Geral_2014.csv"
)

file_path_d = (
    r"(...)\projects\My_2ndEDA_Drug_Resistent_Bacteria"
    r"\project_data\ibge_demographic\br_city_area.ods"
)


file_path_e = (
   r"(...)\projects\My_2ndEDA_Drug_Resistent_Bacteria"
   r"\project_data\ibge_demographic\br_city_population.ods"
)

## 2.2 Defining function to read path and return column names:

In [5]:
def get_column_names(file_path):
    # Check the file extension to determine the type of reading
    if file_path.endswith(".csv"):
        # CSV file reading
        df = pd.read_csv(file_path, delimiter=';', encoding='cp1252', error_bad_lines=False)
        return df.columns.tolist()
    elif file_path.endswith(".ods"):
        # ODS file reading
        df = read_ods(file_path, 1)
        return df.columns.tolist()
    else:
        raise ValueError("Unsupported file format: use CSV or ODS")

### - A)

In [50]:
column_names_a = get_column_names(file_path_a)
print(column_names_a)

['ANO_VENDA', 'MES_VENDA', 'UF_VENDA', 'MUNICIPIO_VENDA', 'PRINCIPIO_ATIVO', 'QTD_ATIVO_POR_UNID_FARMACOTEC', 'UNIDADE_MEDIDA_PRINCIPIO_ATIVO', 'QTD_UNIDADE_FARMACOTECNICA', 'TIPO_UNIDADE_FARMACOTECNICA', 'CID10', 'SEXO', 'IDADE', 'UNIDADE_IDADE']


### - B)  

In [52]:
column_names_b = get_column_names(file_path_b)
print(column_names_b)

['ANO_VENDA', 'MES_VENDA', 'UF_VENDA', 'MUNICIPIO_VENDA', 'PRINCIPIO_ATIVO', 'DESCRICAO_APRESENTACAO', 'QTD_VENDIDA', 'UNIDADE_MEDIDA', 'CONSELHO_PRESCRITOR', 'UF_CONSELHO_PRESCRITOR', 'TIPO_RECEITUARIO', 'CID10', 'SEXO', 'IDADE', 'UNIDADE_IDADE']


### - C)  

In [53]:
column_names_c = get_column_names(file_path_c)
print(column_names_c)

['CONTADOR', 'ORIGEM', 'TIPOBITO', 'DTOBITO', 'HORAOBITO', 'NATURAL', 'CODMUNNATU', 'DTNASC', 'IDADE', 'SEXO', 'RACACOR', 'ESTCIV', 'ESC', 'ESC2010', 'SERIESCFAL', 'OCUP', 'CODMUNRES', 'LOCOCOR', 'CODESTAB', 'ESTABDESCR', 'CODMUNOCOR', 'IDADEMAE', 'ESCMAE', 'ESCMAE2010', 'SERIESCMAE', 'OCUPMAE', 'QTDFILVIVO', 'QTDFILMORT', 'GRAVIDEZ', 'SEMAGESTAC', 'GESTACAO', 'PARTO', 'OBITOPARTO', 'PESO', 'TPMORTEOCO', 'OBITOGRAV', 'OBITOPUERP', 'ASSISTMED', 'EXAME', 'CIRURGIA', 'NECROPSIA', 'LINHAA', 'LINHAB', 'LINHAC', 'LINHAD', 'LINHAII', 'CAUSABAS', 'CB_PRE', 'COMUNSVOIM', 'DTATESTADO', 'CIRCOBITO', 'ACIDTRAB', 'FONTE', 'NUMEROLOTE', 'TPPOS', 'DTINVESTIG', 'CAUSABAS_O', 'DTCADASTRO', 'ATESTANTE', 'STCODIFICA', 'CODIFICADO', 'VERSAOSIST', 'VERSAOSCB', 'FONTEINV', 'DTRECEBIM', 'ATESTADO', 'DTRECORIGA', 'CAUSAMAT', 'ESCMAEAGR1', 'ESCFALAGR1', 'STDOEPIDEM', 'STDONOVA', 'DIFDATA', 'NUDIASOBCO', 'NUDIASOBIN', 'DTCADINV', 'TPOBITOCOR', 'DTCONINV', 'FONTES', 'TPRESGINFO', 'TPNIVELINV', 'NUDIASINF', 'DTCA

### - D)  

In [54]:
column_names_d = get_column_names(file_path_d)
print(column_names_d)

['ID', 'CD_UF', 'NM_UF', 'NM_UF_SIGLA', 'CD_MUN', 'NM_MUN', 'AR_MUN_2022']


### - E)  

In [55]:
column_names_e = get_column_names(file_path_e)
print(column_names_e)

['UF', 'COD. UF', 'COD. MUNIC', 'NOME DO MUNICÍPIO', ' POPULAÇÃO ESTIMADA ']


# Part 3 - 
# Joining D and E files:

## 3.1 - Doing a inner join with D and E

D represents the local directory containing data for the total area of cities, while E represents the file containing the total population of cities. Since both files are relatively small, it makes sense to perform this operation with Pandas, conducting an inner join based on the city code, and subsequently modifying it for compatibility with an SQL-supported extension.

In [55]:
# 3.2 Reading the data
df_d = read_ods(file_path_d, 1)
df_e = read_ods(file_path_e, 1)

# 3.3 Merge the DataFrames based on the common column 'NM_MUN' in df_d and 'NOME DO MUNICÍPIO' in df_e
merged_df = df_d.merge(df_e[['NOME DO MUNICÍPIO', ' POPULAÇÃO ESTIMADA ']], left_on='NM_MUN', right_on='NOME DO MUNICÍPIO', how='inner')

# 3.4 Drop the redundant 'NOME DO MUNICÍPIO' column
merged_df.drop(columns=['NOME DO MUNICÍPIO'], inplace=True)

# 3.5 Rename the merged DataFrame to 'city_area_population'
city_area_population = merged_df

# Step 5: Save the DataFrame as a CSV file
# 3.6 Save the 'city_area_population' DataFrame as a CSV file in the same directory
output_file_path = r"(...)\projects\My_2ndEDA_Drug_Resistent_Bacteria\project_data\ibge_demographic\city_area_population.csv"
city_area_population.to_csv(output_file_path, index=False)

# Step 6: Display the first few rows of the DataFrame

# 3.7 Print the first few rows of the 'city_area_population' DataFrame
print(city_area_population.head())

  ID CD_UF     NM_UF NM_UF_SIGLA   CD_MUN                 NM_MUN  AR_MUN_2022  \
0  1    11  Rondônia          RO  1100015  Alta Floresta D'Oeste     7067.127   
1  2    11  Rondônia          RO  1100023              Ariquemes     4426.571   
2  3    11  Rondônia          RO  1100031                 Cabixi     1314.352   
3  4    11  Rondônia          RO  1100049                 Cacoal     3793.000   
4  5    11  Rondônia          RO  1100056             Cerejeiras     2783.300   

   POPULAÇÃO ESTIMADA   
0              22516.0  
1             111148.0  
2               5067.0  
3              86416.0  
4              16088.0  


# Part 4 -
# Finding max column lenghts to create trustworthy target table in SQL for each file and their respective dir: 

## 4.1 Defining a function that iterates through each file in a folder and counts the maximum number of characters in each column or characteristc.

In [2]:
# Function to find the maximum length values per column in a CSV file
def find_max_lengths(file_path):
    max_lengths = {}

    chunk_size = 30000  # Chunk size

    for chunk in pd.read_csv(file_path, sep=';', encoding='cp1252', chunksize=chunk_size):
        for col in chunk.columns:
            max_length = chunk[col].apply(lambda x: len(str(x))).max()
            if col in max_lengths:
                if max_length > max_lengths[col]:
                    max_lengths[col] = max_length
            else: 
                max_lengths[col] = max_length

    return max_lengths

# Function to process a folder with concurrent features and limit to 60% of the computational power
def process_folder_with_limited_concurrency(folder_path, cpu_percentage=0.6):
    max_lengths = {}

    # Calculate the maximum number of threads based on the CPU capacity percentage
    max_threads = int(psutil.cpu_count(logical=False) * cpu_percentage)

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
        file_paths = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path) if filename.endswith(".csv")]
        results = list(executor.map(find_max_lengths, file_paths))

    for result in results:
        for col, max_length in result.items():
            if col in max_lengths:
                if max_length > max_lengths[col]:
                    max_lengths[col] = max_length
            else:
                max_lengths[col] = max_length

    return max_lengths

## 4.2 Defining "folders":

In [4]:
folder_path_a = r"(...)\projects\My_2ndEDA_Drug_Resistent_Bacteria\project_data\raw_anvisa_files\industrialized_meds"
folder_path_b = r"(...)\projects\My_2ndEDA_Drug_Resistent_Bacteria\project_data\raw_anvisa_files\manipulated_meds"
folder_path_c = r"(...)\projects\My_2ndEDA_Drug_Resistent_Bacteria\project_data\raw_sim_files"
file_path_demographics = r"(...)\projects\My_2ndEDA_Drug_Resistent_Bacteria\project_data\ibge_demographic\city_area_population.csv"


## 4.3 Using the function to retrieve values for A~D files 

# A: 

In [6]:
folder_path = folder_path_a
max_lengths = process_folder_with_limited_concurrency(folder_path)
print(max_lengths)

{'ANO_VENDA': 4, 'MES_VENDA': 2, 'UF_VENDA': 2, 'MUNICIPIO_VENDA': 32, 'PRINCIPIO_ATIVO': 602, 'DESCRICAO_APRESENTACAO': 144, 'QTD_VENDIDA': 7, 'UNIDADE_MEDIDA': 6, 'CONSELHO_PRESCRITOR': 4, 'UF_CONSELHO_PRESCRITOR': 2, 'TIPO_RECEITUARIO': 3, 'CID10': 4, 'SEXO': 3, 'IDADE': 5, 'UNIDADE_IDADE': 3}


# B: 

In [50]:
folder_path = folder_path_b
max_lengths = process_folder_with_limited_concurrency(folder_path)
print(max_lengths)

{'ANO_VENDA': 4, 'MES_VENDA': 2, 'UF_VENDA': 2, 'MUNICIPIO_VENDA': 27, 'PRINCIPIO_ATIVO': 45, 'QTD_ATIVO_POR_UNID_FARMACOTEC': 11, 'UNIDADE_MEDIDA_PRINCIPIO_ATIVO': 9, 'QTD_UNIDADE_FARMACOTECNICA': 12, 'TIPO_UNIDADE_FARMACOTECNICA': 10, 'CID10': 4, 'SEXO': 3, 'IDADE': 5, 'UNIDADE_IDADE': 3}


# C:

In [71]:
folder_path = folder_path_c
max_lengths = process_folder_with_limited_concurrency(folder_path)
print(max_lengths)

{'CONTADOR': 7, 'ORIGEM': 1, 'TIPOBITO': 1, 'DTOBITO': 8, 'HORAOBITO': 7, 'NATURAL': 5, 'CODMUNNATU': 8, 'DTNASC': 10, 'IDADE': 3, 'SEXO': 1, 'RACACOR': 3, 'ESTCIV': 3, 'ESC': 3, 'ESC2010': 3, 'SERIESCFAL': 3, 'OCUP': 8, 'CODMUNRES': 6, 'LOCOCOR': 3, 'CODESTAB': 9, 'ESTABDESCR': 40, 'CODMUNOCOR': 6, 'IDADEMAE': 4, 'ESCMAE': 3, 'ESCMAE2010': 3, 'SERIESCMAE': 3, 'OCUPMAE': 8, 'QTDFILVIVO': 4, 'QTDFILMORT': 4, 'GRAVIDEZ': 3, 'SEMAGESTAC': 4, 'GESTACAO': 3, 'PARTO': 3, 'OBITOPARTO': 3, 'PESO': 6, 'TPMORTEOCO': 3, 'OBITOGRAV': 3, 'OBITOPUERP': 3, 'ASSISTMED': 3, 'EXAME': 3, 'CIRURGIA': 3, 'NECROPSIA': 3, 'LINHAA': 20, 'LINHAB': 20, 'LINHAC': 20, 'LINHAD': 20, 'LINHAII': 30, 'CAUSABAS': 4, 'CB_PRE': 4, 'COMUNSVOIM': 8, 'DTATESTADO': 10, 'CIRCOBITO': 3, 'ACIDTRAB': 3, 'FONTE': 3, 'NUMEROLOTE': 10, 'TPPOS': 3, 'DTINVESTIG': 10, 'CAUSABAS_O': 4, 'DTCADASTRO': 10, 'ATESTANTE': 3, 'STCODIFICA': 3, 'CODIFICADO': 3, 'VERSAOSIST': 6, 'VERSAOSCB': 4, 'FONTEINV': 3, 'DTRECEBIM': 10, 'ATESTADO': 50, 'D

# D: 
### (In this case we only need to check values for the single "joined" file)

In [27]:
try:
    # Read the CSV file with UTF-8 encoding
    df = pd.read_csv(file_path_demographics, encoding='utf-8')

    # Calculate the length of each element in each cell
    column_lengths = df.applymap(lambda x: len(str(x)))

    # Display the maximum lengths for each column
    for col in column_lengths.columns:
        print(f"Maximum length for column '{col}': {column_lengths[col].max()}")

except Exception as e:
    print(f"Error reading the file: {e}")

Maximum length for column 'ID': 4
Maximum length for column 'CD_UF': 2
Maximum length for column 'NM_UF': 19
Maximum length for column 'NM_UF_SIGLA': 2
Maximum length for column 'CD_MUN': 7
Maximum length for column 'NM_MUN': 32
Maximum length for column 'AR_MUN_2022': 10
Maximum length for column ' POPULAÇÃO ESTIMADA ': 11


# Part 5 - 
# Counting number of rows in each file to verify data integrity after insertions

# 5.1 - Defining the function:

In [34]:
# Function to count rows in a CSV file
def count_rows_in_file(file_path, encoding='cp1258'):
    try:
        with open(file_path, 'r', newline='', encoding=encoding) as file:
            csv_reader = csv.reader(file)
            row_count = sum(1 for _ in csv_reader)
        return row_count
    except UnicodeDecodeError:
        print(f"UnicodeDecodeError: Could not read the file {file_path}.")
        return 0  # Return 0 for files that cannot be decoded

# Function to count rows in all CSV files in a folder and create a report
def create_report(folder_path, output_file_path):
    row_counts = {}

    # List all CSV files in the folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

    for filename in csv_files:
        file_path = os.path.join(folder_path, filename)
        row_count = count_rows_in_file(file_path)

        # Extract year and month from the filename
        match = re.search(r'_(\d{4})(\d{2})?\.csv', filename)  # Updated regex pattern
        if match:
            year = match.group(1)
            month = match.group(2)
            if month is None:
                month = ''  # Set month to empty string for files without month

            key = f"{year};{month}"

            if key in row_counts:
                row_counts[key] += row_count
            else:
                row_counts[key] = row_count

    # Convert row_counts to a DataFrame
    report_df = pd.DataFrame(list(row_counts.items()), columns=["year;month", "count"])

    # Split the "year;month" column into two columns: "year" and "month"
    report_df[["year", "month"]] = report_df["year;month"].str.split(";", expand=True)

    # Now you can rename the columns
    report_df = report_df[["year", "month", "count"]]

    # Save the report to the CSV file
    report_df.to_csv(output_file_path, index=False, sep=';', decimal=',', header=True)

    # Display the first few rows of the report
    print(report_df.head())



# 5.2 Calling the report function:
### We need to provide two arguments to generate the report that counts number of rows per file in each different dir: 

"folder_path" and "output_file_path": EX:

# 'A' folder: 

In [37]:
# Set the folder path to folder_path_a
folder_path = folder_path_a 

# Define the path for the output CSV file
output_file_path = (r"(...)\projects\My_2ndEDA_Drug_Resistent_Bacteria\project_data\validation_reports\validation_report_a.csv")

create_report(folder_path, output_file_path)



   year month    count
0  2014    01  4663124
1  2014    02  4461102
2  2014    03  4770799
3  2014    04  4959036
4  2014    05  5257119


# 'B' folder: 

In [39]:
# Set the folder path to folder_path_b:
folder_path = folder_path_b  
 
# Define the path for the output CSV file:
output_file_path = (r"(...)\projects\My_2ndEDA_Drug_Resistent_Bacteria\project_data\validation_reports\validation_report_b.csv")  

create_report(folder_path, output_file_path)



   year month   count
0  2014    01  272874
1  2014    02  277087
2  2014    03  272527
3  2014    04  283229
4  2014    05  295848


# 'C' folder: 

In [40]:
# Set the folder path to folder_path_c:
folder_path = folder_path_c  

# Define the path for the output CSV file:
output_file_path = (r"(...)\projects\My_2ndEDA_Drug_Resistent_Bacteria\project_data\validation_reports\validation_report_c.csv")

create_report(folder_path, output_file_path)

   year month    count
0  2014        1227040
1  2015        1264176
2  2016        1309775
3  2017        1312664
4  2018        1316720


# 'D' folder: 
"The 'd' directory does not contain either years or months, so we only need to count the number of rows."

In [43]:
try:
    df = pd.read_csv(file_path_demographics)
    line_count = len(df)
    print(f"Total lines in the file: {line_count}")
except FileNotFoundError:
    print("File not found.")
except pd.errors.EmptyDataError:
    print("The file is empty or does not contain data.")
except pd.errors.ParserError:
    print("Error parsing the CSV file.")

Total lines in the file: 6225
