# Pre-processing steps

# Part 1: Making a python script that iterates all my csv files inside a folder and drops unnecessary columns in all of them. 



In [1]:
import pandas as pd
import glob
import os

In [None]:
# 1.1 - Obtaining all csv files in the folder: 

folder_path = r"specific_folder"

csv_files = glob.glob(folder_path + "/*.csv")


# 1.2 - Iterate over each CSV file and execute the code.

for file_path in csv_files:
    # Load the csv file into a Pandas Dataframe using "cp1252" encoding 
    df = pd.read_csv(file_path, sep=";", encoding="cp1252")

    # Drop columns 'DESCRICAO_APRESENTACAO', 'CONSELHO_PRESCRITOR', 'UF_CONSELHO_PRESCRITOR'
    columns_to_drop = ['DESCRICAO_APRESENTACAO', 'CONSELHO_PRESCRITOR', 'UF_CONSELHO_PRESCRITOR']
    df.drop(columns=columns_to_drop, inplace=True)

    # Save the modified DataFrame to a new CSV file.
    output_file_path = file_path[:-4] + "_modified.csv"  # Adiciona "_modified" ao nome do arquivo
    df.to_csv(output_file_path, index=False, sep=";", encoding="cp1252") 





# Part 2: Finding max column lenghts to create trustworthy target table in SQL

In [2]:
file_path = r "EDA_Industrializados_202110_modified.csv" 
df = pd.read_csv(file_path, sep=";", encoding="cp1252")

# Função para calcular o comprimento máximo de cada valor em uma coluna
def max_length(column):
    return max(len(str(value)) for value in column)

# Calcular o tamanho máximo de cada coluna
column_lengths = df.apply(max_length)

# Mostrar os tamanhos máximos de cada coluna
print(column_lengths)

ANO_VENDA             4
MES_VENDA             2
UF_VENDA              2
MUNICIPIO_VENDA      32
PRINCIPIO_ATIVO     331
QTD_VENDIDA           7
UNIDADE_MEDIDA        6
TIPO_RECEITUARIO      3
CID10                 4
SEXO                  3
IDADE                 5
UNIDADE_IDADE         3
dtype: int64


# Part 3: Creating a table for double checking after inserting data into db

In [None]:
# Create an empty list to store the results
results = []

# 1.2 - Iterate over each CSV file and execute the code.
for file_path in csv_files:
    try:
        # Load the csv file into a Pandas DataFrame
        df = pd.read_csv(file_path, sep=";", encoding="cp1252")

        # Get the number of rows in the DataFrame
        row_count = df.shape[0]

        # Extract the year and month from the file name directly
        file_name = os.path.basename(file_path)
        year = int(file_name.split("_")[2][:4])
        month = int(file_name.split("_")[2][4:6])

        # Append the results to the list
        results.append((year, month, row_count))
    except pd.errors.ParserError as e:
        print(f"Error reading file: {file_path} - {e}")

# Create a new DataFrame from the results list
columns = ['year', 'month', 'count']
new_df = pd.DataFrame(results, columns=columns)

# Save the new DataFrame to a CSV file
output_file_path = r"(...)\matching_table00.csv"
new_df.to_csv(output_file_path, index=False, sep=";", encoding="cp1252")
