# **Extract (E)**

In [8]:
import glob
import pandas as pd
from datetime import datetime

In [9]:
log_file = "log_file.txt"
target_file = "transformed_data_filter_develop_inloco.csv"

In [10]:
def log_progress(message):
    with open(log_file, "a") as f:
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        f.write(f"{timestamp} - {message}\n")
    print(f"{timestamp} - {message}")  # Para debug no console

In [11]:
def extract_from_csv(file_to_process):
    dataframe = pd.read_csv(file_to_process, sep=";", on_bad_lines="skip", encoding="utf-8")
    return dataframe

In [12]:
def extract():
    # Criar um DataFrame vazio apenas com as colunas desejadas
    extracted_data = pd.DataFrame(columns=['SYMBOL', 'DESCRIPTION', 'PRODUCT', 'REGION'])

    # Caminho dos arquivos CSV
    csv_path = "/content/*.csv"

    for csvfile in glob.glob(csv_path):
        temp_df = extract_from_csv(csvfile)

        # Manter apenas as colunas desejadas (ignorar extras)
        temp_df = temp_df[['SYMBOL', 'DESCRIPTION', 'PRODUCT', 'REGION']]

        # Concatenar apenas essas colunas ao DataFrame final
        extracted_data = pd.concat([extracted_data, temp_df], ignore_index=True)

    return extracted_data

# **Transform (T)**

In [13]:
def transform(data):
    if "SYMBOL" in data.columns:
        data["SYMBOL"] = data["SYMBOL"].str.upper()
    else:
        print("Aviso: Coluna 'SYMBOL' não encontrada no DataFrame.")
    return data

# **Loading and Logging (L)**

In [14]:
def load_data(target_file, transformed_data):
    transformed_data.to_csv(target_file, index=False, encoding="utf-8-sig")

LOGS

In [15]:
# Log the initialization of the ETL process
log_progress("ETL Job Started")

# Log the beginning of the Extraction process
log_progress("Extract phase Started")
extracted_data = extract()

# Log the completion of the Extraction process
log_progress("Extract phase Ended")

# Log the beginning of the Transformation process
log_progress("Transform phase Started")
transformed_data = transform(extracted_data)
print("Transformed Data")
print(transformed_data)

# Log the completion of the Transformation process
log_progress("Transform phase Ended")

# Log the beginning of the Loading process
log_progress("Load phase Started")
load_data(target_file,transformed_data)

# Log the completion of the Loading process
log_progress("Load phase Ended")

# Log the completion of the ETL process
log_progress("ETL Job Ended")

2025-02-19 20:19:08 - ETL Job Started
2025-02-19 20:19:08 - Extract phase Started
2025-02-19 20:19:08 - Extract phase Ended
2025-02-19 20:19:08 - Transform phase Started
Transformed Data
     SYMBOL                   DESCRIPTION PRODUCT REGION
0      9840        SPDR GOLD SHARES - USD     ETF     US
1       AAA  ALTERNATIVE ACCESS FIRST PRI     ETF     US
2      AAAU   GOLDMAN SACHS PHYSICAL GOLD     ETF     US
3      AADR   ADVISORSHARES DORSEY WRIGHT     ETF     US
4      AAPB  GRANITESHARES 2X LONG AAPL D     ETF     US
...     ...                           ...     ...    ...
5296    ZWT   BMO COVERED CALL TECHNOLOGY     ETF     CA
5297    ZWU  BMO COVERED CALL UTILITIES E     ETF     CA
5298    ZXM  CI MORNINGSTAR INTL MOMENTUM     ETF     CA
5299  ZXM.B  CI MORNINGSTAR INTL MOMENT-B     ETF     CA
5300   ZZZD  BMO TACTICAL DIV ETF FD ETFS     ETF     CA

[5301 rows x 4 columns]
2025-02-19 20:19:08 - Transform phase Ended
2025-02-19 20:19:08 - Load phase Started
2025-02-19 20:19:08