# **Extract (E)**

In [75]:
import glob
import pandas as pd
from datetime import datetime

In [78]:
log_file = "log_file.txt"
target_file = "transformed_data.csv"

In [79]:
def log_progress(message):
    with open(log_file, "a") as f:
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        f.write(f"{timestamp} - {message}\n")
    print(f"{timestamp} - {message}")  # Para debug no console

In [80]:
def extract_from_csv(file_to_process):
    dataframe = pd.read_csv(file_to_process, sep=";", on_bad_lines="skip", encoding="utf-8")
    return dataframe

In [81]:
def extract():
    # Criar um DataFrame vazio apenas com as colunas desejadas
    extracted_data = pd.DataFrame(columns=['SYMBOL', 'DESCRIPTION', 'PRODUCT', 'REGION'])

    # Caminho dos arquivos CSV
    csv_path = "/content/*.csv"

    for csvfile in glob.glob(csv_path):
        temp_df = extract_from_csv(csvfile)

        # Manter apenas as colunas desejadas (ignorar extras)
        temp_df = temp_df[['SYMBOL', 'DESCRIPTION', 'PRODUCT', 'REGION']]

        # Concatenar apenas essas colunas ao DataFrame final
        extracted_data = pd.concat([extracted_data, temp_df], ignore_index=True)

    return extracted_data

# **Transform (T)**

In [82]:
def transform(data):
    if "SYMBOL" in data.columns:
        data["SYMBOL"] = data["SYMBOL"].str.upper()
    else:
        print("Aviso: Coluna 'SYMBOL' não encontrada no DataFrame.")
    return data

# **Loading and Logging (L)**

In [83]:
def load_data(target_file, transformed_data):
    transformed_data.to_csv(target_file, index=False, encoding="utf-8-sig")

LOGS

In [84]:
# Log the initialization of the ETL process
log_progress("ETL Job Started")

# Log the beginning of the Extraction process
log_progress("Extract phase Started")
extracted_data = extract()

# Log the completion of the Extraction process
log_progress("Extract phase Ended")

# Log the beginning of the Transformation process
log_progress("Transform phase Started")
transformed_data = transform(extracted_data)
print("Transformed Data")
print(transformed_data)

# Log the completion of the Transformation process
log_progress("Transform phase Ended")

# Log the beginning of the Loading process
log_progress("Load phase Started")
load_data(target_file,transformed_data)

# Log the completion of the Loading process
log_progress("Load phase Ended")

# Log the completion of the ETL process
log_progress("ETL Job Ended")

2025-02-13 18:23:09 - ETL Job Started
2025-02-13 18:23:09 - Extract phase Started
2025-02-13 18:23:09 - Extract phase Ended
2025-02-13 18:23:09 - Transform phase Started
Transformed Data
      SYMBOL                   DESCRIPTION PRODUCT REGION
0       09K0  ARK SPACE EXPLORATION & INNO     ETF     US
1       09KA           THE 3D PRINTING ETF     ETF     US
2       0BYB   ISHARES GLOBAL CLEAN ENERGY     ETF     US
3       2840              SPDR GOLD SHARES     ETF     US
4      82840        SPDR GOLD SHARES - RMB     ETF     US
...      ...                           ...     ...    ...
67473    ZTE         ZTEST ELECTRONICS INC  Stocks     CA
67474    ZTS                    ZOETIS INC  Stocks     MX
67475   ZYUS       ZYUS LIFE SCIENCES CORP  Stocks     CA
67476  ZYZ.A         TSE STOCK TEST SYMBOL  Stocks     CA
67477  ZZE.H           ZIDANE CAPITAL CORP  Stocks     CA

[67478 rows x 4 columns]
2025-02-13 18:23:09 - Transform phase Ended
2025-02-13 18:23:09 - Load phase Started
2025-0