In [None]:
import polars as pl
import boto3
import pandas as pd
import os
from dotenv import load_dotenv
from sqlalchemy import create_engine

## **Extracting from S3 Bucket (2023 Data)**

In case you were to run this script multiple times, it will automatically overwrite the files

In [None]:
load_dotenv()

s3 = boto3.client(
    's3',
    aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")
)

s3_path = 'data/2023/'
bucket_name = os.getenv("AWS_BUCKET_NAME") 

# Files in S3 Bucket
object_keys = [
    f'{s3_path}2023_customers.parquet',
    f'{s3_path}2023_employees.csv',
    f'{s3_path}2023_orders.parquet',
    f'{s3_path}2023_products.csv'
]

local_base_path = os.path.abspath(os.path.join('..', 'data', 'bronze_layer', '2023')) # Output (local) path

# os.makedirs(local_base_path, exist_ok=True)

for key in object_keys:
    filename = os.path.basename(key) # Extract only the filename from the S3 key
    local_path = os.path.join(local_base_path, filename) # Build full local path

    s3.download_file(bucket_name, key, local_path)
    print(f"Downloaded: {key} -> {local_path}")

## **Extracting from MySQL Database (2024 Data)**

In [None]:
# Connect to MySQL
user = os.getenv("MYSQL_USER")
password = os.getenv("MYSQL_PASSWORD")
host = 'localhost'
port = '3306'
db = os.getenv("MYSQL_DATABASE")

engine = create_engine(f'mysql+pymysql://{user}:{password}@{host}:{port}/{db}')

tables = ['employees', 'customers', 'orders', 'products']

output_path = os.path.abspath(os.path.join('..', 'data', 'bronze_layer', '2024'))

for table in tables:
    query = f"SELECT * FROM {table}"
    df = pd.read_sql(query, engine)

    df.to_csv(f"{output_path}/{table}.csv", index=False)
    print(f"Table {table} downloaded to: {output_path}")

Table: employees downloaded to: c:\Users\Usuario\Desktop\multiple_source_ETL\data\bronze_layer\2024
Table: customers downloaded to: c:\Users\Usuario\Desktop\multiple_source_ETL\data\bronze_layer\2024
Table: orders downloaded to: c:\Users\Usuario\Desktop\multiple_source_ETL\data\bronze_layer\2024
Table: products downloaded to: c:\Users\Usuario\Desktop\multiple_source_ETL\data\bronze_layer\2024
