In [2]:
import duckdb
import boto3
import pandas as pd
from io import BytesIO
from datetime import datetime

# ----------------------------- MinIO Config --------------------------------
MINIO_ENDPOINT   = "http://minio:9000"
MINIO_ACCESS_KEY = "admin"
MINIO_SECRET_KEY = "password"
BUCKET_NAME      = "batch-bucket"
BRONZE_KEY       = "bronze/retail/raw_retail.parquet"
SILVER_KEY       = "silver/retail/retail_cleaned.parquet"

# ----------------------------- Lookup Tables -------------------------------
PRODUCT_LOOKUP = {
    1: "Organic Apples",
    2: "Bananas",
    3: "Bread",
    4: "Milk",
    5: "Beer",
    6: "Juice",
    7: "Yogurt",
    8: "Chocolate",
    9: "Coffee",
    10: "Tea"
    # ggf. weitere IDs ergänzen
}

STORE_LOOKUP = {
    1: "Vienna Center",
    2: "Linz Mall",
    3: "Salzburg City",
    4: "Graz Hauptplatz",
    5: "Innsbruck Center"
    # ggf. weitere Filialen ergänzen
}

# ----------------------------- MinIO Client --------------------------------
s3 = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY
)

# ----------------------------- Load Bronze Retail --------------------------
obj = s3.get_object(Bucket=BUCKET_NAME, Key=BRONZE_KEY)
df_retail = pd.read_parquet(BytesIO(obj["Body"].read()))

# ----------------------------- Transformations -----------------------------
# 1. Datum parsen
df_retail["date"] = pd.to_datetime(df_retail["date"])

# 2. Zeitkomponenten extrahieren
df_retail["year"] = df_retail["date"].dt.year
df_retail["month"] = df_retail["date"].dt.month
df_retail["day"] = df_retail["date"].dt.day

# 3. Saison bestimmen
def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    else:
        return "Autumn"

df_retail["season"] = df_retail["month"].apply(get_season)

# 4. Lookups anwenden
df_retail["store_name"] = df_retail["store_id"].map(STORE_LOOKUP)
df_retail["product_name"] = df_retail["product_id"].map(PRODUCT_LOOKUP)

# 5. Fehlerhafte Zeilen filtern (optional)
df_retail = df_retail.dropna(subset=["store_name", "product_name", "revenue"])

# ----------------------------- Save Silver ---------------------------------
buffer = BytesIO()
df_retail.to_parquet(buffer, engine="pyarrow", index=False)
buffer.seek(0)

s3.put_object(Bucket=BUCKET_NAME, Key=SILVER_KEY, Body=buffer.read())

print(f"✅ Silver Retail gespeichert unter: s3://{BUCKET_NAME}/{SILVER_KEY}")


✅ Silver Retail gespeichert unter: s3://batch-bucket/silver/retail/retail_cleaned.parquet


In [1]:
import duckdb
import boto3
import pandas as pd
from io import BytesIO

# ----------------------------- MinIO Config --------------------------------
MINIO_ENDPOINT   = "http://minio:9000"
MINIO_ACCESS_KEY = "admin"
MINIO_SECRET_KEY = "password"
BUCKET_NAME      = "batch-bucket"
BRONZE_KEY       = "bronze/weather/raw_weather.parquet"
SILVER_KEY       = "silver/weather/weather_cleaned.parquet"

# ----------------------------- MinIO Client --------------------------------
s3 = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY
)

# ----------------------------- Load Bronze Weather -------------------------
obj = s3.get_object(Bucket=BUCKET_NAME, Key=BRONZE_KEY)
df_weather = pd.read_parquet(BytesIO(obj["Body"].read()))

# ----------------------------- Transformations -----------------------------
# Rename date column if necessary (optional safety)
# df_weather.rename(columns={"date": "timestamp"}, inplace=True)

# Convert date column to datetime
df_weather["date"] = pd.to_datetime(df_weather["date"])

# Extract time components
df_weather["year"] = df_weather["date"].dt.year
df_weather["month"] = df_weather["date"].dt.month
df_weather["day"] = df_weather["date"].dt.day

# Season classification
def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    else:
        return "Autumn"

df_weather["season"] = df_weather["month"].apply(get_season)

# Temperature categories
def categorize_temperature(temp):
    if temp < 0:
        return "Freezing"
    elif temp < 10:
        return "Cold"
    elif temp < 20:
        return "Mild"
    else:
        return "Warm"

df_weather["temperature_category"] = df_weather["temperature"].apply(categorize_temperature)

# Wind categories (simplified Beaufort scale)
def categorize_wind(speed):
    if speed < 1:
        return "Calm"
    elif speed < 5:
        return "Light Breeze"
    elif speed < 11:
        return "Breeze"
    elif speed < 19:
        return "Windy"
    else:
        return "Storm"

df_weather["wind_category"] = df_weather["wind_speed"].apply(categorize_wind)

# ----------------------------- Save Silver ---------------------------------
buffer = BytesIO()
df_weather.to_parquet(buffer, engine="pyarrow", index=False)
buffer.seek(0)

s3.put_object(Bucket=BUCKET_NAME, Key=SILVER_KEY, Body=buffer.read())

print(f"✅ Silver Weather gespeichert unter: s3://{BUCKET_NAME}/{SILVER_KEY}")


✅ Silver Weather gespeichert unter: s3://batch-bucket/silver/weather/weather_cleaned.parquet
