# Bronze Layer

In [0]:
%run ../config/setup

In [0]:
import requests
import zipfile
import io
import os
from pyspark.sql.functions import current_timestamp, input_file_name

def download_data_to_volume(url, target_dir):
    # In 'dev', skip download if data exists to save time
    if env == 'dev' and os.path.exists(target_dir):
        print(f"Skipping download in {env}. Data already exists.")
        return

    print(f"Downloading from {url}...")
    try:
        r = requests.get(url)
        if r.status_code == 200:
            z = zipfile.ZipFile(io.BytesIO(r.content))
            
            # Ensure target directory exists in the Volume
            os.makedirs(target_dir, exist_ok=True)
            
            # Extract directly to the Volume path
            z.extractall(target_dir)
            print(f"Download and extraction complete at {target_dir}")
        else:
            raise Exception(f"Download failed with status code: {r.status_code}")
    except Exception as e:
        print(f"Error during ingestion: {str(e)}")
        raise e

download_data_to_volume(raw_data_url, landing_path)



### Read Raw Data from txt file

In [0]:
csv_file_path = f"{landing_path}/household_power_consumption.txt"

# Handle semicolons separators, store as Delta table, Add metadata columns
df_raw = (spark.read
          .format("csv")
          .option("header", "true")
          .option("delimiter", ";") 
          .option("inferSchema", "false") # Keep as string to avoid schema errors on load
          .load(csv_file_path))

### Add metadata

In [0]:
df_bronze = df_raw \
    .withColumn("_ingestion_timestamp", current_timestamp()) \
    .withColumn("_source_file", col("_metadata.file_path"))

### Write to bronze layer

In [0]:
df_bronze.write.format("delta").mode("overwrite").saveAsTable(full_path_bronze)
print(f"Table saved: {full_path_bronze}")