In [1]:
# import utils
from utils import ingest_landing, load_bronze, sanitize_columns

In [2]:
# Connect to Spark and Minio
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pyspark.sql.functions as f
from minio import Minio

spark = SparkSession.builder.appName("Jupyter").getOrCreate()

access_key = "admin" 
secret_key = "password"
minio_api_host = "http://minio:9000"
minio_client = Minio("minio:9000", access_key=access_key, secret_key=secret_key, secure=False)

spark

24/08/05 08:41:41 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
# Ingestion Step - Landing Zone - taxi_zone_lookup
ingest_landing(src="https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv", 
               local_dest="/home/iceberg/data/taxi_zone_lookup.csv",
               minio_dest="taxi_zone_lookup/taxi_zone_lookup.csv",
               minio_client=minio_client)

Bucket already exists
Ingestion Successful


In [4]:
# Loading Step - Bronze Zone - taxi zone lookup
load_bronze(local_src="/home/iceberg/data/taxi_zone_lookup.csv",
            minio_src="taxi_zone_lookup/taxi_zone_lookup.csv",
            dest_table="bronze.taxi_zone_lookup",
            file_type="csv",
            spark=spark,
            minio_client=minio_client)

                                                                                

In [5]:
# Silver Zone - schema definition - taxi zone lookup
spark.sql("""
CREATE TABLE IF NOT EXISTS silver.taxi_zone_lookup (
  `location_id` BIGINT,
  `borough` STRING,
  `zone` STRING,
  `service_zone` STRING,
  `ingest_time` TIMESTAMP
  )
USING iceberg
TBLPROPERTIES(
  'write.target-file-size-bytes'='5242880'
)
""")

DataFrame[]

In [7]:
bronze_transform = spark.read.table("bronze.taxi_zone_lookup")

# sanitize column names
columns_to_rename = {"LocationID": "location_id", 
                     "Borough": "borough", 
                     "Zone": "zone"}

bronze_transform = sanitize_columns(bronze_transform, columns_to_rename)

bronze_transform = bronze_transform.withColumn("location_id", f.col("location_id").cast("bigint"))

# deduplicate against silver
silver = spark.read.table("silver.taxi_zone_lookup")
bronze_transform = bronze_transform.unionAll(silver)
bronze_transform = bronze_transform.selectExpr(
  "*", 
  "count(*) over (partition by location_id) as cnt"
).filter(f.col("cnt") == 1).drop("cnt")

# if transformed df is not empty then write to silver
if len(bronze_transform.head(1)) > 0:
    # add ingestion time for silver
    bronze_transform = bronze_transform.withColumn("ingest_time", f.current_timestamp())
    bronze_transform.writeTo("silver.taxi_zone_lookup").append()
    print("Records written to silver.")

else:
    print("No new records to write to silver.")

No new records to write to silver.
