In [None]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = (
    SparkSession.builder
        .appName("Bronze")
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

### Trip Type

In [None]:
df_trip_type = spark.read.format("csv")\
                            .option("header", "true")\
                            .option("inferSchema", "true")\
                            .load(r"C:\Users\gauth\Desktop\Extended Medallion Architecture\storage_raw\trip_type")

In [None]:
df_trip_type.write.format("delta")\
                    .mode("overwrite")\
                    .option("path",r"C:\Users\gauth\Desktop\Extended Medallion Architecture\storage_bronze\trip_type")\
                    .save()

### Trip Zone

In [None]:
df_trip_zone = spark.read.format("csv")\
                               .option("header","true")\
                               .option("inferSchema","true")\
                               .load(r"C:\Users\gauth\Desktop\Extended Medallion Architecture\storage_raw\trip_zone")


In [None]:
df_trip_zone.write.format("delta")\
                    .mode("overwrite")\
                    .option("path",r"C:\Users\gauth\Desktop\Extended Medallion Architecture\storage_bronze\trip_zone")\
                    .save()

### Trip Data

In [7]:
mySchema = StructType().fromDDL("""
    VendorID BIGINT,
    lpep_pickup_datetime TIMESTAMP,
    lpep_dropoff_datetime TIMESTAMP,
    store_and_fwd_flag STRING,
    RatecodeID BIGINT,
    PULocationID BIGINT,
    DOLocationID BIGINT,
    passenger_count BIGINT,
    trip_distance DOUBLE,
    fare_amount DOUBLE,
    extra DOUBLE,
    mta_tax DOUBLE,
    tip_amount DOUBLE,
    tolls_amount DOUBLE,
    ehail_fee DOUBLE,
    improvement_surcharge DOUBLE,
    total_amount DOUBLE,
    payment_type BIGINT,
    trip_type BIGINT,
    congestion_surcharge DOUBLE
""")


In [None]:
import os

raw_path = r"C:\Users\gauth\Desktop\Extended Medallion Architecture\storage_raw\trip_data"

for file in os.listdir(raw_path):

    full_path = os.path.join(raw_path, file)

    df_trip_data = spark.read.parquet(full_path)

    # Cast every column according to target schema
    for field in mySchema.fields:
        df_trip_data = df_trip_data.withColumn(field.name, col(field.name).cast(field.dataType))

In [None]:

df_trip_data.write.format("delta")\
                  .mode("overwrite")\
                  .option("path",r"C:\Users\gauth\Desktop\Extended Medallion Architecture\storage_bronze\trip_data")\
                  .save()