# Data Reading

In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = (
    SparkSession.builder
        .appName("Silver")
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [2]:
#importing modules
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [3]:
#Reading trip type CSV data

df_trip_type = spark.read\
                        .format("delta")\
                        .load(r"C:\Users\gauth\Desktop\Extended Meddalion Architecture\storage_bronze\trip_type")


In [4]:
df_trip_type.show()

+---------+-----------+
|trip_type|description|
+---------+-----------+
|        1|Street-hail|
|        2|   Dispatch|
+---------+-----------+



In [5]:
#reading trip zone CSV data

df_trip_zone = spark.read\
                        .format("delta")\
                        .load(r"C:\Users\gauth\Desktop\Extended Meddalion Architecture\storage_bronze\trip_zone")


In [6]:
df_trip_zone.show(truncate=False)

+----------+-------------+-----------------------+------------+
|LocationID|Borough      |Zone                   |service_zone|
+----------+-------------+-----------------------+------------+
|1         |EWR          |Newark Airport         |EWR         |
|2         |Queens       |Jamaica Bay            |Boro Zone   |
|3         |Bronx        |Allerton/Pelham Gardens|Boro Zone   |
|4         |Manhattan    |Alphabet City          |Yellow Zone |
|5         |Staten Island|Arden Heights          |Boro Zone   |
|6         |Staten Island|Arrochar/Fort Wadsworth|Boro Zone   |
|7         |Queens       |Astoria                |Boro Zone   |
|8         |Queens       |Astoria Park           |Boro Zone   |
|9         |Queens       |Auburndale             |Boro Zone   |
|10        |Queens       |Baisley Park           |Boro Zone   |
|11        |Brooklyn     |Bath Beach             |Boro Zone   |
|12        |Manhattan    |Battery Park           |Yellow Zone |
|13        |Manhattan    |Battery Park C

In [7]:
df_trip_data = spark.read\
                        .format("delta") \
                        .load(r"C:\Users\gauth\Desktop\Extended Meddalion Architecture\storage_bronze\trip_data")

In [8]:
df_trip_data.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|cbd_congestion_fee|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+------------------+
|       2| 2025-03-01 00:07:34|  2025-03-01 00:24:52|                 N|         1|          75|         2

# Data Transformations

### Trip Type

In [9]:
# description column is renamed to trip_description

df_trip_type = df_trip_type.withColumnRenamed("description","trip_description")

In [10]:
df_trip_type.show()

+---------+----------------+
|trip_type|trip_description|
+---------+----------------+
|        1|     Street-hail|
|        2|        Dispatch|
+---------+----------------+



In [11]:
# writing trip type data to silver layer

df_trip_type.write.format("delta")\
                    .mode("overwrite")\
                    .option("path",r"C:\Users\gauth\Desktop\Extended Meddalion Architecture\storage_silver\trip_type")\
                    .save()

### Trip Zone

In [12]:
df_trip_zone.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [13]:
''' 
There are two zones in the zone column in some of the records, 
so we are splitting the zone column into two columns. 
Records which have only one zone will have `null` in the second column.
'''
df_trip_zone = df_trip_zone \
    .withColumn("zone1", expr("get(split(Zone, '/'), 0)")) \
    .withColumn("zone2", expr("get(split(Zone, '/'), 1)"))


In [14]:
df_trip_zone = df_trip_zone.drop("Zone")

In [15]:
df_trip_zone.show()

+----------+-------------+------------+-----------------+--------------+
|LocationID|      Borough|service_zone|            zone1|         zone2|
+----------+-------------+------------+-----------------+--------------+
|         1|          EWR|         EWR|   Newark Airport|          NULL|
|         2|       Queens|   Boro Zone|      Jamaica Bay|          NULL|
|         3|        Bronx|   Boro Zone|         Allerton|Pelham Gardens|
|         4|    Manhattan| Yellow Zone|    Alphabet City|          NULL|
|         5|Staten Island|   Boro Zone|    Arden Heights|          NULL|
|         6|Staten Island|   Boro Zone|         Arrochar|Fort Wadsworth|
|         7|       Queens|   Boro Zone|          Astoria|          NULL|
|         8|       Queens|   Boro Zone|     Astoria Park|          NULL|
|         9|       Queens|   Boro Zone|       Auburndale|          NULL|
|        10|       Queens|   Boro Zone|     Baisley Park|          NULL|
|        11|     Brooklyn|   Boro Zone|       Bath 

In [16]:
# write trip zone data to silver layer
df_trip_zone.write.format("delta")\
                    .mode("overwrite")\
                    .option("path",r"C:\Users\gauth\Desktop\Extended Meddalion Architecture\storage_silver\trip_zone")\
                    .save()

### Trip Data

In [17]:
df_trip_data.count()

543139

In [18]:
df_trip_data.summary().show()

+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+---------+---------------------+------------------+-------------------+-------------------+--------------------+-------------------+
|summary|          VendorID|store_and_fwd_flag|        RatecodeID|      PULocationID|      DOLocationID|   passenger_count|     trip_distance|       fare_amount|             extra|            mta_tax|        tip_amount|      tolls_amount|ehail_fee|improvement_surcharge|      total_amount|       payment_type|          trip_type|congestion_surcharge| cbd_congestion_fee|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+---------+----------

In [21]:
df_trip_data.select([
    approx_count_distinct(c).alias(c)
    for c in df_trip_data.columns
]).show()


+--------+----------+---------+------------+------------+-------------+-----------+------------+
|VendorID|trip_month|trip_year|PULocationID|DOLocationID|trip_distance|fare_amount|total_amount|
+--------+----------+---------+------------+------------+-------------+-----------+------------+
|       3|        12|        2|         255|         262|         3333|       5706|        9730|
+--------+----------+---------+------------+------------+-------------+-----------+------------+



In [19]:
# convert timestamp to date, moth and year, and add them as columns
df_trip_data = df_trip_data.withColumn('trip_date',to_date('lpep_pickup_datetime'))\
                           .withColumn('trip_year',year('trip_date'))\
                           .withColumn('trip_month',month('trip_date'))
df_trip_data.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+------------------+----------+---------+----------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|cbd_congestion_fee| trip_date|trip_year|trip_month|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+------------------+----------+---------+----------+
|       2|

In [20]:
df_trip_data = df_trip_data.select('VendorID','trip_month','trip_year','PULocationID','DOLocationID','trip_distance','fare_amount','total_amount')
df_trip_data.show()

+--------+----------+---------+------------+------------+-------------+-----------+------------+
|VendorID|trip_month|trip_year|PULocationID|DOLocationID|trip_distance|fare_amount|total_amount|
+--------+----------+---------+------------+------------+-------------+-----------+------------+
|       2|         5|     2025|          25|         216|         9.34|       44.3|        46.8|
|       2|         5|     2025|         160|         129|         2.95|       16.3|        18.8|
|       1|         5|     2025|         260|         179|          3.0|       18.4|        20.9|
|       2|         5|     2025|         130|         216|         1.61|        9.3|        11.8|
|       2|         5|     2025|         244|         151|         3.44|       15.6|       22.62|
|       2|         4|     2025|          42|          41|         0.66|        6.5|        11.0|
|       2|         5|     2025|         240|         265|         1.63|        9.3|        11.8|
|       2|         5|     2025

In [0]:
# filter out illogical negative values 
df_trip_data = df_trip_data \
                .filter(col("fare_amount") > lit(0)) \
                .filter(col("total_amount") > lit(0)) \
                .filter(col("trip_distance") > lit(0)) \
                .filter


In [0]:
df_trip_data.display()

VendorID,trip_date,PULocationID,DOLocationID,trip_distance,fare_amount,total_amount
2,2023-03-01,82,196,2.36,13.5,16.0
2,2023-03-01,7,7,0.78,6.5,9.0
2,2023-02-28,166,74,1.66,11.4,16.68
2,2023-03-01,236,229,3.14,15.6,25.02
2,2023-03-01,75,235,5.69,23.3,29.8
2,2023-03-01,260,160,2.92,17.7,24.24
2,2023-03-01,244,41,3.34,16.3,24.44
2,2023-03-01,83,7,1.75,10.7,15.84
2,2023-03-01,223,223,0.74,5.1,9.12
2,2023-03-01,260,260,0.63,6.5,10.8


In [0]:

df_trip_data.write.format("parquet")\
                  .mode("overwrite")\
                  .option("path","abfss://silver@capstoneprojectstoregt.dfs.core.windows.net/green_taxi_2023")\
                  .save()
