In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.functions import hour, dayofweek, month, date_trunc, col

In [20]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

In [21]:
tlc_all = spark.read.parquet('../data/raw/tlc_data/')

## Feature Engineering

In [22]:
# Add column for hour of day and day of week
tlc_all = tlc_all.withColumn("hourly_timestamp", date_trunc("hour",col("pickup_datetime")))
tlc_all = tlc_all.withColumn("pickup_hour_of_day", hour("pickup_datetime"))
tlc_all = tlc_all.withColumn("pickup_day_of_week", dayofweek("pickup_datetime"))
tlc_all = tlc_all.withColumn("is_weekend", (col("pickup_day_of_week") == 1) | (col("pickup_day_of_week") == 7))
tlc_all = tlc_all.withColumn("pickup_month", month("pickup_datetime"))
tlc_all.show(5)

+-----------------+--------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+-------------------+-----------------+------------------+----------------+--------------+-------------------+------------------+------------------+----------+------------+
|hvfhs_license_num|dispatching_base_num|   request_datetime|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|trip_miles|trip_time|base_passenger_fare|tolls| bcf|sales_tax|congestion_surcharge|airport_fee|tips|driver_pay|shared_request_flag|shared_match_flag|access_a_ride_flag|wav_request_flag|wav_match_flag|   hourly_timestamp|pickup_hour_of_day|pickup_day_of_week|is_weekend|pickup_month|
+-----------------+--------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+----

In [23]:
# Add column for waiting time
tlc_all = tlc_all.withColumn("waiting_time", unix_timestamp("pickup_datetime") - unix_timestamp("request_datetime"))

In [24]:
zones = spark.read.csv("../data/taxi_zones/taxi+_zone_lookup.csv", header=True)
zones.show(10)

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
+----------+-------------+--------------------+------------+


### Aggregating the movement data, to use in movement_plot.ipynb

In [25]:
# Merge the data with the shapefile
tlc_all = tlc_all.alias("tlc") \
    .join(zones.alias("zone"), tlc_all.PULocationID == zones.LocationID, how='left') \
    .select("tlc.*", "zone.Borough", "zone.Zone") \
    .withColumnRenamed("Borough", "pickup_borough")
tlc_all = tlc_all.alias("tlc") \
    .join(zones.alias("zone"), tlc_all.DOLocationID == zones.LocationID, how='left') \
    .select("tlc.*", "zone.Borough") \
    .withColumnRenamed("Borough", "dropoff_borough")

In [26]:
# Check if the pickup is at an airport, by checking if zone name contains "airport"
tlc_all = tlc_all.withColumn("pickup_at_airport", col("Zone").contains("Airport"))

In [27]:
movement_aggregates = tlc_all.groupBy('pickup_borough', 'dropoff_borough', 'pickup_hour_of_day').agg({
    '*': 'count',
})
movement_aggregates = movement_aggregates.withColumnRenamed('count(1)', 'num_trips')

In [28]:
movement_aggregates.write.mode('overwrite').parquet('../data/movement_aggregates')

                                                                                

In [29]:
demand_aggregate = tlc_all.groupBy('PULocationID', 'hourly_timestamp', 'pickup_hour_of_day', 'pickup_day_of_week', 'pickup_month', 'pickup_borough', 'is_weekend', "pickup_at_airport")\
    .agg({'*': 'count'})\
    .withColumnRenamed('count(1)', 'num_trips')\
    .orderBy('hourly_timestamp')

### Combining with LOB data

In [30]:
subway_df = spark.read.parquet("../data/raw/lob_data/")
subway_df.show(5)

+-----+----------+
|count|LocationID|
+-----+----------+
| 4996|         3|
| 3132|         4|
| 1753|         5|
| 2792|         6|
|21999|         7|
+-----+----------+


In [31]:
combined_df = demand_aggregate.alias("tlc").join(
    subway_df.withColumnsRenamed({"count": "pickup_num_businesses", "LocationID": "PULocationID"}),
    on="PULocationID",
    how='left'
)

Combining with weather data

In [32]:
weather_df = spark.read.csv("../data/landing/weather/hourly_weather.csv", header=True)
combined_df = combined_df.alias("tlc").join(
    weather_df.withColumnsRenamed({"date": "hourly_timestamp"}),
    on="hourly_timestamp",
    how='left'
)

In [33]:
combined_df = combined_df.withColumn("temperature_2m", col("temperature_2m").cast("double"))
combined_df = combined_df.withColumn("relative_humidity_2m", col("relative_humidity_2m").cast("double"))
combined_df = combined_df.withColumn("rain", col("rain").cast("double"))
combined_df = combined_df.withColumn("snowfall", col("snowfall").cast("double"))
combined_df = combined_df.withColumn("wind_speed_10m", col("wind_speed_10m").cast("double"))

In [34]:
combined_df.printSchema()

root
 |-- hourly_timestamp: timestamp (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- pickup_hour_of_day: integer (nullable = true)
 |-- pickup_day_of_week: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_borough: string (nullable = true)
 |-- is_weekend: boolean (nullable = true)
 |-- pickup_at_airport: boolean (nullable = true)
 |-- num_trips: long (nullable = false)
 |-- pickup_num_businesses: long (nullable = true)
 |-- temperature_2m: double (nullable = true)
 |-- relative_humidity_2m: double (nullable = true)
 |-- rain: double (nullable = true)
 |-- snowfall: double (nullable = true)
 |-- wind_speed_10m: double (nullable = true)


In [35]:
combined_df.show(5)



+-------------------+------------+------------------+------------------+------------+--------------+----------+-----------------+---------+---------------------+--------------+--------------------+----+--------+--------------+
|   hourly_timestamp|PULocationID|pickup_hour_of_day|pickup_day_of_week|pickup_month|pickup_borough|is_weekend|pickup_at_airport|num_trips|pickup_num_businesses|temperature_2m|relative_humidity_2m|rain|snowfall|wind_speed_10m|
+-------------------+------------+------------------+------------------+------------+--------------+----------+-----------------+---------+---------------------+--------------+--------------------+----+--------+--------------+
|2022-03-01 02:00:00|          28|                 2|                 3|           3|        Queens|     false|            false|        8|                 6571|    -3.2215002|           69.164276| 0.0|     0.0|      8.707237|
|2022-03-01 02:00:00|           9|                 2|                 3|           3|       

                                                                                

In [36]:
combined_df = combined_df.dropna()

In [37]:
combined_df.write.mode('overwrite').parquet('../data/combined_data')

                                                                                