In [3]:
# import pyspark to process large files
from pyspark.sql import SparkSession

In [39]:
# create a new spark session
spark = SparkSession.builder.master('local[*]').appName('process_tripdata').getOrCreate()
print("Spark version: ", spark.version)

Spark version:  2.4.0


In [30]:
# read 2019 fhv data file
fhv_data_2019 = spark.read.csv('csv/2019/fhv_data.csv', header=True)

In [31]:
# show first five rows in fhv_data file
fhv_data_2019.show(5)

+--------------------+-------------------+-------------------+------------+------------+-------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+--------------------+-------------------+-------------------+------------+------------+-------+
|              B00001|2019-01-01 00:30:00|2019-01-01 02:51:55|        null|        null|   null|
|              B00001|2019-01-01 00:45:00|2019-01-01 00:54:49|        null|        null|   null|
|              B00001|2019-01-01 00:15:00|2019-01-01 00:54:52|        null|        null|   null|
|              B00008|2019-01-01 00:19:00|2019-01-01 00:39:00|        null|        null|   null|
|              B00008|2019-01-01 00:27:00|2019-01-01 00:37:00|        null|        null|   null|
+--------------------+-------------------+-------------------+------------+------------+-------+
only showing top 5 rows



In [34]:
# keep lines with known PU (pickup) and DO (dropoff) locations
filtered_fhv_data_2019 = fhv_data_2019.filter(fhv_data_2019.PULocationID.isNotNull() & fhv_data_2019.DOLocationID.isNotNull())

In [35]:
# show first five rows in filtered_fhv_data_2019 data frame
filtered_fhv_data_2019.show(5)

+--------------------+-------------------+-------------------+------------+------------+-------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+--------------------+-------------------+-------------------+------------+------------+-------+
|              B00254|2019-01-01 00:33:03|2019-01-01 01:37:24|         140|          52|   null|
|              B00254|2019-01-01 00:03:00|2019-01-01 00:34:25|         141|         237|   null|
|              B00254|2019-01-01 00:45:48|2019-01-01 01:26:01|         237|         236|   null|
|              B00254|2019-01-01 00:37:39|2019-01-01 01:44:59|         162|          85|   null|
|              B00254|2019-01-01 00:35:06|2019-01-01 01:30:21|         237|         246|   null|
+--------------------+-------------------+-------------------+------------+------------+-------+
only showing top 5 rows



In [36]:
# drop unused dispatching_base_num col
dropped_filtered_fhv_data_2019 = filtered_fhv_data_2019.drop(filtered_fhv_data_2019.dispatching_base_num)

In [37]:
# show first five rows of processed file
dropped_filtered_fhv_data_2019.show(5)

+-------------------+-------------------+------------+------------+-------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-------------------+-------------------+------------+------------+-------+
|2019-01-01 00:33:03|2019-01-01 01:37:24|         140|          52|   null|
|2019-01-01 00:03:00|2019-01-01 00:34:25|         141|         237|   null|
|2019-01-01 00:45:48|2019-01-01 01:26:01|         237|         236|   null|
|2019-01-01 00:37:39|2019-01-01 01:44:59|         162|          85|   null|
|2019-01-01 00:35:06|2019-01-01 01:30:21|         237|         246|   null|
+-------------------+-------------------+------------+------------+-------+
only showing top 5 rows



In [40]:
# write processed file back to CSV
dropped_filtered_fhv_data_2019.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("csv/2019/processed_fhv_data.csv")

In [41]:
# read fhv high volume 2019 data
fhvhv_data_2019 = spark.read.csv('csv/2019/fhvhv_data.csv', header=True)

In [42]:
# show first five rows of fhvhv data
fhvhv_data_2019.show(5)

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0003|              B02867|2019-02-01 00:05:18|2019-02-01 00:14:57|         245|         251|   null|
|           HV0003|              B02879|2019-02-01 00:41:29|2019-02-01 00:49:39|         216|         197|   null|
|           HV0005|              B02510|2019-02-01 00:51:34|2019-02-01 01:28:29|         261|         234|   null|
|           HV0005|              B02510|2019-02-01 00:03:51|2019-02-01 00:07:16|          87|          87|   null|
|           HV0005|              B02510|2019-02-01 00:09:44|2019-02-01 00:39:56|          87|         198|   null|
+-----------------+--------------------+-------------------+-------------------+

In [43]:
# drop unused columns hvfhs_license_num and dispatching_base_num
no_hvfhs_fhvhv_data_2019 = fhvhv_data_2019.drop(fhvhv_data_2019.hvfhs_license_num)
dropped = no_hvfhs_fhvhv_data_2019.drop(no_hvfhs_fhvhv_data_2019.dispatching_base_num)

In [44]:
# show first five rows of new data frame after dropping columns
dropped.show(5)

+-------------------+-------------------+------------+------------+-------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-------------------+-------------------+------------+------------+-------+
|2019-02-01 00:05:18|2019-02-01 00:14:57|         245|         251|   null|
|2019-02-01 00:41:29|2019-02-01 00:49:39|         216|         197|   null|
|2019-02-01 00:51:34|2019-02-01 01:28:29|         261|         234|   null|
|2019-02-01 00:03:51|2019-02-01 00:07:16|          87|          87|   null|
|2019-02-01 00:09:44|2019-02-01 00:39:56|          87|         198|   null|
+-------------------+-------------------+------------+------------+-------+
only showing top 5 rows



In [45]:
# remove from dropped lines where PU or DO have null value
final_fhvhv = dropped.filter(dropped.PULocationID.isNotNull() & dropped.DOLocationID.isNotNull())

In [46]:
# show first five rows of final fhvhv data frame
final_fhvhv.show(5)

+-------------------+-------------------+------------+------------+-------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-------------------+-------------------+------------+------------+-------+
|2019-02-01 00:05:18|2019-02-01 00:14:57|         245|         251|   null|
|2019-02-01 00:41:29|2019-02-01 00:49:39|         216|         197|   null|
|2019-02-01 00:51:34|2019-02-01 01:28:29|         261|         234|   null|
|2019-02-01 00:03:51|2019-02-01 00:07:16|          87|          87|   null|
|2019-02-01 00:09:44|2019-02-01 00:39:56|          87|         198|   null|
+-------------------+-------------------+------------+------------+-------+
only showing top 5 rows



In [47]:
# write final_fhvhv to file
final_fhvhv.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("csv/2019/processed_fhvhv_data.csv")

In [7]:
# read fhv 2020 data
fhv_2020_data = spark.read.csv('csv/2020/fhv_data.csv', header=True)

In [8]:
# show first five rows of 2020 fhv data
fhv_2020_data.show(5)

+--------------------+-------------------+-------------------+------------+------------+-------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+--------------------+-------------------+-------------------+------------+------------+-------+
|              B00001|2020-01-01 00:30:00|2020-01-01 01:44:00|         264|         264|   null|
|              B00001|2020-01-01 00:30:00|2020-01-01 00:47:00|         264|         264|   null|
|              B00009|2020-01-01 00:48:00|2020-01-01 01:19:00|         264|         264|   null|
|              B00009|2020-01-01 00:34:00|2020-01-01 00:43:00|         264|         264|   null|
|              B00009|2020-01-01 00:23:00|2020-01-01 00:32:00|         264|         264|   null|
+--------------------+-------------------+-------------------+------------+------------+-------+
only showing top 5 rows



In [9]:
# drop dispatching_base_num
dropped_fhv_2020_data = fhv_2020_data.drop(fhv_2020_data.dispatching_base_num)

In [10]:
# show first 5 rows of after dropping dispatching_base_num column
dropped_fhv_2020_data.show(5)

+-------------------+-------------------+------------+------------+-------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-------------------+-------------------+------------+------------+-------+
|2020-01-01 00:30:00|2020-01-01 01:44:00|         264|         264|   null|
|2020-01-01 00:30:00|2020-01-01 00:47:00|         264|         264|   null|
|2020-01-01 00:48:00|2020-01-01 01:19:00|         264|         264|   null|
|2020-01-01 00:34:00|2020-01-01 00:43:00|         264|         264|   null|
|2020-01-01 00:23:00|2020-01-01 00:32:00|         264|         264|   null|
+-------------------+-------------------+------------+------------+-------+
only showing top 5 rows



In [11]:
# filter rows where PU and DO aren't null
filtered_fhv_2020_data = dropped_fhv_2020_data.filter(dropped_fhv_2020_data.PULocationID.isNotNull() & dropped_fhv_2020_data.DOLocationID.isNotNull())

In [13]:
# write filtered 2020 fhv data to file
filtered_fhv_2020_data.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("csv/2020/processed_fhv_data.csv")

In [14]:
# read 2020 fhv high volume data
fhvhv_2020_data = spark.read.csv('csv/2020/fhvhv_data.csv', header=True)

In [15]:
# show first five rows of 2020 fhvhv data
fhvhv_2020_data.show(5)

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0003|              B02864|2020-01-01 00:45:34|2020-01-01 01:02:20|         148|          90|   null|
|           HV0003|              B02682|2020-01-01 00:47:50|2020-01-01 00:53:23|         114|          79|   null|
|           HV0003|              B02764|2020-01-01 00:04:37|2020-01-01 00:21:49|           4|         125|   null|
|           HV0003|              B02764|2020-01-01 00:26:36|2020-01-01 00:33:00|         231|         113|   null|
|           HV0003|              B02764|2020-01-01 00:37:49|2020-01-01 00:46:59|         114|         144|   null|
+-----------------+--------------------+-------------------+-------------------+

In [17]:
# drop hvfhs_license_num column
dropped_hvfhs = fhvhv_2020_data.drop(fhvhv_2020_data.hvfhs_license_num)

In [18]:
# show first five rows after dropping hvfhs column
dropped_hvfhs.show(5)

+--------------------+-------------------+-------------------+------------+------------+-------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+--------------------+-------------------+-------------------+------------+------------+-------+
|              B02864|2020-01-01 00:45:34|2020-01-01 01:02:20|         148|          90|   null|
|              B02682|2020-01-01 00:47:50|2020-01-01 00:53:23|         114|          79|   null|
|              B02764|2020-01-01 00:04:37|2020-01-01 00:21:49|           4|         125|   null|
|              B02764|2020-01-01 00:26:36|2020-01-01 00:33:00|         231|         113|   null|
|              B02764|2020-01-01 00:37:49|2020-01-01 00:46:59|         114|         144|   null|
+--------------------+-------------------+-------------------+------------+------------+-------+
only showing top 5 rows



In [19]:
# drop dispatching base num col
dropped_disp_col = dropped_hvfhs.drop(dropped_hvfhs.dispatching_base_num)

In [20]:
# show first 5 rows after dropping second col
dropped_disp_col.show(5)

+-------------------+-------------------+------------+------------+-------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-------------------+-------------------+------------+------------+-------+
|2020-01-01 00:45:34|2020-01-01 01:02:20|         148|          90|   null|
|2020-01-01 00:47:50|2020-01-01 00:53:23|         114|          79|   null|
|2020-01-01 00:04:37|2020-01-01 00:21:49|           4|         125|   null|
|2020-01-01 00:26:36|2020-01-01 00:33:00|         231|         113|   null|
|2020-01-01 00:37:49|2020-01-01 00:46:59|         114|         144|   null|
+-------------------+-------------------+------------+------------+-------+
only showing top 5 rows



In [21]:
# filter off columns where PU or DO is null
filtered_fhvhv_2020_data = dropped_disp_col.filter(dropped_disp_col.PULocationID.isNotNull() & dropped_disp_col.DOLocationID.isNotNull())

In [22]:
# show first five rows after filtering
filtered_fhvhv_2020_data.show(5)

+-------------------+-------------------+------------+------------+-------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-------------------+-------------------+------------+------------+-------+
|2020-01-01 00:45:34|2020-01-01 01:02:20|         148|          90|   null|
|2020-01-01 00:47:50|2020-01-01 00:53:23|         114|          79|   null|
|2020-01-01 00:04:37|2020-01-01 00:21:49|           4|         125|   null|
|2020-01-01 00:26:36|2020-01-01 00:33:00|         231|         113|   null|
|2020-01-01 00:37:49|2020-01-01 00:46:59|         114|         144|   null|
+-------------------+-------------------+------------+------------+-------+
only showing top 5 rows



In [24]:
# write 2020 fhvhv data to file
filtered_fhvhv_2020_data.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("csv/2020/processed_fhvhv_data.csv")