In [3]:
import pyspark
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

24/03/06 18:06:06 WARN Utils: Your hostname, Davids-MacBook-Pro-3.local resolves to a loopback address: 127.0.0.1; using 192.168.1.5 instead (on interface en0)
24/03/06 18:06:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/06 18:06:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df = spark.read \
    .option("header", "true") \
    .csv('fhv_tripdata_2019-10.csv.gz')

In [6]:
df.show(5)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropOff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B00009|2019-10-01 00:23:00|2019-10-01 00:35:00|         264|         264|   NULL|                B00009|
|              B00013|2019-10-01 00:11:29|2019-10-01 00:13:22|         264|         264|   NULL|                B00013|
|              B00014|2019-10-01 00:11:43|2019-10-01 00:37:20|         264|         264|   NULL|                B00014|
|              B00014|2019-10-01 00:56:29|2019-10-01 00:57:47|         264|         264|   NULL|                B00014|
|              B00014|2019-10-01 00:23:09|2019-10-01 00:28:27|         264|         264|   NULL|                B00014|
+--------------------+------------------

## Question 2: Average size of the Parquet file

In [7]:
df.repartition(6).write.parquet('../../data/pq/fhv/2019/10')

                                                                                

## Question 3: How many taxi trips were there on the 15th of October?

In [6]:
df = spark.read.parquet('../../data/pq/fhv/2019/10')

In [8]:
df.filter(df['pickup_datetime'].like('2019-10-15%')).count()

62610

## Question 4: What is the length of the longest trip in the dataset in hours?

In [33]:
df = spark.read.parquet('../../data/pq/fhv/2019/10')

In [34]:
# Convert the pickup and dropoff datetime to datetime
df = df.withColumn('pickup_datetime', df['pickup_datetime'].cast('timestamp'))
df = df.withColumn('dropoff_datetime', df['dropoff_datetime'].cast('timestamp'))

In [35]:
df = df.withColumn('trip_duration', df['dropoff_datetime'].cast('long') - df['pickup_datetime'].cast('long'))

In [36]:
# Convert the trip duration to hours
max_trip_seconds = df.agg({'trip_duration': 'max'})
max_trip_hours = max_trip_seconds.collect()[0][0] / 3600
print(max_trip_hours)

631152.5


In [37]:
# Get the trip with the longest duration
df.filter(df['trip_duration'] == max_trip_seconds.collect()[0][0]).show()

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+-------------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PUlocationID|DOlocationID|SR_Flag|Affiliated_base_number|trip_duration|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+-------------+
|              B02832|2019-10-11 18:00:00|2091-10-11 18:30:00|         264|         264|   NULL|                B02832|   2272149000|
|              B02832|2019-10-28 09:00:00|2091-10-28 09:30:00|         264|         264|   NULL|                B02832|   2272149000|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+-------------+


## Question 5: Least frequent pickup location zone

Using the zone lookup data and the FHV October 2019 data, what is the name of the LEAST frequent pickup location Zone?

In [38]:
zones_df = spark.read \
    .parquet('../../installation_test/zones')

In [39]:
zones_df.show(5)

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+


In [40]:
df = spark.read.parquet('../../data/pq/fhv/2019/10')

In [41]:
df.groupBy('PULocationID').count().orderBy('count').show(5)



+------------+-----+
|PULocationID|count|
+------------+-----+
|           2|    1|
|         105|    2|
|         111|    5|
|          30|    8|
|         120|   14|
+------------+-----+


                                                                                

In [42]:
df.join(zones_df, df['PULocationID'] == zones_df['LocationID']).groupBy('Zone').count().orderBy('count').show(5)



+--------------------+-----+
|                Zone|count|
+--------------------+-----+
|         Jamaica Bay|    1|
|Governor's Island...|    2|
| Green-Wood Cemetery|    5|
|       Broad Channel|    8|
|     Highbridge Park|   14|
+--------------------+-----+


                                                                                