In [None]:
%%capture
!pip install awscli
!pip install boto3
!pip install awswrangler

In [None]:
!mkdir -p ~/.aws && cp /content/drive/MyDrive/AWS/wys/* ~/.aws
!chmod 600 ~/.aws/credentials

In [None]:
%%capture
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop3.2.tgz
!tar xf spark-3.0.0-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install -q pyspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop3.2"

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
!wget -q --show-progress https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2021-02.parquet



In [None]:
!ls -lh fhvhv_tripdata_2021-02.parquet

-rw-r--r-- 1 root root 289M Jun 30 03:16 fhvhv_tripdata_2021-02.parquet


In [None]:
df = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .parquet('fhvhv_tripdata_2021-02.parquet') \
    .select(["hvfhs_license_num", "dispatching_base_num",
             "pickup_datetime", "dropoff_datetime",
             "PULocationID", "DOLocationID"])

display(df.show(10))

+-----------------+--------------------+-------------------+-------------------+------------+------------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|
+-----------------+--------------------+-------------------+-------------------+------------+------------+
|           HV0003|              B02764|2021-02-01 00:10:40|2021-02-01 00:21:09|          35|          39|
|           HV0003|              B02764|2021-02-01 00:27:23|2021-02-01 00:44:01|          39|          35|
|           HV0005|              B02510|2021-02-01 00:28:38|2021-02-01 00:38:27|          39|          91|
|           HV0005|              B02510|2021-02-01 00:43:37|2021-02-01 01:23:20|          91|         228|
|           HV0003|              B02872|2021-02-01 00:08:42|2021-02-01 00:17:57|         126|         250|
|           HV0003|              B02872|2021-02-01 00:26:02|2021-02-01 00:42:51|         208|         243|
|           HV0003|              B028

None

In [None]:
df = df.repartition(24)

df.write.parquet('data/pq/fhvhv/2021/02/', compression='snappy')

In [None]:
df = spark.read.parquet('data/pq/fhvhv/2021/02/')

!ls -lh data/pq/fhvhv/2021/02/

total 219M
-rw-r--r-- 1 root root 9.1M Jul 15 07:47 part-00000-2577a51c-f9d7-4e9a-b820-09010edc01b2-c000.snappy.parquet
-rw-r--r-- 1 root root 9.1M Jul 15 07:47 part-00001-2577a51c-f9d7-4e9a-b820-09010edc01b2-c000.snappy.parquet
-rw-r--r-- 1 root root 9.1M Jul 15 07:47 part-00002-2577a51c-f9d7-4e9a-b820-09010edc01b2-c000.snappy.parquet
-rw-r--r-- 1 root root 9.1M Jul 15 07:47 part-00003-2577a51c-f9d7-4e9a-b820-09010edc01b2-c000.snappy.parquet
-rw-r--r-- 1 root root 9.1M Jul 15 07:47 part-00004-2577a51c-f9d7-4e9a-b820-09010edc01b2-c000.snappy.parquet
-rw-r--r-- 1 root root 9.1M Jul 15 07:47 part-00005-2577a51c-f9d7-4e9a-b820-09010edc01b2-c000.snappy.parquet
-rw-r--r-- 1 root root 9.1M Jul 15 07:47 part-00006-2577a51c-f9d7-4e9a-b820-09010edc01b2-c000.snappy.parquet
-rw-r--r-- 1 root root 9.1M Jul 15 07:47 part-00007-2577a51c-f9d7-4e9a-b820-09010edc01b2-c000.snappy.parquet
-rw-r--r-- 1 root root 9.1M Jul 15 07:47 part-00008-2577a51c-f9d7-4e9a-b820-09010edc01b2-c000.snappy.parquet
-rw-r--r

### How many taxi trips were there on February 15?

In [None]:
from pyspark.sql import functions as F

In [None]:
df = spark.read.parquet('data/pq/fhvhv/2021/02/')
df.registerTempTable('fhvhv')

In [None]:
df.withColumn('pickup_date', F.to_date(df.pickup_datetime))\
.filter("pickup_date = '2021-02-15'") \
.count()

367170

In [None]:
spark.sql("""
SELECT
    COUNT(*)
FROM
    fhvhv
WHERE
    DATE(pickup_datetime) = '2021-02-15'
""").show()

+--------+
|count(1)|
+--------+
|  367170|
+--------+



### Calculate the duration for each trip

In [None]:
df \
.withColumn('duration_seconds', df.dropoff_datetime.cast('long')-df.pickup_datetime.cast('long')) \
.orderBy(F.col('duration_seconds').desc()) \
.show()

+-----------------+--------------------+-------------------+-------------------+------------+------------+----------------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|duration_seconds|
+-----------------+--------------------+-------------------+-------------------+------------+------------+----------------+
|           HV0005|              B02510|2021-02-11 13:40:44|2021-02-12 10:39:44|         247|          41|           75540|
|           HV0004|              B02800|2021-02-17 15:54:53|2021-02-18 07:48:34|         242|         254|           57221|
|           HV0004|              B02800|2021-02-20 12:08:15|2021-02-21 00:22:14|         188|          55|           44039|
|           HV0003|              B02864|2021-02-03 20:24:25|2021-02-04 07:41:58|          51|         147|           40653|
|           HV0003|              B02887|2021-02-19 23:17:44|2021-02-20 09:44:01|         210|         149|           37577|
|       

In [None]:
spark.sql("""
SELECT
    pickup_datetime, dropoff_datetime,
    (unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) AS duration
FROM
    fhvhv
SORT BY
    duration DESC
""").show()

+-------------------+-------------------+--------+
|    pickup_datetime|   dropoff_datetime|duration|
+-------------------+-------------------+--------+
|2021-02-11 13:40:44|2021-02-12 10:39:44|   75540|
|2021-02-20 12:08:15|2021-02-21 00:22:14|   44039|
|2021-02-19 23:17:44|2021-02-20 09:44:01|   37577|
|2021-02-25 17:13:35|2021-02-26 02:57:05|   35010|
|2021-02-10 01:56:17|2021-02-10 10:57:33|   32476|
|2021-02-25 09:18:18|2021-02-25 18:18:57|   32439|
|2021-02-02 09:42:30|2021-02-02 18:17:43|   30913|
|2021-02-21 22:50:52|2021-02-22 07:21:52|   30660|
|2021-02-10 20:36:16|2021-02-11 05:00:38|   30262|
|2021-02-08 12:04:24|2021-02-08 20:26:10|   30106|
|2021-02-03 10:05:21|2021-02-03 17:57:04|   28303|
|2021-02-09 12:40:43|2021-02-09 20:04:03|   26600|
|2021-02-25 09:48:53|2021-02-25 17:07:07|   26294|
|2021-02-15 04:32:41|2021-02-15 11:43:55|   25874|
|2021-02-04 10:16:34|2021-02-04 17:23:06|   25592|
|2021-02-26 14:32:17|2021-02-26 21:19:19|   24422|
|2021-02-20 18:31:00|2021-02-21

### Trip starting on which day was the longest?

In [None]:
df \
.withColumn('duration', df.dropoff_datetime.cast('long') - df.pickup_datetime.cast('long')) \
.withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
.groupBy('pickup_date') \
    .max('duration') \
.orderBy('max(duration)', ascending=False) \
.limit(5) \
.show()

+-----------+-------------+
|pickup_date|max(duration)|
+-----------+-------------+
| 2021-02-11|        75540|
| 2021-02-17|        57221|
| 2021-02-20|        44039|
| 2021-02-03|        40653|
| 2021-02-19|        37577|
+-----------+-------------+

