In [12]:
import pyspark
from pyspark.sql import SparkSession, types
from pyspark.sql import functions as F
from pyspark.sql.functions import sha2, concat_ws

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [3]:
spark.sparkContext

In [4]:
!ls -lh data/raw/fhv/2019/10/fhv_tripdata_2019_10.csv.gz

-rw-r--r-- 1 jeff 197609 19M Dec  1  2022 data/raw/fhv/2019/10/fhv_tripdata_2019_10.csv.gz


In [5]:
df = spark.read.csv('data/raw/fhv/2019/10/fhv_tripdata_2019_10.csv.gz', header=True, inferSchema=True)

In [6]:
df.printSchema()

root
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropOff_datetime: timestamp (nullable = true)
 |-- PUlocationID: integer (nullable = true)
 |-- DOlocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [7]:
schema = types.StructType([
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True),
    types.StructField('Affiliated_base_number', types.StringType(), True)
])

In [9]:
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv('data/raw/fhv/2019/10/fhv_tripdata_2019_10.csv.gz')

df = df.repartition(6)

df.write.parquet('data/pq/fhv/2019/10/', mode='overwrite')

In [10]:
df = spark.read.parquet('data/pq/fhv/2019/10/')

In [11]:
!ls -lhR data/pq/fhv

data/pq/fhv:
total 0
drwxr-xr-x 1 jeff 197609 0 Mar  4 07:51 2019

data/pq/fhv/2019:
total 8.0K
drwxr-xr-x 1 jeff 197609 0 Mar  4 07:51 10

data/pq/fhv/2019/10:
total 39M
-rw-r--r-- 1 jeff 197609    0 Mar  4 07:51 _SUCCESS
-rw-r--r-- 1 jeff 197609 6.4M Mar  4 07:51 part-00000-2d4da35e-d801-42bd-baa7-7bb85e4df104-c000.snappy.parquet
-rw-r--r-- 1 jeff 197609 6.4M Mar  4 07:51 part-00001-2d4da35e-d801-42bd-baa7-7bb85e4df104-c000.snappy.parquet
-rw-r--r-- 1 jeff 197609 6.4M Mar  4 07:51 part-00002-2d4da35e-d801-42bd-baa7-7bb85e4df104-c000.snappy.parquet
-rw-r--r-- 1 jeff 197609 6.4M Mar  4 07:51 part-00003-2d4da35e-d801-42bd-baa7-7bb85e4df104-c000.snappy.parquet
-rw-r--r-- 1 jeff 197609 6.4M Mar  4 07:51 part-00004-2d4da35e-d801-42bd-baa7-7bb85e4df104-c000.snappy.parquet
-rw-r--r-- 1 jeff 197609 6.4M Mar  4 07:51 part-00005-2d4da35e-d801-42bd-baa7-7bb85e4df104-c000.snappy.parquet


In [18]:
from pyspark.sql import functions as F

In [13]:
df.withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .filter("pickup_date = '2019-10-15'") \
    .count()

62610

In [31]:
df.createOrReplaceTempView('fhv_2019_10')

In [32]:
spark.sql("""
            SELECT  TO_DATE(pickup_datetime) AS Pickup_Date,
                    MAX(DATEDIFF(HOUR, pickup_datetime, dropoff_datetime)) AS Longest_Trip
            FROM    fhv_2019_10
            GROUP BY
                    pickup_datetime
            ORDER BY
                    2 DESC
            ;
        """).show()

+-----------+------------+
|Pickup_Date|Longest_Trip|
+-----------+------------+
| 2019-10-28|      631152|
| 2019-10-11|      631152|
| 2019-10-31|       87672|
| 2019-10-01|       70128|
| 2019-10-17|        8794|
| 2019-10-26|        8784|
| 2019-10-30|        1464|
| 2019-10-25|        1056|
| 2019-10-01|         793|
| 2019-10-01|         793|
| 2019-10-01|         793|
| 2019-10-01|         792|
| 2019-10-01|         792|
| 2019-10-01|         792|
| 2019-10-01|         792|
| 2019-10-01|         792|
| 2019-10-01|         792|
| 2019-10-01|         792|
| 2019-10-01|         792|
| 2019-10-01|         792|
+-----------+------------+
only showing top 20 rows



**Q4**: Longest trip for each day

In [18]:
df.columns

['dispatching_base_num',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'SR_Flag',
 'Affiliated_base_number']

In [19]:
df.withColumn('duration', df.dropoff_datetime.cast('long') - df.pickup_datetime.cast('long')) \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .groupBy('pickup_date') \
        .max('duration') \
    .orderBy('max(duration)', ascending=False) \
    .limit(5) \
    .show()

+-----------+-------------+
|pickup_date|max(duration)|
+-----------+-------------+
| 2019-10-28|   2272149000|
| 2019-10-11|   2272149000|
| 2019-10-31|    315620787|
| 2019-10-01|    252460901|
| 2019-10-17|     31658400|
+-----------+-------------+



In [33]:
spark.sql("""
SELECT
    to_date(pickup_datetime) AS pickup_date,
    MAX((CAST(dropoff_datetime AS LONG) - CAST(pickup_datetime AS LONG)) / 60) AS duration
FROM 
    fhv_2019_10
GROUP BY
    1
ORDER BY
    2 DESC
LIMIT 10;
""").show()

+-----------+-----------------+
|pickup_date|         duration|
+-----------+-----------------+
| 2019-10-28|       3.786915E7|
| 2019-10-11|       3.786915E7|
| 2019-10-31|       5260346.45|
| 2019-10-01|4207681.683333334|
| 2019-10-17|         527640.0|
| 2019-10-26|         527050.0|
| 2019-10-30|87932.06666666667|
| 2019-10-25|          63469.6|
| 2019-10-02|46213.88333333333|
| 2019-10-23|          44797.0|
+-----------+-----------------+



In [25]:
df_zones = spark.read.parquet('zones')

In [26]:
df_zones.columns

['LocationID', 'Borough', 'Zone', 'service_zone']

In [27]:
df.columns

['dispatching_base_num',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'SR_Flag',
 'Affiliated_base_number']

In [29]:
df_zones.createOrReplaceTempView('zones')

In [51]:
df_PUDO = spark.sql("""
                    SELECT
                        CONCAT(pu.Zone) AS PU_Loc,
                        COUNT(*) AS Cnt
                    FROM fhv_2019_10 fhv
                    INNER JOIN zones pu ON fhv.PULocationID = pu.LocationID
                    GROUP BY 1
                    ORDER BY 2
                    LIMIT 5;
                    """)

In [52]:
df_PUDO.show(truncate=False)

+---------------------------------------------+---+
|PU_Loc                                       |Cnt|
+---------------------------------------------+---+
|Jamaica Bay                                  |1  |
|Governor's Island/Ellis Island/Liberty Island|2  |
|Green-Wood Cemetery                          |5  |
|Broad Channel                                |8  |
|Highbridge Park                              |14 |
+---------------------------------------------+---+

