In [None]:
import pyspark
import os

WDIR = os.path.abspath(os.path.dirname(""))
print(f"pyspark.__version__: {pyspark.__version__}")
print(WDIR)

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

24/03/02 23:23:19 WARN Utils: Your hostname, Ronalds-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.227 instead (on interface en0)
24/03/02 23:23:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/02 23:23:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


24/03/02 23:23:32 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [3]:
df = spark.read.parquet('bucket/fhvhv/2021/01/')

In [9]:
df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)



In [8]:
df.show(5)

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2021-01-10 17:40:01|2021-01-10 17:48:10|          97|          25|   NULL|
|           HV0005|              B02510|2021-01-08 18:19:44|2021-01-08 18:55:57|         138|         265|   NULL|
|           HV0003|              B02876|2021-01-01 16:47:20|2021-01-01 16:58:28|          50|         163|   NULL|
|           HV0005|              B02510|2021-01-15 17:50:08|2021-01-15 18:07:24|         163|          79|   NULL|
|           HV0005|              B02510|2021-01-12 17:26:40|2021-01-12 17:57:57|          47|          74|   NULL|
+-----------------+--------------------+-------------------+-------------------+

Actions vs Transformations

In [16]:
# Not execute -> Transformation
df.select('hvfhs_license_num', 'pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID') \
    .filter(df.hvfhs_license_num == 'HV0003')

DataFrame[hvfhs_license_num: string, pickup_datetime: timestamp, dropoff_datetime: timestamp, PULocationID: int, DOLocationID: int]

In [15]:
# With show -> executed
df.select('hvfhs_license_num', 'pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID') \
    .filter(df.hvfhs_license_num == 'HV0003') \
    .show(5)

+-----------------+-------------------+-------------------+------------+------------+
|hvfhs_license_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|
+-----------------+-------------------+-------------------+------------+------------+
|           HV0003|2021-01-01 16:47:20|2021-01-01 16:58:28|          50|         163|
|           HV0003|2021-01-05 02:00:14|2021-01-05 02:19:39|          48|          95|
|           HV0003|2021-01-02 00:34:43|2021-01-02 00:45:38|          63|          77|
|           HV0003|2021-01-02 16:20:11|2021-01-02 16:56:36|          63|         244|
|           HV0003|2021-01-24 16:00:53|2021-01-24 16:07:40|         210|         165|
+-----------------+-------------------+-------------------+------------+------------+
only showing top 5 rows



In [19]:
from pyspark.sql import functions as F

In [25]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime)) \
    .select('hvfhs_license_num', 'pickup_date', 'dropoff_date', 'PULocationID', 'DOLocationID') \
    .filter(df.hvfhs_license_num == 'HV0003') \
    .show(5)

+-----------------+-----------+------------+------------+------------+
|hvfhs_license_num|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-----------------+-----------+------------+------------+------------+
|           HV0003| 2021-01-01|  2021-01-01|          50|         163|
|           HV0003| 2021-01-05|  2021-01-05|          48|          95|
|           HV0003| 2021-01-02|  2021-01-02|          63|          77|
|           HV0003| 2021-01-02|  2021-01-02|          63|         244|
|           HV0003| 2021-01-24|  2021-01-24|         210|         165|
+-----------------+-----------+------------+------------+------------+
only showing top 5 rows



In [48]:
# udf - things that are hard to implement in SQL
def crazy_stuff(base_num):
    num = int(base_num[2:])
    if num % 7 == 0:
        return f's/{num:03x}'
    else:
        return f'e/{num:03x}'

In [49]:
# Python-testable udf
crazy_stuff('BV02884')

's/b44'

In [50]:
# Register udf function
from pyspark.sql import types
crazy_stuff_udf = F.udf(crazy_stuff, returnType=types.StringType())

In [51]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime)) \
    .withColumn('base_id', crazy_stuff_udf(df.hvfhs_license_num)) \
    .select('hvfhs_license_num', 'base_id', 'pickup_date', 'dropoff_date', 'PULocationID', 'DOLocationID') \
    .filter(df.hvfhs_license_num == 'HV0003') \
    .show(5)

+-----------------+-------+-----------+------------+------------+------------+
|hvfhs_license_num|base_id|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-----------------+-------+-----------+------------+------------+------------+
|           HV0003|  e/003| 2021-01-01|  2021-01-01|          50|         163|
|           HV0003|  e/003| 2021-01-05|  2021-01-05|          48|          95|
|           HV0003|  e/003| 2021-01-02|  2021-01-02|          63|          77|
|           HV0003|  e/003| 2021-01-02|  2021-01-02|          63|         244|
|           HV0003|  e/003| 2021-01-24|  2021-01-24|         210|         165|
+-----------------+-------+-----------+------------+------------+------------+
only showing top 5 rows

