In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
        .master("local[*]")\
        .appName('test')\
        .getOrCreate()

22/03/01 23:59:37 WARN Utils: Your hostname, ZPS713 resolves to a loopback address: 127.0.1.1; using 172.26.238.3 instead (on interface eth0)
22/03/01 23:59:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/03/01 23:59:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
spark.version

'3.0.3'

In [7]:
from pyspark.sql import types

In [8]:
schema = types.StructType([
    types.StructField('hvfhs_license_num',types.StringType(),True),
    types.StructField('dispatching_base_num',types.StringType(),True),
    types.StructField('pickup_datetime',types.TimestampType(),True),
    types.StructField('dropoff_datetime',types.TimestampType(),True),
    types.StructField('PULocationID',types.IntegerType(),True),
    types.StructField('DOLocationID',types.IntegerType(),True),
    types.StructField('SR_Flag',types.StringType(),True)
])

In [9]:
df = spark.read\
    .option("header","true")\
    .schema(schema)\
    .csv('fhvhv_tripdata_2021-02.csv')

In [17]:
df = df.repartition(24)

In [18]:
df.write.parquet('fhvhv/2021/02', mode="overwrite")

                                                                                

In [33]:
df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)



In [35]:
df.select('pickup_datetime','dropoff_datetime', 'PULocationID', 'DOLocationID')\
.filter(df.hvfhs_license_num == 'HV0003')\
.show()



+-------------------+-------------------+------------+------------+
|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|
+-------------------+-------------------+------------+------------+
|2021-01-04 16:35:05|2021-01-04 17:07:34|         246|         148|
|2021-01-03 04:37:27|2021-01-03 05:05:28|         100|         138|
|2021-01-02 01:23:18|2021-01-02 01:30:37|          42|         119|
|2021-01-03 14:49:27|2021-01-03 15:22:31|         100|         132|
|2021-01-05 06:27:16|2021-01-05 06:51:35|          77|          80|
|2021-01-07 08:03:54|2021-01-07 08:10:18|         210|          29|
|2021-01-04 18:09:42|2021-01-04 18:19:46|         107|         246|
|2021-01-04 07:28:15|2021-01-04 07:35:52|         137|          79|
|2021-01-06 19:47:04|2021-01-06 20:02:21|         174|         119|
|2021-01-06 08:28:27|2021-01-06 08:44:52|         223|          82|
|2021-01-06 09:18:49|2021-01-06 09:39:09|         126|         136|
|2021-01-03 08:49:07|2021-01-03 08:54:14|       

                                                                                

In [37]:
from pyspark.sql import functions as F

[Stage 10:>                                                         (0 + 4) / 6]

KeyboardInterrupt: 

In [39]:
def crazy_stuff(base_num):
    num = int(base_num[1:])
    if num % 7 == 0:
        return f's/{num:03x}'
    elif num % 3 == 0:
        return f'a/{num:03x}'
    else:
        return f'e/{num:03x}'

In [41]:
crazy_stuff('B02884')

's/b44'

[Stage 10:>                                                         (0 + 4) / 6]

In [44]:
crazy_stuff_udf = F.udf(crazy_stuff, returnType=types.StringType())

[Stage 10:>                                                         (0 + 4) / 6]

In [45]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime)) \
    .withColumn('base_id', crazy_stuff_udf(df.dispatching_base_num)) \
    .select('pickup_date', 'dropoff_date', 'PULocationID', 'DOLocationID') \
    .show()



+-----------+------------+------------+------------+
|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-----------+------------+------------+------------+
| 2021-01-06|  2021-01-06|         225|         225|
| 2021-01-01|  2021-01-01|         146|          73|
| 2021-01-02|  2021-01-02|          49|         255|
| 2021-01-02|  2021-01-02|          42|         119|
| 2021-01-03|  2021-01-03|          75|         263|
| 2021-01-04|  2021-01-04|          34|          33|
| 2021-01-02|  2021-01-02|         198|          97|
| 2021-01-06|  2021-01-06|         186|         161|
| 2021-01-06|  2021-01-06|         249|         100|
| 2021-01-06|  2021-01-06|          75|         163|
| 2021-01-05|  2021-01-05|         136|         220|
| 2021-01-03|  2021-01-03|          60|         265|
| 2021-01-03|  2021-01-03|         254|         254|
| 2021-01-05|  2021-01-05|         189|         181|
| 2021-01-06|  2021-01-06|         126|         136|
| 2021-01-07|  2021-01-07|         201|       

                                                                                