In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

24/03/03 16:46:35 WARN Utils: Your hostname, Davids-MacBook-Pro-3.local resolves to a loopback address: 127.0.0.1; using 192.168.1.5 instead (on interface en0)
24/03/03 16:46:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/03 16:46:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
df = spark.read \
    .parquet('../data/raw/green/*/*/*.parquet')

In [8]:
df.show()

+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|VendorID|lpep_pickup_datetime|lpep_dropoff_datetime|store_and_fwd_flag|RatecodeID|PULocationID|DOLocationID|passenger_count|trip_distance|fare_amount|extra|mta_tax|tip_amount|tolls_amount|ehail_fee|improvement_surcharge|total_amount|payment_type|trip_type|congestion_surcharge|
+--------+--------------------+---------------------+------------------+----------+------------+------------+---------------+-------------+-----------+-----+-------+----------+------------+---------+---------------------+------------+------------+---------+--------------------+
|       2| 2019-12-18 15:52:30|  2019-12-18 15:54:39|                 N|       1.0|         264|         264|            5.0|          0.0|        3.5|  0.5|    0.

In [6]:
df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- lpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- lpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: integer (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: double (nullable = true)
 |-- trip_type: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)


In [9]:
from pyspark.sql import types

green_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("lpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("lpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("ehail_fee", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("trip_type", types.IntegerType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

yellow_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("tpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("tpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

In [13]:
# Green repartitioned
for year in [2020, 2021]:
    for month in range(1, 13):
        print(f'processing GREEN data for {year}/{month}')
    
        input_path = f'../data/raw/green/{year}/{month:02d}/'
        output_path = f'../data/pq/green/{year}/{month:02d}/'
    
        df_green = spark.read \
            .parquet(input_path)
            # .schema(green_schema) \
    
        df_green \
            .repartition(4) \
            .write.parquet(output_path)

# Yellow repartitioned
for year in [2020, 2021]:
    for month in range(1, 13):
        print(f'processing YELLOW data for {year}/{month}')
    
        input_path = f'../data/raw/yellow/{year}/{month:02d}/'
        output_path = f'../data/pq/yellow/{year}/{month:02d}/'
    
        df_yellow = spark.read \
            .parquet(input_path)
            # .schema(yellow_schema) \
    
        df_yellow \
            .repartition(4) \
            .write.parquet(output_path)

processing GREEN data for 2020/1


                                                                                

processing GREEN data for 2020/2
processing GREEN data for 2020/3
processing GREEN data for 2020/4
processing GREEN data for 2020/5
processing GREEN data for 2020/6
processing GREEN data for 2020/7
processing GREEN data for 2020/8
processing GREEN data for 2020/9
processing GREEN data for 2020/10
processing GREEN data for 2020/11
processing GREEN data for 2020/12
processing GREEN data for 2021/1
processing GREEN data for 2021/2
processing GREEN data for 2021/3
processing GREEN data for 2021/4
processing GREEN data for 2021/5
processing GREEN data for 2021/6
processing GREEN data for 2021/7
processing GREEN data for 2021/8
processing GREEN data for 2021/9
processing GREEN data for 2021/10
processing GREEN data for 2021/11
processing GREEN data for 2021/12
processing YELLOW data for 2020/1


                                                                                

processing YELLOW data for 2020/2


                                                                                

processing YELLOW data for 2020/3


                                                                                

processing YELLOW data for 2020/4
processing YELLOW data for 2020/5
processing YELLOW data for 2020/6


                                                                                

processing YELLOW data for 2020/7


                                                                                

processing YELLOW data for 2020/8


                                                                                

processing YELLOW data for 2020/9


                                                                                

processing YELLOW data for 2020/10


                                                                                

processing YELLOW data for 2020/11


                                                                                

processing YELLOW data for 2020/12


                                                                                

processing YELLOW data for 2021/1


                                                                                

processing YELLOW data for 2021/2


                                                                                

processing YELLOW data for 2021/3


                                                                                

processing YELLOW data for 2021/4


                                                                                

processing YELLOW data for 2021/5


                                                                                

processing YELLOW data for 2021/6


                                                                                

processing YELLOW data for 2021/7


                                                                                

processing YELLOW data for 2021/8


                                                                                

processing YELLOW data for 2021/9


                                                                                

processing YELLOW data for 2021/10


                                                                                

processing YELLOW data for 2021/11


                                                                                

processing YELLOW data for 2021/12


                                                                                