[https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/05-batch/code/05_taxi_schema.ipynb](https://github.com/DataTalksClub/data-engineering-zoomcamp/blob/main/05-batch/code/05_taxi_schema.ipynb)

In [31]:
from pyspark.sql import SparkSession, types
import pandas as pd
import os

In [2]:
# Create Spark session
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

24/03/19 15:57:04 WARN Utils: Your hostname, eli-mac.local resolves to a loopback address: 127.0.0.1; using 192.168.1.6 instead (on interface en0)
24/03/19 15:57:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/19 15:57:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Define custom schemas for greena and yellow Spark DataFrames

In [25]:
# Read sample of data into Pandas DataFrame
pd_green_df = pd.read_csv('../data/raw/green/2021/01/green_tripdata_2021_01.csv.gz',
                    compression='gzip',
                    nrows=1000)
# Get Spark DataFrame schema to customize from Pandas DataFrame 
spark.createDataFrame(pd_green_df).schema

StructType([StructField('VendorID', LongType(), True), StructField('lpep_pickup_datetime', StringType(), True), StructField('lpep_dropoff_datetime', StringType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('RatecodeID', LongType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('ehail_fee', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('payment_type', LongType(), True), StructField('trip_type', LongType(), True), StructField('congestion_surcharge', DoubleType(), True)])

In [26]:
pd_yellow_df = pd.read_csv('../data/raw/yellow/2021/01/yellow_tripdata_2021_01.csv.gz',
                    compression='gzip',
                    nrows=1000)
spark.createDataFrame(pd_yellow_df).schema

StructType([StructField('VendorID', LongType(), True), StructField('tpep_pickup_datetime', StringType(), True), StructField('tpep_dropoff_datetime', StringType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('RatecodeID', LongType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('payment_type', LongType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True)])

In [27]:
green_schema = types.StructType([
    types.StructField('VendorID', types.IntegerType(), True), 
    types.StructField('lpep_pickup_datetime', types.TimestampType(), True), 
    types.StructField('lpep_dropoff_datetime', types.TimestampType(), True), 
    types.StructField('store_and_fwd_flag', types.StringType(), True), 
    types.StructField('RatecodeID', types.IntegerType(), True), 
    types.StructField('PULocationID', types.IntegerType(), True), 
    types.StructField('DOLocationID', types.IntegerType(), True), 
    types.StructField('passenger_count', types.IntegerType(), True), 
    types.StructField('trip_distance', types.DoubleType(), True), 
    types.StructField('fare_amount', types.DoubleType(), True), 
    types.StructField('extra', types.DoubleType(), True), 
    types.StructField('mta_tax', types.DoubleType(), True), 
    types.StructField('tip_amount', types.DoubleType(), True), 
    types.StructField('tolls_amount', types.DoubleType(), True), 
    types.StructField('ehail_fee', types.DoubleType(), True), 
    types.StructField('improvement_surcharge', types.DoubleType(), True), 
    types.StructField('total_amount', types.DoubleType(), True), 
    types.StructField('payment_type', types.IntegerType(), True), 
    types.StructField('trip_type', types.IntegerType(), True), 
    types.StructField('congestion_surcharge', types.DoubleType(), True)
])

yellow_schema = types.StructType([
    types.StructField('VendorID', types.IntegerType(), True), 
    types.StructField('tpep_pickup_datetime', types.TimestampType(), True), 
    types.StructField('tpep_dropoff_datetime', types.TimestampType(), True), 
    types.StructField('passenger_count', types.LongType(), True), 
    types.StructField('trip_distance', types.DoubleType(), True), 
    types.StructField('RatecodeID', types.IntegerType(), True), 
    types.StructField('store_and_fwd_flag', types.StringType(), True), 
    types.StructField('PULocationID', types.IntegerType(), True), 
    types.StructField('DOLocationID', types.IntegerType(), True), 
    types.StructField('payment_type', types.IntegerType(), True), 
    types.StructField('fare_amount', types.DoubleType(), True), 
    types.StructField('extra', types.DoubleType(), True), 
    types.StructField('mta_tax', types.DoubleType(), True), 
    types.StructField('tip_amount', types.DoubleType(), True), 
    types.StructField('tolls_amount', types.DoubleType(), True), 
    types.StructField('improvement_surcharge', types.DoubleType(), True), 
    types.StructField('total_amount', types.DoubleType(), True), 
    types.StructField('congestion_surcharge', types.DoubleType(), True)
])

### Create green and yellow Spark DataFrames using custom schemas

In [29]:
spark_green_df = spark.read \
    .option('header', 'true') \
    .schema(green_schema) \
    .csv('../data/raw/green/2021/01')
spark_green_df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [30]:
spark_yellow_df = spark.read \
    .option('header', 'true') \
    .schema(yellow_schema) \
    .csv('../data/raw/yellow/2021/01')
spark_yellow_df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



### Convert green & yellow CSV files to Parquet files using custom schemas 

In [38]:
res = max([int(month) for month in os.listdir('../data/raw/green/2021')])
res

7

In [54]:
def csv_to_parquet(service_type, year, schema):
    # Get lastest month number there is data for given service type and year
    range_ub = max(
        [ int(month) for month in os.listdir(f'../data/raw/{service_type}/{year}') ]
    ) + 1
    
    for month in range(1, range_ub):
        fmonth = f'{month:02d}'
        print(f'Processing data for {year}-{fmonth}')

        input_path = f'../data/raw/{service_type}/{year}/{fmonth}/'
        output_path = f'../data/pq/{service_type}/{year}/{fmonth}/'

        spark_df = spark.read \
            .option("header", "true") \
            .schema(schema) \
            .csv(input_path)

        spark_df \
            .repartition(4) \
            .write.parquet(output_path)

In [55]:
csv_to_parquet('green', '2020', green_schema)

Processing data for 2020-01


                                                                                

Processing data for 2020-02


                                                                                

Processing data for 2020-03


                                                                                

Processing data for 2020-04
Processing data for 2020-05


                                                                                

Processing data for 2020-06


                                                                                

Processing data for 2020-07
Processing data for 2020-08


                                                                                

Processing data for 2020-09
Processing data for 2020-10
Processing data for 2020-11


                                                                                

Processing data for 2020-12


                                                                                

In [56]:
csv_to_parquet('green', '2021', green_schema)

Processing data for 2021-01
Processing data for 2021-02
Processing data for 2021-03
Processing data for 2021-04
Processing data for 2021-05
Processing data for 2021-06
Processing data for 2021-07


In [57]:
csv_to_parquet('yellow', '2020', yellow_schema)

Processing data for 2020-01


                                                                                

Processing data for 2020-02


                                                                                

Processing data for 2020-03


                                                                                

Processing data for 2020-04


                                                                                

Processing data for 2020-05


                                                                                

Processing data for 2020-06


                                                                                

Processing data for 2020-07


                                                                                

Processing data for 2020-08


                                                                                

Processing data for 2020-09


                                                                                

Processing data for 2020-10


                                                                                

Processing data for 2020-11


                                                                                

Processing data for 2020-12


                                                                                

In [58]:
csv_to_parquet('yellow', '2021', yellow_schema)

Processing data for 2021-01


                                                                                

Processing data for 2021-02


                                                                                

Processing data for 2021-03


                                                                                

Processing data for 2021-04


                                                                                

Processing data for 2021-05


                                                                                

Processing data for 2021-06


                                                                                

Processing data for 2021-07


                                                                                