In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import os
import pyspark


# Instructions
Landing is where your "data lands" as a 1 to 1 with the source system. In this case, when you run the download script in Python, your "raw data lands into the landing bucket" (or landing layer).

The Raw bucket or layer (not to be confused with data) is where your data has simple transformations completed. These transformations are about consistent data types (correct schemas), consistent column names, dropping irrelevant columns (not rows, just columns). You may partition or shuffle the data around before saving it in this area.

From the Raw layer, business rules and transformations are applied such as filtering the data, aggregating stuff, creating new features, etc. This is then dumped into the Curated layer where it is ready for analysis, visualization, etc

In [2]:
from pyspark.sql import SparkSession, functions as F

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 landing to raw")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
23/08/21 03:12:40 WARN Utils: Your hostname, DESKTOP-LHMPQFC resolves to a loopback address: 127.0.1.1; using 172.19.194.216 instead (on interface eth0)
23/08/21 03:12:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/21 03:12:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
os.chdir("../")

In [4]:
!pwd

/home/ngocduyt/github-classroom/MAST30034-Applied-Data-Science/mast30034-project-1-dduygaucho


# Playground for detecting potential problems such as inconsistent data type

In [5]:
# note vendorID should be int
sdf = spark.read.parquet('../data/landing/*parquet')
print(sdf.show(2, vertical=True, truncate=100))
sdf.printSchema()

[Stage 1:>                                                          (0 + 1) / 1]

-RECORD 0------------------------------------
 VendorID              | 1                   
 tpep_pickup_datetime  | 2022-10-01 00:03:41 
 tpep_dropoff_datetime | 2022-10-01 00:18:39 
 passenger_count       | 1.0                 
 trip_distance         | 1.7                 
 RatecodeID            | 1.0                 
 store_and_fwd_flag    | N                   
 PULocationID          | 249                 
 DOLocationID          | 107                 
 payment_type          | 1                   
 fare_amount           | 9.5                 
 extra                 | 3.0                 
 mta_tax               | 0.5                 
 tip_amount            | 2.65                
 tolls_amount          | 0.0                 
 improvement_surcharge | 0.3                 
 total_amount          | 15.95               
 congestion_surcharge  | 2.5                 
 airport_fee           | 0.0                 
-RECORD 1------------------------------------
 VendorID              | 2        

                                                                                

In [6]:
def inspect_schema(folder):
    for file in os.listdir(folder):
       print(file)
       if 'parquet' in file:
        sdf_sample = spark.read.parquet(f"../data/landing/{file}")
        print(sdf_sample.printSchema())

# Note: only 2023_feb has the right schema, in which all other months has long 
# data type for VendorID
inspect_schema("./data/landing/")


2022-10.parquet
root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)

None
2023-02.parquet
root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nulla

In [6]:
# 2023_feb is the closest schema that we want

sdf_2023_feb = spark.read.parquet("../data/landing/2023-02.parquet")
# now, we want to ensure everything has consistent casing to make our lives easier
consistent_col_casing = [F.col(col_name).alias(col_name.lower()) \
                         for col_name in sdf_2023_feb.columns]
sdf_2023_feb = sdf_2023_feb.select(*consistent_col_casing)

# this will be used in the cell below when reading in
sdf_schema = sdf_2023_feb.schema
sdf_schema

StructType([StructField('vendorid', IntegerType(), True), StructField('tpep_pickup_datetime', TimestampNTZType(), True), StructField('tpep_dropoff_datetime', TimestampNTZType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('ratecodeid', LongType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('pulocationid', IntegerType(), True), StructField('dolocationid', IntegerType(), True), StructField('payment_type', LongType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True), StructField('airport_fee', DoubleType(), True)])

In [7]:
sdf_2023_feb.printSchema()

root
 |-- vendorid: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- ratecodeid: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = true)
 |-- dolocationid: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [8]:
from pyspark.sql.types import StructType, StructField, IntegerType, \
    TimestampNTZType, LongType, DoubleType, StringType

# redefined the desired schema
# Define the schema using StructType and StructField
schema = StructType([
    StructField('vendorid', IntegerType(), True),
    StructField('tpep_pickup_datetime', TimestampNTZType(), True),
    StructField('tpep_dropoff_datetime', TimestampNTZType(), True),
    StructField('passenger_count', IntegerType(), True),
    StructField('trip_distance', DoubleType(), True),
    StructField('ratecodeid', IntegerType(), True),
    StructField('store_and_fwd_flag', StringType(), True),
    StructField('pulocationid', IntegerType(), True),
    StructField('dolocationid', IntegerType(), True),
    StructField('payment_type', IntegerType(), True),
    StructField('fare_amount', DoubleType(), True),
    StructField('extra', DoubleType(), True),
    StructField('mta_tax', DoubleType(), True),
    StructField('tip_amount', DoubleType(), True),
    StructField('tolls_amount', DoubleType(), True),
    StructField('improvement_surcharge', DoubleType(), True),
    StructField('total_amount', DoubleType(), True),
    StructField('congestion_surcharge', DoubleType(), True),
    StructField('airport_fee', DoubleType(), True)
])

# Print the generated schema
print(schema)


StructType([StructField('vendorid', IntegerType(), True), StructField('tpep_pickup_datetime', TimestampNTZType(), True), StructField('tpep_dropoff_datetime', TimestampNTZType(), True), StructField('passenger_count', IntegerType(), True), StructField('trip_distance', DoubleType(), True), StructField('ratecodeid', IntegerType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('pulocationid', IntegerType(), True), StructField('dolocationid', IntegerType(), True), StructField('payment_type', IntegerType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True), StructField('airport_fee', DoubleType(), True)])


## Parse malformed dataframes into correct data type and field names

In [9]:
os.getcwd()

'/home/ngocduyt/github-classroom/MAST30034-Applied-Data-Science/mast30034-project-1-dduygaucho'

In [13]:
for file in os.listdir("./data/landing/"):
    print(file)

2022-10.parquet
2023-02.parquet
2022-08.parquet
2022-06.parquet
2022-05.parquet
mta_2023.csv
weather_link_2023.csv
weather_link_2022.csv
taxi_zones
taxi_zones.csv
2022-02.parquet
2022-12.parquet
2022-09.parquet
2023-01.parquet
2022-07.parquet
2022-11.parquet
2022-04.parquet
2022-03.parquet


In [10]:
def parse_dataframes(folder, chosen_schema):
    for file in os.listdir(folder):
        print(file[:-8])
        if 'parquet' not in file:
            continue
        sdf_sample = spark.read.parquet(f"../data/landing/{file}")
        # select all columns from the existing malformed dataframe and 
        # cast it to the required schema
        sdf_sample = sdf_sample \
            .select([F.col(c).cast(chosen_schema[i].dataType) \
                     for i, c in enumerate(sdf_sample.columns)])
        sdf_sample \
            .coalesce(1) \
            .write \
            .mode('overwrite') \
            .parquet(f'../data/raw/{file[:-8]}')
        # break


parse_dataframes("./data/landing/", chosen_schema = schema)


2022-10


                                                                                

2023-02


                                                                                

2022-08


                                                                                

2022-06


                                                                                

2022-05


                                                                                

mta_
weather_link_
weather_link_
ta
taxi_z
2022-02


                                                                                

2022-12


                                                                                

2022-09


                                                                                

2023-01


                                                                                

2022-07


                                                                                

2022-11


                                                                                

2022-04


                                                                                

2022-03


                                                                                

In [14]:
# test whether read successfully
sdf = spark.read.schema(schema).parquet("../data/raw/2*")
sdf.show(1, vertical = True, truncate = 100)

-RECORD 0------------------------------------
 vendorid              | 1                   
 tpep_pickup_datetime  | 2022-10-01 00:03:41 
 tpep_dropoff_datetime | 2022-10-01 00:18:39 
 passenger_count       | 1                   
 trip_distance         | 1.7                 
 ratecodeid            | 1                   
 store_and_fwd_flag    | N                   
 pulocationid          | 249                 
 dolocationid          | 107                 
 payment_type          | 1                   
 fare_amount           | 9.5                 
 extra                 | 3.0                 
 mta_tax               | 0.5                 
 tip_amount            | 2.65                
 tolls_amount          | 0.0                 
 improvement_surcharge | 0.3                 
 total_amount          | 15.95               
 congestion_surcharge  | 2.5                 
 airport_fee           | 0.0                 
only showing top 1 row



In [15]:
# Get the number of rows (count) and columns
num_rows = sdf.count()
num_columns = len(sdf.columns)

# Print the number of rows and columns
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")

Number of rows: 43172888
Number of columns: 19


In [16]:
sample = spark.read.schema(schema).parquet("../data/raw/2022-11")
# Get the number of rows (count) and columns
num_rows = sample.count()
num_columns = len(sample.columns)

# Print the number of rows and columns
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")

Number of rows: 3252717
Number of columns: 19


In [17]:
def inspect_max_min_value(column_name, df):
    """
    Given a pyspark dataframe, output the min, max value for a column
    """
    max_value_row = df.agg(F.max(column_name)).collect()[0][0]
    print(max_value_row)
    min_value_row = df.agg(F.min(column_name)).collect()[0][0]
    print(f"Max value in {column_name}: {max_value_row}, \
          Min value: {min_value_row}")
inspect_max_min_value("passenger_count", sdf)
inspect_max_min_value("payment_type", sdf)
inspect_max_min_value("PULocationID", sdf)
inspect_max_min_value("DOLocationID", sdf)

9
Max value in passenger_count: 9, Min value: 0
5
Max value in payment_type: 5, Min value: 0
265
Max value in PULocationID: 265, Min value: 1
265
Max value in DOLocationID: 265, Min value: 1


In [18]:
spark.stop()

In [None]:
# Current problems with other dataframes:
# 1. VendorID: long --> int, only 2 vendors, integer is more than enough
# 2. PULocationID, DOLocationID: long --> int, 266 zones, integer is reasonable
# the last 2 is not dealt with 2023_feb
# 3. passenger_count: long/double --> int, integer is reasonable
# 4. payment_type: long --> int

# 2023 Feb
#  |-- VendorID: integer (nullable = true)
#  |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
#  |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
#  |-- passenger_count: long (nullable = true)     # change to int
#  |-- trip_distance: double (nullable = true)
#  |-- RatecodeID: integer (nullable = true)
#  |-- store_and_fwd_flag: string (nullable = true)
#  |-- PULocationID: integer (nullable = true)
#  |-- DOLocationID: integer (nullable = true)
#  |-- payment_type: long (nullable = true)
#  |-- fare_amount: double (nullable = true)     # change to int
#  |-- extra: double (nullable = true)
#  |-- mta_tax: double (nullable = true)
#  |-- tip_amount: double (nullable = true)
#  |-- tolls_amount: double (nullable = true)
#  |-- improvement_surcharge: double (nullable = true)
#  |-- total_amount: double (nullable = true)
#  |-- congestion_surcharge: double (nullable = true)
#  |-- Airport_fee: double (nullable = true)