In [1]:
import pandas as pd

import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *

import warnings
warnings.filterwarnings("ignore")

SparkContext.setSystemProperty('spark.executor.memory', '8g')
SparkContext.setSystemProperty('spark.executor.cores', '4')
SparkContext.setSystemProperty('spark.cores.max', '4')
SparkContext.setSystemProperty("spark.driver.memory",'8g')

sc = SparkContext("local", "Data Preprocessing YTX")
spark = SparkSession.builder.getOrCreate()
spark

In [2]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

### Global Hourly Climate Data 2018

In [3]:
ghc_2018_sdf = spark.read.parquet('../preprocessed_data/nyc_ghc_2018_pre.parquet/')

### Yellow Taxi 2018

In [4]:
ytx_2018_sdf = spark.read.parquet('../preprocessed_data/ytx_2018_cleaned.parquet/')

In [5]:
ytx_2018_sdf = ytx_2018_sdf.withColumn("pickup_time_hour", f.date_trunc("hour", f.col("tpep_pickup_datetime")))

In [6]:
# Inner join on pickup time hour and closest hour
ytx_2018_sdf = ytx_2018_sdf.join(f.broadcast(ghc_2018_sdf), (ytx_2018_sdf['pickup_time_hour'] == ghc_2018_sdf['closest_hour']))

In [7]:
ytx_2018_sdf = ytx_2018_sdf.withColumnRenamed('avg(wind_angle_degrees)', 'wind_angle_degrees') \
                         .withColumnRenamed('avg(wind_speed_mps)', 'wind_speed_mps') \
                         .withColumnRenamed('avg(air_temp_celsius)', 'air_temp_celsius') \
                         .withColumnRenamed('avg(dew_temp_celsius)', 'dew_temp_celsius') \
                         .withColumnRenamed('avg(sky_ceil_height_meters)', 'sky_ceil_height_meters') \
                         .withColumnRenamed('avg(vis_distance_meters)', 'vis_distance_meters') \
                         .withColumnRenamed('avg(atp_hectopascals)', 'atp_hectopascals')

In [8]:
ytx_2018_sdf.printSchema()

root
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- avg_speed: double (nullable = true)
 |-- pickup_time_hour: timestamp (nullable = true)
 |-- closest_hour: timestamp (nullable = true)
 |-- wind_angle_degrees: double (nullable = true)
 |-- wind_speed_mps: double (nullable = true)
 |-- air_temp_celsius: double (nullable = true)
 |-- dew_temp_celsius: double (nullable = true)
 |-- sky_ceil_height_meters: double (nullable = true)
 |-- vis_distance_meters: double (nullable = true)
 |-- atp_hectopascals: double (nullable = true)



In [9]:
# Drop keys
ytx_2018_sdf = ytx_2018_sdf.drop('pickup_time_hour', 'closest_hour') 

In [10]:
from shutil import rmtree
from os import path

fpath = '../preprocessed_data/ytx_2018_pre.parquet/'

if path.exists(fpath):
    rmtree(fpath)
        
ytx_2018_sdf.write.parquet('../preprocessed_data/ytx_2018_pre.parquet')

### Sampling for Statistical Modelling

In [11]:
temp = spark.read.parquet('../preprocessed_data/ytx_2018_pre.parquet/')
temp = temp.drop('tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID',)
temp.count()

31456026

In [12]:
# 10% Sampling of Data for modelling
sample_2018 = temp.sample(0.1, 17)

In [14]:
fpath = '../preprocessed_data/ytx_2018_sample.parquet/'

if path.exists(fpath):
    rmtree(fpath)
        
sample_2018.write.parquet('../preprocessed_data/ytx_2018_sample.parquet')

### Global Hourly Climate Data 2019

In [15]:
ghc_2019_sdf = spark.read.parquet('../preprocessed_data/nyc_ghc_2019_pre.parquet/')

### Yellow Taxi 2019

In [16]:
ytx_2019_sdf = spark.read.parquet('../preprocessed_data/ytx_2019_cleaned.parquet/')

In [17]:
ytx_2019_sdf = ytx_2019_sdf.withColumn("pickup_time_hour", f.date_trunc("hour", f.col("tpep_pickup_datetime")))

In [18]:
# Inner join on pickup time hour and closest hour
ytx_2019_sdf = ytx_2019_sdf.join(f.broadcast(ghc_2019_sdf), (ytx_2019_sdf['pickup_time_hour'] == ghc_2019_sdf['closest_hour']))

In [19]:
ytx_2019_sdf = ytx_2019_sdf.withColumnRenamed('avg(wind_angle_degrees)', 'wind_angle_degrees') \
                         .withColumnRenamed('avg(wind_speed_mps)', 'wind_speed_mps') \
                         .withColumnRenamed('avg(air_temp_celsius)', 'air_temp_celsius') \
                         .withColumnRenamed('avg(dew_temp_celsius)', 'dew_temp_celsius') \
                         .withColumnRenamed('avg(sky_ceil_height_meters)', 'sky_ceil_height_meters') \
                         .withColumnRenamed('avg(vis_distance_meters)', 'vis_distance_meters') \
                         .withColumnRenamed('avg(atp_hectopascals)', 'atp_hectopascals')

In [20]:
ytx_2019_sdf.printSchema()

root
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- avg_speed: double (nullable = true)
 |-- pickup_time_hour: timestamp (nullable = true)
 |-- closest_hour: timestamp (nullable = true)
 |-- wind_angle_degrees: double (nullable = true)
 |-- wind_speed_mps: double (nullable = true)
 |-- air_temp_celsius: double (nullable = true)
 |-- dew_temp_celsius: double (nullable = true)
 |-- sky_ceil_height_meters: double (nullable = true)
 |-- vis_distance_meters: double (nullable = true)
 |-- atp_hectopascals: double (nullable = true)



In [21]:
# Drop total amount as it is a only a linear combination of the previous columns
# and we are not trying to predict total amount.
ytx_2019_sdf = ytx_2019_sdf.drop('pickup_time_hour', 'closest_hour') 

In [22]:
from shutil import rmtree
from os import path

fpath = '../preprocessed_data/ytx_2019_pre.parquet/'

if path.exists(fpath):
    rmtree(fpath)
        
ytx_2019_sdf.write.parquet('../preprocessed_data/ytx_2019_pre.parquet')

In [23]:
temp_1 = spark.read.parquet('../preprocessed_data/ytx_2019_pre.parquet/')
temp_1 = temp_1.drop('tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID',)
temp_1.count()

17460233

In [25]:
# Sample 10% of the dataset with seed 17. Note: Does not guarantee to have the same split everytime.
sample = temp_1.sample(0.1, 17)

In [27]:
fpath = '../preprocessed_data/ytx_2019_sample.parquet/'

if path.exists(fpath):
    rmtree(fpath)
        
temp_1.write.parquet('../preprocessed_data/ytx_2019_sample.parquet')