# NYC Taxi Data

This data is freely available. You can find some interesting background information at https://chriswhong.com/open-data/foil_nyc_taxi/ . We will ask some relatively simple questions of this large data set (almost 18GB of gzipped data).

In [1]:
dwh_basedir = "file:///srv/jupyter/nyc-dwh"
integrated_basedir = dwh_basedir + "/integrated"

# 0 Setup Environment

Before we begin, we create a local Spark session

## 0.1 Spark Session

In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

if not 'spark' in locals():
    spark = SparkSession.builder \
        .master("local[*]") \
        .config("spark.driver.memory","64G") \
        .getOrCreate()

spark

'2.4.3'

## 0.2 Matplotlib

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

# 1 Read Taxi Data

Now we can read in the taxi data from the structured zone.

In [7]:
hourly_taxi_trips = spark.read.parquet(integrated_basedir + "/taxi-trips-hourly")
hourly_taxi_trips.limit(10).toPandas()

Unnamed: 0,date,hour,lat_idx,long_idx,trip_count,passenger_count,fare_amount,tip_amount,total_amount,holiday_description,bank_holiday,hourly_wind_speed,hourly_temperature,hourly_precipitation,temperature,daily_wind_speed,daily_precipitation
0,2013-09-22,4,38,0,21,31,272.5,36.05,329.55,,,2.55,20.12,98.25,18.547014,2.490217,199.95
1,2013-09-22,4,64,9,7,20,105.0,15.6,127.1,,,2.55,20.12,98.25,18.547014,2.490217,199.95
2,2013-09-22,4,21,2,9,21,118.5,1.9,129.4,,,2.55,20.12,98.25,18.547014,2.490217,199.95
3,2013-09-22,4,50,5,2,2,55.5,0.0,62.33,,,2.55,20.12,98.25,18.547014,2.490217,199.95
4,2013-09-22,4,41,10,16,30,288.5,35.71,360.62,,,2.55,20.12,98.25,18.547014,2.490217,199.95
5,2013-09-22,4,57,15,5,17,46.5,7.95,59.45,,,2.55,20.12,98.25,18.547014,2.490217,199.95
6,2013-09-22,4,62,40,6,15,52.0,2.0,60.0,,,2.55,20.12,98.25,18.547014,2.490217,199.95
7,2013-09-22,4,88,14,6,10,70.5,9.07,90.9,,,2.55,20.12,98.25,18.547014,2.490217,199.95
8,2013-09-22,4,-1,22,25,41,325.5,28.38,384.21,,,2.55,20.12,98.25,18.547014,2.490217,199.95
9,2013-09-22,4,28,0,3,5,124.0,11.9,136.9,,,2.55,20.12,98.25,18.547014,2.490217,199.95


# 2. Split Training and Validation set

In [8]:
train_data, test_data = hourly_taxi_trips.randomSplit([0.8,0.2], seed=0)
print(train_data.count())
print(test_data.count())

11362335
2839303


# 3. Features

In [9]:
from pyspark.ml.feature import *
from pyspark.ml import Pipeline

tx = SQLTransformer(
    statement="""
        SELECT
            *,
            weekday(`date`) AS `weekday`,
            CASE
                WHEN lat_idx IS NULL THEN NULL
                WHEN long_idx IS NULL THEN NULL
                ELSE concat(lat_idx, "/", long_idx) 
            END AS geo_location,
            CASE 
                WHEN dayofweek(`date`) IN (2,3,4,5,6) THEN 1
                ELSE 0
            END AS workingday
        FROM __THIS__
    """
)
td = tx.transform(train_data)
td.printSchema()

tx2 = StringIndexer(
    inputCol="geo_location",
    outputCol="geo_location_idx",
    handleInvalid="keep"
)
tx2m = tx2.fit(td)
td2 = tx2m.transform(td)
td2.printSchema()

tx3 = OneHotEncoderEstimator(
    inputCols=["geo_location_idx"],
    outputCols=["geo_location_onehot"]
)
tx3m = tx3.fit(td2)
td3 = tx3m.transform(td2)
td3.printSchema()

tx8 = VectorAssembler(
    inputCols=[
        'season',
        'month',
        'hour',
        'bank_holiday',
        'weekday',
        'workingday',
        'geo_location_idx',
        'daily_temperature',
        'hourly_temperature',
        'daily_percipitation',
        'hourly_percipitation',
        'daily_wind',
        'hourly_wind'
    ],
    outputCol='features'
)


root
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- lat_idx: integer (nullable = true)
 |-- long_idx: integer (nullable = true)
 |-- trip_count: long (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- holiday_description: string (nullable = true)
 |-- bank_holiday: boolean (nullable = true)
 |-- hourly_wind_speed: double (nullable = true)
 |-- hourly_temperature: double (nullable = true)
 |-- hourly_precipitation: double (nullable = true)
 |-- temperature: double (nullable = true)
 |-- daily_wind_speed: double (nullable = true)
 |-- daily_precipitation: double (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- geo_location: string (nullable = true)
 |-- workingday: integer (nullable = false)

root
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- lat_idx: integer (nullable = t

## 6.2 Reference Model

## 6.2 Model

In [20]:
pipeline = Pipeline(
    stages = [
        SQLTransformer(
            statement="""
                SELECT
                    *,
                    month(`date`) - 1 AS `month_idx`,
                    dayofweek(`date`) - 1 AS `weekday_idx`,
                    CASE
                        WHEN lat_idx IS NULL THEN NULL
                        WHEN long_idx IS NULL THEN NULL
                        ELSE concat(lat_idx, "/", long_idx) 
                    END AS geo_location,
                    CASE 
                        WHEN dayofweek(`date`) IN (2,3,4,5,6) THEN 1
                        ELSE 0
                    END AS workingday
                FROM __THIS__
            """
        ),
        StringIndexer(
            inputCol="geo_location",
            outputCol="geo_location_idx",
            handleInvalid="keep"
        ),
        OneHotEncoderEstimator(
            inputCols=["geo_location_idx"],
            outputCols=["geo_location_onehot"]
        ),
        OneHotEncoderEstimator(
            inputCols=["hour"],
            outputCols=["hour_onehot"]
        ),
        OneHotEncoderEstimator(
            inputCols=["month_idx"],
            outputCols=["month_onehot"]
        ),
        OneHotEncoderEstimator(
            inputCols=["weekday_idx"],
            outputCols=["weekday_onehot"]
        ),
        VectorAssembler(
            inputCols=[
                'month_onehot',
                'weekday_onehot',
                'hour_onehot',
                'workingday',
                'bank_holiday',
                'geo_location_idx',
                'daily_temperature',
                'hourly_temperature',
                'daily_percipitation',
                'hourly_percipitation',
                'daily_wind',
                'hourly_wind'
            ],
            outputCol='features'
        )
    ]
)

model = pipeline.fit(train_data)

In [18]:
spark.sql("SELECT dayofweek('2019-10-05')").collect()

[Row(dayofweek(CAST(2019-10-05 AS DATE))=7)]

## 6.3 Validation