In [1]:
from functools import partial

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf

from datetime import datetime
from pytz import timezone

import numpy as np
import pandas as pd

In [2]:
spark = (SparkSession
         .builder
         .master("local[*]")
         .appName("taxis")
         .config("spark.executor.cores", "4")
         .getOrCreate())

In [3]:
filepath = "../data/yellow_tripdata_2019-01.csv"
datetime_format = "%Y-%m-%d %H:%M:%S"
QUERY_MONTH = "2019-1"
columns_of_interest = [
    "VendorID", 
    "PULocationID", 
    "trip_distance",
    "fare_amount",
    "payment_type",
    "tpep_pickup_datetime"
] 

In [4]:
df = spark.read.format("csv").option("header", "true").load(filepath)
df = df.select(columns_of_interest)

In [5]:
nytimezone = timezone("US/Eastern")
to_datetime = udf(lambda x: nytimezone.localize(datetime.strptime(x, datetime_format)))

df = df.withColumn("tpep_pickup_datetime", to_datetime("tpep_pickup_datetime"))

In [6]:
def year_month(dt):
    return f"{dt.year}-{dt.month}"

year_month_str = udf(year_month)
df = df.withColumn("year_month", year_month_str("tpep_pickup_datetime"))

In [7]:
df_201901 = df.filter(df["year_month"]=="2019-1")

In [8]:
df.createOrReplaceTempView("taxi_drives")

In [9]:
sqlDF = spark.sql("""
    SELECT VendorID, PULocationID, payment_type, sum(fare_amount) as total_fare_amount, sum(trip_distance) as total_trip_distance 
    FROM taxi_drives 
    WHERE fare_amount > 0
    GROUP BY VendorID, PULocationID, payment_type""")

In [10]:
pdf = sqlDF.toPandas()

In [11]:
pdf.to_csv("../data/tripdata_grouped.csv")

In [2]:
df = pd.read_csv("../data/tripdata_grouped.csv",header=0, index_col=0)

In [87]:
df = df[df["total_trip_distance"]>0]

### Scenario 1

In [88]:
df.groupby("VendorID").agg({"total_fare_amount": lambda x: 0.01*np.sum(x)})

Unnamed: 0_level_0,total_fare_amount
VendorID,Unnamed: 1_level_1
1,360705.0255
2,582639.1506
4,8879.6574


### Scenario 2

In [89]:
percentages = [
    (-np.inf, 10000, 0.0),
    (10000, 30000, 0.1),
    (30000, 70000, 0.2),
    (70000, np.inf,0.3)
]
percentages

[(-inf, 10000, 0.0),
 (10000, 30000, 0.1),
 (30000, 70000, 0.2),
 (70000, inf, 0.3)]

In [90]:
#compute distance per vendor and zone
df_distance = df[(df["payment_type"]==1) | (df["payment_type"]==2)].copy()
df_distance = (df_distance
                .groupby(["VendorID","PULocationID"])
                .agg({"total_trip_distance": np.sum})
               )

In [91]:
def add_tax_percentage(row, percentages):
    for lb, ub, percentage in percentages:
        if lb <= row["total_trip_distance"] < ub:
            return percentage

In [92]:
#add tax percentage
df_tax = df_distance.copy()
df_tax["tax_percentage"] = df_tax.apply(partial(add_tax_percentage, percentages=percentages),axis=1)
df_tax = df_tax.reset_index()

In [93]:
#join in tax percentage
merged_df = df.merge(df_tax, on=["VendorID","PULocationID"], how='left', suffixes=('_x',''))
merged_df.drop(merged_df.filter(regex='_x$').columns.to_list(), axis=1, inplace=True)

In [94]:
#filter on payment type
merged_df = merged_df[merged_df["payment_type"]==1].copy()

In [95]:
#compute final tax
merged_df["tax"] = merged_df.tax_percentage * merged_df.total_fare_amount

In [96]:
merged_df.groupby("VendorID").agg({"tax": np.sum})

Unnamed: 0_level_0,tax
VendorID,Unnamed: 1_level_1
1,7225625.0
2,12203510.0
4,10879.25


### Scenario 3

In [97]:
df_distance

Unnamed: 0_level_0,Unnamed: 1_level_0,total_trip_distance
VendorID,PULocationID,Unnamed: 2_level_1
1,1,739.70
1,2,32.70
1,3,384.80
1,4,12309.50
1,6,7.30
...,...,...
4,261,1513.34
4,262,2129.59
4,263,3089.93
4,264,5.24


In [98]:
percentages = [
    (0, 10000, 0.0),
    (10000, 30000, 0.1),
    (30000,70000, 0.2),
    (70000, np.inf, 0.3)
]

In [99]:
def progressive_percentages(distance, percentages):
    progressive_cut = 0
    for lb, ub, percentage in percentages:
        if distance > ub:
            progressive_cut += percentage*(ub-lb)
        else:
            progressive_cut += percentage*(distance-lb)
            break
    return progressive_cut/distance

In [101]:
progressive_percentages(1000, percentages)

0.0

In [102]:
def add_tax_percentage(row, percentages):
    return progressive_percentages(row["total_trip_distance"], percentages)

In [103]:
df_tax = df_distance.copy()
df_tax["tax_percentage"] = df_tax.apply(partial(add_tax_percentage, percentages=percentages),axis=1)
df_tax = df_tax.reset_index()

In [105]:
#join in tax percentage
merged_df = df.merge(df_tax, on=["VendorID","PULocationID"], how='left', suffixes=('_x',''))
merged_df.drop(merged_df.filter(regex='_x$').columns.to_list(), axis=1, inplace=True)

In [106]:
#filter on payment type
merged_df = merged_df[merged_df["payment_type"]==1].copy()

In [107]:
merged_df["tax"] = merged_df.tax_percentage * merged_df.total_fare_amount

In [110]:
merged_df[(merged_df["VendorID"]==1) & (merged_df["PULocationID"]==4)]

Unnamed: 0,VendorID,PULocationID,payment_type,total_fare_amount,total_trip_distance,tax_percentage,tax
609,1,4,1,39847.06,12309.5,0.018762,747.607824


In [108]:
merged_df.groupby("VendorID").agg({"tax": np.sum})

Unnamed: 0_level_0,tax
VendorID,Unnamed: 1_level_1
1,5534186.0
2,10287640.0
4,6196.05
