In [2]:
from functools import partial

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf

from datetime import datetime
from pytz import timezone

import numpy as np
import pandas as pd

In [2]:
spark = (SparkSession
         .builder
         .master("local[*]")
         .appName("taxis")
         .config("spark.executor.cores", "4")
         .getOrCreate())

In [3]:
filepath = "../data/yellow_tripdata_2019-01.csv"
datetime_format = "%Y-%m-%d %H:%M:%S"
QUERY_MONTH = "2019-1"
columns_of_interest = [
    "VendorID", 
    "PULocationID", 
    "trip_distance",
    "fare_amount",
    "payment_type",
    "tpep_pickup_datetime"
] 

In [4]:
df = spark.read.format("csv").option("header", "true").load(filepath)
df = df.select(columns_of_interest)

In [5]:
nytimezone = timezone("US/Eastern")
to_datetime = udf(lambda x: nytimezone.localize(datetime.strptime(x, datetime_format)))

df = df.withColumn("tpep_pickup_datetime", to_datetime("tpep_pickup_datetime"))

In [6]:
def year_month(dt):
    return f"{dt.year}-{dt.month}"

year_month_str = udf(year_month)
df = df.withColumn("year_month", year_month_str("tpep_pickup_datetime"))

In [7]:
df_201901 = df.filter(df["year_month"]=="2019-1")

In [8]:
df.createOrReplaceTempView("taxi_drives")

In [9]:
sqlDF = spark.sql("""
    SELECT VendorID, PULocationID, payment_type, sum(fare_amount) as total_fare_amount, sum(trip_distance) as total_trip_distance 
    FROM taxi_drives 
    WHERE fare_amount > 0
    GROUP BY VendorID, PULocationID, payment_type""")

In [10]:
pdf = sqlDF.toPandas()

In [11]:
pdf.to_csv("../data/tripdata_grouped.csv")

In [3]:
pdf = pd.read_csv("../data/tripdata_grouped.csv",header=0, index_col=0)

In [4]:
pdf

Unnamed: 0,VendorID,PULocationID,payment_type,total_fare_amount,total_trip_distance
0,2,186,2,523404.01,98862.11
1,2,114,1,480583.69,103292.28
2,1,195,1,1659.50,398.00
3,2,4,1,74945.68,17436.17
4,1,119,2,340.50,68.30
...,...,...,...,...,...
1668,2,158,2,98906.70,21214.11
1669,4,158,2,2002.50,416.97
1670,2,15,1,1829.63,528.56
1671,1,157,1,3557.50,573.80


### Scenario 1

In [5]:
pdf.groupby("VendorID").agg({"total_fare_amount": lambda x: 0.01*np.sum(x)})

Unnamed: 0_level_0,total_fare_amount
VendorID,Unnamed: 1_level_1
1,360713.7305
2,582640.6756
4,8883.4374


### Scenario 2

In [6]:
percentages = [
    (-np.inf, 10000, 0.0),
    (10000, 30000, 0.1),
    (30000, 70000, 0.2),
    (70000, np.inf,0.3)
]
percentages

[(-inf, 10000, 0.0),
 (10000, 30000, 0.1),
 (30000, 70000, 0.2),
 (70000, inf, 0.3)]

In [7]:
pdf_distance = pdf[(pdf["payment_type"]==1) | (pdf["payment_type"]==2)].copy()
pdf_distance = (pdf_distance
                .groupby(["VendorID","PULocationID"])
                .agg({"total_trip_distance": np.sum})
               )

In [8]:
pdf_distance

Unnamed: 0_level_0,Unnamed: 1_level_0,total_trip_distance
VendorID,PULocationID,Unnamed: 2_level_1
1,1,739.70
1,2,32.70
1,3,384.80
1,4,12309.50
1,6,7.30
...,...,...
4,261,1513.34
4,262,2129.59
4,263,3089.93
4,264,5.24


In [9]:
def add_tax_percentage(row, percentages):
    for lb, ub, percentage in percentages:
        if lb <= row["total_trip_distance"] < ub:
            return percentage

In [10]:
pdf_distance["tax_percentage"] = pdf_distance.apply(partial(add_tax_percentage, percentages=percentages),axis=1)
pdf_distance = pdf_distance.reset_index()

In [11]:
merged_df = pdf.merge(pdf_distance, on=["VendorID","PULocationID"], how='left', suffixes=('_x',''))

In [12]:
merged_df.drop(merged_df.filter(regex='_x$').columns.to_list(), axis=1, inplace=True)

In [13]:
merged_df[(merged_df["VendorID"]==1) & (merged_df["PULocationID"]==4)  ]

Unnamed: 0,VendorID,PULocationID,payment_type,total_fare_amount,total_trip_distance,tax_percentage
326,1,4,2,16000.5,12309.5,0.1
483,1,4,3,682.3,12309.5,0.1
622,1,4,1,39847.06,12309.5,0.1
983,1,4,4,221.5,12309.5,0.1


In [14]:
merged_df = merged_df[merged_df["payment_type"]==1].copy()

In [15]:
merged_df["tax"] = merged_df.tax_percentage * merged_df.total_fare_amount

In [16]:
merged_df[(merged_df["VendorID"]==1) & (merged_df["PULocationID"]==4)  ]

Unnamed: 0,VendorID,PULocationID,payment_type,total_fare_amount,total_trip_distance,tax_percentage,tax
622,1,4,1,39847.06,12309.5,0.1,3984.706
