In [1]:
import pandas as pd

In [2]:
INPUT_FILE = 'green_tripdata_2019-01.csv'

# Populate the DB

In [3]:
df = pd.read_csv(INPUT_FILE)

In [4]:
df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

In [5]:
df_zones = pd.read_csv('taxi+_zone_lookup.csv')

# Homework (parse data)

## Question 3. Count records

How many taxi trips were totally made on January 15?

Tip: started and finished on 2019-01-15.

Remember that `lpep_pickup_datetime` and `lpep_dropoff_datetime` columns are in the format timestamp (date and hour+min+sec) and not in date.

- 20689
- 20530
- 17630
- 21090


In [6]:
query_date = pd.to_datetime("2019-01-15").date()

df[(df["lpep_dropoff_datetime"].dt.date == query_date) &
   (df["lpep_pickup_datetime"].dt.date == query_date)].shape[0]

20530

**Answer is 20530**

## Question 4. Largest trip for each day

Which was the day with the largest trip distance
Use the pick up time for your calculations.

- 2019-01-18
- 2019-01-28
- 2019-01-15
- 2019-01-10

In [7]:
# Solution 1: hacky but simple
df[df["trip_distance"] == df["trip_distance"].max()]["lpep_pickup_datetime"]

297377   2019-01-15 19:27:58
Name: lpep_pickup_datetime, dtype: datetime64[ns]

In [8]:
# Solution 2: idiomatic?
df.loc[df["trip_distance"].idxmax(), "lpep_pickup_datetime"]

Timestamp('2019-01-15 19:27:58')

### Answer is 2019-01-15

## Question 5. The number of passengers

In 2019-01-01 how many trips had 2 and 3 passengers?

- 2: 1282 ; 3: 266
- 2: 1532 ; 3: 126
- 2: 1282 ; 3: 254
- 2: 1282 ; 3: 274

In [9]:
query_date = pd.to_datetime("2019-01-01").date()
df_date_filtered = df[df["lpep_pickup_datetime"].dt.date == query_date]

In [10]:
df_date_filtered.groupby("passenger_count").size()

passenger_count
0       21
1    12415
2     1282
3      254
4      129
5      616
6      273
dtype: int64

Answer is 1282 and 254

## Question 6. Largest tip

For the passengers picked up in the Astoria Zone which was the drop off zone that had the largest tip?
We want the name of the zone, not the id.

Note: it's not a typo, it's `tip` , not `trip`

- Central Park
- Jamaica
- South Ozone Park
- Long Island City/Queens Plaza

In [11]:
# Solution 1: Procedural

astoria_zone_id = df_zones.loc[df_zones["Zone"] == 'Astoria', "LocationID"].values[0]
astoria_trips = df[df["PULocationID"] == astoria_zone_id]
target_zone_id = astoria_trips.loc[astoria_trips["tip_amount"].idxmax(), "DOLocationID"]
df_zones.loc[df_zones["LocationID"] == target_zone_id, "Zone"].values[0]

'Long Island City/Queens Plaza'

In [12]:
# Solution 2: Ask ChatGPT to do pandas using Dimmy's SQL solution

# Prompt: 
'''
Can you translate this SQL into pandas, assuming the two tables are contained in different dataframes? 

SELECT zdo."Zone", MAX(td.tip_amount)
FROM tripdata td 
	JOIN zones zpu ON td."PULocationID" = zpu."LocationID" 
	JOIN zones zdo ON td."DOLocationID" = zdo."LocationID"
WHERE LOWER(zpu."Zone") LIKE 'astoria'
GROUP BY zdo."LocationID", zdo."Zone"
ORDER BY MAX(td.tip_amount) DESC;
'''

df1 = df
df2 = df_zones

# Answer below (crashes so we don't run it)

# df1 = df1[df1["PULocationID"].isin(df2[df2["Zone"].str.contains("astoria", case=False)]["LocationID"])]
# df3 = df1.merge(df2, left_on="DOLocationID", right_on="LocationID")
# df3 = df3.groupby(["Zone_y", "LocationID_y"])["tip_amount"].agg("max").reset_index()
# df3 = df3.sort_values("tip_amount", ascending=False)

In [13]:
# Looks good but fortunately it crashed (yey, we still have a job!)
# Let's fix it


df1 = df1[df1["PULocationID"].isin(df2[df2["Zone"] == "Astoria"]["LocationID"])] # fixed
df3 = df1.merge(df2, left_on="DOLocationID", right_on="LocationID")
df3 = df3.groupby(["Zone", "LocationID"])["tip_amount"].agg("max").reset_index()
df3 = df3.sort_values("tip_amount", ascending=False)
df3["Zone"].head(1).values[0] # added

# Maybe it will take our jobs?

'Long Island City/Queens Plaza'

#### Answer is Long Island City/Queens Plaza

## Speed tho

In [14]:
%%timeit

df1 = df
df2 = df_zones
df1 = df1[df1["PULocationID"].isin(df2[df2["Zone"] == "Astoria"]["LocationID"])]
df3 = df1.merge(df2, left_on="DOLocationID", right_on="LocationID")
df3 = df3.groupby(["Zone", "LocationID"])["tip_amount"].agg("max").reset_index()
df3 = df3.sort_values("tip_amount", ascending=False)
df3["Zone"].head(1).values[0]

11.4 ms ± 39 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
%%timeit

astoria_zone_id = df_zones.loc[df_zones["Zone"] == 'Astoria', "LocationID"].values[0]
astoria_trips = df[df["PULocationID"] == astoria_zone_id]
target_zone_id = astoria_trips.loc[astoria_trips["tip_amount"].idxmax(), "DOLocationID"]
df_zones.loc[df_zones["LocationID"] == target_zone_id, "Zone"].values[0]

4.43 ms ± 35.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


**Hmm, maybe merges aren't so fast huh?**

## Can we do it a bit faster? In Polars maybe?

In [16]:
!pip install polars pyarrow

Defaulting to user installation because normal site-packages is not writeable


In [17]:
import polars as pl

In [18]:
df_pl = pl.from_pandas(df)
df_zones_pl = pl.from_pandas(df_zones)

In [19]:
# Solution3: Polars equivalent of my original pandas

astoria_zone_id = df_zones_pl.filter(pl.col("Zone") == "Astoria")["LocationID"][0]
astoria_trips = df_pl.filter(pl.col("PULocationID") == astoria_zone_id)
target_zone_id = astoria_trips.filter(pl.col("tip_amount") == pl.col("tip_amount").max())["DOLocationID"]
df_zones_pl.filter(pl.col("LocationID") == target_zone_id)["Zone"][0]

'Long Island City/Queens Plaza'

In [20]:
%%timeit

astoria_zone_id = df_zones_pl.filter(pl.col("Zone") == "Astoria")["LocationID"][0]
astoria_trips = df_pl.filter(pl.col("PULocationID") == astoria_zone_id)
target_zone_id = astoria_trips.filter(pl.col("tip_amount") == pl.col("tip_amount").max())["DOLocationID"]
df_zones_pl.filter(pl.col("LocationID") == target_zone_id)["Zone"][0]

2.76 ms ± 135 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Final results (for today! :))

- Solution 1 (procedural pandas): 4.43 ms ± 35.8
- Solution 2 (ChatGPT-ed SQL code): 11.4 ms ± 39 µs
- Solution 3 (polars): **2.76 ms ± 135**