In [10]:
!pip install pyarrow pandas numpy boto3



In [11]:

from pathlib import Path

import numpy
import pandas as pd
import requests
from sklearn.model_selection import train_test_split


In [4]:
tmpdir = Path.cwd().parent / "tmp"

filepath = tmpdir / "yellow_tripdata_2024-07.parquet"
if not filepath.is_file():
    endpoint = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-07.parquet"
    with requests.get(endpoint) as response:
        response.raise_for_status()
        with filepath.open("wb") as f:
            f.write(response.content)

# Even if file was downloaded, read parquet from it to check it was written correctly
with filepath.open("rb") as f:
    d = pd.read_parquet(f)
assert type(d['PULocationID'][0]) is numpy.int32

In [9]:
d.describe()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
count,3076903.0,3076903,3076903,2797914.0,3076903.0,2797914.0,3076903.0,3076903.0,3076903.0,3076903.0,3076903.0,3076903.0,3076903.0,3076903.0,3076903.0,3076903.0,2797914.0,2797914.0
mean,1.766745,2024-07-17 04:52:59.937344,2024-07-17 05:10:13.448311,1.354542,5.111787,2.581814,162.5984,161.9581,1.137432,19.53443,1.401216,0.4776966,3.263495,0.5924703,0.9547924,28.08619,2.195965,0.167845
min,1.0,2009-01-01 00:02:24,2009-01-01 00:13:25,0.0,0.0,1.0,1.0,1.0,0.0,-2261.2,-7.5,-0.5,-93.42,-69.0,-1.0,-2265.45,-2.5,-1.75
25%,2.0,2024-07-10 10:55:59,2024-07-10 11:14:03.500000,1.0,1.04,1.0,132.0,113.0,1.0,9.3,0.0,0.5,0.0,0.0,1.0,15.6,2.5,0.0
50%,2.0,2024-07-17 11:25:26,2024-07-17 11:44:51,1.0,1.8,1.0,161.0,162.0,1.0,13.5,1.0,0.5,2.52,0.0,1.0,21.0,2.5,0.0
75%,2.0,2024-07-24 13:46:57,2024-07-24 14:05:16.500000,1.0,3.51,1.0,231.0,233.0,1.0,22.6,2.5,0.5,4.2,0.0,1.0,30.61,2.5,0.0
max,2.0,2024-08-01 23:51:57,2024-08-02 14:48:03,9.0,326505.5,99.0,265.0,265.0,5.0,2261.2,16.19,10.5,300.0,106.0,1.0,2265.45,2.5,1.75
std,0.4229035,,,0.8377762,407.3898,11.96155,63.78071,69.47267,0.6667458,20.61159,1.836983,0.1364387,4.141983,2.300883,0.2752434,25.32898,0.9199493,0.5353739


In [10]:
d["tpep_dropoff_datetime"] - d["tpep_pickup_datetime"]

0         0 days 00:11:53
1         0 days 00:39:06
2         0 days 00:06:33
3         0 days 00:16:58
4         0 days 00:26:39
                ...      
3076898   0 days 00:32:00
3076899   0 days 00:20:01
3076900   0 days 00:22:00
3076901   0 days 00:18:06
3076902   0 days 00:18:25
Length: 3076903, dtype: timedelta64[us]

In [11]:
(d["tpep_dropoff_datetime"] - d["tpep_pickup_datetime"]).apply(lambda timediff: timediff.total_seconds())


0           713.0
1          2346.0
2           393.0
3          1018.0
4          1599.0
            ...  
3076898    1920.0
3076899    1201.0
3076900    1320.0
3076901    1086.0
3076902    1105.0
Length: 3076903, dtype: float64

In [12]:
X = pd.DataFrame(index=d.index)
X = X.assign(distance_km=d["trip_distance"])
X

Unnamed: 0,distance_km
0,3.20
1,19.48
2,1.18
3,9.10
4,17.70
...,...
3076898,5.99
3076899,4.43
3076900,4.80
3076901,3.09


In [13]:
duration_min = (d["tpep_dropoff_datetime"] - d["tpep_pickup_datetime"]).apply(
    lambda timediff: timediff.total_seconds() / 60)
X["duration_min"] = duration_min[
    (duration_min >= 1) & (duration_min <= 60)]  # Only use data for trips lasting max. an hour
X.head()

Unnamed: 0,distance_km,duration_min
0,3.2,11.883333
1,19.48,39.1
2,1.18,6.55
3,9.1,16.966667
4,17.7,26.65


In [24]:
X = X.assign(start_end_code=(d["PULocationID"].astype(str) + "-" + d["DOLocationID"].astype(str)))
X.head()

Unnamed: 0,distance_km,duration_sec,start_end_code
0,3.2,713.0,140-79
1,19.48,2346.0,132-113
2,1.18,393.0,237-145
3,9.1,1018.0,138-164
4,17.7,1599.0,132-263


In [25]:
categorical = ["start_end_code"]
numerical = ["distance_km", "duration_min"]

In [24]:
df1 = pd.DataFrame([["a", 1], ["b", 2], ["c", 3], ["d", 4], ["e", 5], ["f", 6], ["g", 7], ["h", 8]],
                   columns=["letter", "number"])
train, test = train_test_split(df1, test_size=0.2, shuffle=False)
display(train)
display(test)

Unnamed: 0,letter,number
0,a,1
1,b,2
2,c,3
3,d,4
4,e,5
5,f,6


Unnamed: 0,letter,number
6,g,7
7,h,8
