In [1]:
import pathlib
import pickle

import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [2]:
data = pd.read_parquet('../data/yellow_tripdata/yellow_tripdata_2023-01.parquet')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

19 columns

In [4]:
data['duration'] = (data.tpep_dropoff_datetime - data.tpep_pickup_datetime).dt.total_seconds() / 60

In [5]:
data['duration'].std()

42.59435124195458

In [6]:
data.loc[(data['duration'] >= 1) & (data['duration'] <= 60)].shape[0] / data.shape[0] * 100

98.1220282212598

In [7]:
def prepare_data(path: pathlib.Path) -> pd.DataFrame:
    return (
        pd.read_parquet(path)
        .assign(duration=lambda df: (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds() / 60)
        .loc[lambda df: (df['duration'] >= 1) & (df['duration'] <= 60)]
        .astype({'PULocationID': str, 'DOLocationID': str})
    )

In [8]:
train_data = prepare_data(pathlib.Path('../data/yellow_tripdata/yellow_tripdata_2023-01.parquet'))
eval_data = prepare_data(pathlib.Path('../data/yellow_tripdata/yellow_tripdata_2023-02.parquet'))

In [9]:
features = ['PULocationID', 'DOLocationID']
target = 'duration'

In [10]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_data[features].to_dict(orient='records'))
y_train = train_data[target].values

In [11]:
X_train.shape

(3009173, 515)

In [12]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [13]:
y_pred = lr.predict(X_train)

In [14]:
root_mean_squared_error(y_train, y_pred)

7.649261927686161

In [15]:
X_val = dv.transform(eval_data[features].to_dict(orient='records'))
y_val = eval_data[target].values

In [16]:
y_pred = lr.predict(X_val)

In [17]:
root_mean_squared_error(y_val, y_pred)

7.811817957524739

In [18]:
with open('../models/lr.bin', 'wb') as f:
    pickle.dump((dv, lr), f)