In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
jan = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet")
feb = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet")

In [3]:
def add_duration(df: pd.DataFrame) -> pd.DataFrame:
    return df.assign(
        duration=lambda df: 
            (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"])
            .apply(lambda td: td.total_seconds() / 60)
    )

In [4]:
jan = add_duration(jan)
feb = add_duration(feb)

In [5]:
jan["duration"].std()

46.44530513776802

In [6]:
def drop_outliers(df: pd.DataFrame) -> pd.DataFrame:
    return df.loc[(df["duration"] >= 1) & (df["duration"] <= 60)].reset_index(drop=True)

In [7]:
len(drop_outliers(jan)) / len(jan)

0.9827547930522406

In [8]:
jan = drop_outliers(jan)
feb = drop_outliers(feb)

In [9]:
cat_features = ["PULocationID", "DOLocationID"]

jan_dicts = jan.loc[:, cat_features].astype(str).to_dict(orient="records")
dv = DictVectorizer()
dv.fit(jan_dicts)
len(dv.feature_names_)

515

In [10]:
x_train = dv.transform(jan_dicts)
y_train = jan["duration"]

lr = LinearRegression()
lr.fit(x_train, y_train)
mean_squared_error(y_train, lr.predict(x_train), squared=False)

6.986190963982251

In [11]:
feb_dicts = feb.loc[:, cat_features].astype(str).to_dict(orient="records")
x_val = dv.transform(feb_dicts)
y_val = feb["duration"]

mean_squared_error(y_val, lr.predict(x_val), squared=False)

7.786416487365596