# Setup

In [1]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

# Ingesting Data

In [2]:
df_jan = pd.read_parquet(path="../data/yellow/yellow_tripdata_2023-01.parquet")
df_feb = pd.read_parquet(path="../data/yellow/yellow_tripdata_2023-02.parquet")

# Questions

In [3]:
# Read the data for January. How many columns are there?
_, number_of_cols = df_jan.shape
print(f"{number_of_cols} columns.")

19 columns.


In [4]:
# What's the standard deviation of the trips duration in January?
df_jan = df_jan.assign(duration=df_jan["tpep_dropoff_datetime"].sub(df_jan["tpep_pickup_datetime"]).dt.seconds.div(60))
print(f"Standard deviation of {df_jan['duration'].std():.0f}.")

Standard deviation of 42.


In [5]:
# What fraction of the records left after you dropped the outliers? (1min to 60min trips [inclusive])
fraction_without_outliers = len(df_jan.loc[df_jan["duration"].between(1,60), :]) / len(df_jan)
df_jan = df_jan.loc[df_jan["duration"].between(1,60), :]
print(f"{fraction_without_outliers*100:.0f}%.")

98%.


In [6]:
# What's the dimensionality of this matrix (number of columns)?
# Only use pickup and dropoff location IDs features
# Turn the dataframe into a list of dictionaries - not my favorite way but let's do it
train_dicts = df_jan.loc[:, ["PULocationID", "DOLocationID"]].astype(str).to_dict(orient="records")

dict_vectorizer = DictVectorizer()

X_train = dict_vectorizer.fit_transform(train_dicts)
y_train = df_jan.loc[:, "duration"].values

_, X_train_dimensionality = X_train.shape
print(f"{X_train_dimensionality} columns.")

515 columns.


In [7]:
# Train a plain linear regression model with default parameters
# Calculate the RMSE of the model on the training data
model = LinearRegression()
model.fit(X_train, y_train)
print(f"RMSE: {root_mean_squared_error(y_train, model.predict(X_train)):.2f}")

RMSE: 7.65


In [9]:
# Apply model to validation set. What is the RMSE?
df_feb = (
    df_feb
    .assign(duration=df_feb["tpep_dropoff_datetime"].sub(df_feb["tpep_pickup_datetime"]).dt.seconds.div(60))
    .loc[lambda df: df["duration"].between(1,60), :]
)

val_dicts = df_feb.loc[:, ["PULocationID", "DOLocationID"]].astype(str).to_dict(orient="records")

X_val = dict_vectorizer.transform(val_dicts)
y_val = df_feb["duration"].values

print(f"RMSE: {root_mean_squared_error(y_val, model.predict(X_val)):.2f}.")

RMSE: 7.81.
