In [1]:
import pandas as pd
pd.__version__

'2.3.0'

In [8]:
!pip install pyarrow
import os
print(os.getcwd())
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge
import numpy as np
from sklearn.feature_extraction import DictVectorizer


/Users/jordanharris/Code/mlops_zoomcamp/module_1


In [3]:
#Question 1
df_jan = pd.read_parquet("../data/yellow_tripdata_2023-01.parquet")
df_feb = pd.read_parquet("../data/yellow_tripdata_2023-02.parquet")

# number of columns
n_cols = df_jan.shape[1]
print(f"January 2023 dataset has {n_cols} columns.")

df_all = pd.concat([df_jan, df_feb], ignore_index=True)
print(f"Combined dataset shape: {df_all.shape}")
extra_cols = set(df_feb.columns) - set(df_jan.columns)
print("Columns only in February:", extra_cols)



January 2023 dataset has 19 columns.
Combined dataset shape: (5980721, 20)
Columns only in February: {'Airport_fee'}


In [4]:
df_all.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,Airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0,
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0,
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0,
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25,
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0,


In [5]:
#Question 2
std_dev_jan = (
    (pd.to_datetime(df_all.tpep_dropoff_datetime)
     - pd.to_datetime(df_all.tpep_pickup_datetime))
    .dt.total_seconds().div(60)
)[pd.to_datetime(df_all.tpep_pickup_datetime).dt.month.eq(1)].std()

print(f"Std dev (Jan): {std_dev_jan:.2f} minutes")

Std dev (Jan): 42.59 minutes


In [6]:
#Question 3

df_all['pickup_dt']  = pd.to_datetime(df_all.tpep_pickup_datetime)
df_all['dropoff_dt'] = pd.to_datetime(df_all.tpep_dropoff_datetime)

df_all['duration'] = (
    df_all['dropoff_dt']
    - df_all['pickup_dt']
).dt.total_seconds() / 60


fraction_kept = df_all['duration'].between(1, 60).mean()
print(f"{fraction_kept*100:.0f}%")

98%


In [10]:
#Question 4

dicts = df_all[['PULocationID','DOLocationID']].astype(str).to_dict(orient='records')

dv = DictVectorizer()
X = dv.fit_transform(dicts)

print("Number of one-hot features:", X.shape[1])
# → 515

Number of one-hot features: 521


In [14]:
#Question 5
df_all['pickup_dt']  = pd.to_datetime(df_all.tpep_pickup_datetime)
df_all['dropoff_dt'] = pd.to_datetime(df_all.tpep_dropoff_datetime)
df_all['duration']   = (df_all.dropoff_dt - df_all.pickup_dt).dt.total_seconds() / 60

#FILTERRRR!
df_all = df_all[df_all['duration'].between(1, 60)]

jan_mask = df_all['pickup_dt'].dt.month.eq(1)
feb_mask = df_all['pickup_dt'].dt.month.eq(2)

print("Jan rows:", jan_mask.sum())
print("df_all rows:", len(df_all))

dicts  = df_all[['PULocationID','DOLocationID']].astype(str)\
                 .to_dict(orient='records')
dv     = DictVectorizer()
X_full = dv.fit_transform(dicts)

X_train = X_full[jan_mask.values, :]
y_train = df_all.loc[jan_mask, 'duration'].values

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


lr = LinearRegression().fit(X_train, y_train)
y_pred = lr.predict(X_train)

Jan rows: 3009145
df_all rows: 5865124
X_train shape: (3009145, 519)
y_train shape: (3009145,)


In [18]:
mse  = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
print(f"Train MSE: {mse:.2f} minutes")
print(f"Train RMSE: {rmse:.2f} minutes")

Train MSE: 58.51 minutes
Train RMSE: 7.65 minutes


In [None]:
#Question 6
df_all = df_all[df_all['duration'].between(1, 60)]
feb_mask = df_all['pickup_dt'].dt.month.eq(2)
df_feb = df_all.loc[feb_mask].copy()

val_dicts = df_feb[['PULocationID','DOLocationID']].astype(str).to_dict('records')
X_val     = dv.transform(val_dicts)
y_val     = df_feb['duration'].values
y_pred    = lr.predict(X_val)

mse_val  = mean_squared_error(y_val, y_pred)
rmse_val = np.sqrt(mse_val)

print(f"Validation MSE:  {mse_val:.2f} (minutes²)")
print(f"Validation RMSE: {rmse_val:.2f} minutes")
