# Homework - MLOps Zoomcamp 2025

This notebook contains the solution to the homework for Week 1 of the MLOps Zoomcamp 2025.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer

# Download the January and February 2023 data
url_jan = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet '
url_feb = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet '

# Read the data
df_jan = pd.read_parquet(url_jan)
df_feb = pd.read_parquet(url_feb)

In [None]:
# How many columns are there in the January data?
num_columns = df_jan.shape[1]
print(f"Number of columns in January: {num_columns}")

In [None]:
# Calculate the trip duration in minutes
df_jan['duration'] = (df_jan['tpep_dropoff_datetime'] - df_jan['tpep_pickup_datetime']).dt.total_seconds() / 60

# What is the standard deviation of the trip durations in January?
std_duration = df_jan['duration'].std()
print(f"Standard deviation of trip durations in January: {std_duration:.2f}")

In [None]:
# Filter outliers
df_jan_filtered = df_jan[(df_jan['duration'] >= 1) & (df_jan['duration'] <= 60)]

# What fraction of the records remain after removing outliers?
fraction_remaining = len(df_jan_filtered) / len(df_jan)
print(f"Fraction of remaining records: {fraction_remaining:.2%}")

In [None]:
# Prepare the data for the model
df_jan_filtered['PUlocationID'] = df_jan_filtered['PULocationID'].astype(str)
df_jan_filtered['DOlocationID'] = df_jan_filtered['DOLocationID'].astype(str)

# Create a list of dictionaries for the vectorizer
train_dicts = df_jan_filtered[['PUlocationID', 'DOlocationID']].to_dict(orient='records')

# One-hot encoding
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = df_jan_filtered['duration'].values

# What is the dimensionality of the feature matrix (number of columns)?
num_features = X_train.shape[1]
print(f"Number of columns in the feature matrix: {num_features}")

In [None]:
# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# What is the RMSE on the training set?
y_train_pred = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f"RMSE on the training set: {rmse_train:.2f}")

In [None]:
# Process the validation data (February)
df_feb['duration'] = (df_feb['tpep_dropoff_datetime'] - df_feb['tpep_pickup_datetime']).dt.total_seconds() / 60
df_feb_filtered = df_feb[(df_feb['duration'] >= 1) & (df_feb['duration'] <= 60)]

df_feb_filtered['PUlocationID'] = df_feb_filtered['PULocationID'].astype(str)
df_feb_filtered['DOlocationID'] = df_feb_filtered['DOLocationID'].astype(str)

val_dicts = df_feb_filtered[['PUlocationID', 'DOlocationID']].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_feb_filtered['duration'].values

# What is the RMSE on the validation set?
y_val_pred = model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"RMSE on the validation set: {rmse_val:.2f}")