In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Q1. Downloading the data

In [7]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [8]:
len(df.columns)

19

### Answer Q1: 19 columns

## Q2. Computing duration

In [9]:
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
duration_std = df['duration'].std()

In [10]:
duration_std

42.594351241920904

### Answer Q2: 42.59

## Q3. Dropping outliers

In [11]:
original_count = len(df)
df_filtered = df[(df['duration'] >= 1) & (df['duration'] <= 60)].copy()
filtered_count = len(df_filtered)
fraction = filtered_count / original_count

In [12]:
fraction

0.9812202822125979

### Answer Q3: 98%

## Q4. One-hot encoding

In [13]:
categorical = ['PULocationID', 'DOLocationID']
df_filtered[categorical] = df_filtered[categorical].astype(str)

train_dicts = df_filtered[categorical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

print(f"Feature matrix shape: {X_train.shape}")
print(f"Number of columns (features): {X_train.shape[1]}")

Feature matrix shape: (3009173, 515)
Number of columns (features): 515


### Answer Q4: 515

## Q5. Training a model

In [14]:
y_train = df_filtered['duration'].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))

print(f"RMSE on training data: {rmse_train:.2f}")

RMSE on training data: 7.65


### Answer Q5: 7.65

## Q6. Evaluating the model

In [None]:
df_feb = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')
df_feb['duration'] = (df_feb['tpep_dropoff_datetime'] - df_feb['tpep_pickup_datetime']).dt.total_seconds() / 60
df_feb_filtered = df_feb[(df_feb['duration'] >= 1) & (df_feb['duration'] <= 60)].copy()
df_feb_filtered[categorical] = df_feb_filtered[categorical].astype(str)
val_dicts = df_feb_filtered[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_feb_filtered['duration'].values
y_pred_val = lr.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))

print(f"Validation records: {len(df_feb_filtered):,}")
print(f"RMSE on validation data: {rmse_val:.2f}")

### Answer Q6: 7.81