In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import root_mean_squared_error

In [2]:
!pip install pyarrow


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [3]:
jan_yellow_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
feb_yellow_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

## Q1. Read the data for January. How many columns are there?

In [4]:
len(jan_yellow_df.columns)

19

## Q2. What's the standard deviation of the trips duration in January?

In [5]:
jan_yellow_df.loc[:, 'tpep_pickup_datetime'] = pd.to_datetime(jan_yellow_df['tpep_pickup_datetime'])
jan_yellow_df.loc[:, 'tpep_dropoff_datetime'] = pd.to_datetime(jan_yellow_df['tpep_dropoff_datetime'])

jan_yellow_df.loc[:, 'duration'] = (jan_yellow_df['tpep_dropoff_datetime'] - jan_yellow_df['tpep_pickup_datetime']).dt.total_seconds() / 60

std_duration = jan_yellow_df['duration'].std()
print(f"Standard deviation of trip duration (in minutes): {std_duration:.2f}")

Standard deviation of trip duration (in minutes): 42.59


## Q3. What fraction of the records left after you dropped the outliers?

In [6]:
before_filter = len(jan_yellow_df)

jan_yellow_df = jan_yellow_df[(jan_yellow_df['duration'] > 0) & (jan_yellow_df['duration'] < 60)]
after_filter = len(jan_yellow_df)

In [7]:
percentage = (after_filter/ before_filter) * 100
percentage

99.17104858994786

## Q4. What's the dimensionality of this matrix (number of columns)?

In [9]:
from sklearn.feature_extraction import DictVectorizer

# Recast IDs as strings
jan_yellow_df['PULocationID'] = jan_yellow_df['PULocationID'].astype(str)
jan_yellow_df['DOLocationID'] = jan_yellow_df['DOLocationID'].astype(str)

# Convert to list of dicts
records = jan_yellow_df[['PULocationID', 'DOLocationID']].to_dict(orient='records')

# Use sparse matrix
dv = DictVectorizer(sparse=True)
X_sparse = dv.fit_transform(records)

print(X_sparse.shape[1])  # Number of features (i.e., one-hot encoded columns)



517


## Q5. What's the RMSE on train?

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

y = jan_yellow_df['duration'].values

model = LinearRegression()
model.fit(X_sparse, y)

y_pred = model.predict(X_sparse)

mse = mean_squared_error(y, y_pred)

rmse = np.sqrt(mse)
print(f"RMSE on training data: {rmse:.2f} minutes")


RMSE on training data: 7.92 minutes


In [15]:
## Q6. What's the RMSE on validation?

In [14]:
# Step 1: Convert datetime columns
feb_yellow_df.loc[:, 'tpep_pickup_datetime'] = pd.to_datetime(feb_yellow_df['tpep_pickup_datetime'])
feb_yellow_df.loc[:, 'tpep_dropoff_datetime'] = pd.to_datetime(feb_yellow_df['tpep_dropoff_datetime'])

# Step 2: Compute trip duration in minutes
feb_yellow_df.loc[:, 'duration'] = (feb_yellow_df['tpep_dropoff_datetime'] - feb_yellow_df['tpep_pickup_datetime']).dt.total_seconds() / 60

# Step 3: Filter out unreasonable durations
feb_yellow_df = feb_yellow_df[(feb_yellow_df['duration'] > 0) & (feb_yellow_df['duration'] < 180)]

# Step 4: Convert PULocationID and DOLocationID to string
feb_yellow_df['PULocationID'] = feb_yellow_df['PULocationID'].astype(str)
feb_yellow_df['DOLocationID'] = feb_yellow_df['DOLocationID'].astype(str)

# Step 5: Transform features using the previously fitted DictVectorizer (no refit!)
val_records = feb_yellow_df[['PULocationID', 'DOLocationID']].to_dict(orient='records')
X_val = dv.transform(val_records)

# Step 6: Get the target variable
y_val = feb_yellow_df['duration'].values

# Step 7: Predict using the trained model
y_pred_val = model.predict(X_val)

# Step 8: Compute RMSE
from sklearn.metrics import mean_squared_error
import numpy as np

mse_val = mean_squared_error(y_val, y_pred_val)
rmse_val = np.sqrt(mse_val)
print(f"RMSE on validation data (February): {rmse_val:.2f} minutes")


RMSE on validation data (February): 9.04 minutes
