In [167]:
!python -V
!pip install pandas seaborn scikit-learn pyarrow fastparquet

Python 3.9.12
You should consider upgrading via the '/Users/kgi/.pyenv/versions/3.9.12/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0m

In [168]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error

In [169]:
df_jan = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')
df_feb = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')
print('Jan:', df_jan.shape[1])
print('Feb:', df_feb.shape[1])

Jan: 19
Feb: 19


In [170]:
# Convert pickup and dropoff columns to datetime
df_jan['tpep_pickup_datetime'] = pd.to_datetime(df_jan['tpep_pickup_datetime'])
df_jan['tpep_dropoff_datetime'] = pd.to_datetime(df_jan['tpep_dropoff_datetime'])

# Calculate trip duration in minutes
df_jan['duration'] = (df_jan['tpep_dropoff_datetime'] - df_jan['tpep_pickup_datetime']).dt.total_seconds() / 60

# Compute the standard deviation of the trip duration
std_duration = df_jan['duration'].std()
print('Standard deviation of trip duration (in minutes):', round(std_duration, 2))

Standard deviation of trip duration (in minutes): 42.59


In [171]:
# Convert pickup and dropoff columns to datetime
df_jan['tpep_pickup_datetime'] = pd.to_datetime(df_jan['tpep_pickup_datetime'])
df_jan['tpep_dropoff_datetime'] = pd.to_datetime(df_jan['tpep_dropoff_datetime'])

# Calculate trip duration in minutes
df_jan['duration'] = (df_jan['tpep_dropoff_datetime'] - df_jan['tpep_pickup_datetime']).dt.total_seconds() / 60

# Keep only trips with duration between 1 and 60 minutes (inclusive)
condition = (df_jan['duration'] >= 1) & (df_jan['duration'] <= 60)
filtered_df = df_jan[condition]

# Calculate the fraction of records left
fraction_left = len(filtered_df) / len(df_jan)
print('Fraction of records left:', round(fraction_left, 3))  # Rounded to 3 decimals for clarity

# If you want the percentage:
print('Percentage of records left:', round(fraction_left * 100, 2), '%')

Fraction of records left: 0.981
Percentage of records left: 98.12 %


In [172]:
df_jan['tpep_pickup_datetime'] = pd.to_datetime(df_jan['tpep_pickup_datetime'])
df_jan['tpep_dropoff_datetime'] = pd.to_datetime(df_jan['tpep_dropoff_datetime'])
df_jan['duration'] = (df_jan['tpep_dropoff_datetime'] - df_jan['tpep_pickup_datetime']).dt.total_seconds() / 60
df_jan = df_jan[(df_jan['duration'] >= 1) & (df_jan['duration'] <= 60)]

# Cast location IDs to string (important for one-hot encoding!)
df_jan['PULocationID'] = df_jan['PULocationID'].astype(str)
df_jan['DOLocationID'] = df_jan['DOLocationID'].astype(str)

# Create a list of dictionaries for DictVectorizer
features = df_jan[['PULocationID', 'DOLocationID']].to_dict(orient='records')

# One-hot encode
dv = DictVectorizer()
X = dv.fit_transform(features)

# Print the dimensionality (number of columns)
print('Feature matrix shape:', X.shape)
print('Number of columns:', X.shape[1])


Feature matrix shape: (3009173, 515)
Number of columns: 515


In [173]:
df_jan['tpep_pickup_datetime'] = pd.to_datetime(df_jan['tpep_pickup_datetime'])
df_jan['tpep_dropoff_datetime'] = pd.to_datetime(df_jan['tpep_dropoff_datetime'])
df_jan['duration'] = (df_jan['tpep_dropoff_datetime'] - df_jan['tpep_pickup_datetime']).dt.total_seconds() / 60
df_jan = df_jan[(df_jan['duration'] >= 1) & (df_jan['duration'] <= 60)]

# Prepare features for one-hot encoding
df_jan['PULocationID'] = df_jan['PULocationID'].astype(str)
df_jan['DOLocationID'] = df_jan['DOLocationID'].astype(str)
features = df_jan[['PULocationID', 'DOLocationID']].to_dict(orient='records')

# Vectorize
dv = DictVectorizer()
X_train = dv.fit_transform(features)
y_train = df_jan['duration'].values

# Train linear regression
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on train
y_pred = model.predict(X_train)

# Calculate RMSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
print('RMSE on train:', round(rmse_train, 2))

RMSE on train: 7.65


In [174]:
# Load validation data
df_feb = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')
df_feb['tpep_pickup_datetime'] = pd.to_datetime(df_feb['tpep_pickup_datetime'])
df_feb['tpep_dropoff_datetime'] = pd.to_datetime(df_feb['tpep_dropoff_datetime'])
df_feb['duration'] = (df_feb['tpep_dropoff_datetime'] - df_feb['tpep_pickup_datetime']).dt.total_seconds() / 60

# Filter to durations between 1 and 60 minutes
df_feb = df_feb[(df_feb['duration'] >= 1) & (df_feb['duration'] <= 60)]

# Prepare features (convert to string for DictVectorizer)
df_feb['PULocationID'] = df_feb['PULocationID'].astype(str)
df_feb['DOLocationID'] = df_feb['DOLocationID'].astype(str)
features_val = df_feb[['PULocationID', 'DOLocationID']].to_dict(orient='records')

# Transform features using the DictVectorizer from the training step
X_val = dv.transform(features_val)
y_val = df_feb['duration'].values

# Predict and calculate RMSE
y_pred_val = model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
print('RMSE on validation:', round(rmse_val, 2))

RMSE on validation: 7.81
