In [1]:
import pandas as pd

In [2]:
trip_df = pd.read_parquet('yellow_tripdata_2023-01.parquet')

#### Q1. Read the data for January. How many columns are there?

In [3]:
print("Columns in yellow_tripdata_2023-01:", trip_df.shape[1])

Columns in yellow_tripdata_2023-01: 19


#### Q2. What's the standard deviation of the trips duration in January?

In [4]:
trip_df['tpep_pickup_datetime'] = pd.to_datetime(trip_df['tpep_pickup_datetime'])
trip_df['tpep_dropoff_datetime'] = pd.to_datetime(trip_df['tpep_dropoff_datetime'])

trip_df['duration'] = (trip_df['tpep_dropoff_datetime'] - trip_df['tpep_pickup_datetime']).dt.total_seconds() / 60

january_trips = trip_df[trip_df['tpep_pickup_datetime'].dt.month == 1]

std_dev_duration_january = january_trips['duration'].std()

print("Standard deviation of trip duration in January:", std_dev_duration_january)

Standard deviation of trip duration in January: 42.58564176425905


#### Q3. What fraction of the records left after you dropped the outliers?

In [5]:
filtered_df = trip_df[(trip_df['duration'] >= 1) & (trip_df['duration'] <= 60)]

fraction_left = len(filtered_df) / len(trip_df)

print("Fraction of records left after dropping outliers:", fraction_left)

Fraction of records left after dropping outliers: 0.9812202822125979


#### Q4. What's the dimensionality of this matrix (number of columns)?

In [6]:
from sklearn.feature_extraction import DictVectorizer

trip_df['PULocationID'] = trip_df['PULocationID'].astype(str)
trip_df['DOLocationID'] = trip_df['DOLocationID'].astype(str)

location_dicts = trip_df[['PULocationID', 'DOLocationID']].to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_encoded = dv.fit_transform(location_dicts)

num_columns = X_encoded.shape[1]

print("Dimensionality of the feature matrix:", num_columns)

Dimensionality of the feature matrix: 518


#### Q5. What's the RMSE on train?

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Convert datetime columns
trip_df['tpep_pickup_datetime'] = pd.to_datetime(trip_df['tpep_pickup_datetime'])
trip_df['tpep_dropoff_datetime'] = pd.to_datetime(trip_df['tpep_dropoff_datetime'])

# Calculate duration in minutes
trip_df['duration'] = (trip_df['tpep_dropoff_datetime'] - trip_df['tpep_pickup_datetime']).dt.total_seconds() / 60

# Filter the dataset to remove outliers
trip_df = trip_df[(trip_df['duration'] >= 1) & (trip_df['duration'] <= 60)]

# Convert categorical variables to string
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']
trip_df[categorical] = trip_df[categorical].astype(str)

# Prepare data for DictVectorizer
train_dicts = trip_df[categorical + numerical].to_dict(orient='records')

# Vectorize features
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

# Target variable
target = 'duration'
y_train = trip_df[target].values

# Train linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict on training data
y_pred = lr.predict(X_train)

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print("RMSE on train:", rmse)


RMSE on train: 7.65840506917767


#### Q6. What's the RMSE on validation?

In [13]:
import pandas as pd
from sklearn.metrics import mean_squared_error
import numpy as np

df_val = pd.read_parquet('yellow_tripdata_2023-02.parquet')

df_val['tpep_pickup_datetime'] = pd.to_datetime(df_val['tpep_pickup_datetime'])
df_val['tpep_dropoff_datetime'] = pd.to_datetime(df_val['tpep_dropoff_datetime'])

df_val['duration'] = (df_val['tpep_dropoff_datetime'] - df_val['tpep_pickup_datetime']).dt.total_seconds() / 60

df_val = df_val[(df_val['duration'] >= 1) & (df_val['duration'] <= 60)]

df_val[categorical] = df_val[categorical].astype(str)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')

X_val = dv.transform(val_dicts)

y_val = df_val[target].values

y_val_pred = lr.predict(X_val)

rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("RMSE on validation:", rmse_val)


RMSE on validation: 7.8199774540504405
