In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## Q1. Downloading the data

In [2]:
def read_dataframe(filename):
    df = pd.read_parquet(f'data/{filename}')
    
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime, errors='coerce')
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime, errors='coerce')
    return df

In [3]:
jan_record = read_dataframe('yellow_tripdata_2023-01.parquet')
feb_record = read_dataframe('yellow_tripdata_2023-02.parquet')

In [4]:
jan_record.head(3)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0


In [5]:
jan_record.shape

(3066766, 19)

In [6]:
feb_record.shape

(2913955, 19)

In [7]:
print(f'The number of columns are : {jan_record.shape[1]}')

The number of columns are : 19


## Q2. Computing duration

In [8]:
def compute_duration(df):
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    return df.head(2)

In [9]:
compute_duration(jan_record)
compute_duration(feb_record)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,1,2023-02-01 00:32:53,2023-02-01 00:34:34,2.0,0.3,1.0,N,142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0,1.683333
1,2,2023-02-01 00:35:16,2023-02-01 00:35:30,1.0,0.0,1.0,N,71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0,0.233333


In [10]:
duration_std = np.std(jan_record['duration'])
print(f'The standard deviation of the trips duration in January is: {duration_std:.2f}')

The standard deviation of the trips duration in January is: 42.59


## Q3. Dropping outliers

In [11]:
def drop_outliers(df):
    outlier_count = df[(df.duration < 1) | (df.duration > 60)].shape[0]
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    fraction_left = (df.shape[0] / (outlier_count + df.shape[0])) * 100
    print(f'The fraction left is {fraction_left:.0f}%')
    return df

In [12]:
jan_record = drop_outliers(jan_record)
feb_record = drop_outliers(feb_record)

The fraction left is 98%
The fraction left is 98%


In [13]:
jan_record.shape

(3009173, 20)

In [14]:
feb_record.shape

(2855951, 20)

## Q4. One-hot encoding

In [15]:
cols_to_encode = ['PULocationID', 'DOLocationID']
jan_record[cols_to_encode] = jan_record[cols_to_encode].astype(str)
feb_record[cols_to_encode] = feb_record[cols_to_encode].astype(str)

In [16]:
dv = DictVectorizer(sparse=True)

train_dicts = jan_record[cols_to_encode].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

In [17]:
print("Feature matrix dimensionality(number of cols):", X_train.shape[1])

Feature matrix dimensionality(number of cols): 515


## Q5. Training a model

In [18]:
y_train = jan_record['duration']

In [19]:
model = LinearRegression()
model.fit(X_train,y_train)

In [20]:
y_pred = model.predict(X_train)

In [21]:
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)

print(f'Mean Squared Error: {mse:.2f}')
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

Mean Squared Error: 58.51
Root Mean Squared Error (RMSE): 7.65


In [22]:
r2 = r2_score(y_train, y_pred)
print(f'R² Score: {r2:.2f}')

R² Score: 0.41


## Q6. Evaluating the model

In [23]:
# validation_dicts = feb_record[cols_to_encode].to_dict(orient='records')

In [24]:
# X_val = dv.fit_transform(val_dicts)
# print("Feature matrix dimensionality(number of cols):", X_val.shape[1])
# Feature matrix wa 5114 so I added a dummy data to fit with Jnauary

In [25]:
feb_record['dummy_feature'] = 1

In [26]:
validation_dicts = feb_record[cols_to_encode + ['dummy_feature']].to_dict(orient='records')
X_val = dv.transform(validation_dicts) 
print("Feature matrix dimensionality(number of cols):", X_val.shape[1])

Feature matrix dimensionality(number of cols): 515


In [27]:
y_val = feb_record['duration']

In [28]:
val_predictions = model.predict(X_val)

In [30]:
val_rmse = np.sqrt(mean_squared_error(y_val, val_predictions))

print(f"RMSE on validation data: { val_rmse:.2f}")

RMSE on validation data: 7.81
