In [3]:
## Install Packages
%pip install numpy pandas seaborn scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [1]:
## Load Dataset
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.feature_extraction import DictVectorizer

In [2]:
## Download Yellow Taxi Trips Files
! curl -o ./data/jan_yellow.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
! curl -o ./data/feb_yellow.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 45.4M  100 45.4M    0     0   136M      0 --:--:-- --:--:-- --:--:--  136M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 45.5M  100 45.5M    0     0   163M      0 --:--:-- --:--:-- --:--:--  163M


In [4]:
jan_df = pd.read_parquet ("./data/jan_yellow.parquet")
jan_df.head().T

Unnamed: 0,0,1,2,3,4
VendorID,2,2,2,1,2
tpep_pickup_datetime,2023-01-01 00:32:10,2023-01-01 00:55:08,2023-01-01 00:25:04,2023-01-01 00:03:48,2023-01-01 00:10:29
tpep_dropoff_datetime,2023-01-01 00:40:36,2023-01-01 01:01:27,2023-01-01 00:37:49,2023-01-01 00:13:25,2023-01-01 00:21:19
passenger_count,1.0,1.0,1.0,0.0,1.0
trip_distance,0.97,1.1,2.51,1.9,1.43
RatecodeID,1.0,1.0,1.0,1.0,1.0
store_and_fwd_flag,N,N,N,N,N
PULocationID,161,43,48,138,107
DOLocationID,141,237,238,7,79
payment_type,2,1,1,1,1


In [5]:
## Cleaning/Wrangling
jan_df[["tpep_pickup_datetime", "tpep_dropoff_datetime"]] = jan_df[["tpep_pickup_datetime", "tpep_dropoff_datetime"]].apply(pd.to_datetime)
jan_df["duration"] = (jan_df["tpep_dropoff_datetime"] - jan_df["tpep_pickup_datetime"]).dt.total_seconds()/60

jan_df[["tpep_pickup_datetime", "tpep_dropoff_datetime", "duration"]].head().T

Unnamed: 0,0,1,2,3,4
tpep_pickup_datetime,2023-01-01 00:32:10,2023-01-01 00:55:08,2023-01-01 00:25:04,2023-01-01 00:03:48,2023-01-01 00:10:29
tpep_dropoff_datetime,2023-01-01 00:40:36,2023-01-01 01:01:27,2023-01-01 00:37:49,2023-01-01 00:13:25,2023-01-01 00:21:19
duration,8.433333,6.316667,12.75,9.616667,10.833333


Standard deviation
Formula:
<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">
  <mi>S</mi>
  <mi>D</mi>
  <mo>=</mo>
  <msqrt>
    <mfrac>
      <mrow>
        <mo data-mjx-texclass="OP">&#x2211;</mo>
        <mo stretchy="false">(</mo>
        <mi>x</mi>
        <mo>&#x2212;</mo>
        <mi>&#x3BC;</mi>
        <msup>
          <mo stretchy="false">)</mo>
          <mn>2</mn>
        </msup>
      </mrow>
      <mi>N</mi>
    </mfrac>
  </msqrt>
</math>

In [6]:
filtered_duration = jan_df[jan_df['duration'].between(1,60)]
clean_prop = len(filtered_duration['duration'])/len(jan_df['duration'])

In [7]:
## Exploratory Data Analysis (EDA)
print(f"1, Data Dimension: {jan_df.shape[0]} rows | {jan_df.shape[1]} columns \n")

print(f"2, Duration Standard Deviation: {jan_df['duration'].std()} \n")

print(f"3, Outlier Proportion: {clean_prop} \n")

1, Data Dimension: 3066766 rows | 20 columns 

2, Duration Standard Deviation: 42.594351241920904 

3, Outlier Proportion: 0.9812202822125979 



In [8]:
## Apply One-Hot Encoding
ml_df = filtered_duration[['PULocationID', 'DOLocationID']].astype(str)
ml_df['duration'] = filtered_duration['duration']
ml_df.head()

Unnamed: 0,PULocationID,DOLocationID,duration
0,161,141,8.433333
1,43,237,6.316667
2,48,238,12.75
3,138,7,9.616667
4,107,79,10.833333


In [9]:
## Dictionaries
dicts_train = ml_df[['PULocationID', 'DOLocationID']].to_dict(orient='records')
dicts_train[1:5]

[{'PULocationID': '43', 'DOLocationID': '237'},
 {'PULocationID': '48', 'DOLocationID': '238'},
 {'PULocationID': '138', 'DOLocationID': '7'},
 {'PULocationID': '107', 'DOLocationID': '79'}]

In [10]:
vec = DictVectorizer(sparse = True)
feature_matrix = vec.fit_transform(dicts_train)

print(f"4, Dimension of feature_matrix: {feature_matrix.shape} \n")

4, Dimension of feature_matrix: (3009173, 515) 



In [11]:
## Linear Regression Model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

y = ml_df['duration']

model = LinearRegression()
model.fit(feature_matrix, y)
y_pred = model.predict(feature_matrix)
rmse = np.sqrt(mean_squared_error(y, y_pred))

print(f"5, RMSE: {rmse}")

5, RMSE: 7.649261027919939


In [13]:
val_df = pd.read_parquet("./data/feb_yellow.parquet")
val_df[["tpep_pickup_datetime", "tpep_dropoff_datetime"]] = val_df[["tpep_pickup_datetime", "tpep_dropoff_datetime"]].apply(pd.to_datetime)
val_df["duration"] = (val_df["tpep_dropoff_datetime"] - val_df["tpep_pickup_datetime"]).dt.total_seconds()/60
val_df = val_df[val_df['duration'].between(1,60)]

val_df.shape

(2855951, 20)

In [15]:
def rmse_validation(df_pth: str):
    val_df = pd.read_parquet(df_pth)
    val_df[["tpep_pickup_datetime", "tpep_dropoff_datetime"]] = val_df[["tpep_pickup_datetime", "tpep_dropoff_datetime"]].apply(pd.to_datetime)
    val_df["duration"] = (val_df["tpep_dropoff_datetime"] - val_df["tpep_pickup_datetime"]).dt.total_seconds()/60
    val_df = val_df[val_df['duration'].between(1,60)]

    val_df[['PULocationID', 'DOLocationID']] = val_df[['PULocationID', 'DOLocationID']].astype(str)
    dicts_val = val_df[['PULocationID', 'DOLocationID']].to_dict(orient='records')
    
    # Use `transform` not `fit_transform` the validation data according to the feature space learned from the training data
    feature_matrix_val = vec.transform(dicts_val)
    print(f"Dimension of feature_matrix: {feature_matrix_val.shape} \n")

    y_val = val_df['duration']
    y_pred = model.predict(feature_matrix_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))

    return rmse

result_feb_df = rmse_validation("./data/feb_yellow.parquet")
print(f"6, Validation_RMSE: {result_feb_df}")

Dimension of feature_matrix: (2855951, 515) 

6, Validation_RMSE: 7.811832638273232
