## Import dependencies

In [1]:
import pandas as pd

import pickle

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import root_mean_squared_error

import warnings
warnings.filterwarnings("ignore")

## Data Handling

In [2]:
def read_and_handling(filepath):

    df = pd.read_parquet(filepath)
    print(f"Data loaded from {filepath}...")

    # Data Preprocessing
    print("Preprocessing data...\n")

    print(f"Initial shape: {df.shape}")
    print(f"Number of columns: {df.shape[1]}\n")

    # Feature Engineering
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    print(f"Standard deviation of the trips duration: {round(df.duration.std(), 2)}\n")

    categorical = ['PULocationID', 'DOLocationID']
    numerical = []

    df[categorical] = df[categorical].astype(str)

    # Anomaly Handling
    print("Handling outliers...\n")
    initial_rows = df.shape[0]
    df = df[((df.duration >= 1) & (df.duration <= 60))]
    final_rows = df.shape[0]

    print(f"Initial records: {initial_rows}")
    print(f"Final records: {final_rows}")
    print(f"Fraction of the records after outliers handling: {100-(100*(initial_rows - final_rows)/final_rows):.2f}%\n")

    print("----------------------\n")

    return df

In [3]:
my_df = [df_train:=read_and_handling('./data/yellow_tripdata_2023-01.parquet'), 
         df_val:=read_and_handling('./data/yellow_tripdata_2023-02.parquet')]

Data loaded from ./data/yellow_tripdata_2023-01.parquet...
Preprocessing data...

Initial shape: (3066766, 19)
Number of columns: 19

Standard deviation of the trips duration: 42.59

Handling outliers...

Initial records: 3066766
Final records: 3009173
Fraction of the records after outliers handling: 98.09%

----------------------

Data loaded from ./data/yellow_tripdata_2023-02.parquet...
Preprocessing data...

Initial shape: (2913955, 19)
Number of columns: 19

Standard deviation of the trips duration: 42.84

Handling outliers...

Initial records: 2913955
Final records: 2855951
Fraction of the records after outliers handling: 97.97%

----------------------



## ML Model Development

In [4]:
def train_eval_export(df_train, df_val):
    # Model Development
    print("Model Development...\n")
    
    target = 'duration'
    numerical = []
    categorical = ['PULocationID', 'DOLocationID']

    # One-hot encoding
    print("One-hot encoding...\n")
    
    dv = DictVectorizer()
    X_train = dv.fit_transform(df_train[categorical + numerical].to_dict(orient='records'))
    y_train = df_train[target].values

    print(f"Number of features: {len(dv.get_feature_names_out())}")
    print(f"Training shape: {X_train.shape}")    
    print(f"Dimensionality of this matrix (number of columns): {X_train.shape[1]}\n")    

    # Training the model
    print("Training the model...")
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    print(f"Model coefficients: {model.coef_}")
    print(f"Model intercept: {model.intercept_}\n")

    y_test = model.predict(X_train)
    train_rmse = root_mean_squared_error(y_train, y_test)

    print(f"Training RMSE: {train_rmse:.2f}\n")

    # Model Validation
    print("Model Validation...")
    
    X_val = dv.transform(df_val[categorical + numerical].to_dict(orient='records'))
    y_val = df_val[target].values
    
    print(f"Validation shape: {X_val.shape}")
    print(f"Dimensionality of this matrix (number of columns): {X_val.shape[1]}")

    y_pred = model.predict(X_val)
    val_rmse = root_mean_squared_error(y_val, y_pred)

    print(f"Validation RMSE: {val_rmse:.2f}\n")
    
    # Save the model
    with open('models/A01_lr_model.bin', 'wb') as f_out:
        pickle.dump((dv, model), f_out)

In [5]:
train_eval_export(df_train, df_val)
print("Model training and evaluation completed.\n")

Model Development...

One-hot encoding...

Number of features: 515
Training shape: (3009173, 515)
Dimensionality of this matrix (number of columns): 515

Training the model...
Model coefficients: [ 1.83367608e+01 -2.49270965e+01 -4.55936183e+00 -3.60105790e+00
 -1.54317501e+00  7.60917865e+00 -6.24910979e+00  6.43596205e+00
  1.49584124e+01  7.10627338e+00  9.54751629e+00  4.33830021e-01
 -4.83794460e+00 -4.16522548e+00  6.01004780e+00  1.54592744e+00
  3.39314157e+00  9.07356798e+00  1.12106838e+00  8.62341816e-01
  2.61295049e+00 -7.71467791e+00 -4.99971898e+00  3.77974139e+00
 -1.90957576e+01 -3.59892914e+00  1.71916365e+00  4.18671219e+00
  5.67316469e+00 -1.01388823e+00 -1.47053361e-01 -1.59136656e+01
 -7.98151201e+00  1.50643809e+01  7.74893452e+00 -1.00104511e+01
 -9.90187211e+00  6.72813930e+00 -6.34130387e+00  7.12427305e+00
 -1.80078282e+01  8.04129618e+00 -4.34062628e+00 -5.97033002e+00
 -5.34166328e+00 -4.54928187e+00 -2.19108058e+00 -2.30905408e+00
 -1.83935821e+00  2.5361