## Import dependencies

In [1]:
import pandas as pd

import pickle

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import root_mean_squared_error

import mlflow

import warnings
warnings.filterwarnings("ignore")

## Data Handling

In [2]:
def read_and_handling(filepath):
    df = pd.read_parquet(filepath)
    print(f"Data loaded from {filepath}...")

    # Data Preprocessing
    print("Preprocessing data...\n")
    print(f"Initial shape: {df.shape}")
    print(f"Number of columns: {df.shape[1]}\n")

    # mlflow.log_param("initial_shape", df.shape)
    # mlflow.log_param("initial_columns", df.shape[1])


    # Feature Engineering
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    # mlflow.log_param("tpep_dropoff_datetime", df.tpep_dropoff_datetime)
    # mlflow.log_param("tpep_pickup_datetime", df.tpep_pickup_datetime)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    # mlflow.log_param("duration", df.duration)

    print(f"Standard deviation of the trips duration: {round(df.duration.std(), 2)}\n")
    
    # mlflow.log_param("std_duration", df.duration.std())

    categorical = ['PULocationID', 'DOLocationID']
    numerical = []

    df[categorical] = df[categorical].astype(str)


    # Anomaly Handling
    print("Handling outliers...\n")
    initial_rows = df.shape[0]
    df = df[((df.duration >= 1) & (df.duration <= 60))]
    final_rows = df.shape[0]
    # mlflow.log_param("initial_rows", initial_rows)
    # mlflow.log_param("final_rows", final_rows)
    # mlflow.log_param("fraction_of_records_after_outliers_handling", 100-(100*(initial_rows - final_rows)/final_rows))

    print(f"Initial records: {initial_rows}")
    print(f"Final records: {final_rows}")
    print(f"Fraction of the records after outliers handling: {100-(100*(initial_rows - final_rows)/final_rows):.2f}%\n")

    print("----------------------\n")

    return df

## ML Model Development

In [3]:
def train_eval_export(df_train, df_val):
    # Model Development
    print("Model Development...\n")
    
    target = 'duration'
    numerical = []
    categorical = ['PULocationID', 'DOLocationID']
    mlflow.log_param("target", target)
    mlflow.log_param("numerical_columns", numerical)
    mlflow.log_param("categorical_columns", categorical)

    # One-hot encoding
    print("One-hot encoding...\n")
    
    dv = DictVectorizer()
    X_train = dv.fit_transform(df_train[categorical + numerical].to_dict(orient='records'))
    y_train = df_train[target].values
    mlflow.log_param("X_train_shape", X_train.shape)
    mlflow.log_param("y_train_shape", y_train.shape)
    mlflow.log_param("X_train_columns", X_train.shape[1])

    print(f"Number of features: {len(dv.get_feature_names_out())}")
    print(f"Training shape: {X_train.shape}")    
    print(f"Dimensionality of this matrix (number of columns): {X_train.shape[1]}\n")    

    # Training the model
    print("Training the model...")
    
    model = LinearRegression()
    mlflow.log_param("model_type", type(model).__name__)
    mlflow.log_param("model_name", model.__class__.__name__)
    mlflow.log_param("model_params", model.get_params())

    model.fit(X_train, y_train)
    mlflow.log_param("model_coefficients", model.coef_)
    mlflow.log_param("model_intercept", model.intercept_)
    
    # print(f"Model coefficients: {model.coef_}")
    # print(f"Model intercept: {model.intercept_}\n")

    y_test = model.predict(X_train)
    train_rmse = root_mean_squared_error(y_train, y_test)
    mlflow.log_metric("train_rmse", train_rmse)

    print(f"Training RMSE: {train_rmse:.2f}\n")

    # Model Validation
    print("Model Validation...")
    
    X_val = dv.transform(df_val[categorical + numerical].to_dict(orient='records'))
    y_val = df_val[target].values
    mlflow.log_param("X_val_shape", X_val.shape)
    mlflow.log_param("y_val_shape", y_val.shape)
    mlflow.log_param("X_val_columns", X_val.shape[1])
    
    print(f"Validation shape: {X_val.shape}")
    print(f"Dimensionality of this matrix (number of columns): {X_val.shape[1]}")

    y_pred = model.predict(X_val)
    val_rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("val_rmse", val_rmse)

    print(f"Validation RMSE: {val_rmse:.2f}\n")
    
    # Save the model
    with open('models/A01_lr_model.bin', 'wb') as f_out:
        mlflow.log_artifact('models/A01_lr_model.bin')
        pickle.dump((dv, model), f_out)

In [4]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("TrialExperimentNYC_Taxi")

with mlflow.start_run():
    mlflow.set_tag("developer", "sutiwas-jitsopak")

    mlflow.log_param("train-data-path", "../01-intro/data/yellow_tripdata_2023-01.parquet")
    mlflow.log_param("valid-data-path", "../01-intro/data/yellow_tripdata_2023-02.parquet")

    df_train = read_and_handling("../01-intro/data/yellow_tripdata_2023-01.parquet")
    df_val = read_and_handling("../01-intro/data/yellow_tripdata_2023-02.parquet")

    my_df = [df_train, df_val]
    
    mlflow.log_param("list_of_train_and_validation_pair", my_df)

    # Train/Validation Datasets
    mlflow.log_param("train_shape", df_train.shape)
    mlflow.log_param("valid_shape", df_val.shape)

    train_eval_export(df_train, df_val)
    print("Model training and evaluation completed.\n")

Data loaded from ../01-intro/data/yellow_tripdata_2023-01.parquet...
Preprocessing data...

Initial shape: (3066766, 19)
Number of columns: 19

Standard deviation of the trips duration: 42.59

Handling outliers...

Initial records: 3066766
Final records: 3009173
Fraction of the records after outliers handling: 98.09%

----------------------

Data loaded from ../01-intro/data/yellow_tripdata_2023-02.parquet...
Preprocessing data...

Initial shape: (2913955, 19)
Number of columns: 19

Standard deviation of the trips duration: 42.84



0               2 ...' (8420 characters) is truncated to 6000 characters to meet the length limit.


Handling outliers...

Initial records: 2913955
Final records: 2855951
Fraction of the records after outliers handling: 97.97%

----------------------

Model Development...

One-hot encoding...

Number of features: 515
Training shape: (3009173, 515)
Dimensionality of this matrix (number of columns): 515

Training the model...


 -1.53939821e+00  7.61572866e+00 -6...' (8369 characters) is truncated to 6000 characters to meet the length limit.


Training RMSE: 7.65

Model Validation...
Validation shape: (2855951, 515)
Dimensionality of this matrix (number of columns): 515
Validation RMSE: 7.81

Model training and evaluation completed.

