In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [3]:
import pandas as pd
from src.config import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet")
df

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-29,190 Morgan,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-30,190 Morgan,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-31,190 Morgan,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-02-01,190 Morgan,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-02-02,190 Morgan,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137083,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-03-27,SYS038,0
137084,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-03-28,SYS038,0
137085,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-03-29,SYS038,0
137086,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-03-30,SYS038,0


In [4]:
from datetime import datetime

from src.data_utils import split_time_series_data

X_train, y_train, X_test, y_test = split_time_series_data(
    df,
    cutoff_date=datetime(2024, 3, 15, 0, 0, 0),
    target_column="target"
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(100096, 674)
(100096,)
(36992, 674)
(36992,)


In [5]:
past_ride_columns = [c for c in X_train.columns if c.startswith("rides_")]
X_train_only_numeric = X_train[past_ride_columns]
X_test_only_numeric = X_test[past_ride_columns]

## Top 10 Feature Selection Using Feature Importance

In [6]:
import lightgbm as lgb
model = lgb.LGBMRegressor()
model.fit(X_train_only_numeric, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.301916 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20877
[LightGBM] [Info] Number of data points in the train set: 100096, number of used features: 672
[LightGBM] [Info] Start training from score 0.382932


In [10]:
feature_importances = pd.Series(model.feature_importances_, index=X_train_only_numeric.columns)
top_features = feature_importances.sort_values(ascending=False).head(10).index


X_top = X_train_only_numeric[top_features]

In [11]:
X_top

Unnamed: 0,rides_t-1,rides_t-336,rides_t-168,rides_t-648,rides_t-2,rides_t-672,rides_t-3,rides_t-144,rides_t-328,rides_t-671
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
100091,0,0,0,0,0,0,0,0,0,0
100092,0,0,0,0,0,0,0,0,0,0
100093,0,0,0,0,0,0,0,0,0,0
100094,0,0,0,0,0,0,0,0,0,0


In [20]:
model_top = lgb.LGBMRegressor()
model_top.fit(X_top, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005736 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 206
[LightGBM] [Info] Number of data points in the train set: 100096, number of used features: 10
[LightGBM] [Info] Start training from score 0.382932


In [21]:
from sklearn.metrics import mean_absolute_error
predictions = model_top.predict(X_test_only_numeric[top_features])

In [22]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

0.5033


In [23]:
from src.experiments_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(model, X_test_only_numeric, "LGBMRegressorCitiBike_With_Top10_Features", "mean_absolute_error", score=test_mae)

INFO:src.experiments_utils:MLflow tracking URI and credentials set.
2025/05/10 11:32:08 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressorCitiBike_With_Top10_Features' does not exist. Creating a new experiment.
INFO:src.experiments_utils:Experiment set to: LGBMRegressorCitiBike_With_Top10_Features
INFO:src.experiments_utils:Logged mean_absolute_error: 0.5033053893392305
INFO:src.experiments_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/05/10 11:34:12 INFO mlflow.models.model: Found the following environment variables used during model inference: [HOPSWORK_API_KEY]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/05/10 11:36:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 3
Created version '3' of model 'LGBMRegressor'.
INFO:src.experiments_utils:Model logged with name: LGBMRegressor


🏃 View run powerful-gnu-889 at: https://dagshub.com/dushyanth9652narravula/dnarravu_NYC_Yellow_Cab_Taxi_Project.mlflow/#/experiments/9/runs/77b59686d25e467f9f294115fa707aaf
🧪 View experiment at: https://dagshub.com/dushyanth9652narravula/dnarravu_NYC_Yellow_Cab_Taxi_Project.mlflow/#/experiments/9


<mlflow.models.model.ModelInfo at 0x2a740265720>

## Principle Componenet Analysis

In [24]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_train_only_numeric)

# Train model on PCA features
model_pca = lgb.LGBMRegressor()
model_pca.fit(X_pca, y_train)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009362 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 100096, number of used features: 10
[LightGBM] [Info] Start training from score 0.382932


In [25]:
from sklearn.metrics import mean_absolute_error
predictions = model_pca.predict(pca.transform(X_test_only_numeric))

In [26]:
test_mae = mean_absolute_error(y_test, predictions)
print(f"{test_mae:.4f}")

0.5251


In [27]:
from src.experiments_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
import os
load_dotenv() 

mlflow = set_mlflow_tracking()
log_model_to_mlflow(model, X_test_only_numeric, "LGBMRegressorCitiBike_With_PCA10_Features", "mean_absolute_error", score=test_mae)

INFO:src.experiments_utils:MLflow tracking URI and credentials set.
2025/05/10 11:37:23 INFO mlflow.tracking.fluent: Experiment with name 'LGBMRegressorCitiBike_With_PCA10_Features' does not exist. Creating a new experiment.
INFO:src.experiments_utils:Experiment set to: LGBMRegressorCitiBike_With_PCA10_Features
INFO:src.experiments_utils:Logged mean_absolute_error: 0.5250519678666451
INFO:src.experiments_utils:Model signature inferred.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'LGBMRegressor' already exists. Creating a new version of this model...
2025/05/10 11:41:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMRegressor, version 4
Created version '4' of model 'LGBMRegressor'.
INFO:src.experiments_utils:Model logged with name: LGBMRegressor


🏃 View run lyrical-ray-673 at: https://dagshub.com/dushyanth9652narravula/dnarravu_NYC_Yellow_Cab_Taxi_Project.mlflow/#/experiments/10/runs/9e8a9eafd1df4243a665d95fde77e63b
🧪 View experiment at: https://dagshub.com/dushyanth9652narravula/dnarravu_NYC_Yellow_Cab_Taxi_Project.mlflow/#/experiments/10


<mlflow.models.model.ModelInfo at 0x2a6c98f97b0>