<a href="https://colab.research.google.com/github/gyasifred/NLP-Techniques/blob/main/hyper_parameter_tuning_ray.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
pip install 'ray[tune]'

Collecting ray[tune]
  Downloading ray-2.42.1-cp311-cp311-manylinux2014_x86_64.whl.metadata (18 kB)
Collecting tensorboardX>=1.9 (from ray[tune])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ray-2.42.1-cp311-cp311-manylinux2014_x86_64.whl (67.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.4/67.4 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboardX, ray
Successfully installed ray-2.42.1 tensorboardX-2.6.2.2


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import ray
from ray import train, tune
from ray.tune.schedulers import ASHAScheduler

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, num_filters=100, kernel_sizes=[3, 4, 5], dropout_rate=0.5):
        super(TextCNN, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convs = nn.ModuleList([
            nn.Conv1d(embed_dim, num_filters, kernel_size=k) for k in kernel_sizes
        ])
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(num_filters * len(kernel_sizes), 200)
        self.fc2 = nn.Linear(200, 1)
        self.dropout = nn.Dropout(dropout_rate)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = [self.pool(conv(x)).squeeze(-1) for conv in self.convs]
        x = torch.cat(x, dim=1)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

def get_data_loaders():
    # Generate sample dataset (replace with your actual dataset)
    X = np.random.randint(0, 5000, (1000, 8000))
    y = np.random.randint(0, 2, 1000)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.long),
                                torch.tensor(y_train, dtype=torch.float))
    val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.long),
                              torch.tensor(y_val, dtype=torch.float))

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)

    return train_loader, val_loader

def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y.unsqueeze(1).float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def evaluate(model, val_loader, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch_x, batch_y in val_loader:
            batch_x = batch_x.to(device)
            outputs = model(batch_x).cpu().numpy()
            preds = (outputs > 0.5).astype(int)
            all_preds.extend(preds)
            all_labels.extend(batch_y.numpy())

    return accuracy_score(all_labels, all_preds)

def train_textcnn(config):
    # Device configuration
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Get data loaders
    train_loader, val_loader = get_data_loaders()

    # Initialize model with config
    model = TextCNN(
        vocab_size=5000,
        embed_dim=config["embed_dim"],
        num_filters=config["num_filters"],
        dropout_rate=config["dropout_rate"]
    ).to(device)

    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    # Training loop
    for epoch in range(5):
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
        val_accuracy = evaluate(model, val_loader, device)

        # Report metrics using train.report
        train.report({
            "accuracy": float(val_accuracy),
            "loss": float(train_loss)
        })

def main():
    # Parameter space definition
    param_space = {
        "embed_dim": tune.choice([50, 100, 150, 200, 250, 300]),
        "num_filters": tune.choice([50, 100, 150, 200, 250, 300]),
        "dropout_rate": tune.uniform(0.2, 0.6),
        "lr": tune.loguniform(1e-4, 1e-3)
    }

    # Initialize Ray
    ray.init(ignore_reinit_error=True)

    # Create tuner
    train_textcnn_with_resources = tune.with_resources(train_textcnn, {"gpu": 1})
    tuner = tune.Tuner(
        train_textcnn_with_resources,
        tune_config=tune.TuneConfig(
            metric="accuracy",
            mode="max",
            scheduler=ASHAScheduler(
                max_t=5,
                grace_period=3,
                reduction_factor=2
            ),
            num_samples=10
        ),
        param_space=param_space
    )

    # Run the hyperparameter search
    results = tuner.fit()

    # Get best result
    best_result = results.get_best_result(metric="accuracy", mode="max")
    print(f"Best trial config: {best_result.config}")
    print(f"Best trial final validation accuracy: {best_result.metrics['accuracy']}")

if __name__ == "__main__":
    main()

2025-02-23 03:25:44,496	INFO worker.py:1672 -- Calling ray.init() again after it has already been called.


+----------------------------------------------------------------------+
| Configuration for experiment     train_textcnn_2025-02-23_03-25-44   |
+----------------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator               |
| Scheduler                        AsyncHyperBandScheduler             |
| Number of trials                 10                                  |
+----------------------------------------------------------------------+

View detailed results here: /root/ray_results/train_textcnn_2025-02-23_03-25-44
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2025-02-23_03-22-49_708978_2797/artifacts/2025-02-23_03-25-44/train_textcnn_2025-02-23_03-25-44/driver_artifacts`

Trial status: 10 PENDING
Current time: 2025-02-23 03:25:44. Total running time: 0s
Logical resource usage: 0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+---------------------------------------------

2025-02-23 03:28:43,174	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/train_textcnn_2025-02-23_03-25-44' in 0.0051s.



Trial train_textcnn_ddcb3_00009 completed after 5 iterations at 2025-02-23 03:28:43. Total running time: 2min 58s
+----------------------------------------------------+
| Trial train_textcnn_ddcb3_00009 result             |
+----------------------------------------------------+
| checkpoint_dir_name                                |
| time_this_iter_s                           3.79318 |
| time_total_s                               20.6413 |
| training_iteration                               5 |
| accuracy                                     0.515 |
| loss                                       0.56052 |
+----------------------------------------------------+

Trial status: 10 TERMINATED
Current time: 2025-02-23 03:28:43. Total running time: 2min 58s
Logical resource usage: 0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
Current best trial: ddcb3_00004 with accuracy=0.54 and params={'embed_dim': 100, 'num_filters': 300, 'dropout_rate': 0.35317795658945417, 'lr': 0.0009872003310010895}

In [6]:
import xgboost as xgb
import joblib
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from ray import tune
from ray.data import from_pandas
from ray.train import RunConfig, ScalingConfig
from ray.train.xgboost import XGBoostTrainer
from ray.tune.tuner import Tuner

def get_datasets():
    # Create a dummy classification dataset.
    X, y = make_classification(
        n_samples=1000,
        n_features=20,
        n_informative=10,
        n_redundant=5,
        random_state=42
    )
    df = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[1])])
    df["target"] = y

    # Split into training and validation sets.
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

    # Convert pandas DataFrames into Ray Datasets.
    train_ds = from_pandas(train_df)
    val_ds = from_pandas(val_df)
    return train_ds, val_ds

# Retrieve the dummy training and validation datasets.
train_ds, val_ds = get_datasets()

# Define the hyperparameter search space, including GPU resource allocation.
param_space = {
    # The scaling config handles the resource allocation.
    "scaling_config": ScalingConfig(
        num_workers=1,  # Use 1 worker for the task.
        resources_per_worker={
            "CPU": 1,  # Allocate 1 CPU per worker.
            "GPU": 1,  # Allocate 1 GPU per worker.
        },
        use_gpu=True,  # Ensure GPU usage.
    ),
    "params": {
        # XGBoost hyperparameters to tune.
        "objective": "binary:logistic",
        "tree_method": 'gpu_hist',  # Using GPU-accelerated tree method.
        "eval_metric": ["logloss", "error"],
        "eta": tune.loguniform(1e-4, 1e-1),  # Learning rate.
        "subsample": tune.uniform(0.5, 1.0),  # Fraction of samples to use.
        "max_depth": tune.randint(3, 10),  # Maximum depth of the tree.
        "min_child_weight": tune.randint(1, 10),  # Minimum sum of instance weight for a child.
        "gamma": tune.uniform(0, 5),  # Minimum loss reduction required.
        "colsample_bytree": tune.uniform(0.3, 1.0),  # Fraction of features to use per tree.
        "reg_alpha": tune.loguniform(1e-4, 1e-1),  # L1 regularization term.
        "reg_lambda": tune.loguniform(1e-4, 1e-1),  # L2 regularization term.
        "max_bin": tune.randint(100, 300),  # Maximum number of bins.
    },
}

# Initialize the XGBoostTrainer with both training and validation datasets.
trainer = XGBoostTrainer(
    label_column="target",  # Column name of the target variable.
    params={},  # Hyperparameters will be provided by Tune.
    datasets={"train": train_ds, "validation": val_ds},  # Train and validation datasets.
)

# Create the Tuner with a RunConfig for naming the run.
tuner = Tuner(
    trainable=trainer,
    param_space=param_space,
    run_config=RunConfig(name="xgboost_gpu_tune_run_dummy")  # Name for the run.
)

# Execute the hyperparameter search.
results = tuner.fit()

# Retrieve and print the best trial's configuration and evaluation metric.
best_result = results.get_best_result(metric="validation-logloss", mode="min")
print("Best trial config:", best_result.config)
print("Best trial final evaluation logloss:", best_result.metrics["validation-logloss"])


+---------------------------------------------------------------+
| Configuration for experiment     xgboost_gpu_tune_run_dummy   |
+---------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator        |
| Scheduler                        FIFOScheduler                |
| Number of trials                 1                            |
+---------------------------------------------------------------+

View detailed results here: /root/ray_results/xgboost_gpu_tune_run_dummy
To visualize your results with TensorBoard, run: `tensorboard --logdir /tmp/ray/session_2025-02-23_04-15-16_854610_2585/artifacts/2025-02-23_04-19-25/xgboost_gpu_tune_run_dummy/driver_artifacts`

Trial status: 1 PENDING
Current time: 2025-02-23 04:19:25. Total running time: 0s
Logical resource usage: 1.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:L4)
+-----------------------------------------------------------------------------------------------------------

[36m(XGBoostTrainer pid=5188)[0m Started distributed worker processes: 
[36m(XGBoostTrainer pid=5188)[0m - (node_id=33f33a9de2320a2f663ff102120357946c3abd30f12ee79c196edcb0, ip=172.28.0.12, pid=5253) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=5253)[0m [04:19:30] Task [xgboost.ray-rank=00000000]:b62e1e66b43c7f758bcda65401000000 got rank 0
[36m(SplitCoordinator pid=5307)[0m Starting execution of Dataset. Full logs are in /tmp/ray/session_2025-02-23_04-15-16_854610_2585/logs/ray-data
[36m(SplitCoordinator pid=5307)[0m Execution plan of Dataset: InputDataBuffer[Input] -> OutputSplitter[split(1, equal=True)]


[2m[36m(pid=5307) [0mRunning 0: 0.00 row [00:00, ? row/s]

[2m[36m(pid=5307) [0m- split(1, equal=True) 1: 0.00 row [00:00, ? row/s]

[2m[36m(pid=5305) [0mRunning 0: 0.00 row [00:00, ? row/s]

[2m[36m(pid=5305) [0m- split(1, equal=True) 1: 0.00 row [00:00, ? row/s]

[36m(RayTrainWorker pid=5253)[0m 
[36m(RayTrainWorker pid=5253)[0m     E.g. tree_method = "hist", device = "cuda"
[36m(RayTrainWorker pid=5253)[0m 
[36m(XGBoostTrainer pid=5188)[0m [04:19:31] [0]	train-logloss:0.69292	train-error:0.49375	validation-logloss:0.69335	validation-error:0.51000
[36m(XGBoostTrainer pid=5188)[0m [04:19:31] [1]	train-logloss:0.69275	train-error:0.49375	validation-logloss:0.69320	validation-error:0.51000
[36m(XGBoostTrainer pid=5188)[0m [04:19:31] [2]	train-logloss:0.69255	train-error:0.49375	validation-logloss:0.69307	validation-error:0.51000
[36m(XGBoostTrainer pid=5188)[0m [04:19:31] [3]	train-logloss:0.69240	train-error:0.49375	validation-logloss:0.69292	validation-error:0.51000
[36m(XGBoostTrainer pid=5188)[0m [04:19:31] [4]	train-logloss:0.69220	train-error:0.49375	validation-logloss:0.69275	validation-error:0.51000
[36m(XGBoostTrainer pid=5188)[0m [04:19:31] [5]	train-logloss:0.69199	train-error:0.49375	validation-logloss:0.69258	validat


Trial XGBoostTrainer_5dd44_00000 completed after 11 iterations at 2025-02-23 04:19:32. Total running time: 6s
+---------------------------------------------------------------+
| Trial XGBoostTrainer_5dd44_00000 result                       |
+---------------------------------------------------------------+
| checkpoint_dir_name                         checkpoint_000000 |
| time_this_iter_s                                      0.00223 |
| time_total_s                                          3.33527 |
| training_iteration                                         11 |
| train-error                                           0.49375 |
| train-logloss                                         0.69128 |
| validation-error                                         0.51 |
| validation-logloss                                    0.69194 |
+---------------------------------------------------------------+

Trial status: 1 TERMINATED
Current time: 2025-02-23 04:19:32. Total running time: 7s
Logical res