<img src="https://cdn.comet.ml/img/notebook_logo.png">

[Comet](https://www.comet.com/site/products/ml-experiment-tracking/?utm_campaign=ray_train&utm_medium=colab) is an MLOps Platform that is designed to help Data Scientists and Teams build better models faster! Comet provides tooling to track, Explain, Manage, and Monitor your models in a single place! It works with Jupyter Notebooks and Scripts and most importantly it's 100% free to get started!

[Ray Train](https://docs.ray.io/en/latest/train/train.html) abstracts away the complexity of setting up a distributed training system.

Instrument your runs with Comet to start managing experiments, create dataset versions and track hyperparameters for faster and easier reproducibility and collaboration.

[Find more information about our integration with Ray Train](https://www.comet.ml/docs/v2/integrations/ml-frameworks/ray/)

Get a preview for what's to come. Check out a completed experiment created from this notebook [here](https://www.comet.com/examples/comet-example-ray-train-keras/99d169308c854be7ac222c995a2bfa26?experiment-tab=systemMetrics).

This example is based on the [following Ray Train Tensorflow example](https://docs.ray.io/en/latest/train/examples/tf/tensorflow_mnist_example.html).

# Install Dependencies

In [None]:
%pip install "comet_ml>=3.31.5" "ray[air]>=2.1.0" "transformers>=4.43.0" "accelerate>=0.12.0" "datasets" "sentencepiece" scipy "scikit-learn" protobuf "torch>=1.3" evaluate

# Initialize Comet

In [None]:
import comet_ml
import comet_ml.integration.ray

comet_ml.init()

# Import Dependencies

In [None]:
import os
import threading

import evaluate
import numpy as np
from datasets import load_dataset

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    enable_full_determinism,
)

import ray.train.huggingface.transformers
from ray.train import ScalingConfig, RunConfig
from ray.train.torch import TorchTrainer

In [None]:
# Models
PRE_TRAINED_MODEL_NAME = "google-bert/bert-base-cased"
SEED = 42

enable_full_determinism(SEED)

# Prepare your dataset

In [None]:
def get_dataset():
    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    dataset = load_dataset("yelp_review_full")
    dataset["train"] = dataset["train"].shuffle(seed=SEED).select(range(100))
    dataset["test"] = dataset["test"].shuffle(seed=SEED).select(range(100))

    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    small_train_dataset = tokenized_datasets["train"]
    small_eval_dataset = tokenized_datasets["test"]
    return (small_train_dataset, small_eval_dataset)

# Define your distributed training function

This function is gonna be distributed and executed on each distributed worker.

In [None]:
def train_func(config):
    from comet_ml import get_running_experiment
    from comet_ml.integration.ray import comet_worker_logger

    with comet_worker_logger(config) as experiment:
        small_train_dataset, small_eval_dataset = get_dataset()

        # Model
        model = AutoModelForSequenceClassification.from_pretrained(
            "google-bert/bert-base-cased", num_labels=5
        )

        # Evaluation Metrics
        metric = evaluate.load("accuracy")

        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)

            experiment = comet_ml.get_running_experiment()
            if experiment:
                experiment.log_confusion_matrix(predictions, labels)

            return metric.compute(predictions=predictions, references=labels)

        # Hugging Face Trainer
        training_args = TrainingArguments(
            do_eval=True,
            do_train=True,
            eval_strategy="epoch",
            num_train_epochs=config["epochs"],
            output_dir="./results",
            overwrite_output_dir=True,
            per_device_eval_batch_size=4,
            per_device_train_batch_size=4,
            report_to=["comet_ml"],
            seed=SEED,
        )
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=small_train_dataset,
            eval_dataset=small_eval_dataset,
            compute_metrics=compute_metrics,
        )

        # Report Metrics and Checkpoints to Ray Train
        callback = ray.train.huggingface.transformers.RayTrainReportCallback()
        trainer.add_callback(callback)

        # Prepare Transformers Trainer
        trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)

        # Start Training
        trainer.train()

    comet_ml.get_running_experiment().end()

# Define the function that schedule the distributed job

In [None]:
def train(num_workers: int = 2, use_gpu: bool = False, epochs=1):
    scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)
    config = {"use_gpu": use_gpu, "epochs": 2}

    callback = comet_ml.integration.ray.CometTrainLoggerCallback(
        config, project_name="comet-example-ray-train-hugginface-transformers"
    )

    ray_trainer = TorchTrainer(
        train_func,
        scaling_config=scaling_config,
        train_loop_config=config,
        run_config=RunConfig(callbacks=[callback]),
    )
    result = ray_trainer.fit()

# Train the model

Ray will wait indefinitely if we request more num_workers that the available resources, the code below ensure we never request more CPU than available locally.

In [None]:
ideal_num_workers = 2

available_local_cpu_count = os.cpu_count() - 1
num_workers = min(ideal_num_workers, available_local_cpu_count)

if num_workers < 1:
    num_workers = 1

train(num_workers, use_gpu=False, epochs=5)