# Ray AI Runtime Quick Start
To use Ray’s AI Runtime install Ray with the optional extra air packages

In [1]:
!pyenv which python

/Users/caihaocui/.pyenv/versions/ray-example/bin/python


Load data into a Dataset. 

In [4]:
import ray

# Load data.
dataset = ray.data.read_csv("s3://anonymous@air-example-data/breast_cancer.csv")

# Split data into train and validation.
train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)

# Create a test dataset by dropping the target column.
test_dataset = valid_dataset.drop_columns(cols=["target"])

2023-03-05 12:56:48,927	INFO worker.py:1544 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


[dataset]: Run `pip install tqdm` to enable progress reporting.


In [2]:
# Create a preprocessor to scale some columns.
from ray.data.preprocessors import StandardScaler

preprocessor = StandardScaler(columns=["mean radius", "mean texture"])

Train a model with an XGBoostTrainer.

In [3]:
!pip list

Package                  Version
------------------------ ---------
aiohttp                  3.8.4
aiohttp-cors             0.7.0
aiorwlock                1.3.0
aiosignal                1.3.1
anyio                    3.6.2
appnope                  0.1.3
asttokens                2.2.1
async-timeout            4.0.2
attrs                    22.2.0
backcall                 0.2.0
blessed                  1.20.0
cachetools               5.3.0
certifi                  2022.12.7
charset-normalizer       3.0.1
click                    8.1.3
colorful                 0.5.5
comm                     0.1.2
debugpy                  1.6.6
decorator                5.1.1
distlib                  0.3.6
executing                1.2.0
fastapi                  0.92.0
filelock                 3.9.0
frozenlist               1.3.3
fsspec                   2023.3.0
google-api-core          2.11.0
google-auth              2.16.2
googleapis-common-protos 1.58.0
gpustat                  1.0.0
grpcio              

In [1]:
from ray.air.config import ScalingConfig
from ray.train.xgboost import XGBoostTrainer

trainer = XGBoostTrainer(
    scaling_config=ScalingConfig(
        # Number of workers to use for data parallelism.
        num_workers=2,
        # Whether to use GPU acceleration.
        use_gpu=False,
        # Make sure to leave some CPUs free for Ray Data operations.
        _max_cpu_fraction_per_node=0.9,
    ),
    label_column="target",
    num_boost_round=20,
    params={
        # XGBoost specific params
        "objective": "binary:logistic",
        # "tree_method": "gpu_hist",  # uncomment this to use GPUs.
        "eval_metric": ["logloss", "error"],
    },
    datasets={"train": train_dataset, "valid": valid_dataset},
    preprocessor=preprocessor,
)
result = trainer.fit()
print(result.metrics)

ModuleNotFoundError: No module named 'xgboost'

Configure the parameters for tuning:



In [None]:
from ray import tune

param_space = {"params": {"max_depth": tune.randint(1, 9)}}
metric = "train-logloss"

In [None]:
from ray.tune.tuner import Tuner, TuneConfig

tuner = Tuner(
    trainer,
    param_space=param_space,
    tune_config=TuneConfig(num_samples=5, metric=metric, mode="min"),
)
result_grid = tuner.fit()
best_result = result_grid.get_best_result()
print("Best result:", best_result)

Use the trained model for batch prediction with a BatchPredictor.



In [None]:
from ray.train.batch_predictor import BatchPredictor
from ray.train.xgboost import XGBoostPredictor

# You can also create a checkpoint from a trained model using
# `XGBoostCheckpoint.from_model`.
checkpoint = best_result.checkpoint

batch_predictor = BatchPredictor.from_checkpoint(checkpoint, XGBoostPredictor)

predicted_probabilities = batch_predictor.predict(test_dataset)
predicted_probabilities.show()