In [1]:
import ray
from ray.data.preprocessors import MinMaxScaler
from ray.train.xgboost import XGBoostTrainer

In [11]:
train_items = [{"x": x, "y": 2 * x} for x in range(0, 32, 3)]
valid_items = [{"x": x, "y": 2 * x} for x in range(1, 32, 3)]

### Create Ray Dataset

In [12]:
# convert into ray dataset
train_dataset = ray.data.from_items(train_items)
valid_dataset = ray.data.from_items(valid_items)
train_dataset, valid_dataset

(Dataset(num_blocks=11, num_rows=11, schema={x: int64, y: int64}),
 Dataset(num_blocks=11, num_rows=11, schema={x: int64, y: int64}))

In [14]:
train_dataset.take(2)

[ArrowRow({'x': 0, 'y': 0}), ArrowRow({'x': 3, 'y': 6})]

In [13]:
# create preporcessor
preprocessor = MinMaxScaler(["x"])

### Use the XGBoostTrainer 

In [15]:
trainer = XGBoostTrainer(label_column = "y",
                         params={"objective": "reg:squarederror"},
                         scaling_config = {"num_workers": 2},
                         datasets = {"train": train_dataset, "valid": valid_dataset},
                         preprocessor = preprocessor)
result = trainer.fit()

== Status ==
Current time: 2022-07-13 09:04:42 (running for 00:00:01.19)
Memory usage on this node: 23.6/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 3.0/10 CPUs, 0/0 GPUs, 0.0/35.53 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jules/ray_results/XGBoostTrainer_2022-07-13_09-04-41
Number of trials: 1/1 (1 RUNNING)
+----------------------------+----------+-----------------+
| Trial name                 | status   | loc             |
|----------------------------+----------+-----------------|
| XGBoostTrainer_811d0_00000 | RUNNING  | 127.0.0.1:13557 |
+----------------------------+----------+-----------------+




[2m[36m(XGBoostTrainer pid=13557)[0m 2022-07-13 09:04:43,783	INFO main.py:980 -- [RayXGBoost] Created 2 new actors (2 total actors). Waiting until actors are ready for training.
[2m[36m(_RemoteRayXGBoostActor pid=13589)[0m   File "/Users/jules/git-repos/ray/python/ray/_private/workers/default_worker.py", line 237, in <module>
[2m[36m(_RemoteRayXGBoostActor pid=13589)[0m     ray._private.worker.global_worker.main_loop()
[2m[36m(_RemoteRayXGBoostActor pid=13589)[0m   File "/Users/jules/git-repos/ray/python/ray/_private/worker.py", line 754, in main_loop
[2m[36m(_RemoteRayXGBoostActor pid=13589)[0m     self.core_worker.run_task_loop()
[2m[36m(_RemoteRayXGBoostActor pid=13589)[0m   File "/Users/jules/git-repos/ray/python/ray/_private/function_manager.py", line 674, in actor_method_executor
[2m[36m(_RemoteRayXGBoostActor pid=13589)[0m     return method(__ray_actor, *args, **kwargs)
[2m[36m(_RemoteRayXGBoostActor pid=13589)[0m   File "/Users/jules/git-repos/ray/python

Result for XGBoostTrainer_811d0_00000:
  date: 2022-07-13_09-04-46
  done: false
  experiment_id: a70fccfdec104e52ab05a369a4a008fa
  hostname: Juless-MacBook-Pro-16
  iterations_since_restore: 1
  node_ip: 127.0.0.1
  pid: 13557
  should_checkpoint: true
  time_since_restore: 3.626943826675415
  time_this_iter_s: 3.626943826675415
  time_total_s: 3.626943826675415
  timestamp: 1657728286
  timesteps_since_restore: 0
  train-rmse: 26.757627185845564
  training_iteration: 1
  trial_id: 811d0_00000
  valid-rmse: 28.41558771704671
  warmup_time: 0.0032470226287841797
  


[2m[36m(XGBoostTrainer pid=13557)[0m 2022-07-13 09:04:46,562	INFO main.py:1516 -- [RayXGBoost] Finished XGBoost training on training data with total N=11 in 2.79 seconds (1.66 pure XGBoost training time).
2022-07-13 09:04:47,516	INFO tune.py:737 -- Total run time: 5.89 seconds (5.76 seconds for the tuning loop).


Result for XGBoostTrainer_811d0_00000:
  date: 2022-07-13_09-04-46
  done: true
  experiment_id: a70fccfdec104e52ab05a369a4a008fa
  experiment_tag: '0'
  hostname: Juless-MacBook-Pro-16
  iterations_since_restore: 10
  node_ip: 127.0.0.1
  pid: 13557
  should_checkpoint: true
  time_since_restore: 3.717350721359253
  time_this_iter_s: 0.008311986923217773
  time_total_s: 3.717350721359253
  timestamp: 1657728286
  timesteps_since_restore: 0
  train-rmse: 3.3435785038526604
  training_iteration: 10
  trial_id: 811d0_00000
  valid-rmse: 4.805372691443562
  warmup_time: 0.0032470226287841797
  
== Status ==
Current time: 2022-07-13 09:04:47 (running for 00:00:05.76)
Memory usage on this node: 23.7/64.0 GiB
Using FIFO scheduling algorithm.
Resources requested: 0/10 CPUs, 0/0 GPUs, 0.0/35.53 GiB heap, 0.0/2.0 GiB objects
Result logdir: /Users/jules/ray_results/XGBoostTrainer_2022-07-13_09-04-41
Number of trials: 1/1 (1 TERMINATED)
+----------------------------+------------+-----------------

In [17]:
result

Result(metrics={'train-rmse': 3.3435785038526604, 'valid-rmse': 4.805372691443562, 'time_this_iter_s': 0.008311986923217773, 'should_checkpoint': True, 'done': True, 'timesteps_total': None, 'episodes_total': None, 'training_iteration': 10, 'trial_id': '811d0_00000', 'experiment_id': 'a70fccfdec104e52ab05a369a4a008fa', 'date': '2022-07-13_09-04-46', 'timestamp': 1657728286, 'time_total_s': 3.717350721359253, 'pid': 13557, 'hostname': 'Juless-MacBook-Pro-16', 'node_ip': '127.0.0.1', 'config': {}, 'time_since_restore': 3.717350721359253, 'timesteps_since_restore': 0, 'iterations_since_restore': 10, 'warmup_time': 0.0032470226287841797, 'experiment_tag': '0'}, checkpoint=<ray.air.checkpoint.Checkpoint object at 0x136324f70>, error=None, log_dir=PosixPath('/Users/jules/ray_results/XGBoostTrainer_2022-07-13_09-04-41/XGBoostTrainer_811d0_00000_0_2022-07-13_09-04-41'), metrics_dataframe=   train-rmse  valid-rmse  ...  iterations_since_restore  warmup_time
0   26.757627   28.415588  ...       

### Construct a predictor
Since Preprocessor is saved with checkpoint, it will be used for the inference data automatically, without you having to explicity call it. Not bad, eh!

In [18]:
from ray.train.batch_predictor import BatchPredictor
from ray.train.xgboost import XGBoostPredictor

In [19]:
test_dataset = ray.data.from_items([{"x": x} for x in range(2, 32, 3)])

In [20]:
checkpoint = result.checkpoint
batch_predictor = BatchPredictor.from_checkpoint(checkpoint, XGBoostPredictor)
predicted_prob = batch_predictor.predict(test_dataset)
predicted_prob.show()

Map Progress (1 actors 0 pending): 100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 13.91it/s]

{'predictions': 0.09843720495700836}
{'predictions': 5.604666709899902}
{'predictions': 11.405311584472656}
{'predictions': 15.684700012207031}
{'predictions': 23.990947723388672}
{'predictions': 29.900211334228516}
{'predictions': 34.59944152832031}
{'predictions': 40.6968994140625}
{'predictions': 45.68107604980469}
{'predictions': 50.29003143310547}





### Chaining Preoprocessors
Often, you will have a sequence of transformations on your dataset. You can achieve by using the `Chain` preprocessor. 

In [21]:
from ray.data.preprocessors import Chain, MinMaxScaler, SimpleImputer

In [22]:
# create some data with `none` values.
dataset = ray.data.from_items(
    [{"value": 0}, {"value": 1}, {"value": 2}, {"value": 3}, {"value": None}]
)
print(dataset.take())

[{'value': 0}, {'value': 1}, {'value': 2}, {'value': 3}, {'value': None}]


In [26]:
# Now apply the chain preprocessor 
preprocessor  = Chain(SimpleImputer(["value"]), MinMaxScaler(["value"]))

In [27]:
dataset_transformed = preprocessor.fit_transform(dataset)
print(dataset_transformed.take())

Shuffle Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  9.12it/s]
Shuffle Reduce: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 282.96it/s]
Map_Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 328.21it/s]
Shuffle Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 803.04it/s]
Shuffle Reduce: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 233.71it/s]
Map_Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 444.02it/s]

[{'value': 0.0}, {'value': 0.3333333333333333}, {'value': 0.6666666666666666}, {'value': 1.0}, {'value': 0.5}]



