# NYC Taxi Example Playground
***
This is the notebook where Emmy tests out her NYC Taxi code before taking the cleaned up bits for the tutorial on Intro to Ray AIR.

In [2]:
# import your packages
import ray

if ray.is_initialized:
    ray.shutdown()

ray.init()

2022-10-24 18:08:02,427	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.8.13
Ray version:,2.0.0
Dashboard:,http://127.0.0.1:8265


In [3]:
# we use the June 2021 dataset for training and the June 2022 dataset for batch inference later
dataset = ray.data.read_parquet("data/nyc_taxi_2021.parquet")

# split data into training and validation subsets
train_dataset, valid_dataset = dataset.train_test_split(test_size=0.3)
valid_dataset = valid_dataset.drop_columns(["is_big_tip"])

# repartition the dataset for maximum parallelism
# train_dataset.repartition(100)
# valid_dataset.repartition(100)

Read progress: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  6.10it/s]
Map_Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.00it/s]


In [4]:
train_dataset.take(2)

[ArrowRow({'passenger_count': 1.0,
           'trip_distance': 0.9,
           'fare_amount': 5.0,
           'trip_duration': 228,
           'hour': 0,
           'day_of_week': 1,
           'is_big_tip': True,
           '__index_level_0__': 0}),
 ArrowRow({'passenger_count': 1.0,
           'trip_distance': 23.0,
           'fare_amount': 61.5,
           'trip_duration': 2081,
           'hour': 0,
           'day_of_week': 1,
           'is_big_tip': False,
           '__index_level_0__': 1})]

In [5]:
valid_dataset.take(2)

[PandasRow({'passenger_count': 1.0,
            'trip_distance': 1.2,
            'fare_amount': 8.5,
            'trip_duration': 611,
            'hour': 12,
            'day_of_week': 1}),
 PandasRow({'passenger_count': 1.0,
            'trip_distance': 1.4,
            'fare_amount': 6.5,
            'trip_duration': 351,
            'hour': 12,
            'day_of_week': 1})]

So something we might want to do is inspect both "trip_distance" and "trip_duration" to see if they're approximately normal, and if the StandardScaler is the right choice.

In [6]:
# we're going to use MinMaxScaler becaues we aren't sure what the data looks like.
# this scales each column by its range, but maybe we want to cut off some really long trip durations and distances?

from ray.data.preprocessors import MinMaxScaler

# create a preprocessor to scale some columns
preprocessor = MinMaxScaler(columns=["trip_distance", "trip_duration"])

So something to change is that the code in the snippet shows a TorchTrainer, which is maybe not what we're going for here. Another thing is that we've now arrived at the code that we don't understand, so you gotta break it down.

In [7]:
from ray.train.xgboost import XGBoostTrainer
from ray.air.config import ScalingConfig

trainer = XGBoostTrainer(
    label_column="is_big_tip",
    params={"objective": "binary:logistic", "eval_metric": ["logloss", "error"], "tree_method": "approx"},
    scaling_config=ScalingConfig(num_workers=6),
    datasets={"train": train_dataset, "valid": valid_dataset},
    preprocessor=preprocessor,
    num_boost_round=10
)

In [8]:
result = trainer.fit()

Trial name,status,loc
XGBoostTrainer_8c207_00000,ERROR,127.0.0.1:64705

Trial name,# failures,error file
XGBoostTrainer_8c207_00000,1,/Users/jules/ray_results/XGBoostTrainer_2022-10-24_18-08-34/XGBoostTrainer_8c207_00000_0_2022-10-24_18-08-34/error.txt


[2m[36m(XGBoostTrainer pid=64705)[0m 2022-10-24 18:08:41,838	INFO elastic.py:155 -- Actor status: 6 alive, 0 dead (6 total)
[2m[36m(XGBoostTrainer pid=64705)[0m 2022-10-24 18:08:46,840	ERROR function_trainable.py:298 -- Runner Thread raised error.
[2m[36m(XGBoostTrainer pid=64705)[0m Traceback (most recent call last):
[2m[36m(XGBoostTrainer pid=64705)[0m   File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/xgboost_ray/main.py", line 1144, in _train
[2m[36m(XGBoostTrainer pid=64705)[0m     ray.get(ready)
[2m[36m(XGBoostTrainer pid=64705)[0m   File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
[2m[36m(XGBoostTrainer pid=64705)[0m     return func(*args, **kwargs)
[2m[36m(XGBoostTrainer pid=64705)[0m   File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/ray/_private/worker.py", line

Result for XGBoostTrainer_8c207_00000:
  date: 2022-10-24_18-08-35
  experiment_id: c18f160899fd4a02bc676c897363133e
  hostname: Juless-MacBook-Pro-16
  node_ip: 127.0.0.1
  pid: 64705
  timestamp: 1666660115
  trial_id: 8c207_00000
  


2022-10-24 18:08:47,142	ERROR tune.py:754 -- Trials did not complete: [XGBoostTrainer_8c207_00000]
2022-10-24 18:08:47,143	INFO tune.py:758 -- Total run time: 13.05 seconds (12.92 seconds for the tuning loop).


RayTaskError(RuntimeError): [36mray::GBDTTrainable.train()[39m (pid=64705, ip=127.0.0.1, repr=XGBoostTrainer)
ray.exceptions.RayTaskError(KeyError): [36mray::_RemoteRayXGBoostActor.train()[39m (pid=64725, ip=127.0.0.1, repr=<xgboost_ray.main._RemoteRayXGBoostActor object at 0x134d88c10>)
  File "pandas/_libs/index.pyx", line 138, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 165, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 5745, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 5753, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'is_big_tip'

The above exception was the direct cause of the following exception:

[36mray::_RemoteRayXGBoostActor.train()[39m (pid=64725, ip=127.0.0.1, repr=<xgboost_ray.main._RemoteRayXGBoostActor object at 0x134d88c10>)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/xgboost_ray/main.py", line 599, in train
    self.load_data(deval)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/xgboost_ray/main.py", line 561, in load_data
    param = data.get_data(self.rank, self.num_actors)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/xgboost_ray/matrix.py", line 832, in get_data
    self.load_data(num_actors=num_actors, rank=rank)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/xgboost_ray/matrix.py", line 818, in load_data
    refs, self.n = self.loader.load_data(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/xgboost_ray/matrix.py", line 548, in load_data
    x, y, w, b, ll, lu, qid = self._split_dataframe(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/xgboost_ray/matrix.py", line 263, in _split_dataframe
    label, exclude = data_source.get_column(local_data, self.label)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/xgboost_ray/data_sources/data_source.py", line 125, in get_column
    return data[column], column
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/pandas/core/frame.py", line 3805, in __getitem__
    indexer = self.columns.get_loc(key)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 3802, in get_loc
    raise KeyError(key) from err
KeyError: 'is_big_tip'

The above exception was the direct cause of the following exception:

[36mray::GBDTTrainable.train()[39m (pid=64705, ip=127.0.0.1, repr=XGBoostTrainer)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/xgboost_ray/main.py", line 1457, in train
    bst, train_evals_result, train_additional_results = _train(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/xgboost_ray/main.py", line 1169, in _train
    raise RayActorError from exc
ray.exceptions.RayActorError: The actor died unexpectedly before finishing this task.

The above exception was the direct cause of the following exception:

[36mray::GBDTTrainable.train()[39m (pid=64705, ip=127.0.0.1, repr=XGBoostTrainer)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/ray/tune/trainable/trainable.py", line 347, in train
    result = self.step()
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 417, in step
    self._report_thread_runner_error(block=True)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 589, in _report_thread_runner_error
    raise e
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 289, in run
    self._entrypoint()
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 362, in entrypoint
    return self._trainable_func(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/ray/train/base_trainer.py", line 460, in _trainable_func
    super()._trainable_func(self._merged_config, reporter, checkpoint_dir)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/ray/tune/trainable/function_trainable.py", line 684, in _trainable_func
    output = fn()
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/ray/train/base_trainer.py", line 375, in train_func
    trainer.training_loop()
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/ray/train/gbdt_trainer.py", line 246, in training_loop
    model = self._train(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/ray/train/xgboost/xgboost_trainer.py", line 77, in _train
    return xgboost_ray.train(**kwargs)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/ray-core-tutorial/lib/python3.8/site-packages/xgboost_ray/main.py", line 1534, in train
    raise RuntimeError(
RuntimeError: A Ray actor died during training and the maximum number of retries (0) is exhausted.