# Batch Tuning with Ray AIR Tune

### Load and Prepare Data

In [1]:
import os
print(f'Number of CPUs in this system: {os.cpu_count()}')
from typing import Tuple, List, Union, Optional, Callable
import time
import pandas as pd
import numpy as np
import pyarrow
import pyarrow.parquet as pq
import pyarrow.dataset as pds

print(f"pyarrow: {pyarrow.__version__}")

Number of CPUs in this system: 8
pyarrow: 6.0.1


In [2]:
import ray

if ray.is_initialized():
    ray.shutdown()
ray.init()

2022-11-06 17:08:15,620	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.8.13
Ray version:,2.0.1
Dashboard:,http://127.0.0.1:8265


In [3]:
# For benchmarking purposes, we can print the times of various operations.
# In order to reduce clutter in the output, this is set to False by default.
PRINT_TIMES = False

def print_time(msg: str):
    if PRINT_TIMES:
        print(msg)
        
# To speed things up, we’ll only use a small subset of the full dataset consisting of two last months of 2019.
# You can choose to use the full dataset for 2018-2019 by setting the SMOKE_TEST variable to False.
SMOKE_TEST = True


In [4]:
# Define some global variables.
target = "trip_duration"
s3_partitions = pds.dataset(
    "s3://anonymous@air-example-data/ursa-labs-taxi-data/by_year/",
    partitioning=["year", "month"],
)
s3_files = [f"s3://{file}" for file in s3_partitions.files]

# Obtain all location IDs
all_location_ids = (
    pq.read_table(s3_files[0], columns=["dropoff_location_id"])["dropoff_location_id"]
    .unique()
    .to_pylist()
)

# Use smoke testing or not.
starting_idx = -1 if SMOKE_TEST else 0
sample_locations = [145, 166, 152] if SMOKE_TEST else all_location_ids

# Display what data will be used.
s3_files = s3_files[starting_idx:]
print(f"NYC Taxi using {len(s3_files)} file(s)!")
print(f"s3_files: {s3_files}")
print(f"Locations: {sample_locations}")

NYC Taxi using 1 file(s)!
s3_files: ['s3://air-example-data/ursa-labs-taxi-data/by_year/2019/06/data.parquet/ab5b9d2b8cc94be19346e260b543ec35_000000.parquet']
Locations: [145, 166, 152]


In [5]:
def read_data(file: str, sample_id: np.int32) -> pd.DataFrame:
    # start = time.time()
    
    df = pq.read_table(
        file,
        filters=[
            ("passenger_count", ">", 0),
            ("trip_distance", ">", 0),
            ("fare_amount", ">", 0),
            ("pickup_location_id", "in", [264, 265]),
            ("dropoff_location_id", "not in", [264, 265]), 
            ("dropoff_location_id", "=", sample_id)
        ],
        columns=[
            "pickup_at",
            "dropoff_at",
            "pickup_location_id",
            "dropoff_location_id",
            "passenger_count",
            "trip_distance",
            "fare_amount",
        ],
    ).to_pandas()

    # data_loading_time = time.time() - start
    # print(f"Data loading time: {data_loading_time:.2f} seconds")
    return df

# A pandas DataFrame UDF for transforming the Dataset in parallel.
def transform_batch(the_df: pd.DataFrame) -> pd.DataFrame:
    df = the_df.copy()
    
    df["trip_duration"] = (df["dropoff_at"] - df["pickup_at"]).dt.seconds
    df = df[df["trip_duration"] > 60]
    df = df[df["trip_duration"] < 24 * 60 * 60] 
    df.drop(["dropoff_at", "pickup_at", "pickup_location_id", "fare_amount"]
            , axis=1, inplace=True)
    df["dropoff_location_id"] = df["dropoff_location_id"].fillna(-1)
    return df

In [6]:
# %%time

# # Test reading data.
# import itertools
# my_list = itertools.product(s3_files, sample_locations)

# # [print(f[0], f[1]) for f in my_list]  
# df_list = [read_data(f[0], f[1]) for f in my_list]
# df_raw = pd.concat(df_list, ignore_index=True)

# # pq.read_table() one location at a time
# # CPU times: user 5.23 s, sys: 9.14 s, total: 14.4 s
# # Wall time: 5min 34s

In [7]:
# %%time 

# # Test transforming data
# print(f"Number of rows before transformation: {df_raw.shape}")

# # Transform data.
# df = transform_batch(df_raw)

# print(f"Number of rows after transformation: {df.shape}")

In [8]:
# # Inspect the pandas dataframe.
# print(df.shape)
# print(df.dtypes)
# df.head()

### Tuning

In [9]:
# import standard sklearn libraries
import sklearn
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
print(f"sklearn: {sklearn.__version__}")

# import ray AIR libraries
from ray import air, tune
from ray.air import session
from ray.air.checkpoint import Checkpoint
from ray.air.config import ScalingConfig

# set global random seed for sklearn models
np.random.seed(415)

sklearn: 1.1.2


Step 1: First, we define the model training function that we want to run variations of. The function takes in a config dictionary as argument, and returns a simple dict output. Learn more about logging Tune results at [How to configure logging in Tune?](https://docs.ray.io/en/master/tune/tutorials/tune-output.html#tune-logging).

In [10]:
# 1. Define a custom train function
def train_model(config: dict):

    model = config['models']
    the_location = config['locations']
    
    # Load data.
    df_list = [read_data(f, the_location) for f in s3_files]
    # print(f"train_model finished reading files: {len(df_list)}")    
    df_raw = pd.concat(df_list, ignore_index=True)
    # print(f"train_model df_raw.shape: {df_raw.shape}")
    df = transform_batch(df_raw)
    # print(f"train_model df.shape: {df.shape}")
    
    # Train/test split.
    train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True)
    train_X = train_df[["passenger_count", "trip_distance"]]
    train_y = train_df.trip_duration
    test_X = test_df[["passenger_count", "trip_distance"]]
    test_y = test_df.trip_duration

    # Train model.
    model = model.fit(train_X, train_y)
    pred_y = model.predict(test_X)
    
    # Evaluate.
    error = sklearn.metrics.mean_absolute_error(test_y, pred_y)
    
    # Return final stats. You can also return intermediate progress
    # using ray.air.session.report() if needed.
    # To return your model, you could write it to storage and return its
    # URI in this dict, or return it as a Tune Checkpoint:
    # https://docs.ray.io/en/latest/tune/tutorials/tune-checkpoints.html
    # score = 1000.0
    return {'model': model, 'score': error}

Step 2: Next, define the space of trials to run. Here, we define a simple grid sweep from 0..NUM_MODELS, which will generate the config dicts to be passed to each model function. Learn more about what features Tune offers for defining spaces at [Working with Tune Search Spaces](https://docs.ray.io/en/master/tune/tutorials/tune-search-spaces.html#tune-search-space-tutorial).

In [11]:
# 2. Define a search space.
sample_locations = [145, 166, 152] if SMOKE_TEST else all_location_ids
search_space = {
    "models": tune.grid_search([LinearRegression(fit_intercept=True), 
                                DecisionTreeRegressor(max_depth=3)]),
    "locations": tune.grid_search(sample_locations),
}

Step 3: Optionally, configure the resources allocated per trial. Tune uses this resources allocation to control the parallelism. For example, if each trial was configured to use 4 CPUs, and the cluster had only 32 CPUs, then Tune will limit the number of concurrent trials to 8 to avoid overloading the cluster. For more information, see [A Guide To Parallelism and Resources](https://docs.ray.io/en/master/tune/tutorials/tune-resources.html#tune-parallelism).

In [12]:
# 3. Can customize resources per trial, here we set 1 CPU each.
train_model = tune.with_resources(train_model, {"cpu": 1})

Step 4: Run the trial with Tune. Tune will report on experiment status, and after the experiment finishes, you can inspect the results. Tune can retry failed trials automatically, as well as entire experiments; see [Stopping and Resuming a Tune Run](https://docs.ray.io/en/master/tune/tutorials/tune-stopping.html#tune-stopping-guide).

In [13]:
# Define a tuner object using Ray AIR Tuner API
stop_criteria = {
    "done": True,
    "training_iteration": 1 if SMOKE_TEST else 3,
}
tuner = tune.Tuner(
    train_model, 
    param_space=search_space,
    run_config=air.RunConfig(
        
        #redirect logs to relative path instead of default ~/ray_results/
        local_dir = "my_Tune_logs",
        name = "batch_tuning",

        # Stopping criteria whichever occurs first: average reward over training episodes, or ...
        stop=stop_criteria,

        # Set Ray Tune verbosity.  Summary table only with levels 2 or 3.
        verbose=2,
        )
)

# 4. Run the trial with Ray Tune
results = tuner.fit()
print("Best result:")
print(results.get_best_result(metric="score", mode="min").config)

# Total run time: 481.19 seconds (481.06 seconds for the tuning loop).
# 8 minutes




Trial name,status,loc,locations,models,iter,total time (s),score
train_model_b0f17_00000,TERMINATED,127.0.0.1:46536,145,LinearRegression(),1,348.86,191.841
train_model_b0f17_00001,TERMINATED,127.0.0.1:46539,166,LinearRegression(),1,322.047,270.643
train_model_b0f17_00002,TERMINATED,127.0.0.1:46540,152,LinearRegression(),1,327.369,448.861
train_model_b0f17_00003,TERMINATED,127.0.0.1:46541,145,DecisionTreeReg_3340,1,389.588,792.726
train_model_b0f17_00004,TERMINATED,127.0.0.1:46542,166,DecisionTreeReg_3700,1,221.907,259.403
train_model_b0f17_00005,TERMINATED,127.0.0.1:46543,152,DecisionTreeReg_3880,1,310.555,326.3


2022-11-06 17:12:15,992	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'models': DecisionTreeRegressor(max_depth=3)}


Trial train_model_b0f17_00004 reported model=DecisionTreeRegressor(max_depth=3),score=259.40304132405646 with parameters={'models': DecisionTreeRegressor(max_depth=3), 'locations': 166}. This trial completed.


2022-11-06 17:13:44,636	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'models': DecisionTreeRegressor(max_depth=3)}


Trial train_model_b0f17_00005 reported model=DecisionTreeRegressor(max_depth=3),score=326.3 with parameters={'models': DecisionTreeRegressor(max_depth=3), 'locations': 152}. This trial completed.


2022-11-06 17:13:56,107	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'models': LinearRegression()}


Trial train_model_b0f17_00001 reported model=LinearRegression(),score=270.64341735839844 with parameters={'models': LinearRegression(), 'locations': 166}. This trial completed.


2022-11-06 17:14:01,388	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'models': LinearRegression()}


Trial train_model_b0f17_00002 reported model=LinearRegression(),score=448.86126098632815 with parameters={'models': LinearRegression(), 'locations': 152}. This trial completed.


2022-11-06 17:14:20,847	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'models': LinearRegression()}


Trial train_model_b0f17_00000 reported model=LinearRegression(),score=191.8406982421875 with parameters={'models': LinearRegression(), 'locations': 145}. This trial completed.


2022-11-06 17:15:03,660	INFO tensorboardx.py:267 -- Removed the following hyperparameter values when logging to tensorboard: {'models': DecisionTreeRegressor(max_depth=3)}


Trial train_model_b0f17_00003 reported model=DecisionTreeRegressor(max_depth=3),score=792.7261904761905 with parameters={'models': DecisionTreeRegressor(max_depth=3), 'locations': 145}. This trial completed.


2022-11-06 17:15:03,778	INFO tune.py:758 -- Total run time: 393.97 seconds (393.28 seconds for the tuning loop).


Best result:
{'models': LinearRegression(), 'locations': 145}


In [14]:
# Access individual results.
print(f"{results[0]}")
print(f"\n{results[1]}")
print(f"\n{results[2]}")

Result(metrics={'model': LinearRegression(), 'score': 191.8406982421875, 'done': True, 'trial_id': 'b0f17_00000', 'experiment_tag': '0_locations=145,models=LinearRegression'}, error=None, log_dir=PosixPath('/Users/christy/Documents/github_ray_temp/ray/doc/source/data/examples/my_Tune_logs/batch_tuning/train_model_b0f17_00000_0_locations=145,models=LinearRegression_2022-11-06_17-08-30'))

Result(metrics={'model': LinearRegression(), 'score': 270.64341735839844, 'done': True, 'trial_id': 'b0f17_00001', 'experiment_tag': '1_locations=166,models=LinearRegression'}, error=None, log_dir=PosixPath('/Users/christy/Documents/github_ray_temp/ray/doc/source/data/examples/my_Tune_logs/batch_tuning/train_model_b0f17_00001_1_locations=166,models=LinearRegression_2022-11-06_17-08-31'))

Result(metrics={'model': LinearRegression(), 'score': 448.86126098632815, 'done': True, 'trial_id': 'b0f17_00002', 'experiment_tag': '2_locations=152,models=LinearRegression'}, error=None, log_dir=PosixPath('/Users/ch