# Batch Tuning with Ray AIR Tune

### Load and Prepare Data

In [1]:
import os
print(f'Number of CPUs in this system: {os.cpu_count()}')
from typing import Tuple, List, Union, Optional, Callable
import time
import pandas as pd
import numpy as np
import pyarrow.dataset as pds
from pyarrow import fs
from pyarrow import parquet as pq
from ray.data import Dataset

Number of CPUs in this system: 8


In [2]:
import ray

if ray.is_initialized():
    ray.shutdown()
ray.init()

2022-11-02 13:57:13,506	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.8.13
Ray version:,2.0.1
Dashboard:,http://127.0.0.1:8265


In [3]:
# To speed things up, we’ll only use a small subset of the full dataset consisting of two last months of 2019.
# You can choose to use the full dataset for 2018-2019 by setting the SMOKE_TEST variable to False.

SMOKE_TEST = True

In [4]:
# Define some global variables.
target = "trip_duration"
s3_partitions = pds.dataset(
    "s3://anonymous@air-example-data/ursa-labs-taxi-data/by_year/",
    partitioning=["year", "month"],
)
s3_files = [f"s3://{file}" for file in s3_partitions.files]

# Obtain all location IDs
location_ids = (
    pq.read_table(s3_files[0], columns=["dropoff_location_id"])["dropoff_location_id"]
    .unique()
    .to_pylist()
)

# Use smoke testing or not.
starting_idx = -1 if SMOKE_TEST else 0
sample_locations = [145, 166, 152] if SMOKE_TEST else location_ids

# Display what data will be used.
s3_files = s3_files[starting_idx:]
print(f"NYC Taxi using {len(s3_files)} file(s)!")
print(f"s3_files: {s3_files}")
print(f"Locations: {sample_locations}")

NYC Taxi using 1 file(s)!
s3_files: ['s3://air-example-data/ursa-labs-taxi-data/by_year/2019/06/data.parquet/ab5b9d2b8cc94be19346e260b543ec35_000000.parquet']
Locations: [145, 166, 152]


In [5]:
def pushdown_read_data(files_list: list, sample_ids: list) -> Dataset:
    start = time.time()

    filter_expr = (
        (pds.field("passenger_count") > 0)
        & (pds.field("trip_distance") > 0)
        & (pds.field("fare_amount") > 0)
        & (~pds.field("pickup_location_id").isin([264, 265]))
        & (~pds.field("dropoff_location_id").isin([264, 265]))
        & (pds.field("dropoff_location_id").isin(sample_ids))
    )

    dataset = ray.data.read_parquet(
        files_list,
        columns=[
            "pickup_at",
            "dropoff_at",
            "pickup_location_id",
            "dropoff_location_id",
            "passenger_count",
            "trip_distance",
            "fare_amount",
        ],
        filter=filter_expr,
    )

    data_loading_time = time.time() - start
    print(f"Data loading time: {data_loading_time:.2f} seconds")
    
    return dataset

# A pandas DataFrame UDF for transforming the Dataset in parallel.
def transform_batch(the_df: pd.DataFrame) -> pd.DataFrame:
    df = the_df.copy()
    
    df["trip_duration"] = (df["dropoff_at"] - df["pickup_at"]).dt.seconds
    df = df[df["trip_duration"] > 60]
    df = df[df["trip_duration"] < 24 * 60 * 60] 
    df.drop(["dropoff_at", "pickup_at", "pickup_location_id", "fare_amount"]
            , axis=1, inplace=True)
    df["dropoff_location_id"] = df["dropoff_location_id"].fillna(-1)
    return df

In [6]:
# Test the pushdown_read_data function
ds_raw = pushdown_read_data(s3_files, sample_locations)



Data loading time: 5.18 seconds


In [7]:
%%time 

# Test the transform UDF.
print(f"Number of rows before transformation: {ds_raw.count()}")

# # Repartition the dataset to allow for higher parallelism.
ds = ds_raw.repartition(5, shuffle=False) 

# .map_batches applies a UDF to each partition of the data in parallel.
ds = ds.map_batches(transform_batch, batch_format="pandas")

# Verify row count.
print(f"Number of rows after transformation: {ds.count()}")

Number of rows before transformation: 6941024


Read: 100%|██████████████████████████████████████| 1/1 [03:58<00:00, 238.70s/it]
Repartition: 100%|████████████████████████████████| 5/5 [00:00<00:00, 19.69it/s]
Map_Batches: 100%|████████████████████████████████| 5/5 [00:00<00:00, 49.04it/s]

Number of rows after transformation: 82704
CPU times: user 1.87 s, sys: 828 ms, total: 2.7 s
Wall time: 3min 59s





In [8]:
# Inspect ray data.
print(f"Number of rows: {ds.count()}")
print(f"Size bytes (from parquet metadata): {ds.size_bytes()}")

print("\nSchema data types:")
data_types = list(zip(ds.schema().names, ds.schema().types))
for s in data_types:
    print(f"{s[0]}: {s[1]}")
    
print("\nSample row:")
ds.take(1)

Number of rows: 82704
Size bytes (from parquet metadata): 1406608

Schema data types:
dropoff_location_id: int32
passenger_count: int8
trip_distance: float32
trip_duration: int64

Sample row:


[PandasRow({'dropoff_location_id': 166,
            'passenger_count': 1,
            'trip_distance': 6.5,
            'trip_duration': 1248})]

### Tuning

In [9]:
# import standard sklearn libraries
import sklearn
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
print(f"sklearn: {sklearn.__version__}")

# import ray AIR libraries
from ray import air, tune
from ray.air import session
from ray.air.checkpoint import Checkpoint
from ray.air.config import ScalingConfig

# set global random seed for sklearn models
np.random.seed(415)

sklearn: 1.1.2


Step 1: First, we define the model training function that we want to run variations of. The function takes in a config dictionary as argument, and returns a simple dict output. Learn more about logging Tune results at [How to configure logging in Tune?](https://docs.ray.io/en/master/tune/tutorials/tune-output.html#tune-logging).

In [10]:
# TODO add the scikit-learn model training, eval
def evaluation_fn(test_y: pd.Series, pred_y: pd.Series):
    error = sklearn.metrics.mean_absolute_error(test_y, pred_y)
    return error
        
# 1. Define an objective function.
def objective(config: dict):
    
    # Get model choices from top-level dictionary keys
    models = list(config.keys())
    for model in models:
        
        # Get param choices from nested-level dictionary keys
        param_name = list(config[model].keys())
        param_values = config[model][param_name[0]]
        for i in param_values:
            print(f"model: {model}, key: {param_name[0]}, value: {i}")
            # TODO put more meaningful score here later
            score = 1000.0
    
# 2. Define a search space.
search_space = {
    'LinearRegression()': {
        "fit_intercept": [True, False]},
    'DecisionTreeRegressor()': {
        "max_depth": [2,4,6]}
}

# Test objective function call.
objective(search_space)

model: LinearRegression(), key: fit_intercept, value: True
model: LinearRegression(), key: fit_intercept, value: False
model: DecisionTreeRegressor(), key: max_depth, value: 2
model: DecisionTreeRegressor(), key: max_depth, value: 4
model: DecisionTreeRegressor(), key: max_depth, value: 6


In [11]:
def train_model(train_df: pd.DataFrame, 
                test_df: pd.DataFrame,
                config: dict):

    # Import model libraries, etc...
    # Load data and train model code here...
    # Assemble train/test pandas dfs
    train_X = train_df[["passenger_count", "trip_distance"]]
    train_y = train_df.trip_duration
    test_X = test_df[["passenger_count", "trip_distance"]]
    test_y = test_df.trip_duration
    
    # 3. Define a tuner using Ray AIR Tuner API
    stop_criteria = {
        "done": True,
        # "training_iteration": 1 if args.smoke_test else 4,
        "training_iteration": 1 if SMOKE_TEST else 4,
    }
    tuner = tune.Tuner(
        objective, 
        param_space=search_space,
        run_config=air.RunConfig(
            #redirect logs to relative path instead of default ~/ray_results/
            local_dir = "my_Tune_logs",
            name = "batch_tuning",

            # Stopping criteria whichever occurs first: average reward over training episodes, or ...
            stop=stop_criteria,

            # Set Ray Tune verbosity.  Summary table only with levels 2 or 3.
            verbose=2,
            )
    )
    print(f"type(tuner): {type(tuner)}")
    results = tuner.fit()

    # Return final stats. You can also return intermediate progress
    # using ray.air.session.report() if needed.
    # To return your model, you could write it to storage and return its
    # URI in this dict, or return it as a Tune Checkpoint:
    # https://docs.ray.io/en/latest/tune/tutorials/tune-checkpoints.html
    return results

In [14]:
# test the function call

# Randomly split the data into 80/20 train/test.
df = ds.to_pandas()
train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True)
    
results = train_model(train_df, test_df, search_space)
print(results.get_best_result(metric="score", mode="min").config)



(66163, 4) (16541, 4)
type(tuner): <class 'ray.tune.tuner.Tuner'>


Trial name,status,loc
objective_efd1d_00000,TERMINATED,127.0.0.1:16185


Trial objective_efd1d_00000 completed. Last result: 


2022-11-02 14:04:29,792	INFO tune.py:758 -- Total run time: 2.25 seconds (1.55 seconds for the tuning loop).


[2m[36m(objective pid=16185)[0m model: LinearRegression(), key: fit_intercept, value: True
[2m[36m(objective pid=16185)[0m model: LinearRegression(), key: fit_intercept, value: False
[2m[36m(objective pid=16185)[0m model: DecisionTreeRegressor(), key: max_depth, value: 2
[2m[36m(objective pid=16185)[0m model: DecisionTreeRegressor(), key: max_depth, value: 4
[2m[36m(objective pid=16185)[0m model: DecisionTreeRegressor(), key: max_depth, value: 6
{'LinearRegression()': {'fit_intercept': [True, False]}, 'DecisionTreeRegressor()': {'max_depth': [2, 4, 6]}}


Step 2: Next, define the space of trials to run. Here, we define a simple grid sweep from 0..NUM_MODELS, which will generate the config dicts to be passed to each model function. Learn more about what features Tune offers for defining spaces at [Working with Tune Search Spaces](https://docs.ray.io/en/master/tune/tutorials/tune-search-spaces.html#tune-search-space-tutorial).

In [None]:
# TODO

# # Define trial parameters as a single grid sweep.
# trial_space = {
#     # This is an example parameter. You could replace it with filesystem paths,
#     # model types, or even full nested Python dicts of model configurations, etc.,
#     # that enumerate the set of trials to run.
#     "model_id": tune.grid_search([
#         "model_{}".format(i)
#         for i in range(NUM_MODELS)
#     ])
# }

Step 3: Optionally, configure the resources allocated per trial. Tune uses this resources allocation to control the parallelism. For example, if each trial was configured to use 4 CPUs, and the cluster had only 32 CPUs, then Tune will limit the number of concurrent trials to 8 to avoid overloading the cluster. For more information, see [A Guide To Parallelism and Resources](https://docs.ray.io/en/master/tune/tutorials/tune-resources.html#tune-parallelism).

In [None]:
# TODO

# print(type(train_model))

# # Can customize resources per trial, here we set 1 CPU each.
# train_model = tune.with_resources(train_model, {"cpu": 1})

# print(type(train_model))

Step 4: Run the trial with Tune. Tune will report on experiment status, and after the experiment finishes, you can inspect the results. Tune can retry failed trials automatically, as well as entire experiments; see [Stopping and Resuming a Tune Run](https://docs.ray.io/en/master/tune/tutorials/tune-stopping.html#tune-stopping-guide).

In [None]:
# TODO

# # Start a Tune run and print the best result.
# tuner = tune.Tuner(train_model, param_space=trial_space)
# results = tuner.fit()

# # Access individual results.
# print(results[0])
# print(results[1])
# print(results[2])