In [17]:
#!pip install --user "tensorflow>=2.8.0"
!pip install --user "scikit-learn"

Collecting scikit-learn
  Downloading scikit_learn-1.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.5 MB)
     |████████████████████████████████| 9.5 MB 5.3 MB/s eta 0:00:01
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
     |████████████████████████████████| 297 kB 109.2 MB/s eta 0:00:01
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.0 threadpoolctl-3.1.0


In [4]:
import ray

runtime_env = {
    "working_dir": ".",
    "excludes":['/data/','/.ipynb_checkpoints/']
}
ray.init(runtime_env=runtime_env)

2022-12-19 08:48:46,259	INFO worker.py:1230 -- Using address localhost:9031 set in the environment variable RAY_ADDRESS
2022-12-19 08:48:46,520	INFO worker.py:1352 -- Connecting to existing Ray cluster at address: 10.0.7.140:9031...
2022-12-19 08:48:46,529	INFO worker.py:1529 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://console.anyscale.com/api/v2/sessions/ses_buwxbm99nq8dryqg6p8sbytw/services?redirect_to=dashboard [39m[22m
2022-12-19 08:48:46,534	INFO packaging.py:546 -- Creating a file package for local directory '.'.
2022-12-19 08:48:46,537	INFO packaging.py:373 -- Pushing file package 'gcs://_ray_pkg_64f7bff8f540b0eb.zip' (0.14MiB) to Ray cluster...
2022-12-19 08:48:46,539	INFO packaging.py:386 -- Successfully pushed file package 'gcs://_ray_pkg_64f7bff8f540b0eb.zip'.


0,1
Python version:,3.9.12
Ray version:,2.2.0
Dashboard:,http://console.anyscale.com/api/v2/sessions/ses_buwxbm99nq8dryqg6p8sbytw/services?redirect_to=dashboard


In [9]:
from pprint import pprint
pprint(ray.cluster_resources())

{'CPU': 8.0,
 'memory': 18155232462.0,
 'node:10.0.7.140': 1.0,
 'object_store_memory': 9077616230.0}


In [10]:
import pandas as pd

INPUT = "input"
LABEL = "is_big_tip"

def get_data() -> pd.DataFrame:
    """Fetch the taxi fare data to work on."""
    _data = pd.read_csv(
        "https://raw.githubusercontent.com/tensorflow/tfx/master/"
        "tfx/examples/chicago_taxi_pipeline/data/simple/data.csv"
    )
    _data[LABEL] = _data["tips"] / _data["fare"] > 0.2
    # We drop some columns here for the sake of simplicity.
    return _data.drop(
        [
            "tips",
            "fare",
            "dropoff_latitude",
            "dropoff_longitude",
            "pickup_latitude",
            "pickup_longitude",
            "pickup_census_tract",
        ],
        axis=1,
    )

In [11]:
data = get_data()
data.head(5)

Unnamed: 0,pickup_community_area,trip_start_month,trip_start_hour,trip_start_day,trip_start_timestamp,trip_miles,dropoff_census_tract,payment_type,company,trip_seconds,dropoff_community_area,is_big_tip
0,,5,19,6,1400269500,0.0,,Credit Card,Chicago Elite Cab Corp. (Chicago Carriag,0.0,,False
1,,3,19,5,1362683700,0.0,,Unknown,Chicago Elite Cab Corp.,300.0,,False
2,60.0,10,2,3,1380593700,12.6,,Cash,Taxi Affiliation Services,1380.0,,False
3,10.0,10,1,2,1382319000,0.0,,Cash,Taxi Affiliation Services,180.0,,False
4,14.0,5,7,5,1369897200,0.0,,Cash,Dispatch Taxi Affiliation,1080.0,,False


In [19]:
import numpy as np
from sklearn.model_selection import train_test_split
from typing import Tuple


def split_data(data: pd.DataFrame) -> Tuple[ray.data.Dataset, pd.DataFrame, np.array]:
    """Split the data in a stratified way.

    Returns:
        A tuple containing train dataset, test data and test label.
    """
    # There is a native offering in Ray Dataset for split as well.
    # However, supporting stratification is a TODO there. So use
    # scikit-learn equivalent here.
    train_data, test_data = train_test_split(
        data, stratify=data[[LABEL]], random_state=1113
    )
    _train_ds = ray.data.from_pandas(train_data)
    _test_label = test_data[LABEL].values
    _test_df = test_data.drop([LABEL], axis=1)
    return _train_ds, _test_df, _test_label

train_ds, test_df, test_label = split_data(data)
print(f"There are {train_ds.count()} samples for training and {test_df.shape[0]} samples for testing.")

There are 11251 samples for training and 3751 samples for testing.


In [20]:
from ray.data.preprocessors import (
    BatchMapper,
    Chain,
    OneHotEncoder,
    SimpleImputer,
)

def get_preprocessor():
    """Construct a chain of preprocessors."""
    imputer1 = SimpleImputer(
        ["dropoff_census_tract"], strategy="most_frequent"
    )
    imputer2 = SimpleImputer(
        ["pickup_community_area", "dropoff_community_area"],
        strategy="most_frequent",
    )
    imputer3 = SimpleImputer(["payment_type"], strategy="most_frequent")
    imputer4 = SimpleImputer(
        ["company"], strategy="most_frequent")
    imputer5 = SimpleImputer(
        ["trip_start_timestamp", "trip_miles", "trip_seconds"], strategy="mean"
    )

    ohe = OneHotEncoder(
        columns=[
            "trip_start_hour",
            "trip_start_day",
            "trip_start_month",
            "dropoff_census_tract",
            "pickup_community_area",
            "dropoff_community_area",
            "payment_type",
            "company",
        ],
        max_categories={
            "dropoff_census_tract": 25,
            "pickup_community_area": 20,
            "dropoff_community_area": 20,
            "payment_type": 2,
            "company": 7,
        },
    )

    def batch_mapper_fn(df):
        df["trip_start_year"] = pd.to_datetime(df["trip_start_timestamp"], unit="s").dt.year
        df = df.drop(["trip_start_timestamp"], axis=1)
        return df

    def concat_for_tensor(dataframe):
        from ray.data.extensions import TensorArray
        result = {}
        feature_cols = [col for col in dataframe.columns if col != LABEL]
        result[INPUT] = TensorArray(dataframe[feature_cols].to_numpy(dtype=np.float32))
        if LABEL in dataframe.columns:
            result[LABEL] = dataframe[LABEL]
        return  pd.DataFrame(result)

    chained_pp = Chain(
        imputer1,
        imputer2,
        imputer3,
        imputer4,
        imputer5,
        ohe,
        BatchMapper(batch_mapper_fn, batch_format="pandas"),
        BatchMapper(concat_for_tensor, batch_format="pandas")
    )
    return chained_pp

In [21]:
# Note that `INPUT_SIZE` here is corresponding to the output dimension
# of the previously defined processing steps.
# This is used to specify the input shape of Keras model.
INPUT_SIZE = 120
# The training batch size. Based on `NUM_WORKERS`, each worker
# will get its own share of this batch size. For example, if
# `NUM_WORKERS = 2`, each worker will work on 4 samples per batch.
BATCH_SIZE = 8
# Number of epoch. Adjust it based on how quickly you want the run to be.
EPOCH = 1
# Number of training workers.
# Adjust this accordingly based on the resources you have!
NUM_WORKERS = 2

# Input

In [22]:
import tensorflow as tf

def build_model():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.Input(shape=(INPUT_SIZE,)))
    model.add(tf.keras.layers.Dense(50, activation="relu"))
    model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
    return model

2022-12-19 10:29:18.627573: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-19 10:29:22.116166: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-19 10:29:22.116188: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-19 10:29:31.774059: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directo

In [23]:
from ray.air import session, Checkpoint
from ray.train.tensorflow import TensorflowCheckpoint

def train_loop_per_worker():
    dataset_shard = session.get_dataset_shard("train")

    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    with strategy.scope():
        model = build_model()
        model.compile(
            loss="binary_crossentropy",
            optimizer="adam",
            metrics=["accuracy"],
        )

    for epoch in range(EPOCH):            
        tf_dataset = dataset_shard.to_tf(feature_columns=INPUT, label_columns=LABEL, batch_size=BATCH_SIZE, drop_last=True)

        model.fit(tf_dataset, verbose=0)
        # This saves checkpoint in a way that can be used by Ray Serve coherently.
        session.report(
            {},
            checkpoint=TensorflowCheckpoint.from_model(model),
        )


In [24]:
from ray.train.tensorflow import TensorflowTrainer
from ray.air.config import ScalingConfig

trainer = TensorflowTrainer(
    train_loop_per_worker=train_loop_per_worker,
    scaling_config=ScalingConfig(num_workers=NUM_WORKERS),
    datasets={"train": train_ds},
    preprocessor=get_preprocessor(),
)
result = trainer.fit()

  from .autonotebook import tqdm as notebook_tqdm


0,1
Current time:,2022-12-19 10:59:33
Running for:,00:00:54.30
Memory:,3.3/30.9 GiB

Trial name,status,loc,iter,total time (s),_timestamp,_time_this_iter_s,_training_iteration
TensorflowTrainer_1ea52_00000,TERMINATED,10.0.7.140:30179,1,34.606,1671476369,20.2285,1


(pid=30179) 2022-12-19 10:58:41.422374: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
(pid=30179) To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
(pid=30179) 2022-12-19 10:58:41.599689: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
(pid=30179) 2022-12-19 10:58:41.599728: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
(pid=30179) 2022-12-19 10:58:46.457945: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.s

Trial name,_time_this_iter_s,_timestamp,_training_iteration,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
TensorflowTrainer_1ea52_00000,20.2285,1671476369,1,2022-12-19_10-59-30,True,,4d9e5f8df04e49a297d89c4cf977437a,0,ip-10-0-7-140,1,10.0.7.140,30179,True,34.606,34.606,34.606,1671476370,0,,1,1ea52_00000,0.0104268


2022-12-19 10:59:33,161	INFO tune.py:762 -- Total run time: 66.42 seconds (54.29 seconds for the tuning loop).


# Serving

In [25]:
from fastapi import Request

async def dataframe_adapter(request: Request):
    """Serve HTTP Adapter that reads JSON and converts to pandas DataFrame."""
    content = await request.json()
    return pd.DataFrame.from_dict(content)

In [26]:
from ray import serve
from ray.air.checkpoint import Checkpoint
from ray.train.tensorflow import TensorflowPredictor
from ray.serve import PredictorDeployment


def serve_model(checkpoint: Checkpoint, model_definition, adapter, name="Model") -> str:
    """Expose a serve endpoint.

    Returns:
        serve URL.
    """
    serve.run(
        PredictorDeployment.options(name=name).bind(
            TensorflowPredictor,
            checkpoint,
            batching_params=dict(max_batch_size=2, batch_wait_timeout_s=5),
            model_definition=model_definition,
            http_adapter=adapter,
        )
    )
    return f"http://localhost:8000/"

In [27]:
import ray
# Generally speaking, training and serving are done in totally different ray clusters.
# To simulate that, let's shutdown the old ray cluster in preparation for serving.
#ray.shutdown()

endpoint_uri = serve_model(result.checkpoint, build_model, dataframe_adapter)

(ServeController pid=32331) INFO 2022-12-19 11:06:16,518 controller 32331 http_state.py:129 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:SERVE_PROXY_ACTOR-de6187171ed3603eaa1ff6f2b892d34e6142d8769fd54d626724975c' on node 'de6187171ed3603eaa1ff6f2b892d34e6142d8769fd54d626724975c' listening on '127.0.0.1:8000'
(HTTPProxyActor pid=32379) INFO:     Started server process [32379]
(ServeController pid=32331) INFO 2022-12-19 11:06:18,778 controller 32331 deployment_state.py:1310 - Adding 1 replica to deployment 'Model'.
(ServeReplica:Model pid=32425) 2022-12-19 11:06:20.936481: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
(ServeReplica:Model pid=32425) To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
(ServeReplica:Model pid=32425) 2022-12-19 11:06:21.083073:

In [28]:
import json
import requests
import pandas as pd
import numpy as np

NUM_SERVE_REQUESTS = 10

def send_requests(df: pd.DataFrame, label: np.array):
    for i in range(NUM_SERVE_REQUESTS):
        one_row = df.iloc[[i]].to_dict()
        serve_result = requests.post(endpoint_uri, data=json.dumps(one_row), headers={"Content-Type": "application/json"}).json()
        print(
            f"request{i} prediction: {serve_result[0]['predictions']} "
            f"- label: {str(label[i])}"
        )
        
send_requests(test_df, test_label)

request0 prediction: [0.0001859385083662346] - label: True


(HTTPProxyActor pid=32379) INFO 2022-12-19 11:06:39,969 http_proxy 10.0.7.140 http_proxy.py:361 - POST / 200 5161.2ms
(ServeReplica:Model pid=32425) INFO 2022-12-19 11:06:39,958 Model Model#rgTZgm replica.py:505 - HANDLE __call__ OK 5147.3ms


request1 prediction: [2.9279172508722695e-07] - label: False


(HTTPProxyActor pid=32379) INFO 2022-12-19 11:06:45,059 http_proxy 10.0.7.140 http_proxy.py:361 - POST / 200 5088.4ms
(ServeReplica:Model pid=32425) INFO 2022-12-19 11:06:45,057 Model Model#rgTZgm replica.py:505 - HANDLE __call__ OK 5084.3ms


request2 prediction: [3.692715324632445e-07] - label: False


(HTTPProxyActor pid=32379) INFO 2022-12-19 11:06:50,174 http_proxy 10.0.7.140 http_proxy.py:361 - POST / 200 5109.4ms
(ServeReplica:Model pid=32425) INFO 2022-12-19 11:06:50,172 Model Model#rgTZgm replica.py:505 - HANDLE __call__ OK 5105.8ms


request3 prediction: [5.959282134426758e-08] - label: False


(HTTPProxyActor pid=32379) INFO 2022-12-19 11:06:55,263 http_proxy 10.0.7.140 http_proxy.py:361 - POST / 200 5082.7ms
(ServeReplica:Model pid=32425) INFO 2022-12-19 11:06:55,261 Model Model#rgTZgm replica.py:505 - HANDLE __call__ OK 5079.2ms


request4 prediction: [8.020435871003428e-08] - label: False


(HTTPProxyActor pid=32379) INFO 2022-12-19 11:07:00,354 http_proxy 10.0.7.140 http_proxy.py:361 - POST / 200 5086.0ms
(ServeReplica:Model pid=32425) INFO 2022-12-19 11:07:00,352 Model Model#rgTZgm replica.py:505 - HANDLE __call__ OK 5082.9ms


request5 prediction: [1.6037300554216927e-07] - label: False


(HTTPProxyActor pid=32379) INFO 2022-12-19 11:07:05,445 http_proxy 10.0.7.140 http_proxy.py:361 - POST / 200 5087.1ms
(ServeReplica:Model pid=32425) INFO 2022-12-19 11:07:05,444 Model Model#rgTZgm replica.py:505 - HANDLE __call__ OK 5084.1ms


request6 prediction: [1.2155658168921946e-07] - label: False


(HTTPProxyActor pid=32379) INFO 2022-12-19 11:07:10,547 http_proxy 10.0.7.140 http_proxy.py:361 - POST / 200 5097.7ms
(ServeReplica:Model pid=32425) INFO 2022-12-19 11:07:10,546 Model Model#rgTZgm replica.py:505 - HANDLE __call__ OK 5094.6ms


request7 prediction: [6.367843070620438e-07] - label: False


(HTTPProxyActor pid=32379) INFO 2022-12-19 11:07:15,661 http_proxy 10.0.7.140 http_proxy.py:361 - POST / 200 5108.4ms
(ServeReplica:Model pid=32425) INFO 2022-12-19 11:07:15,659 Model Model#rgTZgm replica.py:505 - HANDLE __call__ OK 5104.9ms


request8 prediction: [8.011139840391479e-08] - label: False


(HTTPProxyActor pid=32379) INFO 2022-12-19 11:07:20,771 http_proxy 10.0.7.140 http_proxy.py:361 - POST / 200 5104.3ms
(ServeReplica:Model pid=32425) INFO 2022-12-19 11:07:20,769 Model Model#rgTZgm replica.py:505 - HANDLE __call__ OK 5100.5ms


request9 prediction: [1.0172703696298413e-05] - label: True


(HTTPProxyActor pid=32379) INFO 2022-12-19 11:07:25,882 http_proxy 10.0.7.140 http_proxy.py:361 - POST / 200 5104.9ms
(ServeReplica:Model pid=32425) INFO 2022-12-19 11:07:25,880 Model Model#rgTZgm replica.py:505 - HANDLE __call__ OK 5100.8ms
