In [1]:
from typing import Any, Dict, List, Optional, Tuple, Union
import logging

import numpy as np

import albumentations as A
import cv2
from PIL.Image import Image
import pandas as pd
import ray
from ray.data.datasource.partitioning import Partitioning
from ray.data.preprocessors import LabelEncoder

import torch
from torch import Tensor
from torch.utils.data import Dataset
import torchvision
import torchvision.transforms as T

logger = logging.getLogger()


In [2]:
TransformTypes = Optional[Union[A.Compose, T.Compose]]
EPOCHS = 2
BATCH_SIZE = 10
LEARNING_RATE = 0.003
TRAIN_DATA_PATH = "../data/train/"
TEST_DATA_PATH = "../data/val/"


TRANSFORM_IMG = T.Compose(
    [
        T.Resize(224),
        T.CenterCrop(224),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)


In [3]:
# train_data = torchvision.datasets.ImageFolder(
#     root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG
# )
# train_data_loader = torch.utils.data.DataLoader(
#     train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4
# )
# test_data = torchvision.datasets.ImageFolder(
#     root=TEST_DATA_PATH, transform=TRANSFORM_IMG
# )
# test_data_loader = torch.utils.data.DataLoader(
#     test_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=4
# )


In [4]:
train_dataset = torchvision.datasets.ImageFolder(
    root=TRAIN_DATA_PATH, transform=TRANSFORM_IMG
)
test_dataset = torchvision.datasets.ImageFolder(
    root=TEST_DATA_PATH, transform=TRANSFORM_IMG
)

train_dataset: ray.data.Dataset = ray.data.from_torch(train_dataset)
test_dataset: ray.data.Dataset = ray.data.from_torch(test_dataset)

2023-03-17 14:15:43,055	INFO worker.py:1544 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


In [5]:
# train_partitioning = Partitioning(
#     "dir", field_names=["label"], base_dir=TRAIN_DATA_PATH
# )
# train_dataset = ray.data.read_images(
#     TRAIN_DATA_PATH, size=(224, 224), partitioning=train_partitioning
# )
# test_partitioning = Partitioning("dir", field_names=["label"], base_dir=TEST_DATA_PATH)
# test_dataset = ray.data.read_images(
#     TEST_DATA_PATH, size=(224, 224), partitioning=test_partitioning
# )


In [6]:
# encoder = LabelEncoder(label_column="label")
# encoder.fit_transform(train_dataset)
# encoder.transform(test_dataset)


[2m[36m(raylet)[0m Spilled 2996 MiB, 201 objects, write throughput 2357 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.


In [7]:
def convert_batch_to_numpy(batch: Tuple[Image, int]) -> Dict[str, np.ndarray]:
    images = np.stack([np.array(image) for image, _ in batch])
    labels = np.array([label for _, label in batch])
    return {"image": images, "label": labels}


train_dataset = train_dataset.map_batches(convert_batch_to_numpy).fully_executed()
test_dataset = test_dataset.map_batches(convert_batch_to_numpy).fully_executed()

2023-03-17 14:15:48,129	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(convert_batch_to_numpy)]
MapBatches(convert_batch_to_numpy):   8%|▊         | 17/201 [00:15<01:55,  1.60it/s][2m[36m(raylet)[0m Spilled 5060 MiB, 356 objects, write throughput 2350 MiB/s.
MapBatches(convert_batch_to_numpy): 100%|██████████| 201/201 [00:16<00:00, 12.38it/s]
2023-03-17 14:16:04,459	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(convert_batch_to_numpy)]
MapBatches(convert_batch_to_numpy): 100%|██████████| 16/16 [00:00<00:00, 339.40it/s]


In [8]:
test_dataset.show(1)#[0]['image']

{'image': array([[[-0.01155927,  0.00556549, -0.08005828, ...,  0.31381115,
          0.33093593,  0.33093593],
        [-0.02868402, -0.04580877, -0.06293353, ...,  0.33093593,
          0.33093593,  0.31381115],
        [-0.06293353, -0.04580877, -0.06293353, ...,  0.3651854 ,
          0.3651854 ,  0.34806067],
        ...,
        [-2.0151556 , -1.7754089 , -1.5870366 , ..., -1.1246684 ,
         -1.3472902 , -1.5014129 ],
        [-2.0322802 , -1.7925336 , -1.6212862 , ..., -1.1589178 ,
         -1.3815396 , -1.5185376 ],
        [-2.0494049 , -1.6384109 , -1.5699118 , ..., -1.1760426 ,
         -1.3815396 , -1.5356624 ]],

       [[ 0.11764706,  0.13515405,  0.04761905, ...,  0.45028022,
          0.4677872 ,  0.4677872 ],
        [ 0.10014006,  0.08263306,  0.06512605, ...,  0.4677872 ,
          0.4677872 ,  0.45028022],
        [ 0.06512605,  0.08263306,  0.06512605, ...,  0.50280124,
          0.50280124,  0.48529422],
        ...,
        [-1.9306722 , -1.6855742 , -1.492997

In [9]:
# from PIL import Image
# Image.fromarray(test_dataset.take(1)[0]['image'])

In [10]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self, num_classes: int = 1000, dropout: float = 0.5):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 6, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 3)
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(16 * 7 * 7, 120),
            nn.ReLU(True),
            nn.Dropout(p=dropout),
            nn.Linear(120, 84),
            nn.ReLU(True),
            nn.Dropout(p=dropout),
            nn.Linear(84, num_classes),
        )

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.avgpool(x)
        x = torch.flatten(x, 1)  # flatten all dimensions except batch
        x = self.classifier(x)
        return x


In [11]:
from ray import train
from ray.air import session, Checkpoint
from ray.train.torch import TorchCheckpoint
import torch.nn as nn
import torch.optim as optim
import torchvision


def train_loop_per_worker(config):
    model = train.torch.prepare_model(Net(num_classes=2))

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    train_dataset_shard = session.get_dataset_shard("train")

    for epoch in range(2):
        running_loss = 0.0
        train_dataset_batches = train_dataset_shard.iter_torch_batches(
            batch_size=config["batch_size"], device=train.torch.get_device()
        )
        for i, batch in enumerate(train_dataset_batches):
            # get the inputs and labels
            inputs, labels = batch["image"], batch["label"]

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print(f"[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}")
                running_loss = 0.0

        metrics = dict(running_loss=running_loss)
        checkpoint = TorchCheckpoint.from_state_dict(model.state_dict())
        session.report(metrics, checkpoint=checkpoint)


2023-03-17 14:16:04,698	INFO instantiator.py:21 -- Created a temporary directory at /var/folders/p4/kcmtkxw53z54k341vwwykts80000gn/T/tmpeena33u7
2023-03-17 14:16:04,699	INFO instantiator.py:76 -- Writing /var/folders/p4/kcmtkxw53z54k341vwwykts80000gn/T/tmpeena33u7/_remote_module_non_scriptable.py


In [12]:
# from ray.data.preprocessors import TorchVisionPreprocessor

# preprocessor = TorchVisionPreprocessor(columns=["image"], transform=TRANSFORM_IMG)


In [13]:
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig

use_gpu = ray.available_resources().get("GPU", 0) >= 2

trainer = TorchTrainer(
    train_loop_per_worker=train_loop_per_worker,
    train_loop_config={"batch_size": 2},
    datasets={"train": train_dataset},
    scaling_config=ScalingConfig(num_workers=2, use_gpu=use_gpu),
    # preprocessor=preprocessor,
)
result = trainer.fit()
latest_checkpoint = result.checkpoint


0,1
Current time:,2023-03-17 14:16:07
Running for:,00:00:02.32
Memory:,10.3/16.0 GiB

Trial name,status,loc
TorchTrainer_32a86_00000,RUNNING,127.0.0.1:15275


[2m[36m(TorchTrainer pid=15275)[0m 2023-03-17 14:16:09,203	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[randomize_block_order]
[2m[36m(RayTrainWorker pid=15276)[0m 2023-03-17 14:16:09,184	INFO config.py:86 -- Setting up process group for: env:// [rank=0, world_size=2]
[2m[36m(RayTrainWorker pid=15276)[0m 2023-03-17 14:16:10,356	INFO train_loop_utils.py:255 -- Moving model to device: cpu
[2m[36m(RayTrainWorker pid=15276)[0m 2023-03-17 14:16:10,357	INFO train_loop_utils.py:315 -- Wrapping provided model in DistributedDataParallel.


Trial name,_time_this_iter_s,_timestamp,_training_iteration,date,done,episodes_total,experiment_id,hostname,iterations_since_restore,node_ip,pid,running_loss,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
TorchTrainer_32a86_00000,26.5209,1679033815,2,2023-03-17_14-16-55,False,,7286415d5e604a12be5ea19ec1342106,mac.local,2,127.0.0.1,15275,367.159,True,48.1177,26.5227,48.1177,1679033815,0,,2,32a86_00000,0.0846431


2023-03-17 14:16:57,305	INFO tune.py:798 -- Total run time: 52.53 seconds (52.51 seconds for the tuning loop).


In [14]:
from ray.train.torch import TorchPredictor
from ray.train.batch_predictor import BatchPredictor

batch_predictor = BatchPredictor.from_checkpoint(
    checkpoint=latest_checkpoint,
    predictor_cls=TorchPredictor,
    model=Net(num_classes=2),
)

outputs: ray.data.Dataset = batch_predictor.predict(
    data=test_dataset,
    dtype=torch.float,
    feature_columns=["image"],
    keep_columns=["label"],
    # We will use GPU if available.
    num_gpus_per_worker=ray.available_resources().get("GPU", 0)
)

2023-03-17 14:16:57,350	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> ActorPoolMapOperator[MapBatches(ScoringWrapper)]
MapBatches(ScoringWrapper), 0 actors:   6%|▋         | 1/16 [00:01<00:27,  1.83s/it]


In [15]:
import numpy as np


def convert_logits_to_classes(df):
    best_class = df["predictions"].map(lambda x: x.argmax())
    df["prediction"] = best_class
    return df[["prediction", "label"]]


predictions = outputs.map_batches(convert_logits_to_classes)

predictions.show(1)


2023-03-17 14:16:59,212	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(convert_logits_to_classes)]
MapBatches(convert_logits_to_classes): 100%|██████████| 1/1 [00:00<00:00, 114.57it/s]

{'prediction': 0, 'label': 0}





In [16]:
def calculate_prediction_scores(df):
    df["correct"] = df["prediction"] == df["label"]
    return df


scores = predictions.map_batches(calculate_prediction_scores)

scores.show(1)

2023-03-17 14:16:59,251	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[MapBatches(calculate_prediction_scores)]
MapBatches(calculate_prediction_scores): 100%|██████████| 1/1 [00:00<00:00, 215.51it/s]

{'prediction': 0, 'label': 0, 'correct': True}





In [17]:
scores.sum(on="correct") / scores.count()

2023-03-17 14:16:59,281	INFO bulk_executor.py:39 -- Executing DAG InputDataBuffer[Input] -> AllToAllOperator[aggregate]
Shuffle Map: 100%|██████████| 1/1 [00:00<00:00, 130.60it/s]
Shuffle Reduce: 100%|██████████| 1/1 [00:00<00:00, 214.87it/s]


0.75

In [18]:
from ray import serve
from ray.serve import PredictorDeployment
from ray.serve.http_adapters import json_to_ndarray


serve.run(
    PredictorDeployment.bind(
        TorchPredictor,
        latest_checkpoint,
        model=Net(num_classes=2),
        http_adapter=json_to_ndarray,
    )
)

[2m[36m(ServeController pid=15295)[0m INFO 2023-03-17 14:17:00,042 controller 15295 http_state.py:129 - Starting HTTP proxy with name 'SERVE_CONTROLLER_ACTOR:SERVE_PROXY_ACTOR-c8e692bf02bc194459e2ad8b498820ed92bde5060beda79e4bcec9f4' on node 'c8e692bf02bc194459e2ad8b498820ed92bde5060beda79e4bcec9f4' listening on '127.0.0.1:8000'
2023-03-17 14:17:00,723	INFO api.py:254 -- Started detached Serve instance in namespace "serve".
2023-03-17 14:17:00,741	INFO client.py:540 -- Updating deployment 'PredictorDeployment'. component=serve deployment=PredictorDeployment
[2m[36m(ServeController pid=15295)[0m INFO 2023-03-17 14:17:00,781 controller 15295 deployment_state.py:1333 - Adding 1 replica to deployment 'PredictorDeployment'.
[2m[36m(HTTPProxyActor pid=15296)[0m INFO:     Started server process [15296]
2023-03-17 14:17:02,761	INFO client.py:555 -- Deployment 'PredictorDeployment' is ready at `http://127.0.0.1:8000/`. component=serve deployment=PredictorDeployment


RayServeSyncHandle(deployment='PredictorDeployment')

In [19]:
image = test_dataset.take(1)[0]["image"]

In [20]:
import requests

payload = {"array": image.tolist(), "dtype": "float32"}
response = requests.post("http://localhost:8000/", json=payload)
response.json()

{'predictions': [0.3135456144809723, -0.3151269555091858]}

[2m[36m(HTTPProxyActor pid=15296)[0m INFO 2023-03-17 14:17:03,149 http_proxy 127.0.0.1 http_proxy.py:373 - POST / 200 124.9ms
[2m[36m(ServeReplica:PredictorDeployment pid=15297)[0m INFO 2023-03-17 14:17:03,147 PredictorDeployment PredictorDeployment#RUznkJ replica.py:518 - HANDLE __call__ OK 117.3ms
