In [1]:
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data

from functools import partial

from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

import torchvision
from torchvision import datasets
from torchvision import transforms

import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = torch.device("cpu" if torch.cuda.device_count() < 1 else "cuda:0")
batch_size = 4
CLASSES = 2
DIR = os.getcwd()
EPOCHS = 5
N_TRAIN_EXAMPLES = batch_size * 30
N_VALID_EXAMPLES = batch_size * 10

print(f"Device: {DEVICE}\nBatch size: {batch_size}\nClasses: {CLASSES}\n\
Dir: {DIR}\nEpochs: {EPOCHS}\n\
Number of training examples: {N_TRAIN_EXAMPLES}\n\
Number of validation examples: {N_VALID_EXAMPLES}")

Device: cuda:0
Batch size: 4
Classes: 2
Dir: /home/david/Documents/iVision/patch-1
Epochs: 5
Number of training examples: 120
Number of validation examples: 40


In [3]:
model = torchvision.models.efficientnet_b0(weights=torchvision.models.EfficientNet_B0_Weights.DEFAULT)
model.classifier[1] = nn.Linear(model.classifier[1].in_features, 2)
print(model)

EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat

In [4]:
from torchvision import transforms as T
img_size = 224

transform = T.Compose([
                T.Resize((img_size,img_size)),
                T.ToTensor(),
                T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
            ])

In [5]:
from torch.utils.data import DataLoader
from configs import Inputs
from utils.augmentations import get_transforms
from utils.data import FullRadiographSexDataset

val_dataset = FullRadiographSexDataset(root_dir=Inputs.DATASET_DIR,
                                       fold_nums=Inputs().val_folds,
                                       transforms=get_transforms(Inputs(), subset=["train"]))

val_dataloader = DataLoader(val_dataset,
                            batch_size=batch_size,
                            shuffle=False,
                            num_workers=0)

train_dataset = FullRadiographSexDataset(root_dir=Inputs.DATASET_DIR,
                                         fold_nums=Inputs().train_folds,
                                         transforms=get_transforms(Inputs(), subset=["train"]))

train_dataloader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=False,
                              num_workers=0)

Using only horizontal flip augmentation.
Using only horizontal flip augmentation.


In [6]:
config = {
    "lr": tune.loguniform(1e-5, 1e-1),
    "optimizer_name": tune.choice(["Adam", "RMSprop", "SGD"])
}

In [7]:
def objective(config):

    # Gerar o modelo
    #model = define_model(trial).to(DEVICE)
    model = torchvision.models.efficientnet_b0(weights=torchvision.models.EfficientNet_B0_Weights.IMAGENET1K_V1)
    model.classifier[1] = nn.Linear(model.classifier[1].in_features, 2)
    model = model.to(DEVICE)

    # Gerar optimizer
    optimizer_name = config['optimizer_name']
    lr = config["lr"]
    
    print("opt_name:", optimizer_name, "\nlr:", lr)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

    train_loader, valid_loader = train_dataloader, val_dataloader

    criterion = F.nll_loss

    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(train_dataloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
            running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(val_dataloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1
        
        accuracy= correct / total

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((model.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=loss, accuracy=accuracy)

    return accuracy

In [8]:
scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=EPOCHS,
        grace_period=1,
        reduction_factor=2)

reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])

result = tune.run(partial(objective),
                  resources_per_trial={"cpu": 0, "gpu": 1},
                  config=config,
                  num_samples=N_TRAIN_EXAMPLES,
                  scheduler=scheduler,
                  progress_reporter=reporter,
)

2022-12-16 18:17:05,804	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


== Status ==
Current time: 2022-12-16 18:17:08 (running for 00:00:00.34)
Memory usage on this node: 7.6/11.6 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/8 CPUs, 1.0/1 GPUs, 0.0/2.99 GiB heap, 0.0/1.49 GiB objects (0.0/1.0 accelerator_type:G)
Result logdir: /home/david/ray_results/objective_2022-12-16_18-17-07
Number of trials: 16/120 (15 PENDING, 1 RUNNING)
+-----------------------+----------+--------------------+-------------+------------------+
| Trial name            | status   | loc                |          lr | optimizer_name   |
|-----------------------+----------+--------------------+-------------+------------------|
| objective_ff121_00000 | RUNNING  | 192.168.1.20:19167 | 0.0336051   | RMSprop          |
| objective_ff121_00001 | PENDING  |                    | 0.0158522   | RMSprop          |
| objective_ff121_00002 | PENDING  |                    | 0.0491941   | SGD              |
| objectiv

2022-12-16 18:17:15,290	ERROR trial_runner.py:1088 -- Trial objective_ff121_00000: Error processing event.
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=19167, ip=192.168.1.20, repr=func)
  File "/home/david/.local/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 367, in train
    raise skipped from exception_cause(skipped)
  File "/home/david/.local/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 335, in entrypoint
    return self._trainable_func(
  File "/home/david/.local/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 652, in _trainable_func
    output = fn()
  File "/tmp/ipykernel_17478/4035745101.py", line 32, in objective
  File "/home/david/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/david/.local/lib/python3.10/site-packages/torchvision/models/efficientnet.py",

Trial name,date,experiment_id,hostname,node_ip,pid,timestamp,trial_id
objective_ff121_00000,2022-12-16_18-17-12,9f5385acbe3d4961b4eef2310a24ee3a,arch-pc,192.168.1.20,19167,1671225432,ff121_00000
objective_ff121_00001,2022-12-16_18-17-20,5c75179660d143c5ae5f5dfcb44ef3b3,arch-pc,192.168.1.20,19278,1671225440,ff121_00001
objective_ff121_00002,2022-12-16_18-17-28,0a8376ff118b412a9d18d7d3dbfbd78c,arch-pc,192.168.1.20,19376,1671225448,ff121_00002
objective_ff121_00003,2022-12-16_18-17-36,f5954d0f41ba4f34a3b8a306516d7486,arch-pc,192.168.1.20,19469,1671225456,ff121_00003


== Status ==
Current time: 2022-12-16 18:17:15 (running for 00:00:07.14)
Memory usage on this node: 8.7/11.6 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/8 CPUs, 0/1 GPUs, 0.0/2.99 GiB heap, 0.0/1.49 GiB objects (0.0/1.0 accelerator_type:G)
Result logdir: /home/david/ray_results/objective_2022-12-16_18-17-07
Number of trials: 17/120 (1 ERROR, 16 PENDING)
+-----------------------+----------+--------------------+-------------+------------------+
| Trial name            | status   | loc                |          lr | optimizer_name   |
|-----------------------+----------+--------------------+-------------+------------------|
| objective_ff121_00001 | PENDING  |                    | 0.0158522   | RMSprop          |
| objective_ff121_00002 | PENDING  |                    | 0.0491941   | SGD              |
| objective_ff121_00003 | PENDING  |                    | 0.00128543  | RMSprop          |
| objective_ff

2022-12-16 18:17:23,487	ERROR trial_runner.py:1088 -- Trial objective_ff121_00001: Error processing event.
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=19278, ip=192.168.1.20, repr=func)
  File "/home/david/.local/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 367, in train
    raise skipped from exception_cause(skipped)
  File "/home/david/.local/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 335, in entrypoint
    return self._trainable_func(
  File "/home/david/.local/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 652, in _trainable_func
    output = fn()
  File "/tmp/ipykernel_17478/4035745101.py", line 32, in objective
  File "/home/david/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/david/.local/lib/python3.10/site-packages/torchvision/models/efficientnet.py",

== Status ==
Current time: 2022-12-16 18:17:28 (running for 00:00:20.81)
Memory usage on this node: 8.0/11.6 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/8 CPUs, 1.0/1 GPUs, 0.0/2.99 GiB heap, 0.0/1.49 GiB objects (0.0/1.0 accelerator_type:G)
Result logdir: /home/david/ray_results/objective_2022-12-16_18-17-07
Number of trials: 19/120 (2 ERROR, 16 PENDING, 1 RUNNING)
+-----------------------+----------+--------------------+-------------+------------------+
| Trial name            | status   | loc                |          lr | optimizer_name   |
|-----------------------+----------+--------------------+-------------+------------------|
| objective_ff121_00002 | RUNNING  | 192.168.1.20:19376 | 0.0491941   | SGD              |
| objective_ff121_00003 | PENDING  |                    | 0.00128543  | RMSprop          |
| objective_ff121_00004 | PENDING  |                    | 0.000577491 | Adam             |
|

2022-12-16 18:17:30,343	ERROR trial_runner.py:1088 -- Trial objective_ff121_00002: Error processing event.
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=19376, ip=192.168.1.20, repr=func)
  File "/home/david/.local/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 367, in train
    raise skipped from exception_cause(skipped)
  File "/home/david/.local/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 335, in entrypoint
    return self._trainable_func(
  File "/home/david/.local/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 652, in _trainable_func
    output = fn()
  File "/tmp/ipykernel_17478/4035745101.py", line 32, in objective
  File "/home/david/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/david/.local/lib/python3.10/site-packages/torchvision/models/efficientnet.py",

== Status ==
Current time: 2022-12-16 18:17:35 (running for 00:00:27.81)
Memory usage on this node: 7.5/11.6 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/8 CPUs, 1.0/1 GPUs, 0.0/2.99 GiB heap, 0.0/1.49 GiB objects (0.0/1.0 accelerator_type:G)
Result logdir: /home/david/ray_results/objective_2022-12-16_18-17-07
Number of trials: 20/120 (3 ERROR, 16 PENDING, 1 RUNNING)
+-----------------------+----------+--------------------+-------------+------------------+
| Trial name            | status   | loc                |          lr | optimizer_name   |
|-----------------------+----------+--------------------+-------------+------------------|
| objective_ff121_00003 | RUNNING  | 192.168.1.20:19469 | 0.00128543  | RMSprop          |
| objective_ff121_00004 | PENDING  |                    | 0.000577491 | Adam             |
| objective_ff121_00005 | PENDING  |                    | 0.00598084  | Adam             |
|

2022-12-16 18:17:39,304	ERROR trial_runner.py:1088 -- Trial objective_ff121_00003: Error processing event.
ray.exceptions.RayTaskError(OutOfMemoryError): [36mray::ImplicitFunc.train()[39m (pid=19469, ip=192.168.1.20, repr=func)
  File "/home/david/.local/lib/python3.10/site-packages/ray/tune/trainable/trainable.py", line 367, in train
    raise skipped from exception_cause(skipped)
  File "/home/david/.local/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 335, in entrypoint
    return self._trainable_func(
  File "/home/david/.local/lib/python3.10/site-packages/ray/tune/trainable/function_trainable.py", line 652, in _trainable_func
    output = fn()
  File "/tmp/ipykernel_17478/4035745101.py", line 32, in objective
  File "/home/david/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/david/.local/lib/python3.10/site-packages/torchvision/models/efficientnet.py",

== Status ==
Current time: 2022-12-16 18:17:44 (running for 00:00:36.81)
Memory usage on this node: 6.6/11.6 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/8 CPUs, 1.0/1 GPUs, 0.0/2.99 GiB heap, 0.0/1.49 GiB objects (0.0/1.0 accelerator_type:G)
Result logdir: /home/david/ray_results/objective_2022-12-16_18-17-07
Number of trials: 21/120 (4 ERROR, 16 PENDING, 1 RUNNING)
+-----------------------+----------+--------------------+-------------+------------------+
| Trial name            | status   | loc                |          lr | optimizer_name   |
|-----------------------+----------+--------------------+-------------+------------------|
| objective_ff121_00004 | RUNNING  | 192.168.1.20:19573 | 0.000577491 | Adam             |
| objective_ff121_00005 | PENDING  |                    | 0.00598084  | Adam             |
| objective_ff121_00006 | PENDING  |                    | 8.8022e-05  | RMSprop          |
|



[2m[36m(func pid=19573)[0m [1,    45] loss: -0.337
[2m[36m(func pid=19573)[0m [1,    46] loss: -0.297
[2m[36m(func pid=19573)[0m [1,    47] loss: -0.260
[2m[36m(func pid=19573)[0m [1,    48] loss: -0.303
[2m[36m(func pid=19573)[0m [1,    49] loss: -0.328
[2m[36m(func pid=19573)[0m [1,    50] loss: -0.289
== Status ==
Current time: 2022-12-16 18:18:10 (running for 00:01:01.86)
Memory usage on this node: 7.2/11.6 GiB 
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/8 CPUs, 1.0/1 GPUs, 0.0/2.99 GiB heap, 0.0/1.49 GiB objects (0.0/1.0 accelerator_type:G)
Result logdir: /home/david/ray_results/objective_2022-12-16_18-17-07
Number of trials: 21/120 (4 ERROR, 16 PENDING, 1 RUNNING)
+-----------------------+----------+--------------------+-------------+------------------+
| Trial name            | status   | loc                |          lr | optimizer_name   |
|-----------------------+----------+-----