In [1]:
import os
from sklearn import datasets
import numpy as np

import torch
import pytorch_lightning as pl
from pytorch_lightning import LightningModule, Trainer, LightningDataModule
from torchmetrics import Accuracy
from torch import nn
from torch.nn import functional as F

from torch.utils.data import DataLoader, random_split
from torchvision import transforms

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

import mlflow

from omegaconf import DictConfig, ListConfig, OmegaConf
import hydra

import optuna

# Writer@MLFlowの実装

In [3]:
# 低レベルAPIでのMLFlowの使用

class MlflowWriter():
    def __init__(self, experiment_name):
        self.client = mlflow.tracking.MlflowClient()

        # 新規 experiment の作成
        try: 
            self.experiment_id = self.client.create_experiment(experiment_name)
        except Exception as e:
            print(e)
            self.experiment_id = self.client.get_experiment_by_name(experiment_name).experiment_id

        self.experiment = self.client.get_experiment(self.experiment_id)
        print("New experiment started")
        print(f"Name: {self.experiment.name}")
        print(f"Experiment_id: {self.experiment.experiment_id}")
        #print(f"Artifact Location: {self.experiment.artifact_location}")
    
    # 新規RUNの作成
    def create_new_run(self, tags=None):
        self.run = self.client.create_run(self.experiment_id, tags=tags)
        self.run_id = self.run.info.run_id
        #print(f"New run started: {tags['mlflow.runName']}")

    # OmegaConf形式のparamsを記録する
    def log_params_from_omegaconf_dict(self, params):
        for param_name, element in params.items():
            self._explore_recursive(param_name, element)
    def _explore_recursive(self, parent_name, element):
        if isinstance(element, DictConfig):
            for k, v in element.items():
                if isinstance(v, DictConfig) or isinstance(v, ListConfig):
                    self._explore_recursive(f'{parent_name}.{k}', v)
                else:
                    self.client.log_param(self.run_id, f'{parent_name}.{k}', v)
        elif isinstance(element, ListConfig):
            for i, v in enumerate(element):
                self.client.log_param(self.run_id, f'{parent_name}.{i}', v)
        else:
            self.client.log_param(self.run_id, f'{parent_name}', element)

    # 通常形式での保存メソッドのラッパー
    def log_param(self, key, value):
        self.client.log_param(self.run_id, key, value)
    def log_metric(self, key, value):
        self.client.log_metric(self.run_id, key, value)
    def log_metric_step(self, key, value, step): #stepアリの場合
        self.client.log_metric(self.run_id, key, value, step=step)
    def log_artifact(self, local_path):
        self.client.log_artifact(self.run_id, local_path)
    def log_dict(self, dictionary, file):
        self.client.log_dict(self.run_id, dictionary, file)
    def log_figure(self, figure, file):
        self.client.log_figure(self.run_id, figure, file)

    # 必ず終了時に呼び出す
    def set_terminated(self):
        self.client.set_terminated(self.run_id)

# DataModuleの準備

In [4]:
# 1. DataSetの作成
#   ここは従来通り、任意のデータに対応するDatasetを作成する
class IrisDataset(torch.utils.data.Dataset):
    def __init__(self, transforms=None):
        super().__init__()
        iris = datasets.load_iris()
        self.X, self.y = iris["data"], iris["target"]
        self.transforms = transforms

    def __getitem__(self, idx):
        data, label = self.X[idx], self.y[idx]
        if self.transforms is not None:
            data  = self.transforms(data)
            label = self.transforms(label)
        return data, label

    def __len__(self):
        return self.X.shape[0]

In [5]:
# 2. pl.DataModuleの準備
#   DataLoadersを作成するpl.DataModuleを作成する

class PLIrisData(pl.LightningDataModule):
    def __init__(self, BATCH_SIZE=16):
        super().__init__()
        self.batch_size = BATCH_SIZE
        self.transforms=None
         #self.transforms = transforms.Compose( [ transforms.ToTensor() ] )  #画像などで使う
    
    def setup(self, stage=None): #stageの引数は必須　
        all_data = IrisDataset(transforms=self.transforms)
        self.trn_data, self.val_data = random_split(all_data, [120,30])
    
    def train_dataloader(self):
        return DataLoader( dataset=self.trn_data, batch_size=self.batch_size ,shuffle=True)
    
    def val_dataloader(self):
        return DataLoader( dataset=self.val_data, batch_size=self.batch_size ,shuffle=False)

# Modelの作成

In [6]:
# 1. 従来通りのModelを作成する
#    for Optuna コンストラクタの引数に Trial　を追加

# add trial
class IrisNet(nn.Module):
    def __init__(self, trial, cfg):
        super().__init__()
        
        ### Optuna ###
        self.hidden_size  = trial.suggest_int( **cfg.model.hidden_size )
        self.x1   = nn.Linear(in_features=4, out_features=self.hidden_size )

        self.act1 = nn.ReLU()
        self.x2   = nn.Linear(in_features=self.hidden_size, out_features=3)
        self.act2 = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.x1(x)
        x = self.act1(x)
        x = self.x2(x)
        x = self.act2(x)
        return x

In [7]:
#2. train/valid stepを設定する、pl.moduleを作成する

class PLIrisModel(pl.LightningModule):
    # add trial
    def __init__(self, trial, cfg: DictConfig, experiment_name="test1"):
        super().__init__()
        self.cfg     = cfg
        self.trial   = trial ### added !!!

        self.net     = IrisNet(trial=trial, cfg=cfg) ### added trial !!!
        self.mtrics  = Accuracy()

        ### MLFlow ###
        self.writer = MlflowWriter(experiment_name=experiment_name)
        self.writer.create_new_run()
        self.writer.log_params_from_omegaconf_dict(cfg)

    def forward(self, x):
        return self.net(x.float())

    def training_step(self, batch, batch_idx):
        x, y = batch
        pred = self(x)
        loss = F.nll_loss(pred, y)
        batch_loss = loss * x.size(0)
        return {"loss": loss, "y": y, "pred": pred.detach(), "batch_loss": batch_loss.detach()}
    
    def training_epoch_end(self, train_step_outputs):
        preds      = torch.cat( [trn["pred"] for trn in train_step_outputs], dim=0 )
        ys         = torch.cat( [trn["y"] for trn in train_step_outputs], dim=0 )
        epoch_loss = sum( [trn["batch_loss"] for trn in train_step_outputs] ) / ys.size(0)

        acc = self.mtrics(preds, ys)
        print('-------- Current Epoch {} --------'.format(self.current_epoch + 1))
        print('train Loss: {:.4f} train Acc: {:.4f}'.format(epoch_loss, acc))

        ### MLFlow ###
        self.writer.log_metric("trn_loss", float(epoch_loss) )
        self.writer.log_metric("trn_acc",  float(acc))

    def validation_step(self, batch, batch_idx):
        x, y = batch
        pred = self(x)
        loss = F.nll_loss(pred, y)
        batch_loss = loss * x.size(0)
        return {"y": y, "pred": pred.detach(), "batch_loss": batch_loss.detach()}
    
    def validation_epoch_end(self, valid_step_outputs):
        preds      = torch.cat( [val["pred"] for val in valid_step_outputs], dim=0 )
        ys         = torch.cat( [val["y"] for val in valid_step_outputs], dim=0 )
        epoch_loss = sum( [val["batch_loss"] for val in valid_step_outputs] ) / ys.size(0)

        acc = self.mtrics(preds, ys)
        print('-------- Current Epoch {} --------'.format(self.current_epoch + 1))
        print('valid Loss: {:.4f} valid Acc: {:.4f}'.format(epoch_loss, acc))

        ### for CallBacks ###
        self.log("val_loss", epoch_loss)
        self.log("val_acc", acc)
        
        ### MLFlow ###
        self.writer.log_metric("val_loss", float(epoch_loss) )
        self.writer.log_metric("val_acc",  float(acc))
    
    def configure_optimizers(self):
        #lr         = self.cfg.optim.lr
        #optim_name = self.cfg.optim.optim_name
        
        ### Optuna ###
        lr         = self.trial.suggest_loguniform( **self.cfg.optim.lr )
        optim_name = self.trial.suggest_categorical( **self.cfg.optim.optim_name )
        optimizer  = getattr(torch.optim, optim_name)(self.parameters(), lr=lr)
        return optimizer

In [8]:
# CallBacksの設定

# モデルチェックポイント val_lossが最低となるモデルを保存
checkpoint_callback = ModelCheckpoint(
        dirpath=f"./models",
        filename="best-checkpoint",
        monitor="val_loss",
        mode="min",
    )

# EarlyStop 一定エポックval_lossの改善がなければ学習打ち切り
early_stopping_callback = EarlyStopping(
        monitor="val_loss", patience=3, verbose=True, mode="min"
    )

In [9]:
cfg = OmegaConf.load("configs/config.yaml")

def objective(trial):
    model  = PLIrisModel(trial=trial, cfg=cfg)
    data   = PLIrisData()

    trainer = Trainer(
        gpus=1,
        max_epochs=15,
        callbacks=[ checkpoint_callback, early_stopping_callback ],
    )
    trainer.fit(model, data)

    ### MLFlow ###
    model.writer.set_terminated() # 必ず呼び出す！！
    return trainer.callback_metrics["val_acc"].item()

In [10]:
#@hydra.main(config_path="./configs", config_name="config")
#def main(cfg):
def main():
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=10, timeout=60 * 1)

    print('Number of finished trials: {}'.format(len(study.trials)))
    print('Best trial:')
    trial = study.best_trial
    print('  Value: {}'.format(trial.value))
    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

In [11]:
main()

[32m[I 2021-12-09 14:02:23,682][0m A new study created in memory with name: no-name-f5a50a63-53c4-4076-ac8f-f95f6a75dfd0[0m


New experiment started
Name: test1
Experiment_id: 1


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type     | Params
------------------------------------
0 | net    | IrisNet  | 147   
1 | mtrics | Accuracy | 0     
------------------------------------
147       Trainable params
0         Non-trainable params
147       Total params
0.001     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


-------- Current Epoch 1 --------
valid Loss: -0.3037 valid Acc: 0.2333


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: -0.529


-------- Current Epoch 1 --------
valid Loss: -0.5285 valid Acc: 0.5000
-------- Current Epoch 1 --------
train Loss: -0.4920 train Acc: 0.5250


Validating: 0it [00:00, ?it/s]

-------- Current Epoch 2 --------
valid Loss: -0.5213 valid Acc: 0.5000
-------- Current Epoch 2 --------
train Loss: -0.7069 train Acc: 0.7083


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.239 >= min_delta = 0.0. New best score: -0.768


-------- Current Epoch 3 --------
valid Loss: -0.7677 valid Acc: 0.7667
-------- Current Epoch 3 --------
train Loss: -0.7464 train Acc: 0.7333


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.106 >= min_delta = 0.0. New best score: -0.874


-------- Current Epoch 4 --------
valid Loss: -0.8737 valid Acc: 1.0000
-------- Current Epoch 4 --------
train Loss: -0.7842 train Acc: 0.7917


Validating: 0it [00:00, ?it/s]

-------- Current Epoch 5 --------
valid Loss: -0.8570 valid Acc: 0.9667
-------- Current Epoch 5 --------
train Loss: -0.8781 train Acc: 0.9583


Validating: 0it [00:00, ?it/s]

-------- Current Epoch 6 --------
valid Loss: -0.8264 valid Acc: 0.9000
-------- Current Epoch 6 --------
train Loss: -0.8984 train Acc: 0.9500


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.079 >= min_delta = 0.0. New best score: -0.952


-------- Current Epoch 7 --------
valid Loss: -0.9522 valid Acc: 1.0000
-------- Current Epoch 7 --------
train Loss: -0.8999 train Acc: 0.9000


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.020 >= min_delta = 0.0. New best score: -0.972


-------- Current Epoch 8 --------
valid Loss: -0.9718 valid Acc: 1.0000
-------- Current Epoch 8 --------
train Loss: -0.9279 train Acc: 0.9500


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.006 >= min_delta = 0.0. New best score: -0.978


-------- Current Epoch 9 --------
valid Loss: -0.9783 valid Acc: 1.0000
-------- Current Epoch 9 --------
train Loss: -0.9276 train Acc: 0.9417


Validating: 0it [00:00, ?it/s]

-------- Current Epoch 10 --------
valid Loss: -0.9341 valid Acc: 0.9333
-------- Current Epoch 10 --------
train Loss: -0.9298 train Acc: 0.9417


Validating: 0it [00:00, ?it/s]

-------- Current Epoch 11 --------
valid Loss: -0.7766 valid Acc: 0.7333
-------- Current Epoch 11 --------
train Loss: -0.8464 train Acc: 0.8417


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.005 >= min_delta = 0.0. New best score: -0.983


-------- Current Epoch 12 --------
valid Loss: -0.9829 valid Acc: 1.0000
-------- Current Epoch 12 --------
train Loss: -0.9289 train Acc: 0.9417


Validating: 0it [00:00, ?it/s]

-------- Current Epoch 13 --------
valid Loss: -0.9685 valid Acc: 0.9667
-------- Current Epoch 13 --------
train Loss: -0.9633 train Acc: 0.9750


Validating: 0it [00:00, ?it/s]

-------- Current Epoch 14 --------
valid Loss: -0.9599 valid Acc: 0.9667
-------- Current Epoch 14 --------
train Loss: -0.9515 train Acc: 0.9583


Validating: 0it [00:00, ?it/s]

Metric val_loss improved by 0.002 >= min_delta = 0.0. New best score: -0.985
[32m[I 2021-12-09 14:02:40,064][0m Trial 0 finished with value: 1.0 and parameters: {'hidden_size': 18, 'lr': 0.06812094178489812, 'optimizer': 'Adam'}. Best is trial 0 with value: 1.0.[0m


-------- Current Epoch 15 --------
valid Loss: -0.9848 valid Acc: 1.0000
-------- Current Epoch 15 --------
train Loss: -0.9514 train Acc: 0.9500


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type     | Params
------------------------------------
0 | net    | IrisNet  | 99    
1 | mtrics | Accuracy | 0     
------------------------------------
99        Trainable params
0         Non-trainable params
99        Total params
0.000     Total estimated model params size (MB)


Experiment 'test1' already exists.
New experiment started
Name: test1
Experiment_id: 1


Validation sanity check: 0it [00:00, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


-------- Current Epoch 1 --------
valid Loss: -0.3364 valid Acc: 0.4333


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

-------- Current Epoch 1 --------
valid Loss: -0.3198 valid Acc: 0.2000
-------- Current Epoch 1 --------
train Loss: -0.3562 train Acc: 0.4500


Validating: 0it [00:00, ?it/s]

-------- Current Epoch 2 --------
valid Loss: -0.3129 valid Acc: 0.2000
-------- Current Epoch 2 --------
train Loss: -0.3975 train Acc: 0.3667


Validating: 0it [00:00, ?it/s]

-------- Current Epoch 3 --------
valid Loss: -0.3281 valid Acc: 0.2000


Monitored metric val_loss did not improve in the last 3 records. Best score: -0.985. Signaling Trainer to stop.
[32m[I 2021-12-09 14:02:40,949][0m Trial 1 finished with value: 0.20000000298023224 and parameters: {'hidden_size': 12, 'lr': 0.0700397407945558, 'optimizer': 'SGD'}. Best is trial 0 with value: 1.0.[0m
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type     | Params
------------------------------------
0 | net    | IrisNet  | 187   
1 | mtrics | Accuracy | 0     
------------------------------------
187       Trainable params
0         Non-trainable params
187       Total params
0.001     Total estimated model params size (MB)


-------- Current Epoch 3 --------
train Loss: -0.4154 train Acc: 0.3667
Experiment 'test1' already exists.
New experiment started
Name: test1
Experiment_id: 1


Validation sanity check: 0it [00:00, ?it/s]

-------- Current Epoch 1 --------
valid Loss: -0.3577 valid Acc: 0.3333


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 4 records. Best score: -0.985. Signaling Trainer to stop.
[32m[I 2021-12-09 14:02:41,312][0m Trial 2 finished with value: 0.3333333432674408 and parameters: {'hidden_size': 23, 'lr': 0.0003525286012729594, 'optimizer': 'SGD'}. Best is trial 0 with value: 1.0.[0m
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type     | Params
------------------------------------
0 | net    | IrisNet  | 139   
1 | mtrics | Accuracy | 0     
------------------------------------
139       Trainable params
0         Non-trainable params
139       Total params
0.001     Total estimated model params size (MB)


-------- Current Epoch 1 --------
valid Loss: -0.3577 valid Acc: 0.3333
-------- Current Epoch 1 --------
train Loss: -0.3513 train Acc: 0.3333
Experiment 'test1' already exists.
New experiment started
Name: test1
Experiment_id: 1


Validation sanity check: 0it [00:00, ?it/s]

-------- Current Epoch 1 --------
valid Loss: -0.4588 valid Acc: 0.4667


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 5 records. Best score: -0.985. Signaling Trainer to stop.
[32m[I 2021-12-09 14:02:41,756][0m Trial 3 finished with value: 0.46666666865348816 and parameters: {'hidden_size': 17, 'lr': 0.008084044320514941, 'optimizer': 'Adam'}. Best is trial 0 with value: 1.0.[0m
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type     | Params
------------------------------------
0 | net    | IrisNet  | 91    
1 | mtrics | Accuracy | 0     
------------------------------------
91        Trainable params
0         Non-trainable params
91        Total params
0.000     Total estimated model params size (MB)


-------- Current Epoch 1 --------
valid Loss: -0.4715 valid Acc: 0.4667
-------- Current Epoch 1 --------
train Loss: -0.3086 train Acc: 0.3000
Experiment 'test1' already exists.
New experiment started
Name: test1
Experiment_id: 1


Validation sanity check: 0it [00:05, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


-------- Current Epoch 1 --------
valid Loss: -0.3333 valid Acc: 0.3333


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 6 records. Best score: -0.985. Signaling Trainer to stop.
[32m[I 2021-12-09 14:02:47,793][0m Trial 4 finished with value: 0.3333333432674408 and parameters: {'hidden_size': 11, 'lr': 0.0035679428857446354, 'optimizer': 'SGD'}. Best is trial 0 with value: 1.0.[0m
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type     | Params
------------------------------------
0 | net    | IrisNet  | 99    
1 | mtrics | Accuracy | 0     
------------------------------------
99        Trainable params
0         Non-trainable params
99        Total params
0.000     Total estimated model params size (MB)


-------- Current Epoch 1 --------
valid Loss: -0.3377 valid Acc: 0.3333
-------- Current Epoch 1 --------
train Loss: -0.3111 train Acc: 0.3333
Experiment 'test1' already exists.
New experiment started
Name: test1
Experiment_id: 1


Validation sanity check: 0it [00:00, ?it/s]

-------- Current Epoch 1 --------
valid Loss: -0.3217 valid Acc: 0.2333


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 7 records. Best score: -0.985. Signaling Trainer to stop.
[32m[I 2021-12-09 14:02:48,168][0m Trial 5 finished with value: 0.23333333432674408 and parameters: {'hidden_size': 12, 'lr': 0.0008799350368770233, 'optimizer': 'SGD'}. Best is trial 0 with value: 1.0.[0m
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type     | Params
------------------------------------
0 | net    | IrisNet  | 147   
1 | mtrics | Accuracy | 0     
------------------------------------
147       Trainable params
0         Non-trainable params
147       Total params
0.001     Total estimated model params size (MB)


-------- Current Epoch 1 --------
valid Loss: -0.3221 valid Acc: 0.2333
-------- Current Epoch 1 --------
train Loss: -0.3662 train Acc: 0.3583
Experiment 'test1' already exists.
New experiment started
Name: test1
Experiment_id: 1


Validation sanity check: 0it [00:00, ?it/s]

-------- Current Epoch 1 --------
valid Loss: -0.3267 valid Acc: 0.3333


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

-------- Current Epoch 1 --------
valid Loss: -0.3267 valid Acc: 0.3333


Monitored metric val_loss did not improve in the last 8 records. Best score: -0.985. Signaling Trainer to stop.
[32m[I 2021-12-09 14:02:48,513][0m Trial 6 finished with value: 0.3333333432674408 and parameters: {'hidden_size': 18, 'lr': 0.00011943711431321242, 'optimizer': 'SGD'}. Best is trial 0 with value: 1.0.[0m
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type     | Params
------------------------------------
0 | net    | IrisNet  | 187   
1 | mtrics | Accuracy | 0     
------------------------------------
187       Trainable params
0         Non-trainable params
187       Total params
0.001     Total estimated model params size (MB)


-------- Current Epoch 1 --------
train Loss: -0.3335 train Acc: 0.3333
Experiment 'test1' already exists.
New experiment started
Name: test1
Experiment_id: 1


Validation sanity check: 0it [00:00, ?it/s]

-------- Current Epoch 1 --------
valid Loss: -0.2710 valid Acc: 0.2667


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 9 records. Best score: -0.985. Signaling Trainer to stop.
[32m[I 2021-12-09 14:02:48,917][0m Trial 7 finished with value: 0.2666666805744171 and parameters: {'hidden_size': 23, 'lr': 2.0000765889904796e-05, 'optimizer': 'SGD'}. Best is trial 0 with value: 1.0.[0m
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type     | Params
------------------------------------
0 | net    | IrisNet  | 147   
1 | mtrics | Accuracy | 0     
------------------------------------
147       Trainable params
0         Non-trainable params
147       Total params
0.001     Total estimated model params size (MB)


-------- Current Epoch 1 --------
valid Loss: -0.2710 valid Acc: 0.2667
-------- Current Epoch 1 --------
train Loss: -0.2921 train Acc: 0.3500
Experiment 'test1' already exists.
New experiment started
Name: test1
Experiment_id: 1


Validation sanity check: 0it [00:00, ?it/s]

-------- Current Epoch 1 --------
valid Loss: -0.3325 valid Acc: 0.3333


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 10 records. Best score: -0.985. Signaling Trainer to stop.
[32m[I 2021-12-09 14:02:49,294][0m Trial 8 finished with value: 0.6333333253860474 and parameters: {'hidden_size': 18, 'lr': 0.05346128654749146, 'optimizer': 'SGD'}. Best is trial 0 with value: 1.0.[0m
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type     | Params
------------------------------------
0 | net    | IrisNet  | 91    
1 | mtrics | Accuracy | 0     
------------------------------------
91        Trainable params
0         Non-trainable params
91        Total params
0.000     Total estimated model params size (MB)


-------- Current Epoch 1 --------
valid Loss: -0.3857 valid Acc: 0.6333
-------- Current Epoch 1 --------
train Loss: -0.3563 train Acc: 0.4000
Experiment 'test1' already exists.
New experiment started
Name: test1
Experiment_id: 1


Validation sanity check: 0it [00:00, ?it/s]

-------- Current Epoch 1 --------
valid Loss: -0.3555 valid Acc: 0.3667


  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."
  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 11 records. Best score: -0.985. Signaling Trainer to stop.
[32m[I 2021-12-09 14:02:49,685][0m Trial 9 finished with value: 0.36666664481163025 and parameters: {'hidden_size': 11, 'lr': 0.0002633098324676185, 'optimizer': 'Adam'}. Best is trial 0 with value: 1.0.[0m


-------- Current Epoch 1 --------
valid Loss: -0.3580 valid Acc: 0.3667
-------- Current Epoch 1 --------
train Loss: -0.3274 train Acc: 0.3250
Number of finished trials: 10
Best trial:
  Value: 1.0
  Params: 
    hidden_size: 18
    lr: 0.06812094178489812
    optimizer: Adam


In [11]:
!mlflow ui --port 5000
""" on Colab
# mlflow ui --port 5000
# のコマンドを裏で実行して、裏で実行して、MLFlow UIをバックグラウンドで実行
get_ipython().system_raw("mlflow ui --port 5000 &")

# Colab上の上の5000番ポートののHTTPサービスをサービスをngrokに登録
from pyngrok import ngrok
ngrok.kill()
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)

# 登録されたパブリックパブリックURLを表示
print("MLFlow UI", ngrok_tunnel.public_url)

# ngrokの終了
ngrok.kill()
""" 

[2021-12-09 13:49:30 +0000] [1512] [INFO] Starting gunicorn 20.1.0
[2021-12-09 13:49:30 +0000] [1512] [INFO] Listening at: http://127.0.0.1:5000 (1512)
[2021-12-09 13:49:30 +0000] [1512] [INFO] Using worker: sync
[2021-12-09 13:49:30 +0000] [1515] [INFO] Booting worker with pid: 1515
^C
[2021-12-09 13:50:22 +0000] [1512] [INFO] Handling signal: int
[2021-12-09 13:50:22 +0000] [1515] [INFO] Worker exiting (pid: 1515)


' on Colab\n# mlflow ui --port 5000\n# のコマンドを裏で実行して、裏で実行して、MLFlow UIをバックグラウンドで実行\nget_ipython().system_raw("mlflow ui --port 5000 &")\n\n# Colab上の上の5000番ポートののHTTPサービスをサービスをngrokに登録\nfrom pyngrok import ngrok\nngrok.kill()\nngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)\n\n# 登録されたパブリックパブリックURLを表示\nprint("MLFlow UI", ngrok_tunnel.public_url)\n\n# ngrokの終了\nngrok.kill()\n'