In [2]:
%cd /content/drive/MyDrive/template

/content/drive/MyDrive/template


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
%pwd

'/content/drive/My Drive/template'

In [None]:
!python train.ipynb

Traceback (most recent call last):
  File "/content/drive/MyDrive/template/train.ipynb", line 1, in <module>
    {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"train.ipynb","provenance":[],"mount_file_id":"1VP3203moc0kDNHgdf6gloch_AUm5oVTy","authorship_tag":"ABX9TyNYtp0rGvRRhlMqHdG0m/OA"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU","gpuClass":"standard"},"cells":[{"cell_type":"code","source":["%cd /content/drive/MyDrive/template"],"metadata":{"id":"DNQvYPrRZFiC","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1657214064134,"user_tz":-540,"elapsed":508,"user":{"displayName":"roombi choi","userId":"13120785569977849883"}},"outputId":"0ca56088-0095-40a1-d636-ed353cf91257"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive/template\n"]}]},{"cell_type":"code","source":["%pwd"],"metadata":{"colab":{"base_uri":"htt

In [None]:
pip show Pyyaml

Name: PyYAML
Version: 6.0
Summary: YAML parser and emitter for Python
Home-page: https://pyyaml.org/
Author: Kirill Simonov
Author-email: xi@resolvent.net
License: MIT
Location: /usr/local/lib/python3.9/site-packages
Requires: 
Required-by: 


In [5]:
!pip install timm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting timm
  Downloading timm-0.6.5-py3-none-any.whl (512 kB)
[K     |████████████████████████████████| 512 kB 7.5 MB/s 
Installing collected packages: timm
Successfully installed timm-0.6.5


In [11]:
!pip install Pyyaml

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Pyyaml
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 6.9 MB/s 
[?25hInstalling collected packages: Pyyaml
Successfully installed Pyyaml-6.0


In [None]:
!python -V

Python 3.9.13


In [None]:
import sys

print(sys.executable)

3.7.13 (default, Apr 24 2022, 01:04:09) 
[GCC 7.5.0]


In [None]:
import sys

print(sys.executable)

/usr/bin/python3


In [3]:
from modules.utils import load_yaml, save_yaml, get_logger

from modules.earlystoppers import EarlyStopper
from modules.recorders import Recorder
from modules.datasets import CowDataset
from modules.trainer import Trainer

#from modules.preprocessor import get_preprocessor
from modules.optimizers import get_optimizer
from modules.metrics import get_metric
from modules.losses import get_loss

from models.utils import get_model

from torch.utils.data import DataLoader
import torch

from datetime import datetime, timezone, timedelta
import numpy as np
import random
import os
import copy


# Root Directory
PROJECT_DIR = os.path.dirname("__file__")

# Load config
config_path = os.path.join(PROJECT_DIR, 'config', 'train_config.yaml')
config = load_yaml(config_path)

# Train Serial
kst = timezone(timedelta(hours=9))
train_serial = datetime.now(tz=kst).strftime("%Y%m%d_%H%M%S")

# Recorder Directory
RECORDER_DIR = os.path.join(PROJECT_DIR, 'results', 'train', train_serial)
os.makedirs(RECORDER_DIR, exist_ok=True)

# Data Directory
DATA_DIR = config['DIRECTORY']['dataset']

# Seed
torch.manual_seed(config['TRAINER']['seed'])
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(config['TRAINER']['seed'])
random.seed(config['TRAINER']['seed'])

# GPU
os.environ['CUDA_VISIBLE_DEVICES'] = str(config['TRAINER']['gpu'])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


if __name__ == '__main__':
    '''
    Set Logger
    '''
    logger = get_logger(name='train', dir_=RECORDER_DIR, stream=False)
    logger.info(f"Set Logger {RECORDER_DIR}")
    
    '''
    Load Data
    '''
    # Dataset
    train_dataset = CowDataset(img_folder = os.path.join(DATA_DIR, 'train', 'images'),
                              dfpath = os.path.join(DATA_DIR, 'train', 'grade_labels.csv'))
    val_dataset = CowDataset(img_folder = os.path.join(DATA_DIR, 'val', 'images'),
                             dfpath = os.path.join(DATA_DIR, 'val', 'grade_labels.csv'))
    
    # DataLoader
    train_dataloader = DataLoader(dataset = train_dataset,
                                  batch_size = config['DATALOADER']['batch_size'],
                                  num_workers = config['DATALOADER']['num_workers'],
                                  shuffle = config['DATALOADER']['shuffle'],
                                  pin_memory = config['DATALOADER']['pin_memory'],
                                  drop_last = config['DATALOADER']['drop_last'])
    val_dataloader = DataLoader(dataset = val_dataset,
                                batch_size = config['DATALOADER']['batch_size'],
                                num_workers = config['DATALOADER']['num_workers'], 
                                shuffle = False,
                                pin_memory = config['DATALOADER']['pin_memory'],
                                drop_last = config['DATALOADER']['drop_last'])

    logger.info(f"Load data, train:{len(train_dataset)} val:{len(val_dataset)}")
    
    '''
    Set model
    '''
    # Load model
    model_name = config['TRAINER']['model']
    model_args = config['MODEL'][model_name]
    model = get_model(model_name = model_name, model_args = model_args).to(device)
    
    '''
    Set trainer
    '''
    # Optimizer
    optimizer = get_optimizer(optimizer_name=config['TRAINER']['optimizer'])
    optimizer = optimizer(params=model.parameters(),lr=config['TRAINER']['learning_rate'])

    # Loss
    loss = get_loss(loss_name=config['TRAINER']['loss'])
    
    # Metric
    metrics = {metric_name: get_metric(metric_name) for metric_name in config['TRAINER']['metric']}
    
    # Early stoppper
    early_stopper = EarlyStopper(patience=config['TRAINER']['early_stopping_patience'],
                                mode=config['TRAINER']['early_stopping_mode'],
                                logger=logger)

    # AMP
    if config['TRAINER']['amp'] == True:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    
    # Trainer
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      loss=loss,
                      metrics=metrics,
                      device=device,
                      logger=logger,
                      amp=amp if config['TRAINER']['amp'] else None,
                      interval=config['LOGGER']['logging_interval'])
    
    '''
    Logger
    '''
    # Recorder
    recorder = Recorder(record_dir=RECORDER_DIR,
                        model=model,
                        optimizer=optimizer,
                        scheduler=None,
                        amp=amp if config['TRAINER']['amp'] else None,
                        logger=logger)

    # Save train config
    save_yaml(os.path.join(RECORDER_DIR, 'train_config.yml'), config)

    '''
    TRAIN
    '''
    # Train
    n_epochs = config['TRAINER']['n_epochs']
    for epoch_index in range(n_epochs):

        # Set Recorder row
        row_dict = dict()
        row_dict['epoch_index'] = epoch_index
        row_dict['train_serial'] = train_serial
        
        """
        Train
        """
        print(f"Train {epoch_index}/{n_epochs}")
        logger.info(f"--Train {epoch_index}/{n_epochs}")
        trainer.train(dataloader=train_dataloader, epoch_index=epoch_index, mode='train')
        
        row_dict['train_loss'] = trainer.loss_mean
        row_dict['train_elapsed_time'] = trainer.elapsed_time 
        
        for metric_str, score in trainer.score_dict.items():
            row_dict[f"train_{metric_str}"] = score
        trainer.clear_history()
        
        """
        Validation
        """
        print(f"Val {epoch_index}/{n_epochs}")
        logger.info(f"--Val {epoch_index}/{n_epochs}")
        trainer.train(dataloader=val_dataloader, epoch_index=epoch_index, mode='val')
        
        row_dict['val_loss'] = trainer.loss_mean
        row_dict['val_elapsed_time'] = trainer.elapsed_time 
        
        for metric_str, score in trainer.score_dict.items():
            row_dict[f"val_{metric_str}"] = score
        trainer.clear_history()

        
        """
        Record
        """
        recorder.add_row(row_dict)
        recorder.save_plot(config['LOGGER']['plot'])

        
        """
        Early stopper
        """
        early_stopping_target = config['TRAINER']['early_stopping_target']
        early_stopper.check_early_stopping(loss=row_dict[early_stopping_target])

        if (early_stopper.patience_counter == 0) or (epoch_index == n_epochs-1):
            recorder.save_weight(epoch=epoch_index)
            best_row_dict = copy.deepcopy(row_dict)
        
        if early_stopper.stop == True:
            logger.info(f"Eearly stopped, counter {early_stopper.patience_counter}/{config['TRAINER']['early_stopping_patience']}")
            


Downloading: "https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth" to /root/.cache/torch/hub/checkpoints/convnext_xlarge_22k_1k_224_ema.pth


Train 0/30


100%|██████████| 500/500 [24:42<00:00,  2.97s/it]


Val 0/30


  1%|          | 1/125 [00:21<44:04, 21.32s/it]


RuntimeError: ignored

In [5]:
!nvidia-smi

Wed Jul 13 15:36:04 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    35W / 250W |  16179MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

88