In [1]:
import os
import cv2
import time
import random
import logging
import easydict
import numpy as np
import pandas as pd
from tqdm import tqdm
from os.path import join as opj
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from PIL import Image

import timm
import torch
import torch.nn as nn
import torch_optimizer as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, grad_scaler
from torchvision import transforms
from torch import Tensor
from torchvision.transforms import functional as F

import warnings
warnings.filterwarnings('ignore')

In [2]:
DATA_DIR = './open'

train_df = pd.read_csv(os.path.join(DATA_DIR, 'train_df2.csv'))
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test_df.csv'))

print(train_df.head())
print(test_df.head())
print(train_df.shape)
print(test_df.shape)

   file_name       class state            label  label2  class2  state2
0  10000.png  transistor  good  transistor-good      72      12      25
1  10001.png     capsule  good     capsule-good      15       2      25
2  10002.png  transistor  good  transistor-good      72      12      25
3  10003.png        wood  good        wood-good      76      13      25
4  10004.png      bottle  good      bottle-good       3       0      25
   index  file_name
0      0  20000.png
1      1  20001.png
2      2  20002.png
3      3  20003.png
4      4  20004.png
(4277, 7)
(2154, 2)


In [3]:
# # train_y = pd.read_csv("./open/train_df.csv")

# train_labels = train_y["label"]

# label_unique = sorted(np.unique(train_labels))
# label_unique = {key:value for key,value in zip(label_unique, range(len(label_unique)))}

# train_labels = [label_unique[k] for k in train_labels]
# train_labels

In [4]:
# label_unique

In [5]:
# train_df['label2'] = train_labels

In [6]:
# train_df.to_csv('./open/train_df2.csv')

In [7]:
class_num = len(train_df.label2.unique())

In [8]:
args = easydict.EasyDict(
    {'exp_num':'0',
     
     # Path settings
     'data_path':'./open',
     'Kfold':5,
     'model_path':'label_results/',
     'image_type':'train_1024', 
     'class_num' : class_num,

     # Model parameter settings
     'model_name':'regnety_040',
     'drop_path_rate':0.2,
     
     # Training parameter settings
     ## Base Parameter
     'img_size':288,
     'batch_size':16,
     'epochs':100,
     'optimizer':'Lamb',
     'initial_lr':1e-5,
     'weight_decay':1e-3,

     ## Augmentation
     'aug_ver':2,

     ## Scheduler (OnecycleLR)
     'scheduler':'cycle',
     'warm_epoch':3,
     'max_lr':1e-3,

     ### Cosine Annealing
     'min_lr':5e-5,
     'tmax':145,

     ## etc.
     'patience': 7,
     'clipping':None,

     # Hardware settings
     'amp':True,
     'multi_gpu':True,
     'logging':False,
     'num_workers':4,
     'seed':42
     
     
    })

In [9]:
import torch
import torch.nn as nn

from einops import rearrange
from einops.layers.torch import Rearrange


def conv_3x3_bn(inp, oup, image_size, downsample=False):
    stride = 1 if downsample == False else 2
    return nn.Sequential(
        nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
        nn.BatchNorm2d(oup),
        nn.GELU()
    )


class PreNorm(nn.Module):
    def __init__(self, dim, fn, norm):
        super().__init__()
        self.norm = norm(dim)
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)


class SE(nn.Module):
    def __init__(self, inp, oup, expansion=0.25):
        super().__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(oup, int(inp * expansion), bias=False),
            nn.GELU(),
            nn.Linear(int(inp * expansion), oup, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        return x * y


class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout=0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)


class MBConv(nn.Module):
    def __init__(self, inp, oup, image_size, downsample=False, expansion=4):
        super().__init__()
        self.downsample = downsample
        stride = 1 if self.downsample == False else 2
        hidden_dim = int(inp * expansion)

        if self.downsample:
            self.pool = nn.MaxPool2d(3, 2, 1)
            self.proj = nn.Conv2d(inp, oup, 1, 1, 0, bias=False)

        if expansion == 1:
            self.conv = nn.Sequential(
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, stride,
                          1, groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.GELU(),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        else:
            self.conv = nn.Sequential(
                # pw
                # down-sample in the first conv
                nn.Conv2d(inp, hidden_dim, 1, stride, 0, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.GELU(),
                # dw
                nn.Conv2d(hidden_dim, hidden_dim, 3, 1, 1,
                          groups=hidden_dim, bias=False),
                nn.BatchNorm2d(hidden_dim),
                nn.GELU(),
                SE(inp, hidden_dim),
                # pw-linear
                nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
            )
        
        self.conv = PreNorm(inp, self.conv, nn.BatchNorm2d)

    def forward(self, x):
        if self.downsample:
            return self.proj(self.pool(x)) + self.conv(x)
        else:
            return x + self.conv(x)


class Attention(nn.Module):
    def __init__(self, inp, oup, image_size, heads=8, dim_head=32, dropout=0.):
        super().__init__()
        inner_dim = dim_head * heads
        project_out = not (heads == 1 and dim_head == inp)

        self.ih, self.iw = image_size

        self.heads = heads
        self.scale = dim_head ** -0.5

        # parameter table of relative position bias
        self.relative_bias_table = nn.Parameter(
            torch.zeros((2 * self.ih - 1) * (2 * self.iw - 1), heads))

        coords = torch.meshgrid((torch.arange(self.ih), torch.arange(self.iw)))
        coords = torch.flatten(torch.stack(coords), 1)
        relative_coords = coords[:, :, None] - coords[:, None, :]

        relative_coords[0] += self.ih - 1
        relative_coords[1] += self.iw - 1
        relative_coords[0] *= 2 * self.iw - 1
        relative_coords = rearrange(relative_coords, 'c h w -> h w c')
        relative_index = relative_coords.sum(-1).flatten().unsqueeze(1)
        self.register_buffer("relative_index", relative_index)

        self.attend = nn.Softmax(dim=-1)
        self.to_qkv = nn.Linear(inp, inner_dim * 3, bias=False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, oup),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x):
        qkv = self.to_qkv(x).chunk(3, dim=-1)
        q, k, v = map(lambda t: rearrange(
            t, 'b n (h d) -> b h n d', h=self.heads), qkv)

        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale

        # Use "gather" for more efficiency on GPUs
        relative_bias = self.relative_bias_table.gather(
            0, self.relative_index.repeat(1, self.heads))
        relative_bias = rearrange(
            relative_bias, '(h w) c -> 1 c h w', h=self.ih*self.iw, w=self.ih*self.iw)
        dots = dots + relative_bias

        attn = self.attend(dots)
        out = torch.matmul(attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        out = self.to_out(out)
        return out


class Transformer(nn.Module):
    def __init__(self, inp, oup, image_size, heads=8, dim_head=32, downsample=False, dropout=0.):
        super().__init__()
        hidden_dim = int(inp * 4)

        self.ih, self.iw = image_size
        self.downsample = downsample

        if self.downsample:
            self.pool1 = nn.MaxPool2d(3, 2, 1)
            self.pool2 = nn.MaxPool2d(3, 2, 1)
            self.proj = nn.Conv2d(inp, oup, 1, 1, 0, bias=False)

        self.attn = Attention(inp, oup, image_size, heads, dim_head, dropout)
        self.ff = FeedForward(oup, hidden_dim, dropout)

        self.attn = nn.Sequential(
            Rearrange('b c ih iw -> b (ih iw) c'),
            PreNorm(inp, self.attn, nn.LayerNorm),
            Rearrange('b (ih iw) c -> b c ih iw', ih=self.ih, iw=self.iw)
        )

        self.ff = nn.Sequential(
            Rearrange('b c ih iw -> b (ih iw) c'),
            PreNorm(oup, self.ff, nn.LayerNorm),
            Rearrange('b (ih iw) c -> b c ih iw', ih=self.ih, iw=self.iw)
        )

    def forward(self, x):
        if self.downsample:
            x = self.proj(self.pool1(x)) + self.attn(self.pool2(x))
        else:
            x = x + self.attn(x)
        x = x + self.ff(x)
        return x


class CoAtNet(nn.Module):
    def __init__(self, image_size, in_channels, num_blocks, channels, num_classes=1000, block_types=['C', 'C', 'T', 'T']):
        super().__init__()
        ih, iw = image_size
        block = {'C': MBConv, 'T': Transformer}

        self.s0 = self._make_layer(
            conv_3x3_bn, in_channels, channels[0], num_blocks[0], (ih // 2, iw // 2))
        self.s1 = self._make_layer(
            block[block_types[0]], channels[0], channels[1], num_blocks[1], (ih // 4, iw // 4))
        self.s2 = self._make_layer(
            block[block_types[1]], channels[1], channels[2], num_blocks[2], (ih // 8, iw // 8))
        self.s3 = self._make_layer(
            block[block_types[2]], channels[2], channels[3], num_blocks[3], (ih // 16, iw // 16))
        self.s4 = self._make_layer(
            block[block_types[3]], channels[3], channels[4], num_blocks[4], (ih // 32, iw // 32))

        self.pool = nn.AvgPool2d(ih // 32, 1)
        self.fc = nn.Linear(channels[-1], num_classes, bias=False)

    def forward(self, x):
        x = self.s0(x)
        x = self.s1(x)
        x = self.s2(x)
        x = self.s3(x)
        x = self.s4(x)

        x = self.pool(x).view(-1, x.shape[1])
        x = self.fc(x)
        return x

    def _make_layer(self, block, inp, oup, depth, image_size):
        layers = nn.ModuleList([])
        for i in range(depth):
            if i == 0:
                layers.append(block(inp, oup, image_size, downsample=True))
            else:
                layers.append(block(oup, oup, image_size))
        return nn.Sequential(*layers)


def coatnet_0():
    num_blocks = [2, 2, 3, 5, 2]            # L
    channels = [64, 96, 192, 384, 768]      # D
    return CoAtNet((224, 224), 3, num_blocks, channels, num_classes=1000)


def coatnet_1():
    num_blocks = [2, 2, 6, 14, 2]           # L
    channels = [64, 96, 192, 384, 768]      # D
    return CoAtNet((224, 224), 3, num_blocks, channels, num_classes=1000)


def coatnet_2():
    num_blocks = [2, 2, 6, 14, 2]           # L
    channels = [128, 128, 256, 512, 1026]   # D
    return CoAtNet((224, 224), 3, num_blocks, channels, num_classes=1000)


def coatnet_3():
    num_blocks = [2, 2, 6, 14, 2]           # L
    channels = [192, 192, 384, 768, 1536]   # D
    return CoAtNet((224, 224), 3, num_blocks, channels, num_classes=1000)


def coatnet_4():
    num_blocks = [2, 2, 12, 28, 2]          # L
    channels = [192, 192, 384, 768, 1536]   # D
    return CoAtNet((224, 224), 3, num_blocks, channels, num_classes=1000)


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [10]:
# Warmup Learning rate scheduler
from torch.optim.lr_scheduler import _LRScheduler
class WarmUpLR(_LRScheduler):
    """warmup_training learning rate scheduler
    Args:
        optimizer: optimzier(e.g. SGD)
        total_iters: totoal_iters of warmup phase
    """
    def __init__(self, optimizer, total_iters, last_epoch=-1):
        
        self.total_iters = total_iters
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        """we will use the first m batches, and set the learning
        rate to base_lr * m / total_iters
        """
        return [base_lr * self.last_epoch / (self.total_iters + 1e-8) for base_lr in self.base_lrs]

# Logging
def get_root_logger(logger_name='basicsr',
                    log_level=logging.INFO,
                    log_file=None):

    logger = logging.getLogger(logger_name)
    # if the logger has been initialized, just return it
    if logger.hasHandlers():
        return logger

    format_str = '%(asctime)s %(levelname)s: %(message)s'
    logging.basicConfig(format=format_str, level=log_level)

    if log_file is not None:
        file_handler = logging.FileHandler(log_file, 'w')
        file_handler.setFormatter(logging.Formatter(format_str))
        file_handler.setLevel(log_level)
        logger.addHandler(file_handler)

    return logger

class AvgMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        self.losses = []

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        self.losses.append(val)

In [11]:
class RandomRotation(transforms.RandomRotation):
    def __init__(self, p: float, degrees: int):
        super(RandomRotation, self).__init__(degrees)
        self.p = p

    def forward(self, img):
        if torch.rand(1) < self.p:
            fill = self.fill
            if isinstance(img, Tensor):
                if isinstance(fill, (int, float)):
                    fill = [float(fill)] * F.get_image_num_channels(img)
                else:
                    fill = [float(f) for f in fill]
            angle = self.get_params(self.degrees)

            img = F.rotate(img, angle, self.resample, self.expand, self.center, fill)
        return img

In [12]:
class Train_Dataset(Dataset):
    def __init__(self, df, transform=None):
        self.img_path = df['file_name'].values
        self.target = df['label2'].values 
        self.transform = transform

        print(f'Dataset size:{len(self.img_path)}')

    def __getitem__(self, idx):
        image = cv2.imread(opj('./open/train/', self.img_path[idx])).astype(np.float32)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
#         target = self.target[idx]

#         if self.transform is not None:
#             image = self.transform(torch.from_numpy(image.transpose(2,0,1)))
        
        image = Image.open(opj('./open/train/', self.img_path[idx])).convert('RGB')
        image = self.transform(image)
#         augmentation = random.randint(0,2)
#             if augmentation==1:
#                 img = img[::-1].copy()
#             elif augmentation==2:
#                 img = img[:,::-1].copy()
#         img = transforms.ToTensor()(img)
        target = self.target[idx]

        return image, target

    def __len__(self):
        return len(self.img_path)

class Test_dataset(Dataset):
    def __init__(self, df, transform=None):
        self.img_path = df['file_name'].values
        self.transform = transform

        print(f'Test Dataset size:{len(self.img_path)}')

#         image = cv2.imread(opj('./open/train/', self.img_path[idx])).astype(np.float32)
#         image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) / 255.0
#         target = self.target[idx]

#         if self.transform is not None:
#             image = self.transform(torch.from_numpy(image.transpose(2,0,1)))

    def __getitem__(self, idx):

        image = Image.open(opj('./open/test/', self.img_path[idx])).convert('RGB')
        image = self.transform(image)

        return image

    def __len__(self):
        return len(self.img_path)

def get_loader(df, phase: str, batch_size, shuffle,
               num_workers, transform):
    if phase == 'test':
        dataset = Test_dataset(df, transform)
        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True)
    else:
        dataset = Train_Dataset(df, transform)
        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True,
                                 drop_last=False)
    return data_loader

def get_train_augmentation(img_size, ver):
    if ver==1: # for validset
        transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Resize((img_size, img_size)),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225]),
            ])

    if ver == 2:
        transform = transforms.Compose([
                transforms.RandomHorizontalFlip(p=0.3),
                transforms.RandomVerticalFlip(p=0.3),
#                 transforms.RandomAffine((-20, 20)),
                transforms.RandomRotation(90),
                transforms.ToTensor(),
                transforms.Resize((img_size, img_size)),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225]),
            ])
    
    
    return transform

In [13]:
class Network(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.model_ft = timm.create_model( # timm ImageNet pre-trained 모델 load
            args.model_name,
            pretrained=True,
            num_classes = 88, drop_path_rate=args.drop_path_rate
        )

#         self.model_ft = coatnet_3()
#         num_ftrs = self.model_ft.fc.in_features
#         self.model_ft.fc = nn.Linear(num_ftrs, args.class_num)
        
    def forward(self, x):
        out = self.model_ft(x)
        return out

class Network_test(nn.Module):
    def __init__(self, encoder_name):
        super().__init__()
        self.model_ft = timm.create_model( # timm ImageNet pre-trained 모델 load
            args.model_name,
            pretrained=True,
            num_classes = 88, drop_path_rate=args.drop_path_rate
        )

#         self.model_ft = coatnet_3()
#         num_ftrs = self.model_ft.fc.in_features
#         self.model_ft.fc = nn.Linear(num_ftrs, args.class_num)

    def forward(self, x):
        out = self.model_ft(x)
        return out

In [14]:
# weighted crossentropy loss를 위한 weight 계산 함수
def get_class_weight():
    return 1 / train_df['label2'].value_counts().sort_index().values

class_weight = get_class_weight()



In [15]:
from collections import defaultdict
from itertools import chain
from torch.optim import Optimizer
import torch
import warnings

class Lookahead(Optimizer):
    def __init__(self, optimizer, k=5, alpha=0.5):
        self.optimizer = optimizer
        self.k = k
        self.alpha = alpha
        self.param_groups = self.optimizer.param_groups
        self.state = defaultdict(dict)
        self.fast_state = self.optimizer.state
        for group in self.param_groups:
            group["counter"] = 0
    
    def update(self, group):
        for fast in group["params"]:
            param_state = self.state[fast]
            if "slow_param" not in param_state:
                param_state["slow_param"] = torch.zeros_like(fast.data)
                param_state["slow_param"].copy_(fast.data)
            slow = param_state["slow_param"]
            slow += (fast.data - slow) * self.alpha
            fast.data.copy_(slow)
    
    def update_lookahead(self):
        for group in self.param_groups:
            self.update(group)

    def step(self, closure=None):
        loss = self.optimizer.step(closure)
        for group in self.param_groups:
            if group["counter"] == 0:
                self.update(group)
            group["counter"] += 1
            if group["counter"] >= self.k:
                group["counter"] = 0
        return loss

    def state_dict(self):
        fast_state_dict = self.optimizer.state_dict()
        slow_state = {
            (id(k) if isinstance(k, torch.Tensor) else k): v
            for k, v in self.state.items()
        }
        fast_state = fast_state_dict["state"]
        param_groups = fast_state_dict["param_groups"]
        return {
            "fast_state": fast_state,
            "slow_state": slow_state,
            "param_groups": param_groups,
        }

    def load_state_dict(self, state_dict):
        slow_state_dict = {
            "state": state_dict["slow_state"],
            "param_groups": state_dict["param_groups"],
        }
        fast_state_dict = {
            "state": state_dict["fast_state"],
            "param_groups": state_dict["param_groups"],
        }
        super(Lookahead, self).load_state_dict(slow_state_dict)
        self.optimizer.load_state_dict(fast_state_dict)
        self.fast_state = self.optimizer.state

    def add_param_group(self, param_group):
        param_group["counter"] = 0
        self.optimizer.add_param_group(param_group)

In [16]:
class Trainer():
    def __init__(self, args, save_path):
        '''
        args: arguments
        save_path: Model 가중치 저장 경로
        '''
        super(Trainer, self).__init__()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Logging
        log_file = os.path.join(save_path, 'log.log')
        self.logger = get_root_logger(logger_name='IR', log_level=logging.INFO, log_file=log_file)
        self.logger.info(args)
        # self.logger.info(args.tag)

        # Train, Valid Set load
        ############################################################################
        if args.step == 0 :
            df_train = pd.read_csv(opj(args.data_path, 'train_df2.csv'))
        else :
            df_train = pd.read_csv(opj(args.data_path, f'train_{args.step}step.csv'))

#         if args.image_type is not None:
#             df_train['img_path'] = df_train['img_path'].apply(lambda x:x.replace('train_imgs', args.image_type))
#             df_train['img_path'] = df_train['img_path'].apply(lambda x:x.replace('test_imgs', 'test_1024'))

        kf = StratifiedKFold(n_splits=args.Kfold, shuffle=True, random_state=args.seed)
        for fold, (train_idx, val_idx) in enumerate(kf.split(range(len(df_train)), y=df_train['label2'])):
            df_train.loc[val_idx, 'fold'] = fold
        val_idx = list(df_train[df_train['fold'] == int(args.fold)].index)

        df_val = df_train[df_train['fold'] == args.fold].reset_index(drop=True)
        df_train = df_train[df_train['fold'] != args.fold].reset_index(drop=True)

        # Augmentation
        self.train_transform = get_train_augmentation(img_size=args.img_size, ver=args.aug_ver)
        self.test_transform = get_train_augmentation(img_size=args.img_size, ver=1)

        # TrainLoader
        self.train_loader = get_loader(df_train, phase='train', batch_size=args.batch_size, shuffle=True,
                                       num_workers=args.num_workers, transform=self.train_transform)
        self.val_loader = get_loader(df_val, phase='train', batch_size=args.batch_size, shuffle=False,
                                       num_workers=args.num_workers, transform=self.test_transform)

        # Network
        self.model = Network(args).to(self.device)

        # Loss
        self.criterion = nn.CrossEntropyLoss()
#         self.criterion = CutMixCrossEntropyLoss(True)
        
        # Optimizer & Scheduler
#         self.optimizer = Lookahead(torch.optim.Adam(self.model.parameters(), lr=args.initial_lr), k=5, alpha=0.5)
        self.optimizer = optim.Lamb(self.model.parameters(), lr=args.initial_lr, weight_decay=args.weight_decay)
        
        iter_per_epoch = len(self.train_loader)
        self.warmup_scheduler = WarmUpLR(self.optimizer, iter_per_epoch * args.warm_epoch)

        if args.scheduler == 'step':
            self.scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer, milestones=args.milestone, gamma=args.lr_factor, verbose=True)
        elif args.scheduler == 'cos':
            tmax = args.tmax # half-cycle 
            self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max = tmax, eta_min=args.min_lr, verbose=True)
        elif args.scheduler == 'cycle':
            self.scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer, max_lr=args.max_lr, steps_per_epoch=iter_per_epoch, epochs=args.epochs)

        if args.multi_gpu:
            self.model = nn.DataParallel(self.model).to(self.device)

        # Train / Validate
        best_loss = np.inf
        best_acc = 0
        best_epoch = 0
        early_stopping = 0
        start = time.time()
        for epoch in range(1, args.epochs+1):
            self.epoch = epoch

            if args.scheduler == 'cos':
                if epoch > args.warm_epoch:
                    self.scheduler.step()

            # Training
            train_loss, train_acc, train_f1 = self.training(args)

            # Model weight in Multi_GPU or Single GPU
            state_dict= self.model.module.state_dict() if args.multi_gpu else self.model.state_dict()

            # Validation
            val_loss, val_acc, val_f1 = self.validate(args, phase='val')

            # Save models
            if val_loss < best_loss:
                early_stopping = 0
                best_epoch = epoch
                best_loss = val_loss
                best_acc = val_acc
                best_f1 = val_f1

                torch.save({'epoch':epoch,
                            'state_dict':state_dict,
                            'optimizer': self.optimizer.state_dict(),
                            'scheduler': self.scheduler.state_dict(),
                    }, os.path.join(save_path, 'best_model.pth'))
                self.logger.info(f'-----------------SAVE:{best_epoch}epoch----------------')
            else:
                early_stopping += 1

            # Early Stopping
            if early_stopping == args.patience:
                break

        self.logger.info(f'\nBest Val Epoch:{best_epoch} | Val Loss:{best_loss:.4f} | Val Acc:{best_acc:.4f} | Val F1:{best_f1:.4f}')
        end = time.time()
        self.logger.info(f'Total Process time:{(end - start) / 60:.3f}Minute')

    # Training
    def training(self, args):
        self.model.train()
        train_loss = AvgMeter()
        train_acc = 0
        preds_list = []
        targets_list = []

        scaler = grad_scaler.GradScaler()
        for i, (images, targets) in enumerate(tqdm(self.train_loader)):
            images = torch.tensor(images, device=self.device, dtype=torch.float32)
            targets = torch.tensor(targets, device=self.device, dtype=torch.long)
            
            if self.epoch <= args.warm_epoch:
                self.warmup_scheduler.step()

            self.model.zero_grad(set_to_none=True)
            if args.amp:
                with autocast():
                    preds = self.model(images)
                    loss = self.criterion(preds, targets)
                scaler.scale(loss).backward()

#                 # Gradient Clipping
#                 if args.clipping is not None:
#                     scaler.unscale_(self.optimizer)
#                     torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.clipping)

                scaler.step(self.optimizer)
                scaler.update()

            else:
                preds = self.model(images)
                loss = self.criterion(preds, targets)
                loss.backward()
                nn.utils.clip_grad_norm_(self.model.parameters(), args.clipping)
                self.optimizer.step()

            if args.scheduler == 'cycle':
                if self.epoch > args.warm_epoch:
                    self.scheduler.step()

            # Metric
            train_acc += (preds.argmax(dim=1) == targets).sum().item()
            preds_list.extend(preds.argmax(dim=1).cpu().detach().numpy())
            targets_list.extend(targets.cpu().detach().numpy())
            # log
            train_loss.update(loss.item(), n=images.size(0))

        train_acc /= len(self.train_loader.dataset)
        train_f1 = f1_score(np.array(targets_list), np.array(preds_list), average='macro')

        self.logger.info(f'Epoch:[{self.epoch:03d}/{args.epochs:03d}]')
        self.logger.info(f'Train Loss:{train_loss.avg:.3f} | Acc:{train_acc:.4f} | F1:{train_f1:.4f}')
        return train_loss.avg, train_acc, train_f1
            
    # Validation or Dev
    def validate(self, args, phase='val'):
        self.model.eval()
        with torch.no_grad():
            val_loss = AvgMeter()
            val_acc = 0
            preds_list = []
            targets_list = []

            for i, (images, targets) in enumerate(self.val_loader):
                images = torch.tensor(images, device=self.device, dtype=torch.float32)
                targets = torch.tensor(targets, device=self.device, dtype=torch.long)

                preds = self.model(images)
                loss = self.criterion(preds, targets)

                # Metric
                val_acc += (preds.argmax(dim=1) == targets).sum().item()
                preds_list.extend(preds.argmax(dim=1).cpu().detach().numpy())
                targets_list.extend(targets.cpu().detach().numpy())

                # log
                val_loss.update(loss.item(), n=images.size(0))
            val_acc /= len(self.val_loader.dataset)
            val_f1 = f1_score(np.array(targets_list), np.array(preds_list), average='macro')

            self.logger.info(f'{phase} Loss:{val_loss.avg:.3f} | Acc:{val_acc:.4f} | F1:{val_f1:.4f}')
        return val_loss.avg, val_acc, val_f1

In [17]:
def main(args):
    print('<---- Training Params ---->')
    
    # Random Seed
    seed = args.seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = True

    save_path = os.path.join(args.model_path, (args.exp_num).zfill(3))
    
    # Create model directory
    os.makedirs(save_path, exist_ok=True)
    Trainer(args, save_path)

    return save_path

In [18]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  # Arrange GPU devices starting from 0
os.environ["CUDA_VISIBLE_DEVICES"]= "0,1"  # Set the GPUs 2 and 3 to use

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

Device: cuda
Current cuda device: 0
Count of using GPUs: 2


In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
sub = pd.read_csv('./open/sample_submission.csv')
df_train = pd.read_csv('./open/train_df2.csv')
df_test = pd.read_csv('./open/test_df.csv')

In [None]:
args.step = 0
models_path = []
for s_fold in range(5): # 5fold
    args.fold = s_fold
    args.exp_num = str(s_fold)
    save_path = main(args)
    models_path.append(save_path)

2022-04-27 17:22:35,501 INFO: {'exp_num': '0', 'data_path': './open', 'Kfold': 5, 'model_path': 'label_results/', 'image_type': 'train_1024', 'class_num': 88, 'model_name': 'regnety_040', 'drop_path_rate': 0.2, 'img_size': 288, 'batch_size': 16, 'epochs': 100, 'optimizer': 'Lamb', 'initial_lr': 1e-05, 'weight_decay': 0.001, 'aug_ver': 2, 'scheduler': 'cycle', 'warm_epoch': 3, 'max_lr': 0.001, 'min_lr': 5e-05, 'tmax': 145, 'patience': 7, 'clipping': None, 'amp': True, 'multi_gpu': True, 'logging': False, 'num_workers': 4, 'seed': 42, 'step': 0, 'fold': 0}


<---- Training Params ---->
Dataset size:3421
Dataset size:856


2022-04-27 17:22:35,834 INFO: Loading pretrained weights from url (https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_040-f0d569f9.pth)
100%|██████████| 214/214 [01:13<00:00,  2.93it/s]
2022-04-27 17:23:52,786 INFO: Epoch:[001/100]
2022-04-27 17:23:52,787 INFO: Train Loss:4.498 | Acc:0.0064 | F1:0.0033
2022-04-27 17:24:10,043 INFO: val Loss:4.450 | Acc:0.0257 | F1:0.0024
2022-04-27 17:24:10,507 INFO: -----------------SAVE:1epoch----------------
100%|██████████| 214/214 [01:08<00:00,  3.13it/s]
2022-04-27 17:25:18,868 INFO: Epoch:[002/100]
2022-04-27 17:25:18,869 INFO: Train Loss:4.470 | Acc:0.0082 | F1:0.0022
2022-04-27 17:25:35,898 INFO: val Loss:4.401 | Acc:0.0339 | F1:0.0040
2022-04-27 17:25:36,523 INFO: -----------------SAVE:2epoch----------------
100%|██████████| 214/214 [01:08<00:00,  3.14it/s]
2022-04-27 17:26:44,590 INFO: Epoch:[003/100]
2022-04-27 17:26:44,591 INFO: Train Loss:4.407 | Acc:0.0251 | F1:0.0068
2022-04-27 17:27:01,421 INFO: val

100%|██████████| 214/214 [01:30<00:00,  2.37it/s]
2022-04-27 18:12:26,349 INFO: Epoch:[028/100]
2022-04-27 18:12:26,350 INFO: Train Loss:0.486 | Acc:0.8720 | F1:0.3123
2022-04-27 18:12:45,817 INFO: val Loss:0.361 | Acc:0.8879 | F1:0.3308
2022-04-27 18:12:46,540 INFO: -----------------SAVE:28epoch----------------
100%|██████████| 214/214 [01:31<00:00,  2.34it/s]
2022-04-27 18:14:17,876 INFO: Epoch:[029/100]
2022-04-27 18:14:17,877 INFO: Train Loss:0.464 | Acc:0.8690 | F1:0.3087
2022-04-27 18:14:36,686 INFO: val Loss:0.389 | Acc:0.8867 | F1:0.3165
100%|██████████| 214/214 [01:32<00:00,  2.30it/s]
2022-04-27 18:16:09,693 INFO: Epoch:[030/100]
2022-04-27 18:16:09,693 INFO: Train Loss:0.450 | Acc:0.8772 | F1:0.3498
2022-04-27 18:16:28,747 INFO: val Loss:0.349 | Acc:0.8995 | F1:0.3972
2022-04-27 18:16:29,441 INFO: -----------------SAVE:30epoch----------------
100%|██████████| 214/214 [01:29<00:00,  2.40it/s]
2022-04-27 18:17:58,789 INFO: Epoch:[031/100]
2022-04-27 18:17:58,790 INFO: Train Lo

2022-04-27 19:07:57,710 INFO: val Loss:0.176 | Acc:0.9486 | F1:0.7331
2022-04-27 19:07:58,395 INFO: -----------------SAVE:58epoch----------------
100%|██████████| 214/214 [01:29<00:00,  2.40it/s]
2022-04-27 19:09:27,648 INFO: Epoch:[059/100]
2022-04-27 19:09:27,649 INFO: Train Loss:0.158 | Acc:0.9503 | F1:0.7761
2022-04-27 19:09:46,879 INFO: val Loss:0.220 | Acc:0.9533 | F1:0.7506
100%|██████████| 214/214 [01:30<00:00,  2.38it/s]
2022-04-27 19:11:16,908 INFO: Epoch:[060/100]
2022-04-27 19:11:16,909 INFO: Train Loss:0.143 | Acc:0.9538 | F1:0.7763
2022-04-27 19:11:36,100 INFO: val Loss:0.214 | Acc:0.9451 | F1:0.7138
100%|██████████| 214/214 [01:29<00:00,  2.39it/s]
2022-04-27 19:13:05,499 INFO: Epoch:[061/100]
2022-04-27 19:13:05,500 INFO: Train Loss:0.148 | Acc:0.9582 | F1:0.8160
2022-04-27 19:13:25,370 INFO: val Loss:0.194 | Acc:0.9533 | F1:0.7609
100%|██████████| 214/214 [01:29<00:00,  2.38it/s]
2022-04-27 19:14:55,212 INFO: Epoch:[062/100]
2022-04-27 19:14:55,213 INFO: Train Loss:0.1

<---- Training Params ---->
Dataset size:3421
Dataset size:856


2022-04-27 19:42:46,842 INFO: Loading pretrained weights from url (https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_040-f0d569f9.pth)
100%|██████████| 214/214 [01:30<00:00,  2.38it/s]
2022-04-27 19:44:17,119 INFO: Epoch:[001/100]
2022-04-27 19:44:17,120 INFO: Train Loss:4.496 | Acc:0.0050 | F1:0.0014
2022-04-27 19:44:36,854 INFO: val Loss:4.453 | Acc:0.0339 | F1:0.0032
2022-04-27 19:44:37,392 INFO: -----------------SAVE:1epoch----------------
100%|██████████| 214/214 [01:31<00:00,  2.34it/s]
2022-04-27 19:46:08,671 INFO: Epoch:[002/100]
2022-04-27 19:46:08,671 INFO: Train Loss:4.475 | Acc:0.0094 | F1:0.0044
2022-04-27 19:46:28,556 INFO: val Loss:4.398 | Acc:0.0467 | F1:0.0046
2022-04-27 19:46:29,249 INFO: -----------------SAVE:2epoch----------------
100%|██████████| 214/214 [01:30<00:00,  2.35it/s]
2022-04-27 19:48:00,250 INFO: Epoch:[003/100]
2022-04-27 19:48:00,251 INFO: Train Loss:4.414 | Acc:0.0222 | F1:0.0071
2022-04-27 19:48:20,001 INFO: val

100%|██████████| 214/214 [01:31<00:00,  2.35it/s]
2022-04-27 20:32:54,389 INFO: Epoch:[027/100]
2022-04-27 20:32:54,390 INFO: Train Loss:0.515 | Acc:0.8690 | F1:0.3036
2022-04-27 20:33:14,438 INFO: val Loss:0.342 | Acc:0.8902 | F1:0.3395
2022-04-27 20:33:15,117 INFO: -----------------SAVE:27epoch----------------
100%|██████████| 214/214 [01:31<00:00,  2.35it/s]
2022-04-27 20:34:46,212 INFO: Epoch:[028/100]
2022-04-27 20:34:46,213 INFO: Train Loss:0.471 | Acc:0.8714 | F1:0.3126
2022-04-27 20:35:06,466 INFO: val Loss:0.366 | Acc:0.8925 | F1:0.3826
100%|██████████| 214/214 [01:32<00:00,  2.32it/s]
2022-04-27 20:36:38,663 INFO: Epoch:[029/100]
2022-04-27 20:36:38,664 INFO: Train Loss:0.444 | Acc:0.8766 | F1:0.3458
2022-04-27 20:36:58,530 INFO: val Loss:0.340 | Acc:0.8984 | F1:0.4018
2022-04-27 20:36:59,394 INFO: -----------------SAVE:29epoch----------------
100%|██████████| 214/214 [01:32<00:00,  2.31it/s]
2022-04-27 20:38:32,068 INFO: Epoch:[030/100]
2022-04-27 20:38:32,069 INFO: Train Lo

<---- Training Params ---->
Dataset size:3422
Dataset size:855


2022-04-27 21:02:59,542 INFO: Loading pretrained weights from url (https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_040-f0d569f9.pth)
100%|██████████| 214/214 [01:30<00:00,  2.35it/s]
2022-04-27 21:04:30,694 INFO: Epoch:[001/100]
2022-04-27 21:04:30,695 INFO: Train Loss:4.501 | Acc:0.0061 | F1:0.0030
2022-04-27 21:04:50,416 INFO: val Loss:4.449 | Acc:0.0281 | F1:0.0029
2022-04-27 21:04:50,898 INFO: -----------------SAVE:1epoch----------------
100%|██████████| 214/214 [01:32<00:00,  2.32it/s]
2022-04-27 21:06:23,242 INFO: Epoch:[002/100]
2022-04-27 21:06:23,243 INFO: Train Loss:4.472 | Acc:0.0082 | F1:0.0022
2022-04-27 21:06:42,611 INFO: val Loss:4.404 | Acc:0.0421 | F1:0.0041
2022-04-27 21:06:43,332 INFO: -----------------SAVE:2epoch----------------
100%|██████████| 214/214 [01:32<00:00,  2.31it/s]
2022-04-27 21:08:15,898 INFO: Epoch:[003/100]
2022-04-27 21:08:15,898 INFO: Train Loss:4.409 | Acc:0.0207 | F1:0.0069
2022-04-27 21:08:35,414 INFO: val

2022-04-27 21:53:00,053 INFO: Train Loss:0.522 | Acc:0.8694 | F1:0.3090
2022-04-27 21:53:19,579 INFO: val Loss:0.361 | Acc:0.8877 | F1:0.3335
2022-04-27 21:53:20,272 INFO: -----------------SAVE:27epoch----------------
100%|██████████| 214/214 [01:28<00:00,  2.41it/s]
2022-04-27 21:54:48,928 INFO: Epoch:[028/100]
2022-04-27 21:54:48,929 INFO: Train Loss:0.490 | Acc:0.8720 | F1:0.3272
2022-04-27 21:55:09,036 INFO: val Loss:0.343 | Acc:0.9029 | F1:0.4088
2022-04-27 21:55:09,756 INFO: -----------------SAVE:28epoch----------------
100%|██████████| 214/214 [01:29<00:00,  2.39it/s]
2022-04-27 21:56:39,461 INFO: Epoch:[029/100]
2022-04-27 21:56:39,462 INFO: Train Loss:0.452 | Acc:0.8741 | F1:0.3400
2022-04-27 21:56:59,391 INFO: val Loss:0.399 | Acc:0.8912 | F1:0.3658
100%|██████████| 214/214 [01:30<00:00,  2.37it/s]
2022-04-27 21:58:29,595 INFO: Epoch:[030/100]
2022-04-27 21:58:29,595 INFO: Train Loss:0.451 | Acc:0.8793 | F1:0.3693
2022-04-27 21:58:49,056 INFO: val Loss:0.394 | Acc:0.8877 | F1

100%|██████████| 214/214 [01:30<00:00,  2.37it/s]
2022-04-27 22:48:19,399 INFO: Epoch:[057/100]
2022-04-27 22:48:19,400 INFO: Train Loss:0.175 | Acc:0.9524 | F1:0.7843
2022-04-27 22:48:39,247 INFO: val Loss:0.141 | Acc:0.9614 | F1:0.7808
100%|██████████| 214/214 [01:31<00:00,  2.34it/s]
2022-04-27 22:50:10,708 INFO: Epoch:[058/100]
2022-04-27 22:50:10,709 INFO: Train Loss:0.166 | Acc:0.9509 | F1:0.7655
2022-04-27 22:50:30,037 INFO: val Loss:0.195 | Acc:0.9626 | F1:0.7848
100%|██████████| 214/214 [01:32<00:00,  2.31it/s]
2022-04-27 22:52:02,693 INFO: Epoch:[059/100]
2022-04-27 22:52:02,694 INFO: Train Loss:0.148 | Acc:0.9573 | F1:0.7971
2022-04-27 22:52:22,108 INFO: val Loss:0.209 | Acc:0.9485 | F1:0.7110
100%|██████████| 214/214 [01:31<00:00,  2.33it/s]
2022-04-27 22:53:53,921 INFO: Epoch:[060/100]
2022-04-27 22:53:53,922 INFO: Train Loss:0.156 | Acc:0.9541 | F1:0.7810
2022-04-27 22:54:13,379 INFO: val Loss:0.191 | Acc:0.9532 | F1:0.7456
100%|██████████| 214/214 [01:29<00:00,  2.39it/s

<---- Training Params ---->
Dataset size:3422
Dataset size:855


2022-04-27 22:57:52,200 INFO: Loading pretrained weights from url (https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_040-f0d569f9.pth)
100%|██████████| 214/214 [01:29<00:00,  2.40it/s]
2022-04-27 22:59:21,676 INFO: Epoch:[001/100]
2022-04-27 22:59:21,676 INFO: Train Loss:4.498 | Acc:0.0061 | F1:0.0021
2022-04-27 22:59:41,022 INFO: val Loss:4.442 | Acc:0.0339 | F1:0.0032
2022-04-27 22:59:41,547 INFO: -----------------SAVE:1epoch----------------
100%|██████████| 214/214 [01:29<00:00,  2.40it/s]
2022-04-27 23:01:10,870 INFO: Epoch:[002/100]
2022-04-27 23:01:10,871 INFO: Train Loss:4.465 | Acc:0.0082 | F1:0.0039
2022-04-27 23:01:30,345 INFO: val Loss:4.409 | Acc:0.0409 | F1:0.0044
2022-04-27 23:01:31,008 INFO: -----------------SAVE:2epoch----------------
100%|██████████| 214/214 [01:30<00:00,  2.36it/s]
2022-04-27 23:03:01,544 INFO: Epoch:[003/100]
2022-04-27 23:03:01,545 INFO: Train Loss:4.412 | Acc:0.0184 | F1:0.0044
2022-04-27 23:03:20,820 INFO: val

In [None]:
img_size = 288

test_transform = get_train_augmentation(img_size=img_size, ver=1)
test_dataset = Test_dataset(df_test, test_transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0)

In [None]:
# models_path = ['./class_results/000', './class_results/001', './class_results/002', './class_results/003', './class_results/004']

In [None]:
def predict(encoder_name, test_loader, device, model_path):
    model = Network_test(encoder_name).to(device)
    model.load_state_dict(torch.load(opj(model_path, 'best_model.pth'))['state_dict'])
    model.eval()
    preds_list = []
    with torch.no_grad():
        for images in tqdm(test_loader):
            images = torch.as_tensor(images, device=device, dtype=torch.float32)
            preds = model(images)
            preds = torch.softmax(preds, dim=1)
            preds_list.extend(preds.cpu().tolist())

    return np.array(preds_list)

def ensemble_5fold(model_path_list, test_loader, device):
    predict_list = []
    for model_path in model_path_list:
        prediction = predict(encoder_name= 'regnety_040', test_loader = test_loader, device = device, model_path = model_path)
        predict_list.append(prediction)
    ensemble = (predict_list[0] + predict_list[1] + predict_list[2] + predict_list[3] + predict_list[4])/len(predict_list)

    return ensemble

In [None]:
ensemble = ensemble_5fold(models_path, test_loader, device)

In [None]:
f_pred = ensemble.argmax(axis=1).tolist()
f_pred

In [None]:
train_y = pd.read_csv("./open/train_df.csv")

train_labels = train_y["label"]

label_unique = sorted(np.unique(train_labels))
label_unique = {key:value for key,value in zip(label_unique, range(len(label_unique)))}


In [None]:
label_decoder = {val:key for key, val in label_unique.items()}

f_result = [label_decoder[result] for result in f_pred]

In [None]:
f_result

In [None]:
submission = pd.read_csv("./open/sample_submission.csv")

submission["label"] = f_result

submission

In [None]:
submission.to_csv("label_result.csv", index = False)