In [1]:
import os
import pandas as pd
import numpy as np
import time, gc
import cv2
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pretrainedmodels
from argparse import Namespace
from sklearn.utils import shuffle
from apex import amp
from sklearn.model_selection import StratifiedKFold
from efficientnet_pytorch import EfficientNet
from cvcore.data.auto_augment import RandAugment
from PIL import Image
from utils import bn_update, moving_average, copy_model

In [2]:
!ls /home/chec/data/bengali

class_map.csv		       train.csv
sample_submission.csv	       train.csv.zip
test.csv		       train_image_data_0.parquet
test_image_data_0.parquet      train_image_data_0.parquet.zip
test_image_data_0.parquet.zip  train_image_data_1.parquet
test_image_data_1.parquet      train_image_data_1.parquet.zip
test_image_data_1.parquet.zip  train_image_data_2.parquet
test_image_data_2.parquet      train_image_data_2.parquet.zip
test_image_data_2.parquet.zip  train_image_data_3.parquet
test_image_data_3.parquet      train_image_data_3.parquet.zip
test_image_data_3.parquet.zip


In [3]:
#!ls /home/chec/data/bengali

In [4]:
DATA_DIR = '/home/chec/data/bengali'

In [5]:
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
class_map_df = pd.read_csv(f'{DATA_DIR}/class_map.csv')
sample_sub_df = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

In [6]:
train_df.head()

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme
0,Train_0,15,9,5,ক্ট্রো
1,Train_1,159,0,0,হ
2,Train_2,22,3,5,খ্রী
3,Train_3,53,2,2,র্টি
4,Train_4,71,9,5,থ্রো


In [7]:
HEIGHT = 137
WIDTH = 236

In [8]:
'''
import albumentations as albu

def get_train_augs(p=1.):
    return albu.Compose([
        #albu.HorizontalFlip(.5),
        albu.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=10, p=0.5 ),
        albu.Blur(blur_limit=3, p=0.3),
        albu.OpticalDistortion(p=0.3),
        albu.GaussNoise(p=0.3)
        #albu.GridDistortion(p=.33),
        #albu.HueSaturationValue(p=.33) # not for grey scale
    ], p=p)
'''

'\nimport albumentations as albu\n\ndef get_train_augs(p=1.):\n    return albu.Compose([\n        #albu.HorizontalFlip(.5),\n        albu.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=10, p=0.5 ),\n        albu.Blur(blur_limit=3, p=0.3),\n        albu.OpticalDistortion(p=0.3),\n        albu.GaussNoise(p=0.3)\n        #albu.GridDistortion(p=.33),\n        #albu.HueSaturationValue(p=.33) # not for grey scale\n    ], p=p)\n'

In [9]:
def get_train_augs():
    return RandAugment(n=2, m=27)

In [10]:
#plt.imshow(x)

In [11]:
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms


class BengaliDataset(Dataset):
    def __init__(self, df, img_df, train_mode=True, test_mode=False):
        self.df = df
        self.img_df = img_df
        self.train_mode = train_mode
        self.test_mode = test_mode

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = self.get_img(row.image_id)
        orig_img = img.copy()
        #print(img.shape)
        if self.train_mode:
            augs = get_train_augs()
            #img = augs(image=img)['image']
            img = np.asarray(augs(Image.fromarray(img)))
        
        img = np.expand_dims(img, axis=-1)
        orig_img = np.expand_dims(orig_img, axis=-1)
        
        #print('###', img.shape)
        #img = np.concatenate([img, img, img], 2)
        #print('>>>', img.shape)
        
        # taken from https://www.kaggle.com/iafoss/image-preprocessing-128x128
        #MEAN = [ 0.06922848809290576,  0.06922848809290576,  0.06922848809290576]
        #STD = [ 0.20515700083327537,  0.20515700083327537,  0.20515700083327537]
        
        img = transforms.functional.to_tensor(img)
        orig_img = transforms.functional.to_tensor(orig_img)
        
        #img = transforms.functional.normalize(img, mean=MEAN, std=STD)
        
        if self.test_mode:
            return img
        elif self.train_mode:
            return img, orig_img, torch.tensor([row.grapheme_root, row.vowel_diacritic, row.consonant_diacritic, row.word_label])
        else:
            return img, torch.tensor([row.grapheme_root, row.vowel_diacritic, row.consonant_diacritic, row.word_label])
                    
    def get_img(self, img_id):
        return 255 - self.img_df.loc[img_id].values.reshape(HEIGHT, WIDTH).astype(np.uint8)

    def __len__(self):
        return len(self.df)
    
def get_train_val_loaders(batch_size=4, val_batch_size=4, ifold=0, dev_mode=False):
    train_df = pd.read_csv(f'{DATA_DIR}/train.csv')

    train_df = shuffle(train_df, random_state=1234)

    grapheme_words = np.unique(train_df.grapheme.values)
    grapheme_words_dict = {grapheme: i for i, grapheme in enumerate(grapheme_words)}
    train_df['word_label'] = train_df['grapheme'].map(lambda x: grapheme_words_dict[x])

    print(train_df.shape)

    if dev_mode:
        img_df = pd.read_parquet(f'{DATA_DIR}/train_image_data_0.parquet').set_index('image_id')
        train_df = train_df.iloc[:1000]
    else:
        img_dfs = [pd.read_parquet(f'{DATA_DIR}/train_image_data_{i}.parquet') for i in range(4)]
        img_df = pd.concat(img_dfs, axis=0).set_index('image_id')
    print(img_df.shape)
    #split_index = int(len(train_df) * 0.9)
    
    #train = train_df.iloc[:split_index]
    #val = train_df.iloc[split_index:]
    
    kf = StratifiedKFold(5, random_state=1234, shuffle=True)
    for i, (train_idx, val_idx) in enumerate(kf.split(train_df, train_df['grapheme_root'].values)):
        if i == ifold:
            #print(val_idx)
            train = train_df.iloc[train_idx]
            val = train_df.iloc[val_idx]
            break
    assert i == ifold
    print(train.shape, val.shape)
    
    train_ds = BengaliDataset(train, img_df, True, False)
    val_ds = BengaliDataset(val, img_df, False, False)
    
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=8, drop_last=True)
    train_loader.num = len(train_ds)

    val_loader = DataLoader(val_ds, batch_size=val_batch_size, shuffle=False, num_workers=8, drop_last=False)
    val_loader.num = len(val_ds)

    return train_loader, val_loader

In [12]:
#train_loader, val_loader = get_train_val_loaders(dev_mode=True)

# model

In [13]:
#import pretrainedmodels

In [14]:
print(pretrainedmodels.model_names)

['fbresnet152', 'bninception', 'resnext101_32x4d', 'resnext101_64x4d', 'inceptionv4', 'inceptionresnetv2', 'alexnet', 'densenet121', 'densenet169', 'densenet201', 'densenet161', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'inceptionv3', 'squeezenet1_0', 'squeezenet1_1', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19_bn', 'vgg19', 'nasnetamobile', 'nasnetalarge', 'dpn68', 'dpn68b', 'dpn92', 'dpn98', 'dpn131', 'dpn107', 'xception', 'senet154', 'se_resnet50', 'se_resnet101', 'se_resnet152', 'se_resnext50_32x4d', 'se_resnext101_32x4d', 'cafferesnet101', 'pnasnet5large', 'polynet']


In [15]:
#model_name = 'resnet50' # could be fbresnet152 or inceptionresnetv2
#model = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained='imagenet').cuda()
#model.eval()

In [16]:
#model = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained=False).cuda()


In [17]:
#model.features(torch.randn((2, 3, 137, 236)).cuda()).size()

In [18]:
#model.last_linear.in_features

In [19]:
MEAN = [ 0.06922848809290576 ]
STD = [ 0.20515700083327537 ]

class BengaliNet(nn.Module):
    def __init__(self, backbone_name):
        super(BengaliNet, self).__init__()
        self.n_grapheme = 168
        self.n_vowel = 11
        self.n_consonant = 7
        self.backbone_name = backbone_name
        
        self.num_classes = self.n_grapheme + self.n_vowel + self.n_consonant
        
        #self.conv0 = nn.Conv2d(1, 3, kernel_size=1, stride=1, padding=0)
        
        if self.backbone_name.startswith('efficient'):
            self.backbone = EfficientNet.from_pretrained(self.backbone_name)
            self.fc = nn.Linear(self.backbone._fc.in_features, self.num_classes)
        else:
            self.backbone = pretrainedmodels.__dict__[self.backbone_name](num_classes=1000, pretrained='imagenet')
            self.fc = nn.Linear(self.backbone.last_linear.in_features, self.num_classes)

        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        
        #self.fix_input_layer()
        
    def fix_input_layer(self):
        if self.backbone_name in ['se_resnext50_32x4d', 'se_resnext101_32x4d', 'se_resnet50', 'senet154', 'se_resnet152', 'nasnetmobile', 'mobilenet', 'nasnetalarge']:
            #self.backbone = eval(backbone_name)()
            #print(self.backbone.layer0.conv1)
            w = self.backbone.layer0.conv1.weight.data
            self.backbone.layer0.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
            #self.backbone.layer0.conv1.weight = torch.nn.Parameter(torch.cat((w, w[:, 2, :, :].unsqueeze(1)), dim=1))
            self.backbone.layer0.conv1.weight = torch.nn.Parameter(w[:, 0, :, :].unsqueeze(1))
        
    def logits(self, x):
        x = self.avg_pool(x)
        #x = F.dropout2d(x, 0.2, self.training)
        x = x.view(x.size(0), -1)
        return self.fc(x)
    
    def forward(self, x):
        x = F.interpolate(x, size=(256, 512), mode='bilinear', align_corners=False)
        for i in range(len(x)):
            transforms.functional.normalize(x[i], mean=MEAN, std=STD, inplace=True)
        x = torch.cat([x,x,x], 1)
        #x = self.conv0(x)
        #print(x.size())
        if self.backbone_name.startswith('efficient'):
            x = self.backbone.extract_features(x)
        else:
            x = self.backbone.features(x)
        x = self.logits(x)

        return x

In [20]:
#F.interpolate(torch.randn(2,1,224,224), size=(256, 512), mode='bilinear', align_corners=False).size()

In [21]:
MODEL_DIR = './new-model3-ckps'
def create_model(args):
    model = BengaliNet(backbone_name=args.backbone)
    model_file = os.path.join(MODEL_DIR, args.backbone, args.ckp_name)

    parent_dir = os.path.dirname(model_file)
    if not os.path.exists(parent_dir):
        os.makedirs(parent_dir)

    print('model file: {}, exist: {}'.format(model_file, os.path.exists(model_file)))

    if os.path.exists(model_file):
        print('loading {}...'.format(model_file))
        model.load_state_dict(torch.load(model_file))
    
    return model, model_file

In [22]:
#bnet = BengaliNet('se_resnext50_32x4d').cuda()

In [23]:
#bnet(torch.randn((2, 1, 137, 236)).cuda()).size()

# train

In [24]:
round(1/9, 6)

0.111111

In [25]:
import numpy as np
import sklearn.metrics
import torch


def calc_metrics(preds0, preds1, preds2, y):
    assert len(y) == len(preds0) == len(preds1) == len(preds2) #== len(preds3)

    recall_grapheme = sklearn.metrics.recall_score(y[:, 0], preds0, average='macro')
    recall_vowel = sklearn.metrics.recall_score(y[:, 1], preds1, average='macro')
    recall_consonant = sklearn.metrics.recall_score(y[:, 2], preds2, average='macro')
    #recall_word = sklearn.metrics.recall_score(y[:, 3], preds3, average='macro')
    
    scores = [recall_grapheme, recall_vowel, recall_consonant]
    final_recall_score = np.average(scores, weights=[2, 1, 1])
    
    old_recall_grapheme = sklearn.metrics.recall_score(preds0, y[:, 0], average='macro')
    old_recall_vowel = sklearn.metrics.recall_score(preds1, y[:, 1], average='macro')
    old_recall_consonant = sklearn.metrics.recall_score(preds2, y[:, 2], average='macro')
    #old_recall_word = sklearn.metrics.recall_score(preds3, y[:, 3], average='macro')
    
    old_scores = [old_recall_grapheme, old_recall_vowel, old_recall_consonant]
    old_final_recall_score = np.average(old_scores, weights=[2, 1, 1])

    
    metrics = {}
    metrics['recall'] = round(final_recall_score, 6)
    metrics['recall_grapheme'] = round(recall_grapheme, 6)
    metrics['recall_vowel'] = round(recall_vowel, 6)
    metrics['recall_consonant'] = round(recall_consonant, 6)
    #metrics['recall_word'] = round(recall_word, 6)
    
    metrics['acc_grapheme'] = round((preds0 == y[:, 0]).sum() / len(y), 6)
    metrics['acc_vowel'] = round((preds1 == y[:, 1]).sum() / len(y), 6)
    metrics['acc_consonant'] = round((preds2 == y[:, 2]).sum() / len(y), 6)
    #metrics['acc_word'] = round((preds3 == y[:, 3]).sum() / len(y), 6)  
    
    metrics['old_recall'] = round(old_final_recall_score, 6)
    metrics['old_recall_grapheme'] = round(old_recall_grapheme, 6)
    
    return metrics

In [26]:
def criterion(outputs, y_true):
    # outputs: (N, 182)
    # y_true: (N, 3)
    
    outputs = torch.split(outputs, [168, 11, 7], dim=1)
    loss0 = F.cross_entropy(outputs[0], y_true[:, 0], reduction='mean')
    loss1 = F.cross_entropy(outputs[1], y_true[:, 1], reduction='mean')
    loss2 = F.cross_entropy(outputs[2], y_true[:, 2], reduction='mean')
    
    return loss0 + loss1 + loss2 #, loss0.item(), loss1.item(), loss2.item()

In [27]:
def validate(model, val_loader):
    model.eval()
    loss0, loss1, loss2 = 0., 0., 0.
    preds0, preds1,preds2 = [], [], []
    y_true = []
    with torch.no_grad():
        for x, y in val_loader:
            y_true.append(y)
            x, y = x.cuda(), y.cuda()
            outputs = model(x)
            outputs = torch.split(outputs, [168, 11, 7], dim=1)
            
            preds0.append(torch.max(outputs[0], dim=1)[1])
            preds1.append(torch.max(outputs[1], dim=1)[1])
            preds2.append(torch.max(outputs[2], dim=1)[1])
            loss0 += F.cross_entropy(outputs[0], y[:, 0], reduction='sum').item()
            loss1 += F.cross_entropy(outputs[1], y[:, 1], reduction='sum').item()
            loss2 += F.cross_entropy(outputs[2], y[:, 2], reduction='sum').item()
            
            # for debug
            #metrics = {}
            #metrics['loss_grapheme'] =  F.cross_entropy(outputs[0], y[:, 0], reduction='mean').item()
            #metrics['loss_vowel'] =  F.cross_entropy(outputs[1], y[:, 1], reduction='mean').item()
            #metrics['loss_consonant'] =  F.cross_entropy(outputs[2], y[:, 2], reduction='mean').item()
            #return metrics
    
    preds0 = torch.cat(preds0, 0).cpu().numpy()
    preds1 = torch.cat(preds1, 0).cpu().numpy()
    preds2 = torch.cat(preds2, 0).cpu().numpy()
    y_true = torch.cat(y_true, 0).numpy()
    
    #print('y_true:', y_true.shape)
    #print('preds0:', preds0.shape)
    
    metrics = calc_metrics(preds0, preds1, preds2, y_true)
    metrics['loss_grapheme'] = round(loss0 / val_loader.num, 6)
    metrics['loss_vowel'] = round(loss1 / val_loader.num, 6)
    metrics['loss_consonant'] = round(loss2 / val_loader.num, 6)
    
    return metrics
            

In [28]:
def get_lrs(optimizer):
    lrs = []
    for pgs in optimizer.state_dict()['param_groups']:
        lrs.append(pgs['lr'])
    lrs = ['{:.6f}'.format(x) for x in lrs]
    return lrs

In [29]:
def save_model(model, model_file):
    parent_dir = os.path.dirname(model_file)
    if not os.path.exists(parent_dir):
        os.makedirs(parent_dir)
    if isinstance(model, nn.DataParallel):
        torch.save(model.module.state_dict(), model_file)
    else:
        torch.save(model.state_dict(), model_file)

In [30]:
def mixup(data, targets, alpha=1):
    indices = torch.randperm(data.size(0))
    shuffled_data = data[indices]
    shuffled_targets = targets[indices]

    lam = np.random.beta(alpha, alpha)
    data = data * lam + shuffled_data * (1 - lam)
    targets = (targets, shuffled_targets, lam)

    return data, targets


def mixup_criterion(outputs, targets):
    targets1, targets2, lam = targets
    #criterion = nn.CrossEntropyLoss(reduction='mean')
    return lam * criterion(outputs, targets1) + (1 - lam) * criterion(outputs, targets2)

In [31]:
def rand_bbox(size, lam):
    W = size[2]
    H = size[3]
    cut_rat = np.sqrt(1. - lam)
    cut_w = np.int(W * cut_rat)
    cut_h = np.int(H * cut_rat)

    # uniform
    cx = np.random.randint(W)
    cy = np.random.randint(H)

    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)

    return bbx1, bby1, bbx2, bby2

In [32]:
np.random.random()

0.0479826928449274

In [33]:
from over9000.over9000 import Over9000
from over9000.radam import RAdam
from gridmask import GridMask

In [34]:
from cvcore.solver import WarmupCyclicalLR
def make_optimizer(model, base_lr=4e-4, weight_decay=0., weight_decay_bias=0., epsilon=1e-3):
    """
    Create optimizer with per-layer learning rate and weight decay.
    """
    params = []
    for key, value in model.named_parameters():
        if not value.requires_grad:
            continue
        lr = base_lr
        params += [{"params": [value], "lr": lr, "weight_decay": weight_decay_bias if 'bias' in key else weight_decay}]
    
    optimizer = torch.optim.AdamW(params, lr, eps=epsilon)
    return optimizer

In [35]:
def train_epoch(args, model, train_loader, epoch, optimizer, lr_scheduler, grid):
    train_loss = 0

    for batch_idx, (img, orig_img, targets) in enumerate(train_loader):
        img, orig_img, targets  = img.cuda(), orig_img.cuda(), targets.cuda()
        batch_size = img.size(0)
        r = np.random.rand()

        if r < 0.3:
            # generate mixed sample
            lam = np.random.beta(args.beta, args.beta)
            rand_index = torch.randperm(img.size()[0]).cuda()
            target_a = targets
            target_b = targets[rand_index]
            bbx1, bby1, bbx2, bby2 = rand_bbox(img.size(), lam)
            img[:, :, bbx1:bbx2, bby1:bby2] = img[rand_index, :, bbx1:bbx2, bby1:bby2]
            # adjust lambda to exactly match pixel ratio
            lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (img.size()[-1] * img.size()[-2]))
            # compute output
            outputs = model(img)
            loss = criterion(outputs, target_a) * lam + criterion(outputs, target_b) * (1. - lam)
        elif r > 0.7: # grid mask
            img = grid(img)
            outputs = model(img)
            loss = criterion(outputs, targets)
        else:
            orig_img, targets = mixup(orig_img, targets)
            outputs = model(orig_img)
            loss = mixup_criterion(outputs, targets)
            #loss = criterion(outputs, targets)
        '''
        #if True:
        if r < 0.3:
            # generate mixed sample
            lam = np.random.beta(args.beta, args.beta)
            rand_index = torch.randperm(img.size()[0]).cuda()
            target_a = targets
            target_b = targets[rand_index]
            bbx1, bby1, bbx2, bby2 = rand_bbox(img.size(), lam)
            #img[:, :, bby1:bby2, bbx1:bbx2] = img[rand_index, :, bby1:bby2, bbx1:bbx2] #for new cutmix
            img[:, :, bbx1:bbx2, bby1:bby2] = img[rand_index, :, bbx1:bbx2, bby1:bby2]
            
            # adjust lambda to exactly match pixel ratio
            lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (img.size()[-1] * img.size()[-2]))
            # compute output
            outputs, outputs_aux1, outputs_aux2 = model(img)
            loss_primary = criterion(outputs, target_a) * lam + criterion(outputs, target_b) * (1. - lam)
            loss_aux1 = criterion(outputs_aux1, target_a) * lam + criterion(outputs_aux1, target_b) * (1. - lam)
            loss_aux2 = criterion(outputs_aux2, target_a) * lam + criterion(outputs_aux2, target_b) * (1. - lam)
            loss = loss_primary + (loss_aux1 + loss_aux2)*0.8
        elif r > 0.7:
            img = grid(img)
            outputs, outputs_aux1, outputs_aux2 = model(img)
            loss_primary = criterion(outputs, targets)
            loss_aux1 = criterion(outputs_aux1, targets)
            loss_aux2 = criterion(outputs_aux2, targets)
            loss = loss_primary + (loss_aux1 + loss_aux2)*0.8
        else:
            orig_img, targets = mixup(orig_img, targets)
            outputs, outputs_aux1, outputs_aux2 = model(orig_img)
            loss_primary = mixup_criterion(outputs, targets)
            loss_aux1 = mixup_criterion(outputs_aux1, targets)
            loss_aux2 = mixup_criterion(outputs_aux2, targets)
            loss = loss_primary + (loss_aux1 + loss_aux2)*0.8
            #loss = criterion(outputs, targets)
        '''

        optimizer.zero_grad()
        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()
        #loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        lr_scheduler(optimizer, batch_idx, epoch)
        optimizer.step()            
        
        current_lr = get_lrs(optimizer)

        train_loss += loss.item()
        print('\r {:4d} | {:.6f} | {:06d}/{} | {:.4f} | {:.4f} |'.format(
            epoch, float(current_lr[0]), batch_size*(batch_idx+1), train_loader.num, 
            loss.item(), train_loss/(batch_idx+1)), end='')


In [36]:
import copy
best_metrics = 0.
best_metrics_swa = 0.


def validate_and_save(model, model_file, val_loader, save=False):
    global best_metrics
    best_key = 'recall'
    val_metrics = validate(model, val_loader)
    print('\nval:', val_metrics)
    
    if val_metrics[best_key] > best_metrics:
        best_metrics = val_metrics[best_key]
        if save:
            save_model(model, model_file)
            print('###>>>>> saved', model_file)
    model.train()

def validate_and_save_swa(model, model_file, val_loader, save=False):
    global best_metrics_swa
    best_key = 'recall'
    val_metrics = validate(model, val_loader)
    print('\nval:', val_metrics)
    
    if val_metrics[best_key] > best_metrics_swa:
        best_metrics_swa = val_metrics[best_key]
        if save:
            save_model(model, model_file)
            print('###>>>>> saved', model_file)
    model.train()


def train(args):
    model, model_file = create_model(args)
    model = model.cuda()

    swa_args = copy.deepcopy(args)
    swa_args.ckp_name = args.ckp_name + '_swa'
    swa_model, swa_model_file = create_model(swa_args)
    swa_model = swa_model.cuda()

    optimizer = make_optimizer(model)
    lr_scheduler = WarmupCyclicalLR(
        "cos", args.base_lr, args.num_epochs, iters_per_epoch=len(train_loader), warmup_epochs=args.warmup_epochs)
    
    [model, swa_model], optimizer = amp.initialize([model, swa_model], optimizer, opt_level="O1",verbosity=0)
    #[model, swa_model], optimizer = amp.initialize(
    #    [model, swa_model], optimizer, opt_level="O2",verbosity=0, keep_batchnorm_fp32=True)
    
    #opt_level="O2", keep_batchnorm_fp32=True
    
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
        swa_model = nn.DataParallel(swa_model)
    
    validate_and_save(model, model_file, val_loader, save=False)

    swa_model_loaded = False
    if os.path.exists(swa_model_file):
        swa_model_loaded = True
        validate_and_save_swa(swa_model, swa_model_file, val_loader, save=False)
    
    for cycle in range(1, args.num_cycles+1):
        print('CYCLE:', cycle)
        grid = GridMask(64, 128, rotate=15, ratio=0.6, mode=1, prob=1.)

        for epoch in range(args.start_epoch, args.num_epochs):
            grid.set_prob(epoch, args.st_epochs)
            train_epoch(args, model, train_loader, epoch, optimizer, lr_scheduler, grid)
            validate_and_save(model, model_file, val_loader, save=True)
            
            if (epoch+1) == args.swa_start and cycle == 1:
                if not swa_model_loaded:
                    copy_model(swa_model, model)
                #swa_n = 0
                swa_n = args.swa_n
            if (epoch+1) >= args.swa_start and (epoch+1) % args.swa_freq == 0:
                print('SWA>>>:')
                moving_average(swa_model, model, 1.0 / (swa_n + 1))
                swa_n += 1
                bn_update(train_loader, swa_model)
                validate_and_save_swa(swa_model, swa_model_file, val_loader, save=True)

        #args.base_lr = 1e-4
        #args.num_epochs = 60
        #args.warmup_epochs = 5
        #args.swa_start = 15
        #args.swa_freq = 3

        # reset scheduler at each cycle
        lr_scheduler = WarmupCyclicalLR(
            "cos", args.base_lr, args.num_epochs, iters_per_epoch=len(train_loader), warmup_epochs=args.warmup_epochs)

In [37]:
args = Namespace()
args.backbone = 'se_resnext50_32x4d'
args.ckp_name = 'model3_res50_fold4_256_512.pth'

args.base_lr = 8e-5
args.num_epochs = 60
args.start_epoch = 10
args.warmup_epochs = 5

args.num_cycles = 100
args.batch_size = 280
args.val_batch_size = 512
args.st_epochs = 5

args.swa_start = 15
args.swa_freq = 3
args.swa_n = 5

args.beta = 1.0
args.cutmix_prob = 0.5

In [38]:
train_loader, val_loader = get_train_val_loaders(batch_size=args.batch_size, val_batch_size=args.val_batch_size, ifold=4)

(200840, 6)
(200840, 32332)
(160735, 6) (40105, 6)


In [39]:
train(args)

model file: ./new-model3-ckps/se_resnext50_32x4d/model3_res50_fold4_256_512.pth, exist: True
loading ./new-model3-ckps/se_resnext50_32x4d/model3_res50_fold4_256_512.pth...
model file: ./new-model3-ckps/se_resnext50_32x4d/model3_res50_fold4_256_512.pth_swa, exist: True
loading ./new-model3-ckps/se_resnext50_32x4d/model3_res50_fold4_256_512.pth_swa...

val: {'recall': 0.997497, 'recall_grapheme': 0.996707, 'recall_vowel': 0.998601, 'recall_consonant': 0.997973, 'acc_grapheme': 0.997432, 'acc_vowel': 0.999052, 'acc_consonant': 0.999052, 'old_recall': 0.998326, 'old_recall_grapheme': 0.997742, 'loss_grapheme': 0.019324, 'loss_vowel': 0.015373, 'loss_consonant': 0.011769}

val: {'recall': 0.997863, 'recall_grapheme': 0.997312, 'recall_vowel': 0.998694, 'recall_consonant': 0.998134, 'acc_grapheme': 0.997905, 'acc_vowel': 0.999127, 'acc_consonant': 0.999127, 'old_recall': 0.998566, 'old_recall_grapheme': 0.998129, 'loss_grapheme': 0.009049, 'loss_vowel': 0.004092, 'loss_consonant': 0.003665}


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff06e5cb158>
Traceback (most recent call last):
  File "/home/chec/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 926, in __del__
    self._shutdown_workers()
  File "/home/chec/anaconda3/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 906, in _shutdown_workers
    w.join()
  File "/home/chec/anaconda3/lib/python3.7/multiprocessing/process.py", line 140, in join
    res = self._popen.wait(timeout)
  File "/home/chec/anaconda3/lib/python3.7/multiprocessing/popen_fork.py", line 48, in wait
    return self.poll(os.WNOHANG if timeout == 0.0 else 0)
  File "/home/chec/anaconda3/lib/python3.7/multiprocessing/popen_fork.py", line 28, in poll
    pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt: 


KeyboardInterrupt: 

In [39]:
train(args)

model file: ./new-model3-ckps/se_resnext50_32x4d/model3_res50_fold4_256_512.pth, exist: True
loading ./new-model3-ckps/se_resnext50_32x4d/model3_res50_fold4_256_512.pth...
model file: ./new-model3-ckps/se_resnext50_32x4d/model3_res50_fold4_256_512.pth_swa, exist: True
loading ./new-model3-ckps/se_resnext50_32x4d/model3_res50_fold4_256_512.pth_swa...

val: {'recall': 0.998326, 'recall_grapheme': 0.997742, 'recall_vowel': 0.99917, 'recall_consonant': 0.99865, 'acc_grapheme': 0.997432, 'acc_vowel': 0.999052, 'acc_consonant': 0.999052, 'loss_grapheme': 0.019324, 'loss_vowel': 0.015373, 'loss_consonant': 0.011769}

val: {'recall': 0.998566, 'recall_grapheme': 0.998129, 'recall_vowel': 0.999308, 'recall_consonant': 0.998697, 'acc_grapheme': 0.997905, 'acc_vowel': 0.999127, 'acc_consonant': 0.999127, 'loss_grapheme': 0.009049, 'loss_vowel': 0.004092, 'loss_consonant': 0.003665}
CYCLE: 1
   10 | 0.000074 | 160720/160735 | 1.7604 | 1.3775 |
val: {'recall': 0.997881, 'recall_grapheme': 0.997178,

  0%|          | 0/574 [00:00<?, ?it/s]


val: {'recall': 0.997315, 'recall_grapheme': 0.996745, 'recall_vowel': 0.99874, 'recall_consonant': 0.99703, 'acc_grapheme': 0.996559, 'acc_vowel': 0.998654, 'acc_consonant': 0.998504, 'loss_grapheme': 0.018523, 'loss_vowel': 0.011894, 'loss_consonant': 0.010104}
SWA>>>:


100%|██████████| 574/574 [02:11<00:00,  4.34it/s]



val: {'recall': 0.998404, 'recall_grapheme': 0.997821, 'recall_vowel': 0.999278, 'recall_consonant': 0.998697, 'acc_grapheme': 0.997955, 'acc_vowel': 0.999077, 'acc_consonant': 0.999127, 'loss_grapheme': 0.008967, 'loss_vowel': 0.004131, 'loss_consonant': 0.003661}
   15 | 0.000067 | 160720/160735 | 2.1835 | 1.4455 |
val: {'recall': 0.997394, 'recall_grapheme': 0.996461, 'recall_vowel': 0.998589, 'recall_consonant': 0.998067, 'acc_grapheme': 0.996285, 'acc_vowel': 0.998479, 'acc_consonant': 0.998579, 'loss_grapheme': 0.022371, 'loss_vowel': 0.015577, 'loss_consonant': 0.012056}
   16 | 0.000065 | 160720/160735 | 0.1762 | 1.4121 |
val: {'recall': 0.997679, 'recall_grapheme': 0.99667, 'recall_vowel': 0.998597, 'recall_consonant': 0.998778, 'acc_grapheme': 0.996808, 'acc_vowel': 0.998654, 'acc_consonant': 0.998579, 'loss_grapheme': 0.015938, 'loss_vowel': 0.010189, 'loss_consonant': 0.008488}
   17 | 0.000064 | 160720/160735 | 1.5925 | 1.4009 |

  0%|          | 0/574 [00:00<?, ?it/s]


val: {'recall': 0.99789, 'recall_grapheme': 0.997011, 'recall_vowel': 0.998952, 'recall_consonant': 0.998585, 'acc_grapheme': 0.996858, 'acc_vowel': 0.998878, 'acc_consonant': 0.998828, 'loss_grapheme': 0.02234, 'loss_vowel': 0.017368, 'loss_consonant': 0.013063}
SWA>>>:


100%|██████████| 574/574 [02:11<00:00,  4.51it/s]



val: {'recall': 0.998428, 'recall_grapheme': 0.997865, 'recall_vowel': 0.999278, 'recall_consonant': 0.998703, 'acc_grapheme': 0.99793, 'acc_vowel': 0.999077, 'acc_consonant': 0.999152, 'loss_grapheme': 0.008893, 'loss_vowel': 0.004094, 'loss_consonant': 0.003636}
   18 | 0.000062 | 160720/160735 | 3.1201 | 1.3766 |
val: {'recall': 0.997281, 'recall_grapheme': 0.996374, 'recall_vowel': 0.998527, 'recall_consonant': 0.99785, 'acc_grapheme': 0.995836, 'acc_vowel': 0.998329, 'acc_consonant': 0.998304, 'loss_grapheme': 0.017309, 'loss_vowel': 0.009962, 'loss_consonant': 0.008406}
   19 | 0.000060 | 160720/160735 | 1.1534 | 1.4050 |
val: {'recall': 0.995825, 'recall_grapheme': 0.995496, 'recall_vowel': 0.998441, 'recall_consonant': 0.993867, 'acc_grapheme': 0.994415, 'acc_vowel': 0.99813, 'acc_consonant': 0.995686, 'loss_grapheme': 0.022521, 'loss_vowel': 0.009307, 'loss_consonant': 0.016431}
   20 | 0.000058 | 160720/160735 | 2.5372 | 1.4057 |

  0%|          | 0/574 [00:00<?, ?it/s]


val: {'recall': 0.996349, 'recall_grapheme': 0.996183, 'recall_vowel': 0.998538, 'recall_consonant': 0.994491, 'acc_grapheme': 0.995587, 'acc_vowel': 0.99828, 'acc_consonant': 0.996135, 'loss_grapheme': 0.018005, 'loss_vowel': 0.008804, 'loss_consonant': 0.013801}
SWA>>>:


100%|██████████| 574/574 [02:10<00:00,  4.40it/s]



val: {'recall': 0.998434, 'recall_grapheme': 0.997883, 'recall_vowel': 0.999278, 'recall_consonant': 0.998692, 'acc_grapheme': 0.997955, 'acc_vowel': 0.999077, 'acc_consonant': 0.999102, 'loss_grapheme': 0.008915, 'loss_vowel': 0.004129, 'loss_consonant': 0.00362}
   21 | 0.000056 | 160720/160735 | 0.7384 | 1.4176 |
val: {'recall': 0.997564, 'recall_grapheme': 0.996543, 'recall_vowel': 0.999089, 'recall_consonant': 0.998082, 'acc_grapheme': 0.996584, 'acc_vowel': 0.998953, 'acc_consonant': 0.998504, 'loss_grapheme': 0.020586, 'loss_vowel': 0.014321, 'loss_consonant': 0.01136}
   22 | 0.000054 | 160720/160735 | 0.8256 | 1.3718 |
val: {'recall': 0.9951, 'recall_grapheme': 0.995567, 'recall_vowel': 0.998273, 'recall_consonant': 0.99099, 'acc_grapheme': 0.995213, 'acc_vowel': 0.998055, 'acc_consonant': 0.993293, 'loss_grapheme': 0.019454, 'loss_vowel': 0.00987, 'loss_consonant': 0.021837}
   23 | 0.000052 | 160720/160735 | 0.0971 | 1.5003 |

  0%|          | 0/574 [00:00<?, ?it/s]


val: {'recall': 0.997128, 'recall_grapheme': 0.996122, 'recall_vowel': 0.998384, 'recall_consonant': 0.997885, 'acc_grapheme': 0.99611, 'acc_vowel': 0.998554, 'acc_consonant': 0.998329, 'loss_grapheme': 0.020543, 'loss_vowel': 0.013576, 'loss_consonant': 0.01216}
SWA>>>:


100%|██████████| 574/574 [02:11<00:00,  4.45it/s]



val: {'recall': 0.998433, 'recall_grapheme': 0.99787, 'recall_vowel': 0.999293, 'recall_consonant': 0.998697, 'acc_grapheme': 0.997905, 'acc_vowel': 0.999102, 'acc_consonant': 0.999127, 'loss_grapheme': 0.008952, 'loss_vowel': 0.004144, 'loss_consonant': 0.003633}
   24 | 0.000050 | 160720/160735 | 0.1306 | 1.3305 |
val: {'recall': 0.997895, 'recall_grapheme': 0.997182, 'recall_vowel': 0.998842, 'recall_consonant': 0.998375, 'acc_grapheme': 0.996858, 'acc_vowel': 0.998753, 'acc_consonant': 0.998703, 'loss_grapheme': 0.017344, 'loss_vowel': 0.012411, 'loss_consonant': 0.009205}
   25 | 0.000048 | 160720/160735 | 1.0223 | 1.4663 |
val: {'recall': 0.995599, 'recall_grapheme': 0.996075, 'recall_vowel': 0.998453, 'recall_consonant': 0.991793, 'acc_grapheme': 0.995636, 'acc_vowel': 0.998205, 'acc_consonant': 0.995038, 'loss_grapheme': 0.017952, 'loss_vowel': 0.008977, 'loss_consonant': 0.017277}
   26 | 0.000046 | 160720/160735 | 0.0413 | 1.3641 |

  0%|          | 0/574 [00:00<?, ?it/s]


val: {'recall': 0.996972, 'recall_grapheme': 0.99565, 'recall_vowel': 0.99845, 'recall_consonant': 0.998139, 'acc_grapheme': 0.995661, 'acc_vowel': 0.998304, 'acc_consonant': 0.998504, 'loss_grapheme': 0.016935, 'loss_vowel': 0.009717, 'loss_consonant': 0.007878}
SWA>>>:


100%|██████████| 574/574 [02:11<00:00,  4.31it/s]



val: {'recall': 0.998418, 'recall_grapheme': 0.997837, 'recall_vowel': 0.999293, 'recall_consonant': 0.998703, 'acc_grapheme': 0.997881, 'acc_vowel': 0.999102, 'acc_consonant': 0.999152, 'loss_grapheme': 0.008959, 'loss_vowel': 0.004119, 'loss_consonant': 0.003574}
   27 | 0.000044 | 160720/160735 | 0.0732 | 1.3413 |
val: {'recall': 0.99736, 'recall_grapheme': 0.996534, 'recall_vowel': 0.998606, 'recall_consonant': 0.997766, 'acc_grapheme': 0.995836, 'acc_vowel': 0.998454, 'acc_consonant': 0.99823, 'loss_grapheme': 0.019473, 'loss_vowel': 0.012077, 'loss_consonant': 0.010996}
   28 | 0.000042 | 160720/160735 | 3.3739 | 1.3072 |
val: {'recall': 0.997646, 'recall_grapheme': 0.996685, 'recall_vowel': 0.998936, 'recall_consonant': 0.998277, 'acc_grapheme': 0.996384, 'acc_vowel': 0.998778, 'acc_consonant': 0.998629, 'loss_grapheme': 0.022842, 'loss_vowel': 0.018021, 'loss_consonant': 0.012886}
   29 | 0.000040 | 160720/160735 | 2.6277 | 1.3237 |

  0%|          | 0/574 [00:00<?, ?it/s]


val: {'recall': 0.997299, 'recall_grapheme': 0.996195, 'recall_vowel': 0.998489, 'recall_consonant': 0.998315, 'acc_grapheme': 0.995861, 'acc_vowel': 0.998479, 'acc_consonant': 0.998329, 'loss_grapheme': 0.019989, 'loss_vowel': 0.011532, 'loss_consonant': 0.010559}
SWA>>>:


100%|██████████| 574/574 [02:12<00:00,  4.44it/s]



val: {'recall': 0.998446, 'recall_grapheme': 0.997889, 'recall_vowel': 0.999304, 'recall_consonant': 0.998703, 'acc_grapheme': 0.997856, 'acc_vowel': 0.999127, 'acc_consonant': 0.999152, 'loss_grapheme': 0.008947, 'loss_vowel': 0.004103, 'loss_consonant': 0.003559}
   30 | 0.000038 | 160720/160735 | 2.2773 | 1.3233 |
val: {'recall': 0.996512, 'recall_grapheme': 0.995664, 'recall_vowel': 0.998166, 'recall_consonant': 0.996554, 'acc_grapheme': 0.995462, 'acc_vowel': 0.998005, 'acc_consonant': 0.997357, 'loss_grapheme': 0.017756, 'loss_vowel': 0.009567, 'loss_consonant': 0.010789}
   31 | 0.000036 | 160720/160735 | 0.1188 | 1.2947 |
val: {'recall': 0.997565, 'recall_grapheme': 0.996707, 'recall_vowel': 0.998698, 'recall_consonant': 0.998149, 'acc_grapheme': 0.996484, 'acc_vowel': 0.998629, 'acc_consonant': 0.998604, 'loss_grapheme': 0.017034, 'loss_vowel': 0.010864, 'loss_consonant': 0.008605}
   32 | 0.000034 | 160720/160735 | 0.1340 | 1.3819 |

  0%|          | 0/574 [00:00<?, ?it/s]


val: {'recall': 0.997694, 'recall_grapheme': 0.996801, 'recall_vowel': 0.998896, 'recall_consonant': 0.998277, 'acc_grapheme': 0.996783, 'acc_vowel': 0.998753, 'acc_consonant': 0.998554, 'loss_grapheme': 0.016086, 'loss_vowel': 0.010713, 'loss_consonant': 0.008164}
SWA>>>:


100%|██████████| 574/574 [02:11<00:00,  4.19it/s]



val: {'recall': 0.998468, 'recall_grapheme': 0.99792, 'recall_vowel': 0.999322, 'recall_consonant': 0.998709, 'acc_grapheme': 0.99793, 'acc_vowel': 0.999152, 'acc_consonant': 0.999177, 'loss_grapheme': 0.008977, 'loss_vowel': 0.004089, 'loss_consonant': 0.003533}
   33 | 0.000032 | 160720/160735 | 0.5514 | 1.4002 |
val: {'recall': 0.994198, 'recall_grapheme': 0.995598, 'recall_vowel': 0.997433, 'recall_consonant': 0.988165, 'acc_grapheme': 0.995238, 'acc_vowel': 0.997531, 'acc_consonant': 0.990325, 'loss_grapheme': 0.022824, 'loss_vowel': 0.013254, 'loss_consonant': 0.029641}
   34 | 0.000030 | 160720/160735 | 0.0562 | 1.3230 |

KeyboardInterrupt: 

In [39]:
train(args)

model file: ./new-model3-ckps/se_resnext50_32x4d/model3_res50_fold4_256_512.pth, exist: True
loading ./new-model3-ckps/se_resnext50_32x4d/model3_res50_fold4_256_512.pth...
model file: ./new-model3-ckps/se_resnext50_32x4d/model3_res50_fold4_256_512.pth_swa, exist: False

val: {'recall': 0.993159, 'recall_grapheme': 0.990407, 'recall_vowel': 0.995688, 'recall_consonant': 0.996137, 'acc_grapheme': 0.990849, 'acc_vowel': 0.99626, 'acc_consonant': 0.996434, 'loss_grapheme': 0.07707, 'loss_vowel': 0.065005, 'loss_consonant': 0.044777}
CYCLE: 1
    0 | 0.000195 | 160720/160735 | 1.9432 | 1.6098 |
val: {'recall': 0.995326, 'recall_grapheme': 0.993633, 'recall_vowel': 0.997483, 'recall_consonant': 0.996555, 'acc_grapheme': 0.99237, 'acc_vowel': 0.997182, 'acc_consonant': 0.997307, 'loss_grapheme': 0.050218, 'loss_vowel': 0.039879, 'loss_consonant': 0.032145}
###>>>>> saved ./new-model3-ckps/se_resnext50_32x4d/model3_res50_fold4_256_512.pth
    1 | 0.000362 | 160720/160735 | 1.9828 | 1.5628 |
va

   13 | 0.000082 | 160720/160735 | 0.1083 | 1.4509 |
val: {'recall': 0.997592, 'recall_grapheme': 0.99663, 'recall_vowel': 0.998967, 'recall_consonant': 0.998139, 'acc_grapheme': 0.996285, 'acc_vowel': 0.998629, 'acc_consonant': 0.998354, 'loss_grapheme': 0.022873, 'loss_vowel': 0.016748, 'loss_consonant': 0.01383}
   14 | 0.000079 | 160720/160735 | 0.1230 | 1.4195 |
val: {'recall': 0.997375, 'recall_grapheme': 0.996603, 'recall_vowel': 0.998253, 'recall_consonant': 0.998043, 'acc_grapheme': 0.99616, 'acc_vowel': 0.99813, 'acc_consonant': 0.998404, 'loss_grapheme': 0.018192, 'loss_vowel': 0.011561, 'loss_consonant': 0.009797}
   15 | 0.000077 | 160720/160735 | 0.1504 | 1.3703 |
val: {'recall': 0.997769, 'recall_grapheme': 0.997123, 'recall_vowel': 0.998774, 'recall_consonant': 0.998057, 'acc_grapheme': 0.996833, 'acc_vowel': 0.998728, 'acc_consonant': 0.998554, 'loss_grapheme': 0.024975, 'loss_vowel': 0.018703, 'loss_consonant': 0.014175}
###>>>>> saved ./new-model3-ckps/se_resnext50_3

   39 | 0.000010 | 160720/160735 | 1.5646 | 1.4261 |
val: {'recall': 0.997367, 'recall_grapheme': 0.996963, 'recall_vowel': 0.998625, 'recall_consonant': 0.996917, 'acc_grapheme': 0.996434, 'acc_vowel': 0.998504, 'acc_consonant': 0.998479, 'loss_grapheme': 0.016916, 'loss_vowel': 0.011102, 'loss_consonant': 0.008791}
   40 | 0.000008 | 160720/160735 | 0.1237 | 1.4017 |
val: {'recall': 0.998326, 'recall_grapheme': 0.997742, 'recall_vowel': 0.99917, 'recall_consonant': 0.99865, 'acc_grapheme': 0.997432, 'acc_vowel': 0.999052, 'acc_consonant': 0.999052, 'loss_grapheme': 0.019324, 'loss_vowel': 0.015373, 'loss_consonant': 0.011769}
###>>>>> saved ./new-model3-ckps/se_resnext50_32x4d/model3_res50_fold4_256_512.pth
   41 | 0.000006 | 160720/160735 | 0.0901 | 1.3473 |
val: {'recall': 0.997868, 'recall_grapheme': 0.997142, 'recall_vowel': 0.998767, 'recall_consonant': 0.998421, 'acc_grapheme': 0.996759, 'acc_vowel': 0.998778, 'acc_consonant': 0.998753, 'loss_grapheme': 0.017633, 'loss_vowel': 

KeyboardInterrupt: 