# Shopee Training

Thanks to the public 11th place solution and 2nd place solution: https://www.kaggle.com/lyakaap/2nd-place-solution,
https://www.kaggle.com/shigemitsutomizawa/shopee-training-bert-11th-place-simple-solution.
Only the CPF loss part is ours and original.

In [1]:
import time
import datetime
start_time = time.time()
print(datetime.datetime.now())

In [2]:
### Google Colab

import sys

sys.path.append('../input/pytorchimagemodels/pytorch-image-models-master')

if 'google.colab' in sys.modules:
    !nvidia-smi
    !cat /proc/meminfo | head -2
    from google.colab import drive
    mount_dir = '/content/drive'
    drive.mount(mount_dir)
    TRAIN_CSV = './train.csv'
    !test -e './setup_flag' || pip install git+https://github.com/huggingface/transformers.git
    !test -e './setup_flag' || pip install sentencepiece
    #!test -e './setup_flag' || pip install -U git+https://github.com/ildoonet/pytorch-gradual-warmup-lr.git
    !test -e './setup_flag' || unzip -o "/content/drive/MyDrive/Python/kaggle/Shopee/input/shopee-product-matching.zip" -d . > /dev/null
    !touch setup_flag
else:
    TRAIN_CSV = '../input/shopee-product-matching/train.csv'

In [3]:
import os
import gc
import math
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.neighbors import NearestNeighbors

import torch
from torch import nn 
import torch.nn.functional as F 
from transformers import AutoTokenizer, AutoModel

import timm

import warnings
warnings.filterwarnings('ignore')

In [4]:
class CFG:
    train = 1
    compute_cv = 0  # set False to train model for submission

    max_length = 128
    
    bert_model_name = 'deit'
    
    max_sample_count = 1
    smoothing = 1e-5

    ### ArcFace
    scale = 50
    margin = 0.8
    fc_dim = 768
    seed = 23689
    classes = 11014
    
    ### Training
    n_splits = 5  # GroupKFold(n_splits)
    batch_size = 20
    accum_iter = 1  # 1 if use_sam = True
    epochs = 15
    retrain = 0
    min_save_epoch = epochs // 3
    save_interval = 3
    use_sam = 0  # SAM (Sharpness-Aware Minimization for Efficiently Improving Generalization)
    use_amp = 1  # Automatic Mixed Precision
    num_workers = 4  # On Windows, set 0 or export train_fn and TitleDataset as .py files for faster training.
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(device)
    
    ###
    img_size = 384
    images_dir = '../input/shopee-product-matching/train_images/'
    
    ### NearestNeighbors
    bert_knn = 50
    bert_knn_threshold = 0.60  # Cosine distance threshold
    
    ### GradualWarmupSchedulerV2（lr_start -> lr_max -> lr_min）
    lr_s = 50
    scheduler_params = {
        "lr_start": 7.5e-6,
        "lr_max": 1e-3,
        "lr_min": 1e-5, # 1.5e-5,
    }
    multiplier = scheduler_params['lr_max'] / scheduler_params['lr_start']
    eta_min = scheduler_params['lr_min']  # last minimum learning rate
    freeze_epo = 0
    warmup_epo = 2
    cosine_epo = epochs - freeze_epo - warmup_epo

    ### CPF
    loss_cpf_tau = 0.25
    loss_cpf_psi = 0.25
    loss_cpf_sp = 1.5
    loss_cpf_sn = 1.0
    loss_cpf_mu = 1.0
    loss_cpf_b = 2
    
    trained_path = ''
    ### save_model_path
    if 'google.colab' in sys.modules:  # for Google Colab
        save_model_path = f"{mount_dir}/MyDrive/Python/kaggle/Shopee/output/{bert_model_name.rsplit('/', 1)[-1]}_epoch{epochs}-bs{batch_size}x{accum_iter}.pt"
    elif 'kaggle_web_client' in sys.modules:  # for kaggle notebook
        save_model_path = f"./{bert_model_name}.pt"
    else:  # for local PC
        save_model_path = f"../input/shopee-arcface-models/{bert_model_name}.pt"
        
if not CFG.train:
    CFG.retrain = True

In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True # set True to be faster

seed_everything(CFG.seed)

# Classes and Functions

In [6]:
### GradualWarmupScheduler
# https://github.com/ildoonet/pytorch-gradual-warmup-lr

from torch.optim.lr_scheduler import _LRScheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau


class GradualWarmupScheduler(_LRScheduler):
    """ Gradually warm-up(increasing) learning rate in optimizer.
    Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
    Args:
        optimizer (Optimizer): Wrapped optimizer.
        multiplier: target learning rate = base lr * multiplier if multiplier > 1.0. if multiplier = 1.0, lr starts from 0 and ends up with the base_lr.
        total_epoch: target learning rate is reached at total_epoch, gradually
        after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
    """

    def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
        self.multiplier = multiplier
        if self.multiplier < 1.:
            raise ValueError('multiplier should be greater thant or equal to 1.')
        self.total_epoch = total_epoch
        self.after_scheduler = after_scheduler
        self.finished = False
        super(GradualWarmupScheduler, self).__init__(optimizer)

    def get_lr(self):
        if self.last_epoch > self.total_epoch:
            if self.after_scheduler:
                if not self.finished:
                    self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
                    self.finished = True
                return self.after_scheduler.get_last_lr()
            return [base_lr * self.multiplier for base_lr in self.base_lrs]

        if self.multiplier == 1.0:
            return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
        else:
            return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]

    def step_ReduceLROnPlateau(self, metrics, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
        self.last_epoch = epoch if epoch != 0 else 1  # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning
        if self.last_epoch <= self.total_epoch:
            warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
            for param_group, lr in zip(self.optimizer.param_groups, warmup_lr):
                param_group['lr'] = lr
        else:
            if epoch is None:
                self.after_scheduler.step(metrics, None)
            else:
                self.after_scheduler.step(metrics, epoch - self.total_epoch)

    def step(self, epoch=None, metrics=None):
        if type(self.after_scheduler) != ReduceLROnPlateau:
            if self.finished and self.after_scheduler:
                if epoch is None:
                    self.after_scheduler.step(None)
                else:
                    self.after_scheduler.step(epoch - self.total_epoch)
                self._last_lr = self.after_scheduler.get_last_lr()
            else:
                return super(GradualWarmupScheduler, self).step(epoch)
        else:
            self.step_ReduceLROnPlateau(metrics, epoch)

In [7]:
### GradualWarmupSchedulerV2

class GradualWarmupSchedulerV2(GradualWarmupScheduler):
    def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None):
        super(GradualWarmupSchedulerV2, self).__init__(optimizer, multiplier, total_epoch, after_scheduler)
    def get_lr(self):
        if self.last_epoch > self.total_epoch:
            if self.after_scheduler:
                if not self.finished:
                    self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
                    self.finished = True
                return self.after_scheduler.get_lr()
            return [base_lr * self.multiplier for base_lr in self.base_lrs]
        if self.multiplier == 1.0:
            return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
        else:
            return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]

In [8]:
from timm.data import create_dataset, create_loader, resolve_data_config, Mixup, FastCollateMixup, AugMixDataset
mixup_args = dict(
            mixup_alpha=0.35, cutmix_alpha=0.0, cutmix_minmax=None,
            prob=1.0, switch_prob=0.5, mode='pair',
            label_smoothing=0.00, num_classes=CFG.classes)
mixup_fn = Mixup(**mixup_args)

In [9]:
def train_fn(model, data_loader, optimizer, scheduler, use_sam, accum_iter, epoch, device, use_amp):
    global mixup_fn
    model.train()
    if use_amp:
        scaler = torch.cuda.amp.GradScaler()
    fin_loss = 0.0
    tk = tqdm(data_loader, desc = "Epoch:" + str(epoch+1), ncols=250)

    for t, (input, target) in enumerate(tk):  
        input, target = input.to(device), target.to(device)
        input, target = mixup_fn(input, target)

        if use_sam:
            if use_amp:
                with torch.cuda.amp.autocast():
                    _, loss = model(input, target)
                loss.mean().backward()
                optimizer.first_step(zero_grad=True)
                fin_loss += loss.item() 
                with torch.cuda.amp.autocast():
                     _, loss_second = model(input, target)
                loss_second.mean().backward()
                optimizer.second_step(zero_grad=True)
                optimizer.zero_grad()
            else:
                _, loss = model(**data)
                loss.mean().backward()
                optimizer.first_step(zero_grad=True)
                fin_loss += loss.item() 
                _, loss_second = model(input, target)
                loss_second.mean().backward()
                optimizer.second_step(zero_grad=True)
                optimizer.zero_grad()

        else:  # if use_sam == False
            if use_amp:
                with torch.cuda.amp.autocast():
                    _, loss = model(input, target)
                scaler.scale(loss).backward()
                fin_loss += loss.item() 
                # mini-batch accumulation
                if (t + 1) % accum_iter == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
            else:
                _, loss = model(input, target)
                loss.backward()
                fin_loss += loss.item() 
                # mini-batch accumulation
                if (t + 1) % accum_iter == 0:
                    optimizer.step() 
                    optimizer.zero_grad()
                
        tk.set_postfix({'loss' : '%.3f' %float(fin_loss/(t+1)), 'LR' : optimizer.param_groups[0]['lr']})

    scheduler.step()
    return model, fin_loss / len(data_loader)

In [10]:
### ArcFace
from timm.loss import SoftTargetCrossEntropy
class ArcMarginProduct(nn.Module):
    def __init__(self, in_features, out_features, scale=30.0, margin=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.scale = scale
        self.margin = margin
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.th = math.cos(math.pi - margin)
        self.mm = math.sin(math.pi - margin) * margin
        
#         self.criterion = nn.CrossEntropyLoss()
        self.criterion = SoftTargetCrossEntropy()
                
    def forward(self, input, one_hot):
        # --------------------------- cos(theta) & phi(theta) ---------------------------
        if CFG.use_amp:
            cosine = F.linear(F.normalize(input), F.normalize(self.weight)).float()  # if CFG.use_amp
        else:
            cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        cosine = cosine.clamp(-1, 1)
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------------
        # one_hot = torch.zeros(cosine.size(), device=CFG.device)
        # one_hot.scatter_(1, label.view(-1, 1).long(), 1)
#         if self.ls_eps > 0:
#             one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.scale
        
        return output, self.criterion(output,one_hot)

In [12]:
import numpy as np
import torch, torch.nn as nn, torch.nn.functional as F
import batchminer

"""================================================================================================="""

### This implementation follows the pseudocode provided in the original paper.
class CPF(torch.nn.Module):
    def __init__(self, opt):
        super(CPF, self).__init__()
        self.par = opt

        ####
        self.in_features = opt.embed_dim
        self.out_features = opt.n_classes

        self.weight = nn.Parameter(torch.FloatTensor(self.out_features, self.in_features))
        nn.init.xavier_uniform_(self.weight)
        ####
        self.ls_eps = 0        
        # self.lr    = opt.loss_cpf_lr

        self.tau = opt.loss_cpf_tau
        self.psi = opt.loss_cpf_psi
        self.sp = opt.loss_cpf_sp
        self.sn = opt.loss_cpf_sn        
        self.mu = opt.loss_cpf_mu
        self.b = opt.loss_cpf_b


        ####
        # self.ALLOWED_MINING_OPS  = ALLOWED_MINING_OPS
        # self.REQUIRES_BATCHMINER = REQUIRES_BATCHMINER
        # self.REQUIRES_OPTIM      = REQUIRES_OPTIM

    def forward(self, batch, one_hot, **kwargs):
        one_hot = one_hot.to(self.par.device)
        cosine = F.linear(F.normalize(batch), F.normalize(self.weight))
        
        # one_hot = torch.zeros(cosine.size(), device=self.par.device)
        # one_hot.scatter_(1, labels.view(-1, 1).long(), 1)
        
        tp = ((cosine.clamp(min=0.0) * one_hot) * 2).sum() + self.b
        
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features

        lossp = ((1.0-cosine) * torch.exp((1.0-cosine) * self.sp).detach() * one_hot).sum()
            
        mask = cosine > self.tau
        cosine = cosine[mask]
        lossn =   ((cosine - self.tau) 
                    * torch.exp((cosine - self.mu) * self.sn).detach()
                    * (1 - one_hot[mask])).sum()
        
        loss = (1.0 - (tp)/(tp + lossp + lossn))

        return loss

In [13]:
def gem(x, p=3, eps=1e-6):
    return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)

    
class ShopeeNet(nn.Module):

    def __init__(self,
                 backbone,
                 num_classes,
                 fc_dim=512,
                 s=30, margin=0.5, p=3):
        super(ShopeeNet, self).__init__()

        self.backbone = backbone
        self.backbone.reset_classifier(num_classes=0)  # remove classifier

        self.classifier = nn.Linear(self.backbone.num_features, fc_dim)
        self.bn = nn.BatchNorm1d(fc_dim)
        self._init_params()
        self.p = p
        self.final = CPF(CFG)

    def _init_params(self):
        nn.init.xavier_normal_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone.forward_features(x)
        if isinstance(x, tuple):
            x = (x[0] + x[1]) / 2
            x = self.bn(x)
        else:
            x = gem(x, p=self.p).view(batch_size, -1)
            x = self.classifier(x)
            x = self.bn(x)
        return x

    def forward(self, image, label):
        feat = self.extract_feat(image)
        x = self.final(feat, label)
        return x

# Dataset

In [14]:
import os
import cv2
import numpy as np 

import torch
from PIL import Image

class ShopeeDataset(torch.utils.data.Dataset):

    def __init__(self, df, root_dir, transform=None, training=True):
        self.df = df 
        self.root_dir = root_dir
        self.transform = transform
        self.training = training

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):

        row = self.df.iloc[idx]
        label = row.label_group

        img_path = os.path.join(self.root_dir, row.image)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)
        
        if self.training:
            return image, torch.tensor(label).long()
        else:
            return image

In [15]:
from PIL import Image
from torchvision.io import read_image
from torchvision.transforms import Resize, RandomHorizontalFlip, ColorJitter, Normalize, Compose, RandomResizedCrop, CenterCrop, ToTensor

def get_valid_transforms(img_size=512):
    return Compose([
    Resize(size=params1['test_size'] + 32, interpolation=Image.BICUBIC),
    CenterCrop((params1['test_size'], params1['test_size'])),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [16]:
from timm.data import create_transform

def get_train_transforms(img_size=512):
     return create_transform(
        input_size=img_size,
        scale=(0.6, 1.0),  # Default: (0.08, 1.0)
        ratio=(1.0, 1.0),  # Default: (3. / 4., 4. / 3.)
        hflip=0.5,
        vflip=0.,
        is_training=True,
        color_jitter=0.1,
        auto_augment='rand-m3-n1-mstd0.5-inc1',
        re_prob=0.1,  # RandomErasing probability
        re_mode='pixel',  # ['const', 'rand', 'pixel']
        re_count=1,  # number of erasing blocks per image
    )

# Setup

In [17]:
### Create Dataloader

print("Compute CV =", CFG.compute_cv)

df = pd.read_csv(TRAIN_CSV)
df.drop_duplicates(subset=['image'], inplace=True, ignore_index=True)
df['target'] = df.label_group.map(df.groupby('label_group').posting_id.agg('unique').to_dict())

labelencoder= LabelEncoder()
df['label_group'] = labelencoder.fit_transform(df['label_group'])


train_df = df.copy()
valid_df = pd.DataFrame()
print("train_df length =", len(train_df))
print("train_df classes =", len(train_df['label_group'].unique()))

In [18]:
def get_sampler(df):
    p = 1 / (df.groupby('label_group').size() ** 0.4)
    samples_weight  = p[df['label_group']].values
    return torch.utils.data.WeightedRandomSampler(samples_weight, len(samples_weight))

In [19]:
train_dataset = ShopeeDataset(train_df, CFG.images_dir, transform=get_train_transforms(img_size = CFG.img_size))

train_dataloader = torch.utils.data.DataLoader(
    sampler=get_sampler(train_df),
    dataset=train_dataset,
    batch_size = CFG.batch_size,
    num_workers = CFG.num_workers,
    pin_memory = True,
#     shuffle = True,
#     drop_last = True
)
if CFG.compute_cv:
    valid_dataset = ShopeeDataset(valid_df, CFG.images_dir, transform=get_valid_transforms(img_size = CFG.img_size), training=False)
    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size = CFG.batch_size * 5,
        num_workers = CFG.num_workers,
        pin_memory = True,
        shuffle = False,
        drop_last = False
    )

In [20]:
params = {'ver': 'v45', 'size': 384, 'test_size': 384, 'lr': 0.001, 'batch_size': 32, 'optimizer': 'sam', 'epochs': 18, 'wd': 0.0, 'backbone': 'vit_deit_base_distilled_patch16_384', 'margin': 0.3, 's': 50, 'fc_dim': 768, 'brightness': 0.2, 'contrast': 0.2, 'scale_lower': 0.2, 'scale_upper': 1.0, 'filter_wd': True, 'p': 3.0, 'p_eval': 6.0, 'loss': 'CurricularFace'}

In [21]:
# params['backbone'] = "resnet34"
params['backbone'] = "deit_base_distilled_patch16_384"
backbone = timm.create_model(model_name=params['backbone'], pretrained=not CFG.retrain)
model = ShopeeNet(backbone, num_classes=CFG.classes, fc_dim=params['fc_dim'], s=CFG.scale, margin=CFG.margin, p=3)

In [22]:
### Create Optimizer

optimizer_grouped_parameters = [    
    {'params': model.classifier.parameters(), 'lr': CFG.scheduler_params['lr_start'] },
    {'params': model.backbone.parameters(), 'lr': CFG.scheduler_params['lr_start']/CFG.lr_s},
    {'params': model.bn.parameters(), 'lr': CFG.scheduler_params['lr_start'] },
    {'params': model.final.parameters(), 'lr': CFG.scheduler_params['lr_start'] },
]

from transformers import AdamW
optimizer = AdamW(optimizer_grouped_parameters)

print("lr_start")
print("-" * 30)
for i in range(len(optimizer.param_groups)):
    print('Parameter Group ' + str(i) + ' :', optimizer.param_groups[i]["lr"])

In [23]:
### Create Scheduler

scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=CFG.cosine_epo-2, eta_min=CFG.eta_min, last_epoch=-1)
scheduler = GradualWarmupSchedulerV2(optimizer, multiplier=CFG.multiplier, total_epoch=CFG.warmup_epo,
                                     after_scheduler=scheduler_cosine)

# Training and Validation

In [24]:
print("Training epochs =", CFG.epochs)

In [None]:
max_f1_valid = 0.

start_epoch = 0    
model = model.to(CFG.device)
for epoch in range(start_epoch, start_epoch + CFG.epochs):
    if CFG.train:
        model, avg_loss_train = train_fn(model, train_dataloader, optimizer, scheduler,
                                         CFG.use_sam, CFG.accum_iter, epoch, CFG.device, CFG.use_amp)

       
    if ((epoch + 1)%CFG.save_interval == 0):
        print(f"loss:{avg_loss_train}, Saving model weights to {CFG.bert_model_name}-epoch{epoch+1}.pt")
        torch.save({ 
                    'epoch': epoch+1,
                    'model': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict()}, f"./{CFG.bert_model_name}-epoch{epoch+1}.pt")    
    if not CFG.train:
        break

In [None]:
time_elapsed = time.time() - start_time
print('Elapsed time: {:.0f} min {:.0f} sec'.format(time_elapsed // 60, time_elapsed % 60))
print(datetime.datetime.now())

End