# Previous kernels 

**Training**<br>
[eca_nfnet_l0 + ArcFace] : https://www.kaggle.com/parthdhameliya77/shopee-pytorch-eca-nfnet-l0-image-training

**Inference**<br>

ResNext50-32x4d (LB >= 0.72) : https://www.kaggle.com/parthdhameliya77/pytorch-resnext50-32x4d-image-tfidf-inference (Adam+relu activation) <br>
EfficientNet B3 (LB >= 0.723) : https://www.kaggle.com/parthdhameliya77/pytorch-efficientnet-b3-image-tfidf-inference (Adam+relu activation) <br>
EfficientNet B5 (LB >= 0.729) : https://www.kaggle.com/parthdhameliya77/pytorch-efficientnet-b3-image-tfidf-inference (Adam+relu activation) <br>
EfficientNet B5 (LB >= 0.729) : (Ranger+mish activation) <br>

# About this kernel 

In this training kernel, I used **'eca_nfnet_l0'(from timm)** + **CurricularFace** Module. 'eca_nfnet_l0' contains **SiLU()** activation, so I replaced it with **Mish()** activation. Reason to change Mish() activation is beacuse here I am using **Ranger(RAdam + Lookahead)optimizer** and **Mish() + Ranger optimizer** gives a good result (Based on few experiments, I may be wrong). You can try the same strategy to other models too.

<center><img src="https://www.programmersought.com/images/653/8746a02b316eef34dbd8bd83d10ee625.JPEG"/ width="440" height="440" ></center>

# Imports

In [1]:
import sys

sys.path.append('../input/shopee-competition-utils')
sys.path.append('../input/timm-pytorch-image-models/pytorch-image-models-master')

In [2]:
import numpy as np 
import pandas as pd 

import torch 
from torch import nn 
from torch.utils.data import Dataset, DataLoader 

import albumentations
from albumentations.pytorch.transforms import ToTensorV2

from custom_scheduler import ShopeeScheduler
from custom_activation import replace_activations, Mish
from custom_optimizer import Ranger

import math 
import cv2
import timm 
import os 

from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm 

# Config

In [3]:
class CFG: 
    
    DATA_DIR = '../input/shopee-product-matching/train_images'
    TRAIN_CSV = '../input/shopee-product-matching/train.csv'

    IMG_SIZE = 512
    MEAN = [0.485, 0.456, 0.406]
    STD = [0.229, 0.224, 0.225]

    EPOCHS = 15
    BATCH_SIZE = 8

    NUM_WORKERS = 4
    DEVICE = 'cuda'

    CLASSES = 11014 
    SCALE = 30
    MARGIN = 0.5

    MODEL_NAME = 'eca_nfnet_l1'
    FC_DIM = 512
    SCHEDULER_PARAMS = {
            "lr_start": 1e-5,
            "lr_max": 1e-5 * 32,
            "lr_min": 1e-6,
            "lr_ramp_ep": 5,
            "lr_sus_ep": 0,
            "lr_decay": 0.8,
        }

# Augmentations

In [4]:
def get_train_transforms():
    return albumentations.Compose(
        [   
            albumentations.Resize(CFG.IMG_SIZE,CFG.IMG_SIZE,always_apply=True),
            albumentations.HorizontalFlip(p=0.5),
            albumentations.VerticalFlip(p=0.5),
            albumentations.Rotate(limit=120, p=0.8),
            albumentations.RandomBrightness(limit=(0.09, 0.6), p=0.5),
            albumentations.Normalize(mean = CFG.MEAN, std = CFG.STD),
            ToTensorV2(p=1.0),
        ]
    )

# Dataset 

In [5]:
class ShopeeDataset(torch.utils.data.Dataset):

    def __init__(self,df, transform = None):
        self.df = df 
        self.root_dir = CFG.DATA_DIR
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self,idx):

        row = self.df.iloc[idx]

        img_path = os.path.join(self.root_dir,row.image)
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        label = row.label_group

        if self.transform:
            augmented = self.transform(image=image)
            image = augmented['image']

        return {
            'image' : image,
            'label' : torch.tensor(label).long()
        }

# Curricular Face + NFNet-L0


- **Curricular Face** : https://arxiv.org/pdf/2004.00288.pdf

<center><img src="https://pbs.twimg.com/media/EVOupMwUcAAaGP7.jpg"/ width="440" height="440" ></center>

In [6]:
'''
credit : https://github.com/HuangYG123/CurricularFace/blob/8b2f47318117995aa05490c05b455b113489917e/head/metrics.py#L70
'''

def l2_norm(input, axis = 1):
    norm = torch.norm(input, 2, axis, True)
    output = torch.div(input, norm)

    return output

class CurricularFace(nn.Module):
    def __init__(self, in_features, out_features, s = 30, m = 0.50):
        super(CurricularFace, self).__init__()

        print('Using Curricular Face')

        self.in_features = in_features
        self.out_features = out_features
        self.m = m
        self.s = s
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.threshold = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m
        self.kernel = nn.Parameter(torch.Tensor(in_features, out_features))
        self.register_buffer('t', torch.zeros(1))
        nn.init.normal_(self.kernel, std=0.01)

    def forward(self, embbedings, label):
        embbedings = l2_norm(embbedings, axis = 1)
        kernel_norm = l2_norm(self.kernel, axis = 0)
        cos_theta = torch.mm(embbedings, kernel_norm)
        cos_theta = cos_theta.clamp(-1, 1)  # for numerical stability
        with torch.no_grad():
            origin_cos = cos_theta.clone()
        target_logit = cos_theta[torch.arange(0, embbedings.size(0)), label].view(-1, 1)

        sin_theta = torch.sqrt(1.0 - torch.pow(target_logit, 2))
        cos_theta_m = target_logit * self.cos_m - sin_theta * self.sin_m #cos(target+margin)
        mask = cos_theta > cos_theta_m
        final_target_logit = torch.where(target_logit > self.threshold, cos_theta_m, target_logit - self.mm)

        hard_example = cos_theta[mask]
        with torch.no_grad():
            self.t = target_logit.mean() * 0.01 + (1 - 0.01) * self.t
        cos_theta[mask] = hard_example * (self.t + hard_example)
        cos_theta.scatter_(1, label.view(-1, 1).long(), final_target_logit)
        output = cos_theta * self.s
        return output, nn.CrossEntropyLoss()(output,label)

In [7]:
class ShopeeModel(nn.Module):

    def __init__(
        self,
        n_classes = CFG.CLASSES,
        model_name = CFG.MODEL_NAME,
        fc_dim = CFG.FC_DIM,
        margin = CFG.MARGIN,
        scale = CFG.SCALE,
        use_fc = True,
        pretrained = True):


        super(ShopeeModel,self).__init__()
        print('Building Model Backbone for {} model'.format(model_name))

        self.backbone = timm.create_model(model_name, pretrained=pretrained)

        if 'efficientnet' in model_name:
            final_in_features = self.backbone.classifier.in_features
            self.backbone.classifier = nn.Identity()
            self.backbone.global_pool = nn.Identity()
        
        elif 'nfnet' in model_name:
            final_in_features = self.backbone.head.fc.in_features
            self.backbone.head.fc = nn.Identity()
            self.backbone.head.global_pool = nn.Identity()

        self.pooling =  nn.AdaptiveAvgPool2d(1)

        self.use_fc = use_fc

        if use_fc:
            self.dropout = nn.Dropout(p=0.0)
            self.fc = nn.Linear(final_in_features, fc_dim)
            self.bn = nn.BatchNorm1d(fc_dim)
            self._init_params()
            final_in_features = fc_dim

        self.final = CurricularFace(final_in_features, 
                                           n_classes, 
                                           s=scale, 
                                           m=margin)

    def _init_params(self):
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)
        nn.init.constant_(self.bn.weight, 1)
        nn.init.constant_(self.bn.bias, 0)

    def forward(self, image, label):
        feature = self.extract_feat(image)
        logits = self.final(feature,label)
        return logits

    def extract_feat(self, x):
        batch_size = x.shape[0]
        x = self.backbone(x)
        x = self.pooling(x).view(batch_size, -1)

        if self.use_fc:
            x = self.dropout(x)
            x = self.fc(x)
            x = self.bn(x)
        return x


# Engine

In [8]:
def train_fn(model, data_loader, optimizer, scheduler, i):
    model.train()
    fin_loss = 0.0
    tk = tqdm(data_loader, desc = "Epoch" + " [TRAIN] " + str(i+1))

    for t,data in enumerate(tk):
        for k,v in data.items():
            data[k] = v.to(CFG.DEVICE)
        optimizer.zero_grad()
        _, loss = model(**data)
        loss.backward()
        optimizer.step() 
        fin_loss += loss.item() 

        tk.set_postfix({'loss' : '%.6f' %float(fin_loss/(t+1)), 'LR' : optimizer.param_groups[0]['lr']})

    scheduler.step()

    return fin_loss / len(data_loader)

def eval_fn(model, data_loader, i):
    model.eval()
    fin_loss = 0.0
    tk = tqdm(data_loader, desc = "Epoch" + " [VALID] " + str(i+1))

    with torch.no_grad():
        for t,data in enumerate(tk):
            for k,v in data.items():
                data[k] = v.to(CFG.DEVICE)
            _, loss = model(**data)
            fin_loss += loss.item() 

            tk.set_postfix({'loss' : '%.6f' %float(fin_loss/(t+1))})
        return fin_loss / len(data_loader)

# Training 

In [9]:
def run_training():
    
    df = pd.read_csv(CFG.TRAIN_CSV)

    labelencoder= LabelEncoder()
    df['label_group'] = labelencoder.fit_transform(df['label_group'])
    
    trainset = ShopeeDataset(df, transform = get_train_transforms())

    trainloader = torch.utils.data.DataLoader(
        trainset,
        batch_size = CFG.BATCH_SIZE,
        pin_memory = True,
        num_workers = CFG.NUM_WORKERS,
        shuffle = True,
        drop_last = True
    )

    print(df['label_group'].nunique())

    model = ShopeeModel(n_classes = df['label_group'].nunique())
    model = replace_activations(model, torch.nn.SiLU, Mish())
    model.to(CFG.DEVICE)

    optimizer = Ranger(model.parameters(), lr = CFG.SCHEDULER_PARAMS['lr_start'])
    #optimizer = torch.optim.Adam(model.parameters(), lr = config.SCHEDULER_PARAMS['lr_start'])
    scheduler = ShopeeScheduler(optimizer,**CFG.SCHEDULER_PARAMS)

    for i in range(CFG.EPOCHS):

        avg_loss_train = train_fn(model, trainloader, optimizer, scheduler, i)
        torch.save(model.state_dict(),f'curricular_face_{CFG.MODEL_NAME}_mish.pt')
        
run_training()

11014
Building Model Backbone for eca_nfnet_l1 model


Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecanfnet_l1_ra2-7dce93cd.pth" to /root/.cache/torch/hub/checkpoints/ecanfnet_l1_ra2-7dce93cd.pth


Using Curricular Face


Ranger optimizer loaded. 
Gradient Centralization usage = True
GC applied to both conv and fc layers


Epoch [TRAIN] 1:   0%|          | 0/4281 [00:00<?, ?it/s]

Epoch [TRAIN] 2:   0%|          | 0/4281 [00:00<?, ?it/s]

Epoch [TRAIN] 3:   0%|          | 0/4281 [00:00<?, ?it/s]

Epoch [TRAIN] 4:   0%|          | 0/4281 [00:00<?, ?it/s]

Epoch [TRAIN] 5:   0%|          | 0/4281 [00:00<?, ?it/s]

Epoch [TRAIN] 6:   0%|          | 0/4281 [00:00<?, ?it/s]

Epoch [TRAIN] 7:   0%|          | 0/4281 [00:00<?, ?it/s]