In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Label Smoothing

Another regularization technique that's often used is label smoothing. It's designed to make the model a little bit less certain of it's decision by changing a little bit its target: instead of wanting to predict 1 for the correct class and 0 for all the others, we ask it to predict `1-ε` for the correct class and `ε` for all the others, with `ε` a (small) positive number and N the number of classes. This can be written as:

$$loss = (1-ε) ce(i) + ε \sum ce(j) / N$$

where `ce(x)` is cross-entropy of `x` (i.e. $-\log(p_{x})$), and `i` is the correct class. This can be coded in a loss function:

In [2]:
#export
from exp.nb_12a import *

__Data__

In [3]:
path = datasets.untar_data(datasets.URLs.IMAGENETTE_160) # downloads and returns a path to folder
tfms = [make_rgb, ResizeFixed(128), to_byte_tensor, to_float_tensor] # transforms to be applied to images
bs = 128 # batch size
il = ImageList.from_files(path, tfms=tfms) # Imagelist from files
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name="val")) # Splitdata by grandparent folder function
ll = label_by_func(sd, parent_labeler, proc_y=CategoryProcesser()) # label the data by parent folder
data = ll.to_databunch(bs, c_in=3, c_out=10)

In [4]:
#export
class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, ε:float=0.1, reduction='mean'):
        super().__init__()
        self.ε,self.reduction = ε,reduction
    
    def forward(self, output, target):
        c = output.size()[-1]
        log_preds = F.log_softmax(output, dim=-1)
        loss = reduce_loss(-log_preds.sum(dim=-1), self.reduction)
        nll = F.nll_loss(log_preds, target, reduction=self.reduction)
        return lin_comb(loss/c, nll, self.ε)

In [5]:
cbfs = [partial(AvgStatsCallback,accuracy),
        CudaCallback,
        ProgressCallback,
        partial(BatchTransformXCallback, norm_imagenette)]

In [7]:
nfs = [32,64,128,256,512]

In [8]:
learn = get_learner(nfs, data, 0.4, conv_layer, cb_funcs=cbfs, loss_func=LabelSmoothingCrossEntropy())

In [9]:
learn.fit(1)

Epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,2.462852,0.287781,2.037728,0.397197,00:52


In [10]:
learn.loss_func.reduction

'mean'

In [11]:
learn.fit(10)

Epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,1.765944,0.487063,1.833645,0.478217,00:15
1,1.462188,0.603337,1.742753,0.524586,00:15
2,1.14892,0.741261,1.74317,0.530446,00:15
3,0.889892,0.870314,1.707937,0.549809,00:15
4,0.694859,0.964093,1.627807,0.569682,00:14
5,0.611935,0.991974,1.584033,0.57707,00:14
6,0.571244,0.998838,1.557704,0.57707,00:15
7,0.554448,0.998838,1.544945,0.577325,00:14
8,0.543389,0.999789,1.533487,0.577834,00:14
9,0.538632,0.999894,1.528113,0.585223,00:14


In [12]:
nb_auto_export()

<IPython.core.display.Javascript object>