# Sentiment analysis for SST-2 with DiltilBERT

## Install HuggingFace dependencies

In [None]:
!pip install -q datasets
!pip install -q transformers

[K     |████████████████████████████████| 325 kB 16.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 65.2 MB/s 
[K     |████████████████████████████████| 67 kB 6.0 MB/s 
[K     |████████████████████████████████| 212 kB 64.0 MB/s 
[K     |████████████████████████████████| 136 kB 74.5 MB/s 
[K     |████████████████████████████████| 127 kB 76.8 MB/s 
[K     |████████████████████████████████| 94 kB 3.4 MB/s 
[K     |████████████████████████████████| 144 kB 76.4 MB/s 
[K     |████████████████████████████████| 271 kB 74.3 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
[K     |████████████████████████████████| 3.8 MB 8.3 MB/s 
[K     |████████████████████████████████| 895 kB 61.3 MB/s 
[K     |████████████████████████████████| 596 kB 41.6 MB/

## Create functions to download and preproccess data

In [None]:
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from datasets import load_dataset
from transformers import CanineTokenizer, DataCollatorWithPadding, DistilBertTokenizer

bz = 16  # batch size

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def encode(samples):
    return tokenizer(samples['sentence'], padding=False, truncation=True)

def quantify(samples):
    return {'label': np.digitize(samples['label'], np.array([.5]), right=True)}

letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
def typo(samples, typo_percent=0):
    for i,sample in enumerate(samples['sentence']):
        for j,x in enumerate(sample):
            if np.random.random() < typo_percent and x.isalpha():
                aux = list(samples['sentence'][i])
                aux[j] = np.random.choice(letters)
                samples['sentence'][i] = ''.join(aux)
    return samples

def generate_dataset(typo_percent=0):
    #dataset = load_dataset('sst')
    dataset = load_dataset('glue', 'sst2')

    # Set validation as test
    dataset['test'] = dataset['validation']
    # Split train in train and valid
    dataset_aux = dataset['train'].train_test_split(0.05, seed=0)
    dataset['train'] = dataset_aux['train']
    dataset['validation'] = dataset_aux['test']

    if typo_percent > 0:
        dataset = dataset.map(typo, batched=True, batch_size=bz, fn_kwargs={'typo_percent':typo_percent})
    dataset = dataset.map(encode, batched=True, batch_size=bz)
    
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    loader = {}
    for split in dataset:
        loader[split] = DataLoader(dataset[split], batch_size=bz, shuffle=(split=='train'), collate_fn=DataCollatorWithPadding(tokenizer))
    
    return dataset, loader

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

## Create train and eval functions

In [None]:
from tqdm import tqdm
import copy

def train_canine(net, device, optimizer, criterion, lr, lr_scheduler, train_loader, val_loader, epochs, unfreeze=False, unfreeze_ep=5):
    best_loss = np.Inf
    best_ep = 1
    iters = []
    train_losses = []
    val_losses = []
    log_interval = len(train_loader)//5

    for ep in range(epochs):
        with tqdm(total=len(train_loader), unit_scale=True, postfix={'Loss':0.0, 'Accuracy':0.0, 'lr':lr_scheduler.get_last_lr()[0]},
                        desc="Epoch : %i/%i" % (ep+1, epochs)) as pbar:
            net.train()
            total_loss = 0.0
            correct = 0
            nb_samples = 0
            for it, batch in enumerate(train_loader):
                # Converting to cuda tensors
                batch = {k: v.to(device) for k, v in batch.items()}
                seq, attn_masks, labels = \
                    batch['input_ids'], batch['attention_mask'], batch['labels']
                labels = labels.to(torch.int64)
        
                # Clear gradients
                optimizer.zero_grad()

                # Forward pass
                output = net(input_ids=seq, attention_mask=attn_masks, labels=labels)

                # Backpropagating the gradients
                loss = output.loss #HUGGINGFACE
                #loss = criterion(output, labels)
                loss.backward()
                # Optimization step
                optimizer.step()

                total_loss += loss.item()

                logits = output.logits #HUGGINGFACE
                #logits = output
                predicted = logits.argmax(axis=1)
                correct += predicted.eq(labels).sum().item()
                nb_samples += len(predicted)

                pbar.set_postfix({'Loss':total_loss/(it+1), 'Accuracy':100*correct/nb_samples, 'lr':lr_scheduler.get_last_lr()[0]})
                pbar.update(1)
                
                lr_scheduler.step()

                if it % log_interval == log_interval-1:
                    val_loss, val_acc = evaluate_loss(net, device, val_loader, criterion)  # Compute validation loss
                    print(f"Val Loss : {val_loss:.3f}. Val Accuracy {val_acc:.2f}%")

                    if val_loss < best_loss:
                        print(f"Best validation loss improved from {best_loss:.3f} to {val_loss:.3f}")
                        print()
                        net_copy = copy.deepcopy(net)  # save a copy of the model
                        best_loss = val_loss
                        best_ep = ep + 1

        val_loss, val_acc = evaluate_loss(net, device, val_loader, criterion)  # Compute validation loss
        print()
        print(f"Epoch {ep+1} complete! Val Loss : {val_loss:.3f}. Val Accuracy {val_acc:.2f}%")

        train_losses.append(total_loss/len(train_loader))
        val_losses.append(val_loss)

        if val_loss < best_loss:
            print(f"Best validation loss improved from {best_loss:.3f} to {val_loss:.3f}")
            print()
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_ep = ep + 1
        
        if unfreeze and ep == unfreeze_ep:
            for param in net.parameters():
                param.requires_grad = True

    # Saving the model
    path_to_model=f'models/CANINE_lr_{lr}_val_loss_{round(best_loss, 5)}_ep_{best_ep}_typo_{typo_percent}.pt'
    torch.save(net_copy.state_dict(), path_to_model)
    print(f"The model has been saved in {path_to_model}")

    del loss
    torch.cuda.empty_cache()

    return train_losses, val_losses, net_copy

In [None]:
def evaluate_loss(net, device, dataloader, criterion):
    net.eval()

    total_loss = 0
    correct = 0

    print('Evaluating...')
    with torch.no_grad():
        for it, batch in enumerate(dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            seq, attn_masks, labels = \
                batch['input_ids'], batch['attention_mask'], batch['labels']
            labels = labels.to(torch.int64)
            output = net(seq, attn_masks, labels=labels)
            total_loss += output.loss.item() #HUGGINGFACE
            #total_loss += criterion(output, labels).item()
            predicted = output.logits.argmax(axis=1) #HUGGINGFACE
            #predicted = output.argmax(axis=1)
            correct += predicted.eq(labels).sum().item()

    return total_loss/len(dataloader), 100*correct/len(dataloader.dataset)

## Train the model with different levels of noise

In [None]:
print("Creation of the models' folder...")
!mkdir models

Creation of the models' folder...


In [None]:
#freeze_canine = False  # if True, freeze the encoder weights and only update the classification layer weights
#maxlen = 2048  # maximum length of the tokenized input pair : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
lr = 2e-5  # learning rate
epochs = 5  # number of training epochs

In [None]:
from transformers import CanineTokenizer, CanineConfig, CanineForSequenceClassification, get_linear_schedule_with_warmup, DistilBertForSequenceClassification#, AdamW
from torch.optim import AdamW

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
losses = []
accs = []

for typo_percent in [0, 0.05, 0.1, 0.2, 0.4]:
    dataset, loader = generate_dataset(typo_percent=typo_percent)

    net = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

    net.to(device)

    optimizer = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
    criterion = nn.CrossEntropyLoss()
    t_total = len(loader['train']) * epochs  # Necessary to take into account Gradient accumulation
    num_warmup_steps = int(t_total*0.1) # The number of steps for the warmup phase.
    lr_scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

    train_losses, val_losses, net = train_canine(net, device, optimizer, criterion, lr, lr_scheduler, loader['train'], loader['validation'], epochs)

    print(f'\n### Typo percentage = {typo_percent} ###')
    losses_aux = []
    accs_aux = []
    for split in ['train', 'validation', 'test']:
        loss, acc = evaluate_loss(net, device, loader[split], criterion)
        losses_aux.append(loss)
        accs_aux.append(acc)
        print()
        print(split)
        print(f'   loss = {loss}\n   acc = {acc}')

    losses.append(losses_aux)
    accs.append(accs_aux)

Downloading builder script:   0%|          | 0.00/7.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

Downloading and preparing dataset glue/sst2 (download: 7.09 MiB, generated: 4.81 MiB, post-processed: Unknown size, total: 11.90 MiB) to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3999 [00:00<?, ?ba/s]

  0%|          | 0/211 [00:00<?, ?ba/s]

  0%|          | 0/55 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

Evaluating...


Epoch : 1/5:  20%|██        | 801/4.00k [00:52<35:24, 1.51it/s, Loss=0.434, Accuracy=78.5, lr=8.01e-6]

Val Loss : 0.260. Val Accuracy 89.76%
Best validation loss improved from inf to 0.260



Epoch : 1/5:  40%|███▉      | 1.60k/4.00k [01:40<02:25, 16.5it/s, Loss=0.339, Accuracy=84.3, lr=1.6e-5]

Evaluating...


Epoch : 1/5:  40%|████      | 1.60k/4.00k [01:44<19:11, 2.08it/s, Loss=0.339, Accuracy=84.3, lr=1.6e-5]

Val Loss : 0.228. Val Accuracy 91.09%
Best validation loss improved from 0.260 to 0.228



Epoch : 1/5:  60%|█████▉    | 2.40k/4.00k [02:32<01:32, 17.3it/s, Loss=0.296, Accuracy=86.9, lr=1.96e-5]

Evaluating...


Epoch : 1/5:  60%|██████    | 2.40k/4.00k [02:36<12:50, 2.08it/s, Loss=0.296, Accuracy=86.9, lr=1.96e-5]

Val Loss : 0.204. Val Accuracy 92.90%
Best validation loss improved from 0.228 to 0.204



Epoch : 1/5:  80%|███████▉  | 3.20k/4.00k [03:24<00:48, 16.5it/s, Loss=0.268, Accuracy=88.4, lr=1.87e-5]

Evaluating...


Epoch : 1/5:  80%|███████▉  | 3.20k/4.00k [03:28<08:50, 1.51it/s, Loss=0.268, Accuracy=88.5, lr=1.87e-5]

Val Loss : 0.182. Val Accuracy 93.44%
Best validation loss improved from 0.204 to 0.182



Epoch : 1/5: 100%|█████████▉| 4.00k/4.00k [04:15<00:00, 17.6it/s, Loss=0.247, Accuracy=89.5, lr=1.78e-5]

Evaluating...


Epoch : 1/5: 100%|█████████▉| 4.00k/4.00k [04:19<00:01, 1.51it/s, Loss=0.247, Accuracy=89.5, lr=1.78e-5]

Val Loss : 0.158. Val Accuracy 94.24%
Best validation loss improved from 0.182 to 0.158



Epoch : 1/5: 100%|██████████| 4.00k/4.00k [04:20<00:00, 15.4it/s, Loss=0.247, Accuracy=89.5, lr=1.78e-5]


Evaluating...

Epoch 1 complete! Val Loss : 0.158. Val Accuracy 94.18%
Best validation loss improved from 0.158 to 0.158



Epoch : 2/5:  20%|█▉        | 799/4.00k [00:48<03:12, 16.7it/s, Loss=0.124, Accuracy=95.5, lr=1.69e-5]

Evaluating...


Epoch : 2/5:  20%|██        | 801/4.00k [00:52<35:14, 1.51it/s, Loss=0.124, Accuracy=95.5, lr=1.69e-5]

Val Loss : 0.175. Val Accuracy 94.27%


Epoch : 2/5:  40%|███▉      | 1.60k/4.00k [01:39<02:21, 17.0it/s, Loss=0.113, Accuracy=96, lr=1.6e-5]

Evaluating...


Epoch : 2/5:  40%|████      | 1.60k/4.00k [01:43<19:08, 2.09it/s, Loss=0.113, Accuracy=96, lr=1.6e-5]

Val Loss : 0.158. Val Accuracy 94.92%


Epoch : 2/5:  60%|█████▉    | 2.40k/4.00k [02:31<01:32, 17.3it/s, Loss=0.111, Accuracy=96.1, lr=1.51e-5]

Evaluating...


Epoch : 2/5:  60%|█████▉    | 2.40k/4.00k [02:35<17:32, 1.52it/s, Loss=0.111, Accuracy=96.1, lr=1.51e-5]

Val Loss : 0.159. Val Accuracy 94.51%


Epoch : 2/5:  80%|███████▉  | 3.19k/4.00k [03:23<00:50, 16.1it/s, Loss=0.11, Accuracy=96.2, lr=1.42e-5]

Evaluating...


Epoch : 2/5:  80%|███████▉  | 3.20k/4.00k [03:27<08:49, 1.51it/s, Loss=0.11, Accuracy=96.2, lr=1.42e-5]

Val Loss : 0.156. Val Accuracy 94.74%
Best validation loss improved from 0.158 to 0.156



Epoch : 2/5: 100%|█████████▉| 4.00k/4.00k [04:15<00:00, 16.5it/s, Loss=0.106, Accuracy=96.3, lr=1.33e-5]

Evaluating...


Epoch : 2/5: 100%|█████████▉| 4.00k/4.00k [04:19<00:01, 1.52it/s, Loss=0.106, Accuracy=96.3, lr=1.33e-5]

Val Loss : 0.150. Val Accuracy 94.89%
Best validation loss improved from 0.156 to 0.150



Epoch : 2/5: 100%|██████████| 4.00k/4.00k [04:19<00:00, 15.4it/s, Loss=0.106, Accuracy=96.3, lr=1.33e-5]


Evaluating...

Epoch 2 complete! Val Loss : 0.150. Val Accuracy 94.77%
Best validation loss improved from 0.150 to 0.150



Epoch : 3/5:  20%|█▉        | 799/4.00k [00:48<03:09, 16.9it/s, Loss=0.058, Accuracy=98, lr=1.24e-5]

Evaluating...


Epoch : 3/5:  20%|██        | 801/4.00k [00:52<34:54, 1.53it/s, Loss=0.0578, Accuracy=98, lr=1.24e-5]

Val Loss : 0.180. Val Accuracy 94.95%


Epoch : 3/5:  40%|███▉      | 1.60k/4.00k [01:40<02:17, 17.4it/s, Loss=0.0543, Accuracy=98.2, lr=1.16e-5]

Evaluating...


Epoch : 3/5:  40%|████      | 1.60k/4.00k [01:44<26:13, 1.52it/s, Loss=0.0543, Accuracy=98.1, lr=1.16e-5]

Val Loss : 0.170. Val Accuracy 94.98%


Epoch : 3/5:  60%|█████▉    | 2.40k/4.00k [02:31<01:37, 16.4it/s, Loss=0.054, Accuracy=98.1, lr=1.07e-5]

Evaluating...


Epoch : 3/5:  60%|█████▉    | 2.40k/4.00k [02:35<17:29, 1.52it/s, Loss=0.0541, Accuracy=98.1, lr=1.07e-5]

Val Loss : 0.160. Val Accuracy 95.19%


Epoch : 3/5:  80%|███████▉  | 3.20k/4.00k [03:23<00:46, 17.1it/s, Loss=0.0542, Accuracy=98.1, lr=9.78e-6]

Evaluating...


Epoch : 3/5:  80%|███████▉  | 3.20k/4.00k [03:27<08:45, 1.52it/s, Loss=0.0542, Accuracy=98.1, lr=9.78e-6]

Val Loss : 0.160. Val Accuracy 95.10%


Epoch : 3/5: 100%|█████████▉| 3.99k/4.00k [04:15<00:00, 16.4it/s, Loss=0.0548, Accuracy=98.1, lr=8.89e-6]

Evaluating...


Epoch : 3/5: 100%|█████████▉| 4.00k/4.00k [04:19<00:00, 2.08it/s, Loss=0.0548, Accuracy=98.1, lr=8.89e-6]

Val Loss : 0.163. Val Accuracy 95.16%


Epoch : 3/5: 100%|██████████| 4.00k/4.00k [04:19<00:00, 15.4it/s, Loss=0.0547, Accuracy=98.1, lr=8.89e-6]


Evaluating...

Epoch 3 complete! Val Loss : 0.165. Val Accuracy 95.07%


Epoch : 4/5:  20%|█▉        | 799/4.00k [00:48<03:16, 16.2it/s, Loss=0.0305, Accuracy=98.9, lr=8e-6]

Evaluating...


Epoch : 4/5:  20%|██        | 801/4.00k [00:52<35:05, 1.52it/s, Loss=0.0306, Accuracy=98.9, lr=8e-6]

Val Loss : 0.190. Val Accuracy 94.98%


Epoch : 4/5:  40%|███▉      | 1.60k/4.00k [01:40<02:30, 16.0it/s, Loss=0.0274, Accuracy=99, lr=7.11e-6]

Evaluating...


Epoch : 4/5:  40%|████      | 1.60k/4.00k [01:44<26:17, 1.52it/s, Loss=0.0274, Accuracy=99, lr=7.11e-6]

Val Loss : 0.200. Val Accuracy 94.83%


Epoch : 4/5:  60%|█████▉    | 2.40k/4.00k [02:32<01:39, 16.2it/s, Loss=0.0267, Accuracy=99, lr=6.23e-6]

Evaluating...


Epoch : 4/5:  60%|██████    | 2.40k/4.00k [02:36<12:49, 2.08it/s, Loss=0.0267, Accuracy=99, lr=6.22e-6]

Val Loss : 0.201. Val Accuracy 95.04%


Epoch : 4/5:  80%|███████▉  | 3.20k/4.00k [03:24<00:47, 17.0it/s, Loss=0.0263, Accuracy=98.9, lr=5.34e-6]

Evaluating...


Epoch : 4/5:  80%|███████▉  | 3.20k/4.00k [03:28<08:44, 1.53it/s, Loss=0.0263, Accuracy=98.9, lr=5.33e-6]

Val Loss : 0.209. Val Accuracy 94.95%


Epoch : 4/5: 100%|█████████▉| 3.99k/4.00k [04:16<00:00, 16.4it/s, Loss=0.0263, Accuracy=99, lr=4.45e-6]

Evaluating...


Epoch : 4/5: 100%|█████████▉| 4.00k/4.00k [04:20<00:00, 2.09it/s, Loss=0.0263, Accuracy=99, lr=4.45e-6]

Val Loss : 0.194. Val Accuracy 94.89%


Epoch : 4/5: 100%|██████████| 4.00k/4.00k [04:20<00:00, 15.3it/s, Loss=0.0263, Accuracy=99, lr=4.45e-6]


Evaluating...

Epoch 4 complete! Val Loss : 0.193. Val Accuracy 94.92%


Epoch : 5/5:  20%|█▉        | 798/4.00k [00:48<03:01, 17.7it/s, Loss=0.0202, Accuracy=99.3, lr=3.56e-6]

Evaluating...


Epoch : 5/5:  20%|██        | 802/4.00k [00:52<25:28, 2.09it/s, Loss=0.0202, Accuracy=99.3, lr=3.55e-6]

Val Loss : 0.238. Val Accuracy 95.04%


Epoch : 5/5:  40%|███▉      | 1.60k/4.00k [01:40<02:30, 16.0it/s, Loss=0.0154, Accuracy=99.5, lr=2.67e-6]

Evaluating...


Epoch : 5/5:  40%|████      | 1.60k/4.00k [01:44<19:08, 2.09it/s, Loss=0.0154, Accuracy=99.5, lr=2.67e-6]

Val Loss : 0.246. Val Accuracy 95.37%


Epoch : 5/5:  60%|█████▉    | 2.40k/4.00k [02:32<01:37, 16.5it/s, Loss=0.0135, Accuracy=99.5, lr=1.78e-6]

Evaluating...


Epoch : 5/5:  60%|█████▉    | 2.40k/4.00k [02:36<17:27, 1.53it/s, Loss=0.0136, Accuracy=99.5, lr=1.78e-6]

Val Loss : 0.257. Val Accuracy 94.92%


Epoch : 5/5:  80%|███████▉  | 3.20k/4.00k [03:24<00:51, 15.5it/s, Loss=0.0131, Accuracy=99.5, lr=8.94e-7]

Evaluating...


Epoch : 5/5:  80%|███████▉  | 3.20k/4.00k [03:28<08:48, 1.51it/s, Loss=0.0132, Accuracy=99.5, lr=8.9e-7] 

Val Loss : 0.246. Val Accuracy 95.25%


Epoch : 5/5: 100%|█████████▉| 4.00k/4.00k [04:16<00:00, 17.0it/s, Loss=0.0127, Accuracy=99.5, lr=5.56e-9]

Evaluating...


Epoch : 5/5: 100%|█████████▉| 4.00k/4.00k [04:21<00:01, 1.50it/s, Loss=0.0127, Accuracy=99.5, lr=2.22e-9]

Val Loss : 0.244. Val Accuracy 95.25%


Epoch : 5/5: 100%|██████████| 4.00k/4.00k [04:21<00:00, 15.3it/s, Loss=0.0127, Accuracy=99.5, lr=1.11e-9]


Evaluating...

Epoch 5 complete! Val Loss : 0.244. Val Accuracy 95.25%
The model has been saved in models/CANINE_lr_2e-05_val_loss_0.14968_ep_2_typo_0.pt

### Typo percentage = 0 ###
Evaluating...

train
   loss = 0.048431562569244364
   acc = 98.5839546115253
Evaluating...

validation
   loss = 0.1496804413513691
   acc = 94.77434679334917
Evaluating...

test
   loss = 0.246930263123729
   acc = 90.94036697247707


Reusing dataset glue (/root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-abf39e5d242edc33.arrow and /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-b8f06ab981dbf971.arrow


  0%|          | 0/3999 [00:00<?, ?ba/s]

  0%|          | 0/211 [00:00<?, ?ba/s]

  0%|          | 0/55 [00:00<?, ?ba/s]

  0%|          | 0/3999 [00:00<?, ?ba/s]

  0%|          | 0/211 [00:00<?, ?ba/s]

  0%|          | 0/55 [00:00<?, ?ba/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

Evaluating...


Epoch : 1/5:  20%|██        | 800/4.00k [00:57<38:37, 1.38it/s, Loss=0.535, Accuracy=70.6, lr=8e-6]   

Val Loss : 0.410. Val Accuracy 81.98%
Best validation loss improved from inf to 0.410



Epoch : 1/5:  40%|███▉      | 1.60k/4.00k [01:49<02:43, 14.7it/s, Loss=0.449, Accuracy=77.1, lr=1.6e-5]

Evaluating...


Epoch : 1/5:  40%|████      | 1.60k/4.00k [01:54<29:04, 1.38it/s, Loss=0.449, Accuracy=77.1, lr=1.6e-5]

Val Loss : 0.343. Val Accuracy 84.98%
Best validation loss improved from 0.410 to 0.343



Epoch : 1/5:  60%|█████▉    | 2.40k/4.00k [02:46<01:44, 15.3it/s, Loss=0.409, Accuracy=79.8, lr=1.96e-5]

Evaluating...


Epoch : 1/5:  60%|██████    | 2.40k/4.00k [02:51<14:02, 1.90it/s, Loss=0.409, Accuracy=79.8, lr=1.96e-5]

Val Loss : 0.318. Val Accuracy 86.28%
Best validation loss improved from 0.343 to 0.318



Epoch : 1/5:  80%|███████▉  | 3.20k/4.00k [03:43<00:53, 15.0it/s, Loss=0.382, Accuracy=81.6, lr=1.87e-5]

Evaluating...


Epoch : 1/5:  80%|███████▉  | 3.20k/4.00k [03:47<09:41, 1.38it/s, Loss=0.382, Accuracy=81.7, lr=1.87e-5]

Val Loss : 0.294. Val Accuracy 87.98%
Best validation loss improved from 0.318 to 0.294



Epoch : 1/5: 100%|█████████▉| 4.00k/4.00k [04:40<00:00, 15.1it/s, Loss=0.362, Accuracy=82.9, lr=1.78e-5]

Evaluating...


Epoch : 1/5: 100%|█████████▉| 4.00k/4.00k [04:44<00:01, 1.36it/s, Loss=0.362, Accuracy=82.9, lr=1.78e-5]

Val Loss : 0.271. Val Accuracy 89.25%
Best validation loss improved from 0.294 to 0.271



Epoch : 1/5: 100%|██████████| 4.00k/4.00k [04:44<00:00, 14.0it/s, Loss=0.362, Accuracy=82.9, lr=1.78e-5]


Evaluating...

Epoch 1 complete! Val Loss : 0.272. Val Accuracy 88.95%


Epoch : 2/5:  20%|█▉        | 798/4.00k [00:52<03:32, 15.1it/s, Loss=0.2, Accuracy=91.9, lr=1.69e-5]

Evaluating...


Epoch : 2/5:  20%|██        | 802/4.00k [00:57<27:56, 1.91it/s, Loss=0.2, Accuracy=91.9, lr=1.69e-5]

Val Loss : 0.285. Val Accuracy 89.10%


Epoch : 2/5:  40%|███▉      | 1.60k/4.00k [01:49<02:37, 15.3it/s, Loss=0.188, Accuracy=92.6, lr=1.6e-5]

Evaluating...


Epoch : 2/5:  40%|████      | 1.60k/4.00k [01:54<28:51, 1.39it/s, Loss=0.188, Accuracy=92.6, lr=1.6e-5]

Val Loss : 0.297. Val Accuracy 89.10%


Epoch : 2/5:  60%|█████▉    | 2.40k/4.00k [02:46<01:46, 15.1it/s, Loss=0.179, Accuracy=93.1, lr=1.51e-5]

Evaluating...


Epoch : 2/5:  60%|██████    | 2.40k/4.00k [02:50<14:12, 1.88it/s, Loss=0.179, Accuracy=93.1, lr=1.51e-5]

Val Loss : 0.282. Val Accuracy 89.52%


Epoch : 2/5:  80%|███████▉  | 3.20k/4.00k [03:43<00:53, 15.1it/s, Loss=0.177, Accuracy=93.1, lr=1.42e-5]

Evaluating...


Epoch : 2/5:  80%|███████▉  | 3.20k/4.00k [03:47<09:46, 1.37it/s, Loss=0.177, Accuracy=93.1, lr=1.42e-5]

Val Loss : 0.274. Val Accuracy 89.76%


Epoch : 2/5: 100%|█████████▉| 3.99k/4.00k [04:39<00:00, 15.5it/s, Loss=0.176, Accuracy=93.2, lr=1.33e-5]

Evaluating...


Epoch : 2/5: 100%|█████████▉| 4.00k/4.00k [04:44<00:02, 1.38it/s, Loss=0.176, Accuracy=93.2, lr=1.33e-5]

Val Loss : 0.260. Val Accuracy 90.02%
Best validation loss improved from 0.271 to 0.260



Epoch : 2/5: 100%|██████████| 4.00k/4.00k [04:44<00:00, 14.1it/s, Loss=0.176, Accuracy=93.2, lr=1.33e-5]


Evaluating...

Epoch 2 complete! Val Loss : 0.262. Val Accuracy 89.70%


Epoch : 3/5:  20%|█▉        | 798/4.00k [00:52<03:45, 14.2it/s, Loss=0.0805, Accuracy=97.1, lr=1.24e-5]

Evaluating...


Epoch : 3/5:  20%|██        | 802/4.00k [00:57<28:24, 1.88it/s, Loss=0.0806, Accuracy=97.1, lr=1.24e-5]

Val Loss : 0.348. Val Accuracy 89.88%


Epoch : 3/5:  40%|███▉      | 1.60k/4.00k [01:49<02:32, 15.8it/s, Loss=0.069, Accuracy=97.6, lr=1.16e-5]

Evaluating...


Epoch : 3/5:  40%|████      | 1.60k/4.00k [01:53<28:53, 1.38it/s, Loss=0.0691, Accuracy=97.5, lr=1.16e-5]

Val Loss : 0.350. Val Accuracy 89.61%


Epoch : 3/5:  60%|█████▉    | 2.40k/4.00k [02:46<01:45, 15.1it/s, Loss=0.0658, Accuracy=97.7, lr=1.07e-5]

Evaluating...


Epoch : 3/5:  60%|██████    | 2.40k/4.00k [02:50<14:02, 1.90it/s, Loss=0.0658, Accuracy=97.7, lr=1.07e-5]

Val Loss : 0.337. Val Accuracy 90.35%


Epoch : 3/5:  80%|███████▉  | 3.19k/4.00k [03:43<00:53, 15.1it/s, Loss=0.0655, Accuracy=97.7, lr=9.78e-6]

Evaluating...


Epoch : 3/5:  80%|███████▉  | 3.20k/4.00k [03:47<09:44, 1.37it/s, Loss=0.0654, Accuracy=97.7, lr=9.78e-6]

Val Loss : 0.338. Val Accuracy 90.14%


Epoch : 3/5: 100%|█████████▉| 4.00k/4.00k [04:40<00:00, 15.8it/s, Loss=0.065, Accuracy=97.7, lr=8.89e-6]

Evaluating...


Epoch : 3/5: 100%|█████████▉| 4.00k/4.00k [04:44<00:01, 1.38it/s, Loss=0.065, Accuracy=97.7, lr=8.89e-6] 

Val Loss : 0.332. Val Accuracy 89.46%


Epoch : 3/5: 100%|██████████| 4.00k/4.00k [04:44<00:00, 14.0it/s, Loss=0.065, Accuracy=97.7, lr=8.89e-6]


Evaluating...

Epoch 3 complete! Val Loss : 0.331. Val Accuracy 89.82%


Epoch : 4/5:  20%|█▉        | 798/4.00k [00:52<03:26, 15.5it/s, Loss=0.0445, Accuracy=98.7, lr=8e-6]

Evaluating...


Epoch : 4/5:  20%|██        | 802/4.00k [00:57<28:11, 1.89it/s, Loss=0.0443, Accuracy=98.7, lr=8e-6]

Val Loss : 0.386. Val Accuracy 90.74%


Epoch : 4/5:  40%|███▉      | 1.60k/4.00k [01:49<02:53, 13.9it/s, Loss=0.0311, Accuracy=99.1, lr=7.11e-6]

Evaluating...


Epoch : 4/5:  40%|████      | 1.60k/4.00k [01:54<28:58, 1.38it/s, Loss=0.031, Accuracy=99.1, lr=7.11e-6]

Val Loss : 0.417. Val Accuracy 90.35%


Epoch : 4/5:  60%|█████▉    | 2.40k/4.00k [02:46<01:47, 14.9it/s, Loss=0.0255, Accuracy=99.2, lr=6.23e-6]

Evaluating...


Epoch : 4/5:  60%|█████▉    | 2.40k/4.00k [02:51<19:28, 1.37it/s, Loss=0.0255, Accuracy=99.2, lr=6.22e-6]

Val Loss : 0.437. Val Accuracy 90.41%


Epoch : 4/5:  80%|███████▉  | 3.20k/4.00k [03:43<00:53, 14.9it/s, Loss=0.0227, Accuracy=99.3, lr=5.34e-6]

Evaluating...


Epoch : 4/5:  80%|███████▉  | 3.20k/4.00k [03:48<09:39, 1.38it/s, Loss=0.0227, Accuracy=99.3, lr=5.33e-6]

Val Loss : 0.446. Val Accuracy 90.50%


Epoch : 4/5: 100%|█████████▉| 3.99k/4.00k [04:40<00:00, 15.3it/s, Loss=0.0216, Accuracy=99.4, lr=4.45e-6]

Evaluating...


Epoch : 4/5: 100%|█████████▉| 4.00k/4.00k [04:45<00:00, 1.90it/s, Loss=0.0216, Accuracy=99.4, lr=4.45e-6]

Val Loss : 0.426. Val Accuracy 90.35%


Epoch : 4/5: 100%|██████████| 4.00k/4.00k [04:45<00:00, 14.0it/s, Loss=0.0216, Accuracy=99.4, lr=4.45e-6]


Evaluating...

Epoch 4 complete! Val Loss : 0.424. Val Accuracy 90.26%


Epoch : 5/5:  20%|█▉        | 798/4.00k [00:52<03:33, 15.0it/s, Loss=0.0221, Accuracy=99.2, lr=3.56e-6]

Evaluating...


Epoch : 5/5:  20%|██        | 802/4.00k [00:57<28:10, 1.89it/s, Loss=0.0221, Accuracy=99.2, lr=3.55e-6]

Val Loss : 0.462. Val Accuracy 90.11%


Epoch : 5/5:  40%|███▉      | 1.60k/4.00k [01:50<02:32, 15.8it/s, Loss=0.0135, Accuracy=99.5, lr=2.67e-6]

Evaluating...


Epoch : 5/5:  40%|████      | 1.60k/4.00k [01:54<28:56, 1.38it/s, Loss=0.0136, Accuracy=99.5, lr=2.67e-6]

Val Loss : 0.487. Val Accuracy 90.08%


Epoch : 5/5:  60%|█████▉    | 2.40k/4.00k [02:46<01:44, 15.3it/s, Loss=0.0112, Accuracy=99.6, lr=1.78e-6]

Evaluating...


Epoch : 5/5:  60%|█████▉    | 2.40k/4.00k [02:51<19:25, 1.37it/s, Loss=0.0112, Accuracy=99.6, lr=1.78e-6]

Val Loss : 0.506. Val Accuracy 90.23%


Epoch : 5/5:  80%|███████▉  | 3.19k/4.00k [03:43<00:50, 15.8it/s, Loss=0.00983, Accuracy=99.7, lr=8.94e-7]

Evaluating...


Epoch : 5/5:  80%|███████▉  | 3.20k/4.00k [03:48<07:00, 1.90it/s, Loss=0.00982, Accuracy=99.7, lr=8.9e-7]

Val Loss : 0.499. Val Accuracy 90.32%


Epoch : 5/5: 100%|█████████▉| 4.00k/4.00k [04:41<00:00, 15.1it/s, Loss=0.00902, Accuracy=99.7, lr=5.56e-9]

Evaluating...


Epoch : 5/5: 100%|█████████▉| 4.00k/4.00k [04:45<00:01, 1.37it/s, Loss=0.00901, Accuracy=99.7, lr=2.22e-9]

Val Loss : 0.500. Val Accuracy 90.23%


Epoch : 5/5: 100%|██████████| 4.00k/4.00k [04:45<00:00, 14.0it/s, Loss=0.00901, Accuracy=99.7, lr=1.11e-9]


Evaluating...

Epoch 5 complete! Val Loss : 0.500. Val Accuracy 90.23%
The model has been saved in models/CANINE_lr_2e-05_val_loss_0.26029_ep_2_typo_0.05.pt

### Typo percentage = 0.05 ###
Evaluating...

train
   loss = 0.08076710514416842
   acc = 97.94157640549538
Evaluating...

validation
   loss = 0.2602902161337047
   acc = 90.02375296912113
Evaluating...

test
   loss = 0.3376750673760067
   acc = 86.69724770642202


Reusing dataset glue (/root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-abf39e5d242edc33.arrow and /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-b8f06ab981dbf971.arrow


  0%|          | 0/3999 [00:00<?, ?ba/s]

  0%|          | 0/211 [00:00<?, ?ba/s]

  0%|          | 0/55 [00:00<?, ?ba/s]

  0%|          | 0/3999 [00:00<?, ?ba/s]

  0%|          | 0/211 [00:00<?, ?ba/s]

  0%|          | 0/55 [00:00<?, ?ba/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

Evaluating...


Epoch : 1/5:  20%|██        | 800/4.00k [01:01<41:45, 1.28it/s, Loss=0.591, Accuracy=66.6, lr=8e-6]  

Val Loss : 0.479. Val Accuracy 76.72%
Best validation loss improved from inf to 0.479



Epoch : 1/5:  40%|███▉      | 1.60k/4.00k [01:56<02:47, 14.3it/s, Loss=0.519, Accuracy=72.7, lr=1.6e-5]

Evaluating...


Epoch : 1/5:  40%|████      | 1.60k/4.00k [02:01<31:17, 1.28it/s, Loss=0.519, Accuracy=72.7, lr=1.6e-5]

Val Loss : 0.418. Val Accuracy 80.64%
Best validation loss improved from 0.479 to 0.418



Epoch : 1/5:  60%|█████▉    | 2.40k/4.00k [02:57<01:48, 14.7it/s, Loss=0.484, Accuracy=75.3, lr=1.96e-5]

Evaluating...


Epoch : 1/5:  60%|█████▉    | 2.40k/4.00k [03:02<20:49, 1.28it/s, Loss=0.484, Accuracy=75.3, lr=1.96e-5]

Val Loss : 0.398. Val Accuracy 82.21%
Best validation loss improved from 0.418 to 0.398



Epoch : 1/5:  80%|███████▉  | 3.20k/4.00k [03:58<00:59, 13.4it/s, Loss=0.461, Accuracy=77, lr=1.87e-5]

Evaluating...


Epoch : 1/5:  80%|███████▉  | 3.20k/4.00k [04:03<10:32, 1.27it/s, Loss=0.461, Accuracy=77, lr=1.87e-5]

Val Loss : 0.384. Val Accuracy 82.21%
Best validation loss improved from 0.398 to 0.384



Epoch : 1/5: 100%|█████████▉| 3.99k/4.00k [04:59<00:00, 14.0it/s, Loss=0.442, Accuracy=78.2, lr=1.78e-5]

Evaluating...


Epoch : 1/5: 100%|█████████▉| 4.00k/4.00k [05:04<00:02, 1.28it/s, Loss=0.442, Accuracy=78.2, lr=1.78e-5]

Val Loss : 0.365. Val Accuracy 83.37%
Best validation loss improved from 0.384 to 0.365



Epoch : 1/5: 100%|██████████| 4.00k/4.00k [05:04<00:00, 13.1it/s, Loss=0.442, Accuracy=78.2, lr=1.78e-5]


Evaluating...

Epoch 1 complete! Val Loss : 0.361. Val Accuracy 83.85%
Best validation loss improved from 0.365 to 0.361



Epoch : 2/5:  20%|█▉        | 798/4.00k [00:56<04:01, 13.3it/s, Loss=0.275, Accuracy=88.3, lr=1.69e-5]

Evaluating...


Epoch : 2/5:  20%|██        | 800/4.00k [01:01<41:59, 1.27it/s, Loss=0.275, Accuracy=88.4, lr=1.69e-5]

Val Loss : 0.382. Val Accuracy 83.55%


Epoch : 2/5:  40%|███▉      | 1.60k/4.00k [01:57<02:52, 13.9it/s, Loss=0.261, Accuracy=89.1, lr=1.6e-5]

Evaluating...


Epoch : 2/5:  40%|████      | 1.60k/4.00k [02:02<31:16, 1.28it/s, Loss=0.261, Accuracy=89.1, lr=1.6e-5]

Val Loss : 0.387. Val Accuracy 83.61%


Epoch : 2/5:  60%|█████▉    | 2.40k/4.00k [02:58<01:53, 14.1it/s, Loss=0.257, Accuracy=89.1, lr=1.51e-5]

Evaluating...


Epoch : 2/5:  60%|██████    | 2.40k/4.00k [03:03<15:04, 1.77it/s, Loss=0.257, Accuracy=89.1, lr=1.51e-5]

Val Loss : 0.369. Val Accuracy 84.53%


Epoch : 2/5:  80%|███████▉  | 3.20k/4.00k [03:59<00:56, 14.3it/s, Loss=0.251, Accuracy=89.4, lr=1.42e-5]

Evaluating...


Epoch : 2/5:  80%|███████▉  | 3.20k/4.00k [04:04<10:25, 1.28it/s, Loss=0.251, Accuracy=89.4, lr=1.42e-5]

Val Loss : 0.373. Val Accuracy 84.26%


Epoch : 2/5: 100%|█████████▉| 3.99k/4.00k [04:59<00:00, 14.3it/s, Loss=0.249, Accuracy=89.6, lr=1.33e-5]

Evaluating...


Epoch : 2/5: 100%|█████████▉| 4.00k/4.00k [05:04<00:02, 1.27it/s, Loss=0.249, Accuracy=89.6, lr=1.33e-5]

Val Loss : 0.355. Val Accuracy 84.62%
Best validation loss improved from 0.361 to 0.355



Epoch : 2/5: 100%|██████████| 4.00k/4.00k [05:04<00:00, 13.1it/s, Loss=0.249, Accuracy=89.6, lr=1.33e-5]


Evaluating...

Epoch 2 complete! Val Loss : 0.358. Val Accuracy 84.80%


Epoch : 3/5:  20%|█▉        | 798/4.00k [00:56<04:00, 13.3it/s, Loss=0.128, Accuracy=94.9, lr=1.24e-5]

Evaluating...


Epoch : 3/5:  20%|██        | 801/4.00k [01:01<41:43, 1.28it/s, Loss=0.128, Accuracy=94.9, lr=1.24e-5]

Val Loss : 0.487. Val Accuracy 83.76%


Epoch : 3/5:  40%|███▉      | 1.60k/4.00k [01:57<02:49, 14.2it/s, Loss=0.108, Accuracy=95.8, lr=1.16e-5]

Evaluating...


Epoch : 3/5:  40%|████      | 1.60k/4.00k [02:02<31:18, 1.28it/s, Loss=0.108, Accuracy=95.8, lr=1.16e-5]

Val Loss : 0.557. Val Accuracy 83.82%


Epoch : 3/5:  60%|█████▉    | 2.40k/4.00k [02:58<01:57, 13.6it/s, Loss=0.102, Accuracy=96.1, lr=1.07e-5]

Evaluating...


Epoch : 3/5:  60%|█████▉    | 2.40k/4.00k [03:03<20:48, 1.28it/s, Loss=0.102, Accuracy=96.1, lr=1.07e-5]

Val Loss : 0.479. Val Accuracy 83.94%


Epoch : 3/5:  80%|███████▉  | 3.20k/4.00k [03:58<00:55, 14.4it/s, Loss=0.0988, Accuracy=96.3, lr=9.78e-6]

Evaluating...


Epoch : 3/5:  80%|███████▉  | 3.20k/4.00k [04:03<10:25, 1.28it/s, Loss=0.0989, Accuracy=96.3, lr=9.78e-6]

Val Loss : 0.472. Val Accuracy 84.59%


Epoch : 3/5: 100%|█████████▉| 3.99k/4.00k [04:59<00:00, 14.4it/s, Loss=0.0974, Accuracy=96.3, lr=8.89e-6]

Evaluating...


Epoch : 3/5: 100%|█████████▉| 4.00k/4.00k [05:04<00:02, 1.28it/s, Loss=0.0974, Accuracy=96.3, lr=8.89e-6]

Val Loss : 0.502. Val Accuracy 84.53%


Epoch : 3/5: 100%|██████████| 4.00k/4.00k [05:04<00:00, 13.1it/s, Loss=0.0974, Accuracy=96.3, lr=8.89e-6]


Evaluating...

Epoch 3 complete! Val Loss : 0.506. Val Accuracy 84.59%


Epoch : 4/5:  20%|█▉        | 798/4.00k [00:56<04:00, 13.3it/s, Loss=0.0621, Accuracy=97.7, lr=8e-6]

Evaluating...


Epoch : 4/5:  20%|██        | 800/4.00k [01:01<42:00, 1.27it/s, Loss=0.062, Accuracy=97.7, lr=8e-6] 

Val Loss : 0.622. Val Accuracy 83.88%


Epoch : 4/5:  40%|███▉      | 1.60k/4.00k [01:57<02:43, 14.7it/s, Loss=0.0445, Accuracy=98.4, lr=7.11e-6]

Evaluating...


Epoch : 4/5:  40%|████      | 1.60k/4.00k [02:02<32:07, 1.24it/s, Loss=0.0445, Accuracy=98.4, lr=7.11e-6]

Val Loss : 0.679. Val Accuracy 84.23%


Epoch : 4/5:  60%|█████▉    | 2.40k/4.00k [02:58<01:57, 13.6it/s, Loss=0.0346, Accuracy=98.8, lr=6.23e-6]

Evaluating...


Epoch : 4/5:  60%|█████▉    | 2.40k/4.00k [03:03<20:47, 1.28it/s, Loss=0.0346, Accuracy=98.8, lr=6.22e-6]

Val Loss : 0.789. Val Accuracy 84.77%


Epoch : 4/5:  80%|███████▉  | 3.20k/4.00k [03:59<00:54, 14.6it/s, Loss=0.0316, Accuracy=98.9, lr=5.34e-6]

Evaluating...


Epoch : 4/5:  80%|███████▉  | 3.20k/4.00k [04:03<10:26, 1.28it/s, Loss=0.0316, Accuracy=98.9, lr=5.34e-6]

Val Loss : 0.749. Val Accuracy 83.85%


Epoch : 4/5: 100%|█████████▉| 3.99k/4.00k [04:59<00:00, 15.4it/s, Loss=0.0298, Accuracy=99, lr=4.45e-6]

Evaluating...


Epoch : 4/5: 100%|█████████▉| 4.00k/4.00k [05:04<00:02, 1.29it/s, Loss=0.0298, Accuracy=99, lr=4.45e-6]

Val Loss : 0.739. Val Accuracy 84.26%


Epoch : 4/5: 100%|██████████| 4.00k/4.00k [05:04<00:00, 13.1it/s, Loss=0.0298, Accuracy=99, lr=4.45e-6]


Evaluating...

Epoch 4 complete! Val Loss : 0.738. Val Accuracy 84.09%


Epoch : 5/5:  20%|█▉        | 798/4.00k [00:56<03:42, 14.4it/s, Loss=0.0433, Accuracy=98.6, lr=3.56e-6]

Evaluating...


Epoch : 5/5:  20%|██        | 800/4.00k [01:01<41:20, 1.29it/s, Loss=0.0432, Accuracy=98.6, lr=3.56e-6]

Val Loss : 0.680. Val Accuracy 83.61%


Epoch : 5/5:  40%|███▉      | 1.60k/4.00k [01:57<02:49, 14.1it/s, Loss=0.0243, Accuracy=99.2, lr=2.67e-6]

Evaluating...


Epoch : 5/5:  40%|████      | 1.60k/4.00k [02:01<31:02, 1.29it/s, Loss=0.0242, Accuracy=99.2, lr=2.67e-6]

Val Loss : 0.787. Val Accuracy 83.88%


Epoch : 5/5:  60%|█████▉    | 2.40k/4.00k [02:58<01:45, 15.2it/s, Loss=0.0183, Accuracy=99.4, lr=1.78e-6]

Evaluating...


Epoch : 5/5:  60%|██████    | 2.40k/4.00k [03:02<15:00, 1.77it/s, Loss=0.0183, Accuracy=99.4, lr=1.78e-6]

Val Loss : 0.814. Val Accuracy 83.85%


Epoch : 5/5:  80%|███████▉  | 3.20k/4.00k [03:58<00:54, 14.7it/s, Loss=0.0154, Accuracy=99.5, lr=8.94e-7]

Evaluating...


Epoch : 5/5:  80%|███████▉  | 3.20k/4.00k [04:03<10:19, 1.29it/s, Loss=0.0154, Accuracy=99.5, lr=8.91e-7]

Val Loss : 0.822. Val Accuracy 83.94%


Epoch : 5/5: 100%|█████████▉| 3.99k/4.00k [05:00<00:00, 13.5it/s, Loss=0.0132, Accuracy=99.6, lr=5.56e-9]

Evaluating...


Epoch : 5/5: 100%|█████████▉| 4.00k/4.00k [05:04<00:02, 1.29it/s, Loss=0.0132, Accuracy=99.6, lr=3.33e-9]

Val Loss : 0.832. Val Accuracy 84.20%


Epoch : 5/5: 100%|██████████| 4.00k/4.00k [05:05<00:00, 13.1it/s, Loss=0.0132, Accuracy=99.6, lr=1.11e-9]


Evaluating...

Epoch 5 complete! Val Loss : 0.832. Val Accuracy 84.20%
The model has been saved in models/CANINE_lr_2e-05_val_loss_0.35491_ep_2_typo_0.1.pt

### Typo percentage = 0.1 ###
Evaluating...

train
   loss = 0.11642390772165596
   acc = 96.41924946468482
Evaluating...

validation
   loss = 0.35491415725853204
   acc = 84.61995249406176
Evaluating...

test
   loss = 0.4227986291050911
   acc = 82.45412844036697


Reusing dataset glue (/root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-abf39e5d242edc33.arrow and /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-b8f06ab981dbf971.arrow


  0%|          | 0/3999 [00:00<?, ?ba/s]

  0%|          | 0/211 [00:00<?, ?ba/s]

  0%|          | 0/55 [00:00<?, ?ba/s]

  0%|          | 0/3999 [00:00<?, ?ba/s]

  0%|          | 0/211 [00:00<?, ?ba/s]

  0%|          | 0/55 [00:00<?, ?ba/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

Evaluating...


Epoch : 1/5:  20%|██        | 800/4.00k [01:07<46:25, 1.15it/s, Loss=0.664, Accuracy=58.5, lr=8e-6]   

Val Loss : 0.591. Val Accuracy 67.40%
Best validation loss improved from inf to 0.591



Epoch : 1/5:  40%|███▉      | 1.60k/4.00k [02:09<03:04, 13.0it/s, Loss=0.615, Accuracy=64.1, lr=1.6e-5]

Evaluating...


Epoch : 1/5:  40%|████      | 1.60k/4.00k [02:15<34:19, 1.16it/s, Loss=0.615, Accuracy=64.1, lr=1.6e-5]

Val Loss : 0.562. Val Accuracy 69.74%
Best validation loss improved from 0.591 to 0.562



Epoch : 1/5:  60%|█████▉    | 2.40k/4.00k [03:17<02:04, 12.9it/s, Loss=0.588, Accuracy=67, lr=1.96e-5]

Evaluating...


Epoch : 1/5:  60%|█████▉    | 2.40k/4.00k [03:22<22:52, 1.17it/s, Loss=0.588, Accuracy=67, lr=1.96e-5]

Val Loss : 0.521. Val Accuracy 73.07%
Best validation loss improved from 0.562 to 0.521



Epoch : 1/5:  80%|███████▉  | 3.20k/4.00k [04:24<01:03, 12.6it/s, Loss=0.569, Accuracy=68.7, lr=1.87e-5]

Evaluating...


Epoch : 1/5:  80%|███████▉  | 3.20k/4.00k [04:29<11:28, 1.16it/s, Loss=0.569, Accuracy=68.7, lr=1.87e-5]

Val Loss : 0.507. Val Accuracy 74.02%
Best validation loss improved from 0.521 to 0.507



Epoch : 1/5: 100%|█████████▉| 3.99k/4.00k [05:32<00:00, 13.7it/s, Loss=0.555, Accuracy=70, lr=1.78e-5]

Evaluating...


Epoch : 1/5: 100%|█████████▉| 4.00k/4.00k [05:37<00:02, 1.16it/s, Loss=0.555, Accuracy=70, lr=1.78e-5]

Val Loss : 0.502. Val Accuracy 74.97%
Best validation loss improved from 0.507 to 0.502



Epoch : 1/5: 100%|██████████| 4.00k/4.00k [05:37<00:00, 11.8it/s, Loss=0.555, Accuracy=70, lr=1.78e-5]


Evaluating...

Epoch 1 complete! Val Loss : 0.509. Val Accuracy 74.47%


Epoch : 2/5:  20%|█▉        | 798/4.00k [01:02<04:10, 12.8it/s, Loss=0.41, Accuracy=80.9, lr=1.69e-5]

Evaluating...


Epoch : 2/5:  20%|██        | 800/4.00k [01:07<46:07, 1.16it/s, Loss=0.41, Accuracy=80.9, lr=1.69e-5]

Val Loss : 0.500. Val Accuracy 75.92%
Best validation loss improved from 0.502 to 0.500



Epoch : 2/5:  40%|███▉      | 1.60k/4.00k [02:10<02:54, 13.7it/s, Loss=0.394, Accuracy=81.7, lr=1.6e-5]

Evaluating...


Epoch : 2/5:  40%|████      | 1.60k/4.00k [02:15<34:20, 1.16it/s, Loss=0.394, Accuracy=81.7, lr=1.6e-5]

Val Loss : 0.498. Val Accuracy 76.34%
Best validation loss improved from 0.500 to 0.498



Epoch : 2/5:  60%|█████▉    | 2.40k/4.00k [03:17<02:00, 13.3it/s, Loss=0.389, Accuracy=82, lr=1.51e-5]

Evaluating...


Epoch : 2/5:  60%|█████▉    | 2.40k/4.00k [03:22<22:50, 1.17it/s, Loss=0.389, Accuracy=82, lr=1.51e-5]

Val Loss : 0.503. Val Accuracy 76.25%


Epoch : 2/5:  80%|███████▉  | 3.20k/4.00k [04:24<01:04, 12.5it/s, Loss=0.384, Accuracy=82.2, lr=1.42e-5]

Evaluating...


Epoch : 2/5:  80%|███████▉  | 3.20k/4.00k [04:30<11:35, 1.15it/s, Loss=0.384, Accuracy=82.2, lr=1.42e-5]

Val Loss : 0.517. Val Accuracy 76.34%


Epoch : 2/5: 100%|█████████▉| 3.99k/4.00k [05:32<00:00, 13.2it/s, Loss=0.379, Accuracy=82.4, lr=1.33e-5]

Evaluating...


Epoch : 2/5: 100%|█████████▉| 4.00k/4.00k [05:37<00:02, 1.16it/s, Loss=0.379, Accuracy=82.4, lr=1.33e-5]

Val Loss : 0.489. Val Accuracy 77.08%
Best validation loss improved from 0.498 to 0.489



Epoch : 2/5: 100%|██████████| 4.00k/4.00k [05:37<00:00, 11.8it/s, Loss=0.379, Accuracy=82.4, lr=1.33e-5]


Evaluating...

Epoch 2 complete! Val Loss : 0.486. Val Accuracy 77.11%
Best validation loss improved from 0.489 to 0.486



Epoch : 3/5:  20%|█▉        | 798/4.00k [01:02<03:46, 14.2it/s, Loss=0.229, Accuracy=90.3, lr=1.24e-5]

Evaluating...


Epoch : 3/5:  20%|██        | 800/4.00k [01:08<45:39, 1.17it/s, Loss=0.229, Accuracy=90.4, lr=1.24e-5]

Val Loss : 0.621. Val Accuracy 76.45%


Epoch : 3/5:  40%|███▉      | 1.60k/4.00k [02:09<03:04, 13.0it/s, Loss=0.197, Accuracy=91.6, lr=1.16e-5]

Evaluating...


Epoch : 3/5:  40%|████      | 1.60k/4.00k [02:15<34:21, 1.16it/s, Loss=0.197, Accuracy=91.7, lr=1.16e-5]

Val Loss : 0.689. Val Accuracy 76.90%


Epoch : 3/5:  60%|█████▉    | 2.40k/4.00k [03:17<02:16, 11.7it/s, Loss=0.188, Accuracy=92, lr=1.07e-5]

Evaluating...


Epoch : 3/5:  60%|█████▉    | 2.40k/4.00k [03:22<22:59, 1.16it/s, Loss=0.188, Accuracy=92, lr=1.07e-5]

Val Loss : 0.704. Val Accuracy 75.95%


Epoch : 3/5:  80%|███████▉  | 3.20k/4.00k [04:24<01:00, 13.3it/s, Loss=0.183, Accuracy=92.3, lr=9.78e-6]

Evaluating...


Epoch : 3/5:  80%|███████▉  | 3.20k/4.00k [04:29<11:27, 1.16it/s, Loss=0.183, Accuracy=92.3, lr=9.78e-6]

Val Loss : 0.659. Val Accuracy 77.14%


Epoch : 3/5: 100%|█████████▉| 3.99k/4.00k [05:32<00:00, 12.6it/s, Loss=0.178, Accuracy=92.4, lr=8.89e-6]

Evaluating...


Epoch : 3/5: 100%|█████████▉| 4.00k/4.00k [05:37<00:01, 1.16it/s, Loss=0.178, Accuracy=92.4, lr=8.89e-6]

Val Loss : 0.684. Val Accuracy 77.97%


Epoch : 3/5: 100%|██████████| 4.00k/4.00k [05:37<00:00, 11.8it/s, Loss=0.178, Accuracy=92.4, lr=8.89e-6]


Evaluating...

Epoch 3 complete! Val Loss : 0.684. Val Accuracy 77.67%


Epoch : 4/5:  20%|█▉        | 798/4.00k [01:03<03:59, 13.4it/s, Loss=0.124, Accuracy=95, lr=8e-6]

Evaluating...


Epoch : 4/5:  20%|██        | 800/4.00k [01:08<45:44, 1.17it/s, Loss=0.124, Accuracy=95, lr=8e-6]

Val Loss : 0.799. Val Accuracy 76.66%


Epoch : 4/5:  40%|███▉      | 1.60k/4.00k [02:10<03:02, 13.2it/s, Loss=0.0849, Accuracy=96.6, lr=7.11e-6]

Evaluating...


Epoch : 4/5:  40%|████      | 1.60k/4.00k [02:15<34:03, 1.17it/s, Loss=0.0848, Accuracy=96.6, lr=7.11e-6]

Val Loss : 0.961. Val Accuracy 77.26%


Epoch : 4/5:  60%|█████▉    | 2.40k/4.00k [03:17<02:05, 12.8it/s, Loss=0.0719, Accuracy=97.2, lr=6.23e-6]

Evaluating...


Epoch : 4/5:  60%|█████▉    | 2.40k/4.00k [03:22<22:58, 1.16it/s, Loss=0.0718, Accuracy=97.2, lr=6.22e-6]

Val Loss : 0.974. Val Accuracy 76.22%


Epoch : 4/5:  80%|███████▉  | 3.20k/4.00k [04:25<01:06, 12.1it/s, Loss=0.0645, Accuracy=97.5, lr=5.34e-6]

Evaluating...


Epoch : 4/5:  80%|███████▉  | 3.20k/4.00k [04:30<11:29, 1.16it/s, Loss=0.0645, Accuracy=97.5, lr=5.34e-6]

Val Loss : 1.011. Val Accuracy 77.05%


Epoch : 4/5: 100%|█████████▉| 4.00k/4.00k [05:32<00:00, 12.5it/s, Loss=0.0606, Accuracy=97.7, lr=4.45e-6]

Evaluating...


Epoch : 4/5: 100%|█████████▉| 4.00k/4.00k [05:38<00:01, 1.16it/s, Loss=0.0606, Accuracy=97.7, lr=4.45e-6]

Val Loss : 1.027. Val Accuracy 76.60%


Epoch : 4/5: 100%|██████████| 4.00k/4.00k [05:38<00:00, 11.8it/s, Loss=0.0606, Accuracy=97.7, lr=4.45e-6]


Evaluating...

Epoch 4 complete! Val Loss : 1.029. Val Accuracy 76.54%


Epoch : 5/5:  20%|█▉        | 798/4.00k [01:02<04:08, 12.9it/s, Loss=0.0845, Accuracy=96.9, lr=3.56e-6]

Evaluating...


Epoch : 5/5:  20%|██        | 801/4.00k [01:08<38:31, 1.38it/s, Loss=0.0844, Accuracy=96.9, lr=3.56e-6]

Val Loss : 0.954. Val Accuracy 77.14%


Epoch : 5/5:  40%|███▉      | 1.60k/4.00k [02:10<03:09, 12.7it/s, Loss=0.0497, Accuracy=98.2, lr=2.67e-6]

Evaluating...


Epoch : 5/5:  40%|███▉      | 1.60k/4.00k [02:15<34:18, 1.17it/s, Loss=0.0497, Accuracy=98.2, lr=2.67e-6]

Val Loss : 1.080. Val Accuracy 76.99%


Epoch : 5/5:  60%|█████▉    | 2.40k/4.00k [03:17<02:05, 12.8it/s, Loss=0.0385, Accuracy=98.6, lr=1.78e-6]

Evaluating...


Epoch : 5/5:  60%|█████▉    | 2.40k/4.00k [03:22<22:52, 1.17it/s, Loss=0.0385, Accuracy=98.6, lr=1.78e-6]

Val Loss : 1.105. Val Accuracy 77.17%


Epoch : 5/5:  80%|███████▉  | 3.20k/4.00k [04:25<01:02, 12.8it/s, Loss=0.0312, Accuracy=98.9, lr=8.94e-7]

Evaluating...


Epoch : 5/5:  80%|███████▉  | 3.20k/4.00k [04:30<11:27, 1.16it/s, Loss=0.0312, Accuracy=98.9, lr=8.91e-7]

Val Loss : 1.160. Val Accuracy 77.08%


Epoch : 5/5: 100%|█████████▉| 3.99k/4.00k [05:33<00:00, 12.2it/s, Loss=0.0271, Accuracy=99.1, lr=5.56e-9]

Evaluating...


Epoch : 5/5: 100%|█████████▉| 4.00k/4.00k [05:38<00:02, 1.15it/s, Loss=0.0271, Accuracy=99.1, lr=3.33e-9]

Val Loss : 1.167. Val Accuracy 77.05%


Epoch : 5/5: 100%|██████████| 4.00k/4.00k [05:38<00:00, 11.8it/s, Loss=0.0271, Accuracy=99.1, lr=1.11e-9]


Evaluating...

Epoch 5 complete! Val Loss : 1.167. Val Accuracy 77.05%
The model has been saved in models/CANINE_lr_2e-05_val_loss_0.48588_ep_2_typo_0.2.pt

### Typo percentage = 0.2 ###
Evaluating...

train
   loss = 0.22296247542813646
   acc = 92.43994310811021
Evaluating...

validation
   loss = 0.4858833174287425
   acc = 77.10807600950119
Evaluating...

test
   loss = 0.5322308911518617
   acc = 74.54128440366972


Reusing dataset glue (/root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-abf39e5d242edc33.arrow and /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-b8f06ab981dbf971.arrow


  0%|          | 0/3999 [00:00<?, ?ba/s]

  0%|          | 0/211 [00:00<?, ?ba/s]

  0%|          | 0/55 [00:00<?, ?ba/s]

  0%|          | 0/3999 [00:00<?, ?ba/s]

  0%|          | 0/211 [00:00<?, ?ba/s]

  0%|          | 0/55 [00:00<?, ?ba/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

Evaluating...


Epoch : 1/5:  20%|██        | 800/4.00k [01:15<51:21, 1.04it/s, Loss=0.687, Accuracy=55, lr=8e-6]   

Val Loss : 0.677. Val Accuracy 58.22%
Best validation loss improved from inf to 0.677



Epoch : 1/5:  40%|███▉      | 1.60k/4.00k [02:24<03:28, 11.5it/s, Loss=0.678, Accuracy=57, lr=1.6e-5]

Evaluating...


Epoch : 1/5:  40%|███▉      | 1.60k/4.00k [02:30<38:48, 1.03it/s, Loss=0.678, Accuracy=57, lr=1.6e-5]

Val Loss : 0.646. Val Accuracy 62.05%
Best validation loss improved from 0.677 to 0.646



Epoch : 1/5:  60%|█████▉    | 2.40k/4.00k [03:38<02:32, 10.5it/s, Loss=0.667, Accuracy=58.7, lr=1.96e-5]

Evaluating...


Epoch : 1/5:  60%|█████▉    | 2.40k/4.00k [03:44<25:56, 1.03it/s, Loss=0.667, Accuracy=58.7, lr=1.96e-5]

Val Loss : 0.638. Val Accuracy 63.60%
Best validation loss improved from 0.646 to 0.638



Epoch : 1/5:  80%|███████▉  | 3.19k/4.00k [04:54<01:08, 11.8it/s, Loss=0.658, Accuracy=59.9, lr=1.87e-5]

Evaluating...


Epoch : 1/5:  80%|███████▉  | 3.20k/4.00k [05:00<10:52, 1.23it/s, Loss=0.658, Accuracy=59.9, lr=1.87e-5]

Val Loss : 0.629. Val Accuracy 64.79%
Best validation loss improved from 0.638 to 0.629



Epoch : 1/5: 100%|█████████▉| 3.99k/4.00k [06:09<00:00, 11.9it/s, Loss=0.652, Accuracy=60.8, lr=1.78e-5]

Evaluating...


Epoch : 1/5: 100%|█████████▉| 4.00k/4.00k [06:15<00:02, 1.03it/s, Loss=0.652, Accuracy=60.8, lr=1.78e-5]

Val Loss : 0.623. Val Accuracy 64.19%
Best validation loss improved from 0.629 to 0.623



Epoch : 1/5: 100%|██████████| 4.00k/4.00k [06:15<00:00, 10.6it/s, Loss=0.652, Accuracy=60.8, lr=1.78e-5]


Evaluating...

Epoch 1 complete! Val Loss : 0.622. Val Accuracy 64.16%
Best validation loss improved from 0.623 to 0.622



Epoch : 2/5:  20%|█▉        | 798/4.00k [01:09<04:37, 11.5it/s, Loss=0.577, Accuracy=69.3, lr=1.69e-5]

Evaluating...


Epoch : 2/5:  20%|██        | 801/4.00k [01:15<43:29, 1.23it/s, Loss=0.577, Accuracy=69.3, lr=1.69e-5]

Val Loss : 0.631. Val Accuracy 66.15%


Epoch : 2/5:  40%|███▉      | 1.60k/4.00k [02:24<03:15, 12.3it/s, Loss=0.565, Accuracy=70.2, lr=1.6e-5]

Evaluating...


Epoch : 2/5:  40%|███▉      | 1.60k/4.00k [02:30<38:31, 1.04it/s, Loss=0.565, Accuracy=70.2, lr=1.6e-5]

Val Loss : 0.642. Val Accuracy 65.97%


Epoch : 2/5:  60%|█████▉    | 2.40k/4.00k [03:39<02:27, 10.8it/s, Loss=0.561, Accuracy=70.4, lr=1.51e-5]

Evaluating...


Epoch : 2/5:  60%|█████▉    | 2.40k/4.00k [03:45<25:47, 1.03it/s, Loss=0.561, Accuracy=70.4, lr=1.51e-5]

Val Loss : 0.626. Val Accuracy 65.88%


Epoch : 2/5:  80%|███████▉  | 3.19k/4.00k [04:54<01:10, 11.5it/s, Loss=0.56, Accuracy=70.5, lr=1.42e-5] 

Evaluating...


Epoch : 2/5:  80%|███████▉  | 3.20k/4.00k [05:00<12:54, 1.03it/s, Loss=0.56, Accuracy=70.5, lr=1.42e-5]

Val Loss : 0.611. Val Accuracy 66.42%
Best validation loss improved from 0.622 to 0.611



Epoch : 2/5: 100%|█████████▉| 3.99k/4.00k [06:09<00:00, 11.6it/s, Loss=0.556, Accuracy=70.7, lr=1.33e-5]

Evaluating...


Epoch : 2/5: 100%|█████████▉| 4.00k/4.00k [06:15<00:02, 1.03it/s, Loss=0.556, Accuracy=70.7, lr=1.33e-5]

Val Loss : 0.619. Val Accuracy 65.47%


Epoch : 2/5: 100%|██████████| 4.00k/4.00k [06:15<00:00, 10.6it/s, Loss=0.556, Accuracy=70.7, lr=1.33e-5]


Evaluating...

Epoch 2 complete! Val Loss : 0.619. Val Accuracy 65.38%


Epoch : 3/5:  20%|█▉        | 798/4.00k [01:09<04:35, 11.6it/s, Loss=0.421, Accuracy=79.9, lr=1.24e-5]

Evaluating...


Epoch : 3/5:  20%|██        | 801/4.00k [01:15<43:26, 1.23it/s, Loss=0.421, Accuracy=80, lr=1.24e-5]

Val Loss : 0.738. Val Accuracy 64.55%


Epoch : 3/5:  40%|███▉      | 1.60k/4.00k [02:24<03:23, 11.8it/s, Loss=0.384, Accuracy=82.1, lr=1.16e-5]

Evaluating...


Epoch : 3/5:  40%|████      | 1.60k/4.00k [02:30<38:37, 1.04it/s, Loss=0.384, Accuracy=82.1, lr=1.16e-5]

Val Loss : 0.757. Val Accuracy 64.93%


Epoch : 3/5:  60%|█████▉    | 2.40k/4.00k [03:39<02:19, 11.5it/s, Loss=0.375, Accuracy=82.6, lr=1.07e-5]

Evaluating...


Epoch : 3/5:  60%|█████▉    | 2.40k/4.00k [03:46<25:49, 1.03it/s, Loss=0.375, Accuracy=82.6, lr=1.07e-5]

Val Loss : 0.750. Val Accuracy 65.68%


Epoch : 3/5:  80%|███████▉  | 3.19k/4.00k [04:54<01:10, 11.3it/s, Loss=0.365, Accuracy=83.3, lr=9.78e-6]

Evaluating...


Epoch : 3/5:  80%|███████▉  | 3.20k/4.00k [05:00<12:55, 1.03it/s, Loss=0.365, Accuracy=83.3, lr=9.78e-6]

Val Loss : 0.788. Val Accuracy 65.68%


Epoch : 3/5: 100%|█████████▉| 3.99k/4.00k [06:09<00:00, 11.3it/s, Loss=0.36, Accuracy=83.6, lr=8.89e-6]

Evaluating...


Epoch : 3/5: 100%|█████████▉| 4.00k/4.00k [06:15<00:02, 1.03it/s, Loss=0.36, Accuracy=83.6, lr=8.89e-6]

Val Loss : 0.767. Val Accuracy 65.35%


Epoch : 3/5: 100%|██████████| 4.00k/4.00k [06:15<00:00, 10.7it/s, Loss=0.36, Accuracy=83.6, lr=8.89e-6]


Evaluating...

Epoch 3 complete! Val Loss : 0.766. Val Accuracy 65.35%


Epoch : 4/5:  20%|█▉        | 798/4.00k [01:09<04:44, 11.3it/s, Loss=0.249, Accuracy=89.9, lr=8e-6]

Evaluating...


Epoch : 4/5:  20%|██        | 801/4.00k [01:15<43:15, 1.23it/s, Loss=0.248, Accuracy=89.9, lr=8e-6]

Val Loss : 0.976. Val Accuracy 65.47%


Epoch : 4/5:  40%|███▉      | 1.60k/4.00k [02:24<03:39, 10.9it/s, Loss=0.182, Accuracy=92.9, lr=7.11e-6]

Evaluating...


Epoch : 4/5:  40%|███▉      | 1.60k/4.00k [02:30<38:54, 1.03it/s, Loss=0.181, Accuracy=92.9, lr=7.11e-6]

Val Loss : 1.227. Val Accuracy 65.14%


Epoch : 4/5:  60%|█████▉    | 2.40k/4.00k [03:39<02:20, 11.4it/s, Loss=0.159, Accuracy=93.8, lr=6.23e-6]

Evaluating...


Epoch : 4/5:  60%|█████▉    | 2.40k/4.00k [03:45<25:51, 1.03it/s, Loss=0.159, Accuracy=93.8, lr=6.22e-6]

Val Loss : 1.238. Val Accuracy 65.41%


Epoch : 4/5:  80%|███████▉  | 3.20k/4.00k [04:54<01:05, 12.3it/s, Loss=0.147, Accuracy=94.4, lr=5.34e-6]

Evaluating...


Epoch : 4/5:  80%|███████▉  | 3.20k/4.00k [05:00<12:54, 1.03it/s, Loss=0.146, Accuracy=94.4, lr=5.34e-6]

Val Loss : 1.230. Val Accuracy 65.74%


Epoch : 4/5: 100%|█████████▉| 4.00k/4.00k [06:09<00:00, 11.4it/s, Loss=0.14, Accuracy=94.6, lr=4.45e-6]

Evaluating...


Epoch : 4/5: 100%|█████████▉| 4.00k/4.00k [06:15<00:01, 1.04it/s, Loss=0.14, Accuracy=94.6, lr=4.45e-6]

Val Loss : 1.191. Val Accuracy 64.99%


Epoch : 4/5: 100%|██████████| 4.00k/4.00k [06:15<00:00, 10.6it/s, Loss=0.14, Accuracy=94.6, lr=4.45e-6]


Evaluating...

Epoch 4 complete! Val Loss : 1.190. Val Accuracy 65.08%


Epoch : 5/5:  20%|█▉        | 798/4.00k [01:09<04:41, 11.4it/s, Loss=0.165, Accuracy=93.5, lr=3.56e-6]

Evaluating...


Epoch : 5/5:  20%|██        | 801/4.00k [01:15<43:39, 1.22it/s, Loss=0.164, Accuracy=93.6, lr=3.56e-6]

Val Loss : 1.210. Val Accuracy 65.14%


Epoch : 5/5:  40%|███▉      | 1.60k/4.00k [02:24<03:32, 11.3it/s, Loss=0.099, Accuracy=96.4, lr=2.67e-6] 

Evaluating...


Epoch : 5/5:  40%|███▉      | 1.60k/4.00k [02:30<38:34, 1.04it/s, Loss=0.0989, Accuracy=96.4, lr=2.67e-6]

Val Loss : 1.473. Val Accuracy 65.44%


Epoch : 5/5:  60%|█████▉    | 2.40k/4.00k [03:39<02:15, 11.9it/s, Loss=0.0755, Accuracy=97.3, lr=1.78e-6]

Evaluating...


Epoch : 5/5:  60%|█████▉    | 2.40k/4.00k [03:45<25:43, 1.04it/s, Loss=0.0757, Accuracy=97.3, lr=1.78e-6]

Val Loss : 1.548. Val Accuracy 65.50%


Epoch : 5/5:  80%|███████▉  | 3.19k/4.00k [04:54<01:11, 11.3it/s, Loss=0.064, Accuracy=97.8, lr=8.94e-7]

Evaluating...


Epoch : 5/5:  80%|███████▉  | 3.20k/4.00k [05:00<13:00, 1.03it/s, Loss=0.0639, Accuracy=97.8, lr=8.91e-7]

Val Loss : 1.572. Val Accuracy 65.71%


Epoch : 5/5: 100%|█████████▉| 4.00k/4.00k [06:10<00:00, 11.7it/s, Loss=0.056, Accuracy=98.1, lr=5.56e-9]

Evaluating...


Epoch : 5/5: 100%|█████████▉| 4.00k/4.00k [06:16<00:01, 1.03it/s, Loss=0.056, Accuracy=98.1, lr=3.33e-9]

Val Loss : 1.587. Val Accuracy 65.47%


Epoch : 5/5: 100%|██████████| 4.00k/4.00k [06:16<00:00, 10.6it/s, Loss=0.056, Accuracy=98.1, lr=1.11e-9]


Evaluating...

Epoch 5 complete! Val Loss : 1.587. Val Accuracy 65.47%
The model has been saved in models/CANINE_lr_2e-05_val_loss_0.61115_ep_2_typo_0.4.pt

### Typo percentage = 0.4 ###
Evaluating...

train
   loss = 0.44711435142607353
   acc = 81.08188368421875
Evaluating...

validation
   loss = 0.6111504476499784
   acc = 66.41923990498812
Evaluating...

test
   loss = 0.6025622877207669
   acc = 67.88990825688073


## Print final results

In [None]:
from tabulate import tabulate

noises = [0, 0.05, 0.1, 0.2, 0.4]
for i in range(len(noises)):
    accs[i] = [noises[i]]+accs[i]
    losses[i] = [noises[i]]+losses[i]

print('Accuracy table')
print(tabulate([['Noise', 'Train', 'Validation', 'Test']]+accs))

print('\n\nLosses table')
print(tabulate([['Noise', 'Train', 'Validation', 'Test']]+losses))