# Sentiment analysis for Sentiment140 with DiltilBERT

## Install HuggingFace dependencies

In [None]:
!pip install -q datasets
!pip install -q transformers

[K     |████████████████████████████████| 325 kB 5.4 MB/s 
[K     |████████████████████████████████| 67 kB 3.2 MB/s 
[K     |████████████████████████████████| 1.1 MB 33.5 MB/s 
[K     |████████████████████████████████| 136 kB 12.4 MB/s 
[K     |████████████████████████████████| 212 kB 35.8 MB/s 
[K     |████████████████████████████████| 127 kB 35.7 MB/s 
[K     |████████████████████████████████| 271 kB 38.2 MB/s 
[K     |████████████████████████████████| 94 kB 1.1 MB/s 
[K     |████████████████████████████████| 144 kB 19.0 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
[K     |████████████████████████████████| 3.8 MB 5.4 MB/s 
[K     |████████████████████████████████| 895 kB 33.1 MB/s 
[K     |████████████████████████████████| 6.5 MB 30.3 MB/s

## Create functions to download and preproccess data

In [None]:
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from datasets import load_dataset
from transformers import CanineTokenizer, DataCollatorWithPadding, DistilBertTokenizer

bz = 64  # batch size

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def encode(samples):
    return tokenizer(samples['text'], padding=False, truncation=True)

def quantify(samples):
    return {'label': np.digitize(samples['sentiment'], np.array([2]), right=True)}

letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
def typo(samples, typo_percent=0):
    for i,sample in enumerate(samples['text']):
        for j,x in enumerate(sample):
            if np.random.random() < typo_percent and x.isalpha():
                aux = list(samples['text'][i])
                aux[j] = np.random.choice(letters)
                samples['text'][i] = ''.join(aux)
    return samples

def generate_dataset(typo_percent=0):
    dataset = load_dataset('sentiment140')

    # Remove neutral ids
    non_neutral_ids = np.where(np.array(dataset['test']['sentiment']) != 2)[0]
    dataset['test'] = dataset['test'].select(non_neutral_ids)

    # Split train in train and valid
    dataset_aux = dataset['train'].train_test_split(0.01, seed=0)
    dataset['train'] = dataset_aux['train']
    dataset['validation'] = dataset_aux['test']

    if typo_percent > 0:
        dataset = dataset.map(typo, batched=True, batch_size=bz, fn_kwargs={'typo_percent':typo_percent})
    dataset = dataset.map(encode, batched=True, batch_size=bz)
    dataset = dataset.map(quantify, batched=True, batch_size=bz)
    
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    loader = {}
    for split in dataset:
        loader[split] = DataLoader(dataset[split], batch_size=bz, shuffle=(split=='train'), collate_fn=DataCollatorWithPadding(tokenizer))
    
    return dataset, loader

dataset, loader = generate_dataset(typo_percent=0)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/739 [00:00<?, ?B/s]

Downloading and preparing dataset sentiment140/sentiment140 (download: 77.59 MiB, generated: 215.36 MiB, post-processed: Unknown size, total: 292.95 MiB) to /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/f81c014152931b776735658d8ae493b181927de002e706c4d5244ecb26376997...


Downloading data:   0%|          | 0.00/81.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1600000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/498 [00:00<?, ? examples/s]

Dataset sentiment140 downloaded and prepared to /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/f81c014152931b776735658d8ae493b181927de002e706c4d5244ecb26376997. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/24750 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/250 [00:00<?, ?ba/s]

  0%|          | 0/24750 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/250 [00:00<?, ?ba/s]

## Create train and eval functions

In [None]:
from tqdm import tqdm
import copy
import math

def train_canine(net, device, optimizer, criterion, lr, lr_scheduler, train_loader, val_loader, epochs, unfreeze=False, unfreeze_ep=5):
    best_loss = np.Inf
    best_ep = 1
    iters = []
    train_losses = []
    val_losses = []
    log_interval = math.ceil(len(train_loader)/20)+1

    for ep in range(epochs):
        with tqdm(total=len(train_loader), unit_scale=True, postfix={'Loss':0.0, 'Accuracy':0.0, 'lr':lr_scheduler.get_last_lr()[0]},
                        desc="Epoch : %i/%i" % (ep+1, epochs)) as pbar:
            net.train()
            total_loss = 0.0
            correct = 0
            nb_samples = 0
            for it, batch in enumerate(train_loader):
                # Converting to cuda tensors
                batch = {k: v.to(device) for k, v in batch.items()}
                seq, attn_masks, labels = \
                    batch['input_ids'], batch['attention_mask'], batch['labels']
                labels = labels.to(torch.int64)
        
                # Clear gradients
                optimizer.zero_grad()

                # Forward pass
                output = net(input_ids=seq, attention_mask=attn_masks, labels=labels)

                # Backpropagating the gradients
                loss = output.loss #HUGGINGFACE
                #loss = criterion(output, labels)
                loss.backward()
                # Optimization step
                optimizer.step()

                total_loss += loss.item()

                logits = output.logits #HUGGINGFACE
                #logits = output
                predicted = logits.argmax(axis=1)
                correct += predicted.eq(labels).sum().item()
                nb_samples += len(predicted)

                pbar.set_postfix({'Loss':total_loss/(it+1), 'Accuracy':100*correct/nb_samples, 'lr':lr_scheduler.get_last_lr()[0]})
                pbar.update(1)
                
                lr_scheduler.step()

                if it % log_interval == log_interval-1:
                    total_loss = 0.0
                    correct = 0
                    nb_samples = 0
                    val_loss, val_acc = evaluate_loss(net, device, val_loader, criterion)  # Compute validation loss
                    print(f"Val Loss : {val_loss:.3f}. Val Accuracy {val_acc:.2f}%")

                    if val_loss < best_loss:
                        print(f"Best validation loss improved from {best_loss:.3f} to {val_loss:.3f}")
                        print()
                        net_copy = copy.deepcopy(net)  # save a copy of the model
                        best_loss = val_loss
                        best_ep = ep + 1
                        path_to_model=f'model.pt'
                        torch.save(net_copy.state_dict(), path_to_model)

        val_loss, val_acc = evaluate_loss(net, device, val_loader, criterion)  # Compute validation loss
        print()
        print(f"Epoch {ep+1} complete! Val Loss : {val_loss:.3f}. Val Accuracy {val_acc:.2f}%")

        train_losses.append(total_loss/len(train_loader))
        val_losses.append(val_loss)

        if val_loss < best_loss:
            print(f"Best validation loss improved from {best_loss:.3f} to {val_loss:.3f}")
            print()
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_ep = ep + 1
        
        if unfreeze and ep == unfreeze_ep:
            for param in net.parameters():
                param.requires_grad = True

    # Saving the model
    path_to_model=f'model.pt'
    torch.save(net_copy.state_dict(), path_to_model)
    print(f"The model has been saved in {path_to_model}")

    del loss
    torch.cuda.empty_cache()

    return train_losses, val_losses, net_copy

In [None]:
from tqdm import tqdm

def evaluate_loss(net, device, dataloader, criterion, disable_tqdm=True):
    net.eval()

    total_loss = 0
    correct = 0

    print('Evaluating...')
    with torch.no_grad():
        for it, batch in enumerate(tqdm(dataloader, disable=disable_tqdm)):
            batch = {k: v.to(device) for k, v in batch.items()}
            seq, attn_masks, labels = \
                batch['input_ids'], batch['attention_mask'], batch['labels']
            labels = labels.to(torch.int64)
            output = net(seq, attn_masks, labels=labels)
            total_loss += output.loss.item() #HUGGINGFACE
            #total_loss += criterion(output, labels).item()
            predicted = output.logits.argmax(axis=1) #HUGGINGFACE
            #predicted = output.argmax(axis=1)
            correct += predicted.eq(labels).sum().item()

    return total_loss/len(dataloader), 100*correct/len(dataloader.dataset)

## Train the model

In [None]:
print("Creation of the models' folder...")
!mkdir models

Creation of the models' folder...


In [None]:
#freeze_canine = False  # if True, freeze the encoder weights and only update the classification layer weights
#maxlen = 2048  # maximum length of the tokenized input pair : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
lr = 2e-5  # learning rate
epochs = 2  # number of training epochs

In [None]:
from transformers import CanineTokenizer, CanineConfig, CanineForSequenceClassification, get_linear_schedule_with_warmup, DistilBertForSequenceClassification#, AdamW
from torch.optim import AdamW

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
losses = []
accs = []

typo_percent = 0

net = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

net.to(device)

optimizer = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
criterion = nn.CrossEntropyLoss()
t_total = len(loader['train']) * epochs  # Necessary to take into account Gradient accumulation
num_warmup_steps = int(t_total*0.1) # The number of steps for the warmup phase.
lr_scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

train_losses, val_losses, net = train_canine(net, device, optimizer, criterion, lr, lr_scheduler, loader['train'], loader['validation'], epochs)

print(f'\n### Typo percentage = {typo_percent} ###')
losses_aux = []
accs_aux = []
for split in ['train', 'validation', 'test']:
    loss, acc = evaluate_loss(net, device, loader[split], criterion)
    losses_aux.append(loss)
    accs_aux.append(acc)
    print()
    print(split)
    print(f'   loss = {loss}\n   acc = {acc}')

losses.append(losses_aux)
accs.append(accs_aux)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

Evaluating...
Val Loss : 0.411. Val Accuracy 81.54%
Best validation loss improved from inf to 0.411



Epoch : 1/2:  10%|█         | 2.48k/24.8k [08:42<1:11:17, 5.21it/s, Loss=0.193, Accuracy=82.8, lr=1e-5]

Evaluating...
Val Loss : 0.376. Val Accuracy 83.17%
Best validation loss improved from 0.411 to 0.376



Epoch : 1/2:  15%|█▌        | 3.72k/24.8k [13:08<1:08:47, 5.10it/s, Loss=0.123, Accuracy=83.7, lr=1.5e-5]

Evaluating...
Val Loss : 0.372. Val Accuracy 83.28%
Best validation loss improved from 0.376 to 0.372



Epoch : 1/2:  20%|██        | 4.96k/24.8k [17:36<1:01:59, 5.32it/s, Loss=0.0894, Accuracy=84.2, lr=2e-5]

Evaluating...
Val Loss : 0.355. Val Accuracy 84.10%
Best validation loss improved from 0.372 to 0.355



Epoch : 1/2:  25%|██▌       | 6.20k/24.8k [22:03<1:09:08, 4.47it/s, Loss=0.0706, Accuracy=84.4, lr=1.94e-5]

Evaluating...
Val Loss : 0.348. Val Accuracy 85.03%
Best validation loss improved from 0.355 to 0.348



Epoch : 1/2:  30%|███       | 7.43k/24.8k [26:31<56:39, 5.09it/s, Loss=0.0568, Accuracy=85, lr=1.89e-5]

Evaluating...
Val Loss : 0.334. Val Accuracy 85.09%
Best validation loss improved from 0.348 to 0.334



Epoch : 1/2:  35%|███▌      | 8.67k/24.8k [31:00<55:14, 4.85it/s, Loss=0.0478, Accuracy=85.4, lr=1.83e-5]

Evaluating...
Val Loss : 0.332. Val Accuracy 85.21%
Best validation loss improved from 0.334 to 0.332



Epoch : 1/2:  40%|████      | 9.91k/24.8k [35:30<46:46, 5.29it/s, Loss=0.0414, Accuracy=85.6, lr=1.78e-5]

Evaluating...
Val Loss : 0.328. Val Accuracy 85.76%
Best validation loss improved from 0.332 to 0.328



Epoch : 1/2:  45%|████▌     | 11.2k/24.8k [39:59<45:41, 4.96it/s, Loss=0.0363, Accuracy=85.9, lr=1.72e-5]

Evaluating...
Val Loss : 0.323. Val Accuracy 85.90%
Best validation loss improved from 0.328 to 0.323



Epoch : 1/2:  50%|█████     | 12.4k/24.8k [44:27<43:12, 4.77it/s, Loss=0.0325, Accuracy=85.9, lr=1.67e-5]

Evaluating...
Val Loss : 0.320. Val Accuracy 86.11%
Best validation loss improved from 0.323 to 0.320



Epoch : 1/2:  55%|█████▌    | 13.6k/24.8k [48:55<36:29, 5.08it/s, Loss=0.0292, Accuracy=85.9, lr=1.61e-5]

Evaluating...
Val Loss : 0.316. Val Accuracy 86.38%
Best validation loss improved from 0.320 to 0.316



Epoch : 1/2:  60%|██████    | 14.9k/24.8k [53:22<32:51, 5.01it/s, Loss=0.0264, Accuracy=86.3, lr=1.55e-5]

Evaluating...
Val Loss : 0.315. Val Accuracy 86.28%
Best validation loss improved from 0.316 to 0.315



Epoch : 1/2:  65%|██████▌   | 16.1k/24.8k [57:52<27:57, 5.15it/s, Loss=0.0241, Accuracy=86.5, lr=1.5e-5]

Evaluating...


Epoch : 1/2:  65%|██████▌   | 16.1k/24.8k [58:13<15:08:37, 6.31s/it, Loss=2.15e-5, Accuracy=85.9, lr=1.5e-5]

Val Loss : 0.317. Val Accuracy 86.24%


Epoch : 1/2:  70%|███████   | 17.3k/24.8k [1:02:20<25:08, 4.91it/s, Loss=0.0226, Accuracy=86.3, lr=1.44e-5]

Evaluating...
Val Loss : 0.313. Val Accuracy 86.46%
Best validation loss improved from 0.315 to 0.313



Epoch : 1/2:  75%|███████▌  | 18.6k/24.8k [1:06:51<21:40, 4.74it/s, Loss=0.021, Accuracy=86.5, lr=1.39e-5]

Evaluating...
Val Loss : 0.312. Val Accuracy 86.42%
Best validation loss improved from 0.313 to 0.312



Epoch : 1/2:  80%|████████  | 19.8k/24.8k [1:11:20<17:19, 4.74it/s, Loss=0.0196, Accuracy=86.4, lr=1.33e-5]

Evaluating...
Val Loss : 0.306. Val Accuracy 86.70%
Best validation loss improved from 0.312 to 0.306



Epoch : 1/2:  85%|████████▌ | 21.1k/24.8k [1:15:49<11:43, 5.24it/s, Loss=0.0183, Accuracy=86.5, lr=1.28e-5]

Evaluating...
Val Loss : 0.306. Val Accuracy 86.78%


Epoch : 1/2:  90%|█████████ | 22.3k/24.8k [1:20:16<08:05, 5.04it/s, Loss=0.017, Accuracy=86.7, lr=1.22e-5]

Evaluating...


Epoch : 1/2:  90%|█████████ | 22.3k/24.8k [1:20:36<4:15:56, 6.28s/it, Loss=1.47e-5, Accuracy=89.1, lr=1.22e-5]

Val Loss : 0.308. Val Accuracy 86.36%


Epoch : 1/2:  95%|█████████▌| 23.5k/24.8k [1:24:45<03:57, 5.09it/s, Loss=0.0162, Accuracy=86.7, lr=1.17e-5]

Evaluating...
Val Loss : 0.303. Val Accuracy 86.84%
Best validation loss improved from 0.306 to 0.303



Epoch : 1/2: 100%|██████████| 24.8k/24.8k [1:29:09<00:00, 4.63it/s, Loss=0.0151, Accuracy=86.7, lr=1.11e-5]


Evaluating...

Epoch 1 complete! Val Loss : 0.304. Val Accuracy 86.98%


Epoch : 2/2:   5%|▌         | 1.24k/24.8k [04:08<1:15:28, 5.19it/s, Loss=0.271, Accuracy=88.6, lr=1.06e-5]

Evaluating...


Epoch : 2/2:   5%|▌         | 1.24k/24.8k [04:29<41:09:05, 6.30s/it, Loss=0.00021, Accuracy=89.1, lr=1.06e-5]

Val Loss : 0.312. Val Accuracy 86.69%


Epoch : 2/2:  10%|█         | 2.48k/24.8k [08:37<1:12:08, 5.15it/s, Loss=0.131, Accuracy=89.1, lr=1e-5]

Evaluating...


Epoch : 2/2:  10%|█         | 2.48k/24.8k [08:57<39:05:27, 6.32s/it, Loss=9.19e-5, Accuracy=90.6, lr=1e-5]

Val Loss : 0.306. Val Accuracy 86.65%


Epoch : 2/2:  15%|█▌        | 3.72k/24.8k [13:04<1:06:38, 5.26it/s, Loss=0.0851, Accuracy=89.4, lr=9.44e-6]

Evaluating...


Epoch : 2/2:  15%|█▌        | 3.72k/24.8k [13:25<36:50:16, 6.31s/it, Loss=0.000104, Accuracy=82.8, lr=9.44e-6]

Val Loss : 0.308. Val Accuracy 86.62%


Epoch : 2/2:  20%|██        | 4.96k/24.8k [17:33<1:02:55, 5.24it/s, Loss=0.0638, Accuracy=89.5, lr=8.89e-6]

Evaluating...
Val Loss : 0.309. Val Accuracy 86.71%


Epoch : 2/2:  25%|██▌       | 6.20k/24.8k [22:00<59:07, 5.23it/s, Loss=0.0506, Accuracy=89.4, lr=8.33e-6]

Evaluating...


Epoch : 2/2:  25%|██▌       | 6.20k/24.8k [22:21<32:26:38, 6.30s/it, Loss=4.1e-5, Accuracy=89.1, lr=8.33e-6]

Val Loss : 0.308. Val Accuracy 86.74%


Epoch : 2/2:  30%|███       | 7.43k/24.8k [26:29<56:05, 5.14it/s, Loss=0.0422, Accuracy=89.5, lr=7.77e-6]

Evaluating...


Epoch : 2/2:  30%|███       | 7.43k/24.8k [26:49<30:14:47, 6.29s/it, Loss=2.72e-5, Accuracy=93.8, lr=7.77e-6]

Val Loss : 0.305. Val Accuracy 86.69%


Epoch : 2/2:  35%|███▌      | 8.67k/24.8k [30:56<53:01, 5.05it/s, Loss=0.0368, Accuracy=89.3, lr=7.22e-6]

Evaluating...


Epoch : 2/2:  35%|███▌      | 8.67k/24.8k [31:16<27:58:02, 6.26s/it, Loss=3.6e-5, Accuracy=82.8, lr=7.22e-6]

Val Loss : 0.304. Val Accuracy 86.79%


Epoch : 2/2:  40%|████      | 9.91k/24.8k [35:22<48:13, 5.13it/s, Loss=0.0316, Accuracy=89.5, lr=6.66e-6]

Evaluating...


Epoch : 2/2:  40%|████      | 9.91k/24.8k [35:43<26:05:42, 6.33s/it, Loss=3.64e-5, Accuracy=89.1, lr=6.66e-6]

Val Loss : 0.305. Val Accuracy 86.89%


Epoch : 2/2:  45%|████▌     | 11.2k/24.8k [39:50<44:02, 5.15it/s, Loss=0.0282, Accuracy=89.4, lr=6.11e-6]

Evaluating...
Val Loss : 0.303. Val Accuracy 86.97%
Best validation loss improved from 0.303 to 0.303



Epoch : 2/2:  50%|█████     | 12.4k/24.8k [44:17<39:41, 5.19it/s, Loss=0.0248, Accuracy=89.7, lr=5.55e-6]

Evaluating...


Epoch : 2/2:  50%|█████     | 12.4k/24.8k [44:38<21:41:36, 6.32s/it, Loss=2.19e-5, Accuracy=85.9, lr=5.55e-6]

Val Loss : 0.306. Val Accuracy 86.94%


Epoch : 2/2:  55%|█████▌    | 13.6k/24.8k [48:45<35:00, 5.29it/s, Loss=0.0227, Accuracy=89.7, lr=4.99e-6]

Evaluating...


Epoch : 2/2:  55%|█████▌    | 13.6k/24.8k [49:06<35:00, 5.29it/s, Loss=2.63e-5, Accuracy=82.8, lr=4.99e-6]

Val Loss : 0.304. Val Accuracy 86.81%


Epoch : 2/2:  60%|██████    | 14.9k/24.8k [53:14<32:53, 5.01it/s, Loss=0.021, Accuracy=89.4, lr=4.44e-6]

Evaluating...
Val Loss : 0.300. Val Accuracy 86.99%
Best validation loss improved from 0.303 to 0.300



Epoch : 2/2:  65%|██████▌   | 16.1k/24.8k [57:43<28:57, 4.97it/s, Loss=0.0191, Accuracy=89.5, lr=3.88e-6]

Evaluating...


Epoch : 2/2:  65%|██████▌   | 16.1k/24.8k [58:03<28:57, 4.97it/s, Loss=1.33e-5, Accuracy=90.6, lr=3.88e-6]

Val Loss : 0.302. Val Accuracy 87.19%


Epoch : 2/2:  70%|███████   | 17.3k/24.8k [1:02:09<23:14, 5.31it/s, Loss=0.0178, Accuracy=89.7, lr=3.32e-6]

Evaluating...


Epoch : 2/2:  70%|███████   | 17.3k/24.8k [1:02:30<12:55:19, 6.28s/it, Loss=1.44e-5, Accuracy=92.2, lr=3.32e-6]

Val Loss : 0.301. Val Accuracy 87.02%


Epoch : 2/2:  75%|███████▌  | 18.6k/24.8k [1:06:39<20:20, 5.05it/s, Loss=0.0166, Accuracy=89.7, lr=2.77e-6]

Evaluating...


Epoch : 2/2:  75%|███████▌  | 18.6k/24.8k [1:06:59<10:46:34, 6.29s/it, Loss=1.38e-5, Accuracy=89.1, lr=2.77e-6]

Val Loss : 0.301. Val Accuracy 87.09%


Epoch : 2/2:  80%|████████  | 19.8k/24.8k [1:11:05<15:44, 5.21it/s, Loss=0.0156, Accuracy=89.7, lr=2.21e-6]

Evaluating...
Val Loss : 0.300. Val Accuracy 86.97%
Best validation loss improved from 0.300 to 0.300



Epoch : 2/2:  85%|████████▌ | 21.1k/24.8k [1:15:35<13:00, 4.73it/s, Loss=0.0145, Accuracy=89.7, lr=1.66e-6]

Evaluating...


Epoch : 2/2:  85%|████████▌ | 21.1k/24.8k [1:15:56<6:27:26, 6.31s/it, Loss=1.66e-5, Accuracy=87.5, lr=1.66e-6]

Val Loss : 0.301. Val Accuracy 87.06%


Epoch : 2/2:  90%|█████████ | 22.3k/24.8k [1:20:02<07:46, 5.25it/s, Loss=0.0139, Accuracy=89.7, lr=1.1e-6]

Evaluating...
Val Loss : 0.300. Val Accuracy 87.08%
Best validation loss improved from 0.300 to 0.300



Epoch : 2/2:  95%|█████████▌| 23.5k/24.8k [1:24:30<04:10, 4.82it/s, Loss=0.0129, Accuracy=89.9, lr=5.43e-7]

Evaluating...


Epoch : 2/2:  95%|█████████▌| 23.5k/24.8k [1:24:50<2:07:00, 6.31s/it, Loss=8.24e-6, Accuracy=96.9, lr=5.43e-7]

Val Loss : 0.301. Val Accuracy 87.12%


Epoch : 2/2: 100%|██████████| 24.8k/24.8k [1:28:53<00:00, 4.64it/s, Loss=0.012, Accuracy=89.8, lr=4.49e-10]


Evaluating...

Epoch 2 complete! Val Loss : 0.300. Val Accuracy 87.17%
The model has been saved in /content/drive/MyDrive/ENS/NLP/BERT.pt

### Typo percentage = 0 ###
Evaluating...

train
   loss = 0.20778374118365422
   acc = 91.9469696969697
Evaluating...

validation
   loss = 0.2997824672460556
   acc = 87.08125
Evaluating...

test
   loss = 0.3210481156905492
   acc = 84.67966573816156


## Evaluate model

In [None]:
from transformers import DistilBertForSequenceClassification

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

net = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
net.to(device)

path_to_model='model.pt'
net.load_state_dict(torch.load(path_to_model))

criterion = nn.CrossEntropyLoss()

losses = []
accs = []
for split in ['train', 'validation', 'test']:
    loss, acc = evaluate_loss(net, device, loader[split], criterion, disable_tqdm=False)
    losses.append(loss)
    accs.append(acc)
    print()
    print(split)
    print(f'   loss = {loss}\n   acc = {acc}')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

Evaluating...


100%|██████████| 6/6 [00:00<00:00, 13.68it/s]


test
   loss = 0.3281664202610652
   acc = 85.51532033426184



