# Sentiment analysis for Sentiment140 with CANINE

## Install HuggingFace dependencies

In [None]:
!pip install -q datasets
!pip install -q transformers

[?25l[K     |█                               | 10 kB 32.9 MB/s eta 0:00:01[K     |██                              | 20 kB 40.2 MB/s eta 0:00:01[K     |███                             | 30 kB 40.2 MB/s eta 0:00:01[K     |████                            | 40 kB 33.3 MB/s eta 0:00:01[K     |█████                           | 51 kB 36.8 MB/s eta 0:00:01[K     |██████                          | 61 kB 41.1 MB/s eta 0:00:01[K     |███████                         | 71 kB 28.3 MB/s eta 0:00:01[K     |████████                        | 81 kB 28.9 MB/s eta 0:00:01[K     |█████████                       | 92 kB 31.0 MB/s eta 0:00:01[K     |██████████                      | 102 kB 30.5 MB/s eta 0:00:01[K     |███████████                     | 112 kB 30.5 MB/s eta 0:00:01[K     |████████████                    | 122 kB 30.5 MB/s eta 0:00:01[K     |█████████████                   | 133 kB 30.5 MB/s eta 0:00:01[K     |██████████████                  | 143 kB 30.5 MB/s eta 0:

## Create functions to download and preproccess data

In [None]:
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from datasets import load_dataset
from transformers import CanineTokenizer, DataCollatorWithPadding

bz = 64  # batch size

tokenizer = CanineTokenizer.from_pretrained("google/canine-s")

def encode(samples):
    return tokenizer(samples['text'], padding=False, truncation=True)

def quantify(samples):
    return {'label': np.digitize(samples['sentiment'], np.array([2]), right=True)}

letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
           'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
def typo(samples, typo_percent=0):
    for i,sample in enumerate(samples['text']):
        for j,x in enumerate(sample):
            if np.random.random() < typo_percent and x.isalpha():
                aux = list(samples['text'][i])
                aux[j] = np.random.choice(letters)
                samples['text'][i] = ''.join(aux)
    return samples

def generate_dataset(typo_percent=0):
    dataset = load_dataset('sentiment140')

    # Remove neutral ids
    non_neutral_ids = np.where(np.array(dataset['test']['sentiment']) != 2)[0]
    dataset['test'] = dataset['test'].select(non_neutral_ids)

    # Split train in train and valid
    dataset_aux = dataset['train'].train_test_split(0.01, seed=0)
    dataset['train'] = dataset_aux['train']
    dataset['validation'] = dataset_aux['test']

    if typo_percent > 0:
        dataset = dataset.map(typo, batched=True, batch_size=bz, fn_kwargs={'typo_percent':typo_percent})
    dataset = dataset.map(encode, batched=True, batch_size=bz)
    dataset = dataset.map(quantify, batched=True, batch_size=bz)
    
    dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

    loader = {}
    for split in dataset:
        loader[split] = DataLoader(dataset[split], batch_size=bz, shuffle=(split=='train'), collate_fn=DataCollatorWithPadding(tokenizer))
    
    return dataset, loader

dataset, loader = generate_dataset(typo_percent=0)

Downloading:   0%|          | 0.00/657 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892 [00:00<?, ?B/s]

Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.
Using unk_token, but it is not set yet.


Downloading builder script:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/739 [00:00<?, ?B/s]

Downloading and preparing dataset sentiment140/sentiment140 (download: 77.59 MiB, generated: 215.36 MiB, post-processed: Unknown size, total: 292.95 MiB) to /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/f81c014152931b776735658d8ae493b181927de002e706c4d5244ecb26376997...


Downloading data:   0%|          | 0.00/81.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1600000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/498 [00:00<?, ? examples/s]

Dataset sentiment140 downloaded and prepared to /root/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/f81c014152931b776735658d8ae493b181927de002e706c4d5244ecb26376997. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/24750 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/250 [00:00<?, ?ba/s]

  0%|          | 0/24750 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/250 [00:00<?, ?ba/s]

## Create train and eval functions

In [None]:
from tqdm import tqdm
import copy
import math

def train_canine(net, device, optimizer, criterion, lr, lr_scheduler, train_loader, val_loader, epochs, unfreeze=False, unfreeze_ep=5):
    best_loss = np.Inf
    best_ep = 1
    iters = []
    train_losses = []
    val_losses = []
    log_interval = math.ceil(len(train_loader)/20)+1

    for ep in range(epochs):
        with tqdm(total=len(train_loader), unit_scale=True, postfix={'Loss':0.0, 'Accuracy':0.0, 'lr':lr_scheduler.get_last_lr()[0]},
                        desc="Epoch : %i/%i" % (ep+1, epochs)) as pbar:
            net.train()
            total_loss = 0.0
            correct = 0
            nb_samples = 0
            for it, batch in enumerate(train_loader):
                # Converting to cuda tensors
                batch = {k: v.to(device) for k, v in batch.items()}
                seq, attn_masks, token_type_ids, labels = \
                    batch['input_ids'], batch['attention_mask'], batch['token_type_ids'], batch['labels']
                labels = labels.to(torch.int64)
        
                # Clear gradients
                optimizer.zero_grad()

                # Forward pass
                output = net(input_ids=seq, attention_mask=attn_masks,
                             token_type_ids=token_type_ids, labels=labels)

                # Backpropagating the gradients
                loss = output.loss #HUGGINGFACE
                #loss = criterion(output, labels)
                loss.backward()
                # Optimization step
                optimizer.step()

                total_loss += loss.item()

                logits = output.logits #HUGGINGFACE
                #logits = output
                predicted = logits.argmax(axis=1)
                correct += predicted.eq(labels).sum().item()
                nb_samples += len(predicted)

                pbar.set_postfix({'Loss':total_loss/(it%log_interval+1), 'Accuracy':100*correct/nb_samples, 'lr':lr_scheduler.get_last_lr()[0]})
                pbar.update(1)
                
                lr_scheduler.step()

                if it % log_interval == log_interval-1:
                    total_loss = 0.0
                    correct = 0
                    nb_samples = 0
                    val_loss, val_acc = evaluate_loss(net, device, val_loader, criterion)  # Compute validation loss
                    print(f"Val Loss : {val_loss:.3f}. Val Accuracy {val_acc:.2f}%")

                    if val_loss < best_loss:
                        print(f"Best validation loss improved from {best_loss:.3f} to {val_loss:.3f}")
                        print()
                        net_copy = copy.deepcopy(net)  # save a copy of the model
                        best_loss = val_loss
                        best_ep = ep + 1
                        path_to_model=f'model.pt'
                        torch.save(net_copy.state_dict(), path_to_model)

        val_loss, val_acc = evaluate_loss(net, device, val_loader, criterion)  # Compute validation loss
        print()
        print(f"Epoch {ep+1} complete! Val Loss : {val_loss:.3f}. Val Accuracy {val_acc:.2f}%")

        train_losses.append(total_loss/len(train_loader))
        val_losses.append(val_loss)

        if val_loss < best_loss:
            print(f"Best validation loss improved from {best_loss:.3f} to {val_loss:.3f}")
            print()
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_ep = ep + 1
        
        if unfreeze and ep == unfreeze_ep:
            for param in net.parameters():
                param.requires_grad = True

    # Saving the model
    path_to_model=f'model.pt'
    torch.save(net_copy.state_dict(), path_to_model)
    print(f"The model has been saved in {path_to_model}")

    del loss
    torch.cuda.empty_cache()

    return train_losses, val_losses, net_copy

In [None]:
from tqdm import tqdm

def evaluate_loss(net, device, dataloader, criterion, disable_tqdm=True):
    net.eval()

    total_loss = 0
    correct = 0

    print('Evaluating...')
    with torch.no_grad():
        for it, batch in enumerate(tqdm(dataloader, disable=disable_tqdm)):
            batch = {k: v.to(device) for k, v in batch.items()}
            seq, attn_masks, token_type_ids, labels = \
                batch['input_ids'], batch['attention_mask'], batch['token_type_ids'], batch['labels']
            labels = labels.to(torch.int64)
            output = net(seq, attn_masks, token_type_ids, labels=labels)
            total_loss += output.loss.item() #HUGGINGFACE
            #total_loss += criterion(output, labels).item()
            predicted = output.logits.argmax(axis=1) #HUGGINGFACE
            #predicted = output.argmax(axis=1)
            correct += predicted.eq(labels).sum().item()

    return total_loss/len(dataloader), 100*correct/len(dataloader.dataset)

## Train the model

In [None]:
print("Creation of the models' folder...")
!mkdir models

Creation of the models' folder...


In [None]:
#freeze_canine = False  # if True, freeze the encoder weights and only update the classification layer weights
#maxlen = 2048  # maximum length of the tokenized input pair : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
lr = 2e-5  # learning rate
epochs = 2  # number of training epochs

In [None]:
from transformers import CanineTokenizer, CanineConfig, CanineForSequenceClassification, get_linear_schedule_with_warmup, DistilBertForSequenceClassification#, AdamW
from torch.optim import AdamW

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
losses = []
accs = []

typo_percent = 0

configuration = CanineConfig(num_labels=2)
net = CanineForSequenceClassification.from_pretrained("google/canine-s", config=configuration)

net.to(device)

optimizer = AdamW(net.parameters(), lr=lr, weight_decay=1e-2)
criterion = nn.CrossEntropyLoss()
t_total = len(loader['train']) * epochs  # Necessary to take into account Gradient accumulation
num_warmup_steps = int(t_total*0.1) # The number of steps for the warmup phase.
lr_scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total)

train_losses, val_losses, net = train_canine(net, device, optimizer, criterion, lr, lr_scheduler, loader['train'], loader['validation'], epochs)

print(f'\n### Typo percentage = {typo_percent} ###')
losses_aux = []
accs_aux = []
for split in ['train', 'validation', 'test']:
    loss, acc = evaluate_loss(net, device, loader[split], criterion, disable_tqdm=False)
    losses_aux.append(loss)
    accs_aux.append(acc)
    print()
    print(split)
    print(f'   loss = {loss}\n   acc = {acc}')

losses.append(losses_aux)
accs.append(accs_aux)

Downloading:   0%|          | 0.00/504M [00:00<?, ?B/s]

Some weights of CanineForSequenceClassification were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch : 1/2:   5%|▌         | 1.24k/24.8k [13:52<4:24:37, 1.48it/s, Loss=0.668, Accuracy=57.2, lr=5e-6]

Evaluating...
Val Loss : 0.565. Val Accuracy 70.84%
Best validation loss improved from inf to 0.565



Epoch : 1/2:  10%|█         | 2.48k/24.8k [29:02<4:09:53, 1.49it/s, Loss=0.491, Accuracy=76.3, lr=1e-5]

Evaluating...
Val Loss : 0.455. Val Accuracy 78.60%
Best validation loss improved from 0.565 to 0.455



Epoch : 1/2:  15%|█▌        | 3.72k/24.8k [44:16<3:53:50, 1.50it/s, Loss=0.435, Accuracy=79.8, lr=1.5e-5]

Evaluating...
Val Loss : 0.427. Val Accuracy 80.38%
Best validation loss improved from 0.455 to 0.427



Epoch : 1/2:  20%|██        | 4.96k/24.8k [59:30<3:39:31, 1.50it/s, Loss=0.412, Accuracy=81.3, lr=2e-5]

Evaluating...
Val Loss : 0.399. Val Accuracy 81.90%
Best validation loss improved from 0.427 to 0.399



Epoch : 1/2:  25%|██▌       | 6.20k/24.8k [1:14:44<3:25:09, 1.51it/s, Loss=0.387, Accuracy=82.8, lr=1.94e-5]

Evaluating...
Val Loss : 0.378. Val Accuracy 82.87%
Best validation loss improved from 0.399 to 0.378



Epoch : 1/2:  30%|███       | 7.43k/24.8k [1:29:56<3:15:22, 1.48it/s, Loss=0.371, Accuracy=83.5, lr=1.89e-5]

Evaluating...
Val Loss : 0.357. Val Accuracy 84.24%
Best validation loss improved from 0.378 to 0.357



Epoch : 1/2:  35%|███▌      | 8.67k/24.8k [1:45:09<3:00:21, 1.49it/s, Loss=0.361, Accuracy=84.1, lr=1.83e-5]

Evaluating...
Val Loss : 0.351. Val Accuracy 84.64%
Best validation loss improved from 0.357 to 0.351



Epoch : 1/2:  40%|████      | 9.91k/24.8k [2:00:20<2:39:54, 1.55it/s, Loss=0.355, Accuracy=84.4, lr=1.78e-5]

Evaluating...
Val Loss : 0.349. Val Accuracy 84.61%
Best validation loss improved from 0.351 to 0.349



Epoch : 1/2:  45%|████▌     | 11.2k/24.8k [2:15:29<2:31:48, 1.49it/s, Loss=0.346, Accuracy=84.8, lr=1.72e-5]

Evaluating...
Val Loss : 0.348. Val Accuracy 84.75%
Best validation loss improved from 0.349 to 0.348



Epoch : 1/2:  50%|█████     | 12.4k/24.8k [2:30:39<2:18:08, 1.49it/s, Loss=0.338, Accuracy=85.3, lr=1.67e-5]

Evaluating...
Val Loss : 0.334. Val Accuracy 85.38%
Best validation loss improved from 0.348 to 0.334



Epoch : 1/2:  55%|█████▌    | 13.6k/24.8k [2:45:51<2:02:54, 1.51it/s, Loss=0.34, Accuracy=85.2, lr=1.61e-5]

Evaluating...
Val Loss : 0.331. Val Accuracy 85.61%
Best validation loss improved from 0.334 to 0.331



Epoch : 1/2:  60%|██████    | 14.9k/24.8k [3:01:00<1:50:39, 1.49it/s, Loss=0.335, Accuracy=85.3, lr=1.55e-5]

Evaluating...
Val Loss : 0.327. Val Accuracy 85.72%
Best validation loss improved from 0.331 to 0.327



Epoch : 1/2:  65%|██████▌   | 16.1k/24.8k [3:16:13<1:37:17, 1.48it/s, Loss=0.328, Accuracy=85.8, lr=1.5e-5]

Evaluating...
Val Loss : 0.324. Val Accuracy 86.03%
Best validation loss improved from 0.327 to 0.324



Epoch : 1/2:  70%|███████   | 17.3k/24.8k [3:31:23<1:22:24, 1.50it/s, Loss=0.333, Accuracy=85.6, lr=1.44e-5]

Evaluating...
Val Loss : 0.322. Val Accuracy 86.13%
Best validation loss improved from 0.324 to 0.322



Epoch : 1/2:  75%|███████▌  | 18.6k/24.8k [3:46:35<1:10:10, 1.46it/s, Loss=0.323, Accuracy=86.2, lr=1.39e-5]

Evaluating...
Val Loss : 0.317. Val Accuracy 86.10%
Best validation loss improved from 0.322 to 0.317



Epoch : 1/2:  80%|████████  | 19.8k/24.8k [4:01:45<55:20, 1.48it/s, Loss=0.319, Accuracy=86.3, lr=1.33e-5]

Evaluating...
Val Loss : 0.314. Val Accuracy 86.47%
Best validation loss improved from 0.317 to 0.314



Epoch : 1/2:  85%|████████▌ | 21.1k/24.8k [4:16:56<41:45, 1.47it/s, Loss=0.319, Accuracy=86.2, lr=1.28e-5]

Evaluating...
Val Loss : 0.313. Val Accuracy 86.50%
Best validation loss improved from 0.314 to 0.313



Epoch : 1/2:  90%|█████████ | 22.3k/24.8k [4:32:07<27:22, 1.49it/s, Loss=0.317, Accuracy=86.3, lr=1.22e-5]

Evaluating...
Val Loss : 0.311. Val Accuracy 86.79%
Best validation loss improved from 0.313 to 0.311



Epoch : 1/2:  95%|█████████▌| 23.5k/24.8k [4:47:19<13:27, 1.50it/s, Loss=0.308, Accuracy=86.8, lr=1.17e-5]

Evaluating...
Val Loss : 0.308. Val Accuracy 86.87%
Best validation loss improved from 0.311 to 0.308



Epoch : 1/2: 100%|██████████| 24.8k/24.8k [5:02:11<00:00, 1.37it/s, Loss=0.309, Accuracy=86.7, lr=1.11e-5]


Evaluating...

Epoch 1 complete! Val Loss : 0.307. Val Accuracy 86.98%
Best validation loss improved from 0.308 to 0.307



Epoch : 2/2:   5%|▌         | 1.24k/24.8k [14:07<4:23:34, 1.49it/s, Loss=0.298, Accuracy=87.4, lr=1.06e-5]

Evaluating...
Val Loss : 0.312. Val Accuracy 86.69%


Epoch : 2/2:  10%|█         | 2.48k/24.8k [29:17<4:08:29, 1.49it/s, Loss=0.259, Accuracy=89.3, lr=1e-5]

Evaluating...
Val Loss : 0.307. Val Accuracy 86.89%
Best validation loss improved from 0.307 to 0.307



Epoch : 2/2:  15%|█▌        | 3.72k/24.8k [44:31<3:55:04, 1.49it/s, Loss=0.256, Accuracy=89.4, lr=9.44e-6]

Evaluating...
Val Loss : 0.305. Val Accuracy 87.18%
Best validation loss improved from 0.307 to 0.305



Epoch : 2/2:  20%|██        | 4.96k/24.8k [59:49<3:45:49, 1.46it/s, Loss=0.258, Accuracy=89.6, lr=8.89e-6]

Evaluating...
Val Loss : 0.304. Val Accuracy 87.03%
Best validation loss improved from 0.305 to 0.304



Epoch : 2/2:  25%|██▌       | 6.20k/24.8k [1:15:05<3:29:04, 1.48it/s, Loss=0.257, Accuracy=89.4, lr=8.33e-6]

Evaluating...
Val Loss : 0.305. Val Accuracy 87.36%


Epoch : 2/2:  30%|███       | 7.43k/24.8k [1:30:20<3:15:15, 1.48it/s, Loss=0.258, Accuracy=89.3, lr=7.77e-6]

Evaluating...
Val Loss : 0.302. Val Accuracy 87.22%
Best validation loss improved from 0.304 to 0.302



Epoch : 2/2:  35%|███▌      | 8.67k/24.8k [1:45:35<2:57:50, 1.51it/s, Loss=0.255, Accuracy=89.5, lr=7.22e-6]

Evaluating...
Val Loss : 0.305. Val Accuracy 87.24%


Epoch : 2/2:  40%|████      | 9.91k/24.8k [2:00:47<2:45:23, 1.50it/s, Loss=0.255, Accuracy=89.4, lr=6.66e-6]

Evaluating...
Val Loss : 0.299. Val Accuracy 87.43%
Best validation loss improved from 0.302 to 0.299



Epoch : 2/2:  45%|████▌     | 11.2k/24.8k [2:16:03<2:32:24, 1.49it/s, Loss=0.254, Accuracy=89.5, lr=6.11e-6]

Evaluating...
Val Loss : 0.300. Val Accuracy 87.36%


Epoch : 2/2:  50%|█████     | 12.4k/24.8k [2:31:17<2:19:54, 1.47it/s, Loss=0.25, Accuracy=89.7, lr=5.55e-6]

Evaluating...
Val Loss : 0.300. Val Accuracy 87.38%


Epoch : 2/2:  55%|█████▌    | 13.6k/24.8k [2:46:30<2:03:18, 1.50it/s, Loss=0.247, Accuracy=89.7, lr=4.99e-6]

Evaluating...
Val Loss : 0.298. Val Accuracy 87.41%
Best validation loss improved from 0.299 to 0.298



Epoch : 2/2:  60%|██████    | 14.9k/24.8k [3:01:46<1:53:28, 1.45it/s, Loss=0.249, Accuracy=89.7, lr=4.44e-6]

Evaluating...
Val Loss : 0.299. Val Accuracy 87.22%


Epoch : 2/2:  65%|██████▌   | 16.1k/24.8k [3:16:57<1:37:35, 1.48it/s, Loss=0.247, Accuracy=89.9, lr=3.88e-6]

Evaluating...
Val Loss : 0.302. Val Accuracy 87.27%


Epoch : 2/2:  66%|██████▌   | 16.3k/24.8k [3:20:46<1:35:17, 1.47it/s, Loss=0.243, Accuracy=90.2, lr=3.78e-6]

## Evaluate model

In [None]:
from transformers import CanineConfig, CanineForSequenceClassification

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

configuration = CanineConfig(num_labels=2)
net = CanineForSequenceClassification.from_pretrained("google/canine-s", config=configuration)
net.to(device)

path_to_model='model.pt'
net.load_state_dict(torch.load(path_to_model))

criterion = nn.CrossEntropyLoss()

losses = []
accs = []
for split in ['train', 'validation', 'test']:
    loss, acc = evaluate_loss(net, device, loader[split], criterion, disable_tqdm=False)
    losses.append(loss)
    accs.append(acc)
    print()
    print(split)
    print(f'   loss = {loss}\n   acc = {acc}')

Some weights of CanineForSequenceClassification were not initialized from the model checkpoint at google/canine-s and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating...


100%|██████████| 6/6 [00:00<00:00,  6.63it/s]


test
   loss = 0.3904445419708888
   acc = 82.45125348189416



