<a href="https://colab.research.google.com/github/germanjke/Deep_Learning_School_MIPT/blob/master/nlp_homeworks/homework_classification_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Задание 3

## Классификация текстов

В этом задании вам предстоит попробовать несколько методов, используемых в задаче классификации, а также понять насколько хорошо модель понимает смысл слов и какие слова в примере влияют на результат.

In [None]:
import pandas as pd
import numpy as np
import torch

from torchtext import datasets

from torchtext.data import Field, LabelField
from torchtext.data import BucketIterator

from torchtext.vocab import Vectors, GloVe

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
from tqdm.autonotebook import tqdm

В этом задании мы будем использовать библиотеку torchtext. Она довольна проста в использовании и поможет нам сконцентрироваться на задаче, а не на написании Dataloader-а.

In [None]:
TEXT = Field(sequential=True, lower=True, include_lengths=True)  # Поле текста
LABEL = LabelField(dtype=torch.float)  # Поле метки

In [None]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

Датасет на котором мы будем проводить эксперементы это комментарии к фильмам из сайта IMDB.

In [None]:
train, test = datasets.IMDB.splits(TEXT, LABEL)  # загрузим датасет
train, valid = train.split(random_state=random.seed(SEED))  # разобьем на части

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:04<00:00, 20.8MB/s]


In [None]:
TEXT.build_vocab(train)
LABEL.build_vocab(train)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

train_iter, valid_iter, test_iter = BucketIterator.splits(
    (train, valid, test), 
    batch_size = 64,
    sort_within_batch = True,
    device = device)

## RNN

Для начала попробуем использовать рекурентные нейронные сети. На семинаре вы познакомились с GRU, вы можете также попробовать LSTM. Можно использовать для классификации как hidden_state, так и output последнего токена.

In [None]:
class RNNBaseline(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, dropout=dropout,
                                batch_first=True, bias=True, bidirectional=True)  
        
        self.fc = nn.Linear(hidden_dim*2, output_dim)  
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        
        # cell arg for LSTM, remove for GRU
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)  

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = torch.cat([hidden[-1, :, :], hidden[-2, :, :]], 1)  
        hidden = self.dropout(hidden)        
        #hidden = [batch size, hid dim * num directions] or [batch_size, hid dim * num directions]
            
        return self.fc(hidden)

Поиграйтесь с гиперпараметрами

In [None]:
vocab_size = len(TEXT.vocab)
emb_dim = 100
hidden_dim = 256
output_dim = 1
n_layers = 3
bidirectional = True
dropout = 0.1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
patience=5

In [None]:
model = RNNBaseline(
    vocab_size=vocab_size,
    embedding_dim=emb_dim,
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    n_layers=n_layers,
    bidirectional=bidirectional,
    dropout=dropout,
    pad_idx=PAD_IDX
)

In [None]:
model = model.to(device)

In [None]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
max_epochs = 5

Обучите сетку! Используйте любые вам удобные инструменты, Catalyst, PyTorch Lightning или свои велосипеды.

In [None]:
import numpy as np

min_loss = np.inf

cur_patience = 0

for epoch in range(1, max_epochs + 1):
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar: 
  
          input = batch.text
          label = batch.label.unsqueeze(1) 

          optimizer.zero_grad()
          prediction = model(input[0], input[1])
          loss = criterion(prediction, label)
          train_loss += loss

          optimizer.zero_grad()
          loss.backward()
          optimizer.step()

    train_loss /= len(train_iter)
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:

          input = batch.text
          label = batch.label.unsqueeze(1)

          optimizer.zero_grad()
          prediction = model(input[0], input[1])
          loss = criterion(prediction, label)
          val_loss += loss

          val_loss += loss

    val_loss /= len(valid_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)

HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 1, Training Loss: 0.6334003806114197, Validation Loss: 1.181138038635254


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 2, Training Loss: 0.5657919645309448, Validation Loss: 1.0632634162902832


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 3, Training Loss: 0.41631370782852173, Validation Loss: 1.1139955520629883


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 4, Training Loss: 0.2839461863040924, Validation Loss: 0.8928310871124268


HBox(children=(FloatProgress(value=0.0, max=274.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=118.0), HTML(value='')))

Epoch: 5, Training Loss: 0.22327104210853577, Validation Loss: 0.8887850046157837


<All keys matched successfully>

Посчитайте f1-score вашего классификатора на тестовом датасете.

**Ответ**:

In [None]:
from sklearn.metrics import f1_score

preds = []
targets = []

model.eval()
for batch in tqdm(test_iter):
      
      input = batch.text
      label = batch.label 

      predictions = model(input[0], input[1])
      predictions = (torch.sigmoid(predictions) > 0.5).cpu().numpy()

      preds.extend(predictions)
      targets.extend(label.tolist())

print(f1_score(targets, preds, average='binary'))


HBox(children=(FloatProgress(value=0.0, max=391.0), HTML(value='')))


0.8315030504714365


## CNN

![](https://www.researchgate.net/publication/333752473/figure/fig1/AS:769346934673412@1560438011375/Standard-CNN-on-text-classification.png)

Для классификации текстов также часто используют сверточные нейронные сети. Идея в том, что как правило сентимент содержат словосочетания из двух-трех слов, например "очент хороший фильм" или "невероятноя скука". Проходясь сверткой по этим словам мы получим какой-то большой скор и выхватим его с помощью MaxPool. Далее идет обычная полносвязная сетка. Важный момент: свертки применяются не последовательно, а параллельно. Давайте попробуем!

In [None]:
TEXT = Field(sequential=True, lower=True, batch_first=True)  # batch_first тк мы используем conv  
LABEL = LabelField(batch_first=True, dtype=torch.float)

train, tst = datasets.IMDB.splits(TEXT, LABEL)
trn, vld = train.split(random_state=random.seed(SEED))

TEXT.build_vocab(trn)
LABEL.build_vocab(trn)

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
train_iter, val_iter, test_iter = BucketIterator.splits(
        (trn, vld, tst),
        batch_sizes=(128, 256, 256),
        sort=False,
        sort_key= lambda x: len(x.src),
        sort_within_batch=False,
        device=device,
        repeat=False,
)

Вы можете использовать Conv2d с `in_channels=1, kernel_size=(kernel_sizes[0], emb_dim))` или Conv1d c `in_channels=emb_dim, kernel_size=kernel_size[0]`. Но хорошенько подумайте над shape в обоих случаях.

In [None]:
class CNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        emb_dim,
        out_channels,
        kernel_sizes,
        dropout=0.5,
    ):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.conv_0 = nn.Conv2d(1, out_channels, kernel_size=(kernel_sizes[0], dim), stride=(1, 1)) 

        self.conv_1 = nn.Conv2d(1, out_channels, kernel_size=(kernel_sizes[1], dim), stride=(1, 1))  
        
        self.conv_2 = nn.Conv2d(1, out_channels, kernel_size=(kernel_sizes[2], dim), stride=(1, 1))  
        
        self.fc = nn.Linear(len(kernel_sizes) * out_channels, 1)
        
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, text):
        
        embedded = self.embedding(text)
        
        embedded = embedded.unsqueeze(1)  # may be reshape here

        conved_0 = F.relu(self.conv_0(embedded)).squeeze(3)  # may be reshape here
        conved_1 = F.relu(self.conv_1(embedded)).squeeze(3)  # may be reshape here
        conved_2 = F.relu(self.conv_2(embedded)).squeeze(3)  # may be reshape here

        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)

        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))
            
        return self.fc(cat)

In [None]:
kernel_sizes = [3, 4, 5]
vocab_size = len(TEXT.vocab)
out_channels=64
dropout = 0.5
dim = 300

model = CNN(vocab_size=vocab_size, emb_dim=dim, out_channels=out_channels,
            kernel_sizes=kernel_sizes, dropout=dropout)

In [None]:
model.to(device)

CNN(
  (embedding): Embedding(201944, 300)
  (conv_0): Conv2d(1, 64, kernel_size=(3, 300), stride=(1, 1))
  (conv_1): Conv2d(1, 64, kernel_size=(4, 300), stride=(1, 1))
  (conv_2): Conv2d(1, 64, kernel_size=(5, 300), stride=(1, 1))
  (fc): Linear(in_features=192, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
max_epochs = 5

Обучите!

In [None]:
import numpy as np


min_loss = np.inf

cur_patience = 0

for epoch in range(1, max_epochs + 1):
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar: 

          input = batch.text 
          label = batch.label 

          prediction = torch.squeeze(model(input), 1)
          loss = criterion(prediction, label)
          
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          
          train_loss += loss

    train_loss /= len(train_iter)
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(val_iter), total=len(val_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:

          input = batch.text 
          label = batch.label 

          prediction = torch.squeeze(model(input), 1)
          loss = criterion(prediction, label)
          
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          
          val_loss += loss

    val_loss /= len(val_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)

HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 1, Training Loss: 0.6510807871818542, Validation Loss: 0.47967782616615295


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 2, Training Loss: 0.49915748834609985, Validation Loss: 0.3961154818534851


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 3, Training Loss: 0.42093732953071594, Validation Loss: 0.32855305075645447


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 4, Training Loss: 0.3597782254219055, Validation Loss: 0.28558439016342163


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 5, Training Loss: 0.2959457039833069, Validation Loss: 0.22147701680660248


<All keys matched successfully>

Посчитайте f1-score вашего классификатора.

**Ответ**:

In [None]:
from sklearn.metrics import f1_score

preds = []
targets = []

model.eval()
for batch in tqdm(test_iter):
      
      input_embeds = batch.text
      labels = batch.label 

      predictions = model(input_embeds)
      predictions = (torch.sigmoid(predictions) > 0.5).cpu().numpy()

      preds.extend(predictions)
      targets.extend(labels.tolist())

print(f1_score(targets, preds, average='binary'))

HBox(children=(FloatProgress(value=0.0, max=98.0), HTML(value='')))


0.8697573425774257


## Интерпретируемость

Посмотрим, куда смотрит наша модель. Достаточно запустить код ниже.

In [None]:
!pip install -q captum

[?25l[K     |▎                               | 10kB 27.2MB/s eta 0:00:01[K     |▌                               | 20kB 24.9MB/s eta 0:00:01[K     |▊                               | 30kB 15.7MB/s eta 0:00:01[K     |█                               | 40kB 12.6MB/s eta 0:00:01[K     |█▏                              | 51kB 11.3MB/s eta 0:00:01[K     |█▍                              | 61kB 11.2MB/s eta 0:00:01[K     |█▊                              | 71kB 11.7MB/s eta 0:00:01[K     |██                              | 81kB 11.0MB/s eta 0:00:01[K     |██▏                             | 92kB 10.4MB/s eta 0:00:01[K     |██▍                             | 102kB 10.6MB/s eta 0:00:01[K     |██▋                             | 112kB 10.6MB/s eta 0:00:01[K     |██▉                             | 122kB 10.6MB/s eta 0:00:01[K     |███                             | 133kB 10.6MB/s eta 0:00:01[K     |███▍                            | 143kB 10.6MB/s eta 0:00:01[K     |███▋         

In [None]:
from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization

PAD_IND = TEXT.vocab.stoi['pad']

token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)
lig = LayerIntegratedGradients(model, model.embedding)

In [None]:
def forward_with_softmax(inp):
    logits = model(inp)
    return torch.softmax(logits, 0)[0][1]

def forward_with_sigmoid(input):
    return torch.sigmoid(model(input))


# accumalate couple samples in this array for visualization purposes
vis_data_records_ig = []

def interpret_sentence(model, sentence, min_len = 7, label = 0):
    model.eval()
    text = [tok for tok in TEXT.tokenize(sentence)]
    if len(text) < min_len:
        text += ['pad'] * (min_len - len(text))
    indexed = [TEXT.vocab.stoi[t] for t in text]

    model.zero_grad()

    input_indices = torch.tensor(indexed, device=device)
    input_indices = input_indices.unsqueeze(0)
    
    # input_indices dim: [sequence_length]
    seq_length = min_len

    # predict
    pred = forward_with_sigmoid(input_indices).item()
    pred_ind = round(pred)

    # generate reference indices for each sample
    reference_indices = token_reference.generate_reference(seq_length, device=device).unsqueeze(0)

    # compute attributions and approximation delta using layer integrated gradients
    attributions_ig, delta = lig.attribute(input_indices, reference_indices, \
                                           n_steps=5000, return_convergence_delta=True)

    print('pred: ', LABEL.vocab.itos[pred_ind], '(', '%.2f'%pred, ')', ', delta: ', abs(delta))

    add_attributions_to_visualizer(attributions_ig, text, pred, pred_ind, label, delta, vis_data_records_ig)
    
def add_attributions_to_visualizer(attributions, text, pred, pred_ind, label, delta, vis_data_records):
    attributions = attributions.sum(dim=2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()

    # storing couple samples in an array for visualization purposes
    vis_data_records.append(visualization.VisualizationDataRecord(
                            attributions,
                            pred,
                            LABEL.vocab.itos[pred_ind],
                            LABEL.vocab.itos[label],
                            LABEL.vocab.itos[1],
                            attributions.sum(),       
                            text,
                            delta))

In [None]:
interpret_sentence(model, 'It was a fantastic performance !', label=1)
interpret_sentence(model, 'Best film ever', label=1)
interpret_sentence(model, 'Such a great show!', label=1)
interpret_sentence(model, 'It was a horrible movie', label=0)
interpret_sentence(model, 'I\'ve never watched something as bad', label=0)
interpret_sentence(model, 'It is a disgusting movie!', label=0)
interpret_sentence(model, 'Omg this movie so bad', label=0)
interpret_sentence(model, 'I have orgasm by this movie!', label=1)


pred:  pos ( 1.00 ) , delta:  tensor([9.5114e-06], device='cuda:0', dtype=torch.float64)
pred:  pos ( 0.89 ) , delta:  tensor([1.9634e-05], device='cuda:0', dtype=torch.float64)
pred:  pos ( 0.97 ) , delta:  tensor([3.2942e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.13 ) , delta:  tensor([8.1663e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.05 ) , delta:  tensor([8.9405e-06], device='cuda:0', dtype=torch.float64)
pred:  pos ( 0.81 ) , delta:  tensor([7.4002e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.01 ) , delta:  tensor([0.0002], device='cuda:0', dtype=torch.float64)
pred:  pos ( 0.67 ) , delta:  tensor([2.3089e-06], device='cuda:0', dtype=torch.float64)


Попробуйте добавить свои примеры!

In [None]:
print('Visualize attributions based on Integrated Gradients')
visualization.visualize_text(vis_data_records_ig)

Visualize attributions based on Integrated Gradients


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
pos,pos (1.00),pos,1.35,It was a fantastic performance ! pad
,,,,
pos,pos (0.89),pos,1.04,Best film ever pad pad pad pad
,,,,
pos,pos (0.97),pos,0.79,Such a great show! pad pad pad
,,,,
neg,neg (0.13),pos,-0.99,It was a horrible movie pad pad
,,,,
neg,neg (0.05),pos,-1.23,I've never watched something as bad pad
,,,,


## Эмбэдинги слов

Вы ведь не забыли, как мы можем применить знания о word2vec и GloVe. Давайте попробуем!

In [None]:
TEXT.build_vocab(trn, vectors=GloVe('6B')) #yes its GloVe
LABEL.build_vocab(trn)

word_embeddings = TEXT.vocab.vectors

kernel_sizes = [3, 4, 5]
vocab_size = len(TEXT.vocab)
dropout = 0.5
dim = 300

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m



.vector_cache/glove.6B.zip:  14%|█▍        | 119M/862M [00:34<19:10, 646kB/s][A[A[A[A[A




.vector_cache/glove.6B.zip:  14%|█▍        | 120M/862M [00:34<13:43, 901kB/s][A[A[A[A[A




.vector_cache/glove.6B.zip:  14%|█▍        | 123M/862M [00:36<12:27, 989kB/s][A[A[A[A[A




.vector_cache/glove.6B.zip:  14%|█▍        | 123M/862M [00:36<10:00, 1.23MB/s][A[A[A[A[A




.vector_cache/glove.6B.zip:  14%|█▍        | 125M/862M [00:36<07:19, 1.68MB/s][A[A[A[A[A




.vector_cache/glove.6B.zip:  15%|█▍        | 127M/862M [00:38<08:00, 1.53MB/s][A[A[A[A[A




.vector_cache/glove.6B.zip:  15%|█▍        | 127M/862M [00:38<08:36, 1.42MB/s][A[A[A[A[A




.vector_cache/glove.6B.zip:  15%|█▍        | 128M/862M [00:38<06:48, 1.80MB/s][A[A[A[A[A




.vector_cache/glove.6B.zip:  15%|█▌        | 130M/862M [00:38<04:56, 2.47MB/s][A[A[A[A[A




.vector_cache/glove.6B.zip:  15%|█▌ 

In [49]:
train, tst = datasets.IMDB.splits(TEXT, LABEL)
trn, vld = train.split(random_state=random.seed(SEED))

device = "cuda" if torch.cuda.is_available() else "cpu"

train_iter, val_iter, test_iter = BucketIterator.splits(
        (trn, vld, tst),
        batch_sizes=(128, 256, 256),
        sort=False,
        sort_key= lambda x: len(x.src),
        sort_within_batch=False,
        device=device,
        repeat=False,
)

In [50]:
model = CNN(vocab_size=vocab_size, emb_dim=dim, out_channels=64,
            kernel_sizes=kernel_sizes, dropout=dropout)

word_embeddings = TEXT.vocab.vectors 

prev_shape = model.embedding.weight.shape

model.embedding.weight = nn.Parameter(word_embeddings) # инициализируйте эмбэдинги

assert prev_shape == model.embedding.weight.shape
model.to(device)

optimizer = torch.optim.Adam(model.parameters())

Вы знаете, что делать.

In [51]:
min_loss = np.inf

cur_patience = 0

for epoch in range(1, max_epochs + 1):
    train_loss = 0.0
    model.train()
    pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar: 

          input = batch.text 
          label = batch.label

          prediction = torch.squeeze(model(input), 1)
          loss = criterion(prediction, label)
          
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          
          train_loss += loss

    train_loss /= len(train_iter)
    val_loss = 0.0
    model.eval()
    pbar = tqdm(enumerate(val_iter), total=len(val_iter), leave=False)
    pbar.set_description(f"Epoch {epoch}")
    for it, batch in pbar:
      
          input = batch.text 
          label = batch.label

          prediction = torch.squeeze(model(input), 1)
          loss = criterion(prediction, label)
          
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          
          val_loss += loss

    val_loss /= len(val_iter)
    if val_loss < min_loss:
        min_loss = val_loss
        best_model = model.state_dict()
    else:
        cur_patience += 1
        if cur_patience == patience:
            cur_patience = 0
            break
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)

HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 1, Training Loss: 0.5166822671890259, Validation Loss: 0.35615038871765137


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 2, Training Loss: 0.30161938071250916, Validation Loss: 0.24077804386615753


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 3, Training Loss: 0.1676194965839386, Validation Loss: 0.14228162169456482


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 4, Training Loss: 0.06776633113622665, Validation Loss: 0.06697939336299896


HBox(children=(FloatProgress(value=0.0, max=137.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

Epoch: 5, Training Loss: 0.025425061583518982, Validation Loss: 0.02677890472114086


<All keys matched successfully>

Посчитайте f1-score вашего классификатора.

**Ответ**:

In [52]:
from sklearn.metrics import f1_score

preds = []
targets = []

model.eval()
for batch in tqdm(test_iter):
      
      input_embeds = batch.text
      labels = batch.label 

      predictions = model(input_embeds)
      predictions = (torch.sigmoid(predictions) > 0.5).cpu().numpy()

      preds.extend(predictions)
      targets.extend(labels.tolist())

print(f1_score(targets, preds, average='binary'))

HBox(children=(FloatProgress(value=0.0, max=98.0), HTML(value='')))


0.8607394439581508


Проверим насколько все хорошо!

In [53]:
PAD_IND = TEXT.vocab.stoi['pad']

token_reference = TokenReferenceBase(reference_token_idx=PAD_IND)
lig = LayerIntegratedGradients(model, model.embedding)
vis_data_records_ig = []

interpret_sentence(model, 'It was a fantastic performance !', label=1)
interpret_sentence(model, 'Best film ever', label=1)
interpret_sentence(model, 'Such a great show!', label=1)
interpret_sentence(model, 'It was a horrible movie', label=0)
interpret_sentence(model, 'I\'ve never watched something as bad', label=0)
interpret_sentence(model, 'It is a disgusting movie!', label=0)

pred:  pos ( 0.93 ) , delta:  tensor([1.2565e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.01 ) , delta:  tensor([3.3948e-05], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.19 ) , delta:  tensor([0.0001], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.00 ) , delta:  tensor([3.2758e-06], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.33 ) , delta:  tensor([0.0001], device='cuda:0', dtype=torch.float64)
pred:  neg ( 0.00 ) , delta:  tensor([2.5240e-05], device='cuda:0', dtype=torch.float64)


In [54]:
print('Visualize attributions based on Integrated Gradients')
visualization.visualize_text(vis_data_records_ig)

Visualize attributions based on Integrated Gradients


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
pos,pos (0.93),pos,1.86,It was a fantastic performance ! pad
,,,,
pos,neg (0.01),pos,0.63,Best film ever pad pad pad pad
,,,,
pos,neg (0.19),pos,1.57,Such a great show! pad pad pad
,,,,
neg,neg (0.00),pos,-0.52,It was a horrible movie pad pad
,,,,
neg,neg (0.33),pos,1.86,I've never watched something as bad pad
,,,,
