# Практическия реализация NLP

In this assignment you will perform sentiment analysis of the IMDBs reviews by using RNN.

In [1]:
!pip install torch==1.6.0
!pip install torchtext==0.7
!pip install numpy
!pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.6.0
  Downloading torch-1.6.0-cp37-cp37m-manylinux1_x86_64.whl (748.8 MB)
[K     |████████████████████████████████| 748.8 MB 17 kB/s 
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.12.0+cu113
    Uninstalling torch-1.12.0+cu113:
      Successfully uninstalled torch-1.12.0+cu113
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.0+cu113 requires torch==1.12.0, but you have torch 1.6.0 which is incompatible.
torchtext 0.13.0 requires torch==1.12.0, but you have torch 1.6.0 which is incompatible.
torchaudio 0.12.0+cu113 requires torch==1.12.0, but you have torch 1.6.0 which is incompatible.
fastai 2.7.7 requires torch<1.13,>=1.7, but you have torch 1.6.0 which is inco

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import torch

from torchtext import datasets

from torchtext.data import Field, LabelField
from torchtext.data import BucketIterator

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### Preparing Data

In [3]:
TEXT = Field(sequential=True, lower=True)
LABEL = LabelField()

In [4]:
train, tst = datasets.IMDB.splits(TEXT, LABEL)
trn, vld = train.split()

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:08<00:00, 9.82MB/s]


In [5]:
%%time
TEXT.build_vocab(trn)

CPU times: user 1.18 s, sys: 35.7 ms, total: 1.22 s
Wall time: 1.22 s


In [6]:
LABEL.build_vocab(trn)

In [7]:
TEXT.vocab.freqs.most_common(10)

[('the', 225754),
 ('a', 112289),
 ('and', 111408),
 ('of', 101336),
 ('to', 93885),
 ('is', 73019),
 ('in', 63516),
 ('i', 49152),
 ('this', 48709),
 ('that', 46310)]

### Creating the Iterator

During training, we'll be using a special kind of Iterator, the **BucketIterator**. 

All the neural networks require to have inputs of the same shape and size. So the data saples should be padded to the same length before gathering them into batches:

e.g.
\[ 
\[3, 15, 2, 7\],
\[4, 1\], 
\[5, 5, 6, 8, 1\] 
\] -> \[ 
\[3, 15, 2, 7, **0**\],
\[4, 1, **0**, **0**, **0**\], 
\[5, 5, 6, 8, 1\] 
\] 

If the sequences of one batch differ greatly in length, the padding will consume a lot of wasteful memory and time. The BucketIterator groups sequences of similar lengths together for each batch to minimize padding.

The **BucketIterator** usage:

In [8]:
train_iter, val_iter, test_iter = BucketIterator.splits(
        (trn, vld, tst),
        batch_sizes=(64, 64, 64),
        sort=True,
        sort_key=lambda x: len(x.text),
        sort_within_batch=False,
        device='cuda',
        repeat=False
)

Let's take a look at the output of the BucketIterator

In [9]:
batch = next(train_iter.__iter__()); batch.text

tensor([[   10,     9,  1245,  ...,   117,     9,    10],
        [   20,   509,   140,  ...,   208,   376,    20],
        [    7,   875,  2186,  ...,   117,     2,     7],
        ...,
        [    1,     1,     1,  ...,   769,   226, 12889],
        [    1,     1,     1,  ...,    12,   508,    13],
        [    1,     1,     1,  ..., 10845,   112,   728]], device='cuda:0')

The batch contains all the fields we passed to the Dataset object that can be accessed as attributes with the corresponding names.

In [10]:
batch.__dict__.keys()

dict_keys(['batch_size', 'dataset', 'fields', 'input_fields', 'target_fields', 'text', 'label'])

### Define the RNN-based text classification model

Let's start with the simple architecture. Implement the model according to the scheme below.  
![alt text](https://miro.medium.com/max/1396/1*v-tLYQCsni550A-hznS0mw.jpeg)


In [11]:
class RNNBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim, vocab_dim):
        super(RNNBaseline, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.emb_dim = emb_dim
        self.vocab_dim = vocab_dim        
        layer_dim = 4
        
        self.embedding = nn.Embedding(vocab_dim, emb_dim)
        
        self.encoder = nn.GRU(emb_dim, hidden_dim, layer_dim)
        
        self.decoder = nn.Linear(2*hidden_dim, 64)
            
    def forward(self, seq):
        
        emb = self.embedding(seq)
        
        outputs, _ = self.encoder(emb)
        
        encoding = torch.cat((outputs[0], outputs[1]), -1)
       
       
        preds = self.decoder(encoding)
        
        return preds

In [None]:
em_sz = 200
nh = 300
v_size = len(TEXT.vocab)
model = RNNBaseline(nh, em_sz, v_size)
model

RNNBaseline(
  (embedding): Embedding(201052, 200)
  (encoder): GRU(200, 300, num_layers=4)
  (decoder): Linear(in_features=600, out_features=64, bias=True)
)

*If* you're using GPU, remember to call model.cuda() to move your model to the GPU.

In [None]:
model.cuda()

RNNBaseline(
  (embedding): Embedding(201052, 200)
  (encoder): GRU(200, 300, num_layers=4)
  (decoder): Linear(in_features=600, out_features=64, bias=True)
)

### Training loop

Define the optimizer and the loss function

In [None]:
opt = torch.optim.Adam(model.parameters())
loss_func = torch.nn.CrossEntropyLoss()

Set the number of training epochs

In [None]:
epochs = 10

In [None]:
def save_model(model, iter):
  path = f'/content/drive/My Drive/Model/_iter_{iter}'
  print(f'Saving {iter} model...')
  torch.save(model, path)
  print(f'{iter} saved successfully.')

Finally, run the training loop

In [None]:
%%time
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() 
    for batch in train_iter: 
        
        x = batch.text
        y = batch.label
        
        opt.zero_grad()
        preds = model(x)   
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        running_loss += loss.item()

    epoch_loss = running_loss / len(trn)
    
    val_loss = 0.0
    model.eval()
    for batch in val_iter:
        
        x = batch.text
        y = batch.label
        
        preds = model(x) 
        loss = loss_func(preds, y)
        val_loss += loss.item()
        
    val_loss /= len(vld)
    print(f'Epoch: {epoch}, Training Loss: {epoch_loss}, Validation Loss: {val_loss}')
    save_model(model, epoch)



Epoch: 1, Training Loss: 0.01099149864401136, Validation Loss: 0.010946952231725057
Saving 1 model...
1 saved successfully.
Epoch: 2, Training Loss: 0.01083477886063712, Validation Loss: 0.011152411619822184
Saving 2 model...
2 saved successfully.
Epoch: 3, Training Loss: 0.010438943416731699, Validation Loss: 0.011444394063949585
Saving 3 model...
3 saved successfully.
Epoch: 4, Training Loss: 0.009872654577663967, Validation Loss: 0.011823897043863931
Saving 4 model...
4 saved successfully.
Epoch: 5, Training Loss: 0.009281155111108507, Validation Loss: 0.012725892392794291
Saving 5 model...
5 saved successfully.
Epoch: 6, Training Loss: 0.008788975258384433, Validation Loss: 0.013775794672966003
Saving 6 model...
6 saved successfully.
Epoch: 7, Training Loss: 0.008367549686772483, Validation Loss: 0.013699033331871033
Saving 7 model...
7 saved successfully.
Epoch: 8, Training Loss: 0.007973204700435912, Validation Loss: 0.014367634884516398
Saving 8 model...
8 saved successfully.
Ep

### Calculate performance of the trained model (10 points)

In [None]:

for epoch in range(1, epochs + 1):
  acc = 0.0
  pre = 0.0
  rec = 0.0
  f1 = 0.0
  model.eval()
  for batch in test_iter:
    x = batch.text
    y = batch.label
    l = y.detach().cpu().numpy()
    preds = model(x).detach().cpu().numpy().argmax(axis=1)
    acc += accuracy_score(l, preds)
    pre += precision_score(l, preds)
    rec += recall_score(l, preds)
    f1 += f1_score(l, preds)
    i = len(test_iter)
     
  print(f'Epoch: {epoch}, Accuracy: {acc/i}, Precision: {pre/i}, Recall: {rec/i}, F1: {f1/i}')

Epoch: 1, Accuracy: 0.5431345907928389, Precision: 0.5628120546200523, Recall: 0.30321270477920037, F1: 0.37626781544729987
Epoch: 2, Accuracy: 0.5431345907928389, Precision: 0.5628120546200523, Recall: 0.30321270477920037, F1: 0.37626781544729987
Epoch: 3, Accuracy: 0.5431345907928389, Precision: 0.5628120546200523, Recall: 0.30321270477920037, F1: 0.37626781544729987
Epoch: 4, Accuracy: 0.5431345907928389, Precision: 0.5628120546200523, Recall: 0.30321270477920037, F1: 0.37626781544729987
Epoch: 5, Accuracy: 0.5431345907928389, Precision: 0.5628120546200523, Recall: 0.30321270477920037, F1: 0.37626781544729987
Epoch: 6, Accuracy: 0.5431345907928389, Precision: 0.5628120546200523, Recall: 0.30321270477920037, F1: 0.37626781544729987
Epoch: 7, Accuracy: 0.5431345907928389, Precision: 0.5628120546200523, Recall: 0.30321270477920037, F1: 0.37626781544729987
Epoch: 8, Accuracy: 0.5431345907928389, Precision: 0.5628120546200523, Recall: 0.30321270477920037, F1: 0.37626781544729987
Epoch: 9

Report the calculated performance below

#### Accuracy: 0.543
#### Precision: 0.563
#### Recall: 0.303
#### F1: 0.376

### Experiments

Feel free to experiment with the model to improve performance scores. You can find advices [here](https://arxiv.org/abs/1801.06146). 

Below describe, please, 
 - your improvements and challenges you faced
 - provide your experiments' implementation details
 - explain your choice of architecture/training method/regularization techniques etc.

### 1. Проведённые исследования по изменению алгоритма оптимизации показал, что наилучшие результаты по сравнению с базовой моделью Adam были получены c использованием алгоритма RMSprop и скоростью обучения 0.001 и 0.0001.
* Эксперимент по подбору других гиперпараметров и моделированием функции потерь требует большего времени и, к сожалению, на Google Colab не хватило времени на прогон по всем параметрам. Лучших результатов можно добиться на стационарном "железе".
* Также желательно использовать более новые библиотеки. В частности torch.text не поддерживается в последних версиях pytorch.
### 2. Доработана базовая программа обучения модели. В циклах осуществлен перебор и обучение с нуля для каждого варианта.
### 3. Изменение функции оптимизации и ее гиперпараметров является одним из ключевых способов ручного управления процессом улучшения модели.
* Также необходимо провести эксперименты по подбору функции потерь, может быть попытаться группировать потери с регулиризацией.

#### Попробуем поэкспериментировать с изменением алгоритма оптимизации Adam, RMSprop, SGD и гиперпараметром  (скорость обучения lr).

In [12]:
from torch.optim import Adam, RMSprop, SGD
loss_func =  torch.nn.CrossEntropyLoss()
optim_dict = {'Adam': Adam, 'RMSprop': RMSprop, 'SGD': SGD}
    

# loss_dict = {'CELoss': torch.nn.CrossEntropyLoss(), 
            #  'BCELoss': torch.nn.BCELoss(), 
            #  'BCEWLoss': torch.nn.BCEWithLogitsLoss()}
# eps_dict = {'1e-8': 1e-8, '0.1': 0.1, '1.0': 1.0}
lr_dict = {'0.001': 0.001,  '0.01': 0.01, '0.0001': 0.0001}

In [None]:
def save_model2(model, iter, loss_key, eps_key, lr_key):
  path = f'/content/drive/My Drive/Model/_iter_{iter}' + loss_key+lr_key +eps_key
  print(f'Saving {iter} model...')
  torch.save(model, path)
  print(f'{iter} saved successfully.')

In [13]:
epochs = 7

In [16]:
# def train_model(model, epochs, loss_func, opt, loss_key, eps_key, lr_key):
def train_model(model, epochs, loss_func, opt):
  %%time
  for epoch in range(1, epochs + 1):
      running_loss = 0.0
      running_corrects = 0
      model.train() 
      for batch in train_iter: 
          
          x = batch.text
          y = batch.label
          
          opt.zero_grad()
          preds = model(x)   
          loss = loss_func(preds, y)
          loss.backward()
          opt.step()
          running_loss += loss.item()

      epoch_loss = running_loss / len(trn)
      
      val_loss = 0.0
      model.eval()
      for batch in val_iter:
          
        x = batch.text
        y = batch.label
        
        preds = model(x) 
        loss = loss_func(preds, y)
        val_loss += loss.item()
          
      val_loss /= len(vld)
      print(f'Epoch: {epoch}, Training Loss: {epoch_loss}, Validation Loss: {val_loss}')
      # save_model2(model, epoch, loss_key, eps_key, lr_key)

In [17]:
for optim_key, optim_func in optim_dict.items():
  # for opt_key, opt in opt_dict.items():
    for lr_key, lr in lr_dict.items():
    # for eps_key, eps in eps_dict.items():
      em_sz = 200
      nh = 300
      v_size = len(TEXT.vocab)
      model = RNNBaseline(nh, em_sz, v_size)
      model.cuda()      
      opt = optim_func(model.parameters(), lr = lr)
        
      print(optim_key, lr_key)
      train_model(model, epochs, loss_func, opt) #, loss_key, eps_key, lr_key)
      for epoch in range(1, epochs + 1):
        acc = 0.0
        pre = 0.0
        rec = 0.0
        f1 = 0.0
        model.eval()
        for batch in test_iter:
          x = batch.text
          y = batch.label
          l = y.detach().cpu().numpy()
          preds = model(x).detach().cpu().numpy().argmax(axis=1)
          acc += accuracy_score(l, preds)
          pre += precision_score(l, preds, average = 'macro')
          rec += recall_score(l, preds, average = 'macro')
          f1 += f1_score(l, preds, average = 'macro')
      i = len(test_iter)
      print(f'Accuracy: {acc/i}, Precision: {pre/i}, Recall: {rec/i}, F1: {f1/i}')


Adam 0.001
CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs
Epoch: 1, Training Loss: 0.01249655064514705, Validation Loss: 0.010912781151135762
Epoch: 2, Training Loss: 0.010905010523114886, Validation Loss: 0.010864249404271444
Epoch: 3, Training Loss: 0.010692979982921055, Validation Loss: 0.010876572998364767
Epoch: 4, Training Loss: 0.010402227316583907, Validation Loss: 0.010953509783744812
Epoch: 5, Training Loss: 0.009981107558522906, Validation Loss: 0.011258458876609803
Epoch: 6, Training Loss: 0.009486561390331813, Validation Loss: 0.012215459394454956
Epoch: 7, Training Loss: 0.009009210654667445, Validation Loss: 0.013245729919274647
Accuracy: 0.5451726342710997, Precision: 0.5399968135540874, Recall: 0.529135998201002, F1: 0.5072153215771382
Adam 0.01
CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.48 µs
Epoch: 1, Training Loss: 0.011893706846237183, Validation Loss: 0.010997549804051716
Epoch: 2, Training Loss: 0.010944170662334987, Validation Los