## Assignment 2.4: Text classification via CNN (20 points)

In this assignment you should perform sentiment analysis of the IMDB reviews based on CNN architecture. Read carefully [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/pdf/1408.5882.pdf) by Yoon Kim.

In [1]:
import numpy as np
import torch

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext import datasets
from torchtext.data import Field, LabelField
from torchtext.data import Iterator

### Preparing Data

In [2]:
TEXT = Field(sequential=True, lower=True, batch_first=True)
LABEL = LabelField(batch_first=True)

In [3]:
train, tst = datasets.IMDB.splits(TEXT, LABEL)
trn, vld = train.split()

In [4]:
# %%time
TEXT.build_vocab(trn)

In [5]:
LABEL.build_vocab(trn)

### Creating the Iterator (2 points)

Define an iterator here

In [6]:
if torch.cuda.is_available():
    device = 'cuda'
    torch.cuda.set_device(3)
else:
    device = 'cpu'


print("device: ", device)

device:  cuda


In [7]:
train_iter, val_iter, test_iter = Iterator.splits((trn, vld, tst), 
                                                  batch_size = 64, 
                                                  device = device)

### Define CNN-based text classification model (8 points)

In [8]:
class CNN(nn.Module):
    def __init__(self, V, D, kernel_sizes, dropout=0.5):
        super(CNN, self).__init__()
        self.n_filters = 100
        self.embedding = nn.Embedding(V, D)

        self.convs = nn.ModuleList([nn.Conv2d(in_channels = 1, 
                                              out_channels = self.n_filters, 
                                              kernel_size = (k, D)) 
                                    for k in kernel_sizes])
        
        self.linear = nn.Linear(len(kernel_sizes) * self.n_filters, 1)
        self.dropout = nn.Dropout(dropout)
        self.act = nn.Sigmoid()
        
    def forward(self, x):
        emb = self.embedding(x).unsqueeze(1)
        cnv = [F.relu(conv(emb)).squeeze(3) for conv in self.convs]  
        pool = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in cnv]
        cat = self.dropout(torch.cat(pool, dim = 1))
        outputs = self.linear(cat)
        logit = self.act(outputs)
        return logit

In [9]:
kernel_sizes = [3,4,5]
vocab_size = len(TEXT.vocab)
dropout = 0.5
dim = 300

model = CNN(vocab_size, dim, kernel_sizes, dropout)

In [10]:
model.cuda()

CNN(
  (embedding): Embedding(202379, 300)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1))
  )
  (linear): Linear(in_features=300, out_features=1, bias=True)
  (dropout): Dropout(p=0.5)
  (act): Sigmoid()
)

### The training loop (3 points)

Define the optimization function and the loss functions.

In [11]:
opt = optim.Adam(model.parameters())
loss_func = nn.BCEWithLogitsLoss()

Think carefully about the stopping criteria. 

In [12]:
epochs = 15

In [13]:
%%time
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() 
    for batch in train_iter:         
        
        x = batch.text
        y = batch.label
        
        opt.zero_grad()
        preds = model(x).squeeze()
        loss = loss_func(preds, y.float())
        loss.backward()
        opt.step()
        running_loss += loss.item()
        
    epoch_loss = running_loss / len(trn)
    
    val_loss = 0.0
    model.eval()
    correct = 0
    total = 0 
    for batch in val_iter:
        
        x = batch.text
        y = batch.label
        
        preds = model(x).squeeze()
        loss = loss_func(preds, y.float())
        val_loss += loss.item()
        
    val_loss /= len(vld)
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, val_loss))

Epoch: 1, Training Loss: 0.010644211176463535, Validation Loss: 0.009809689891338349
Epoch: 2, Training Loss: 0.009914062336512975, Validation Loss: 0.00955055730342865
Epoch: 3, Training Loss: 0.009621580059187753, Validation Loss: 0.00938572502930959
Epoch: 4, Training Loss: 0.00942767163344792, Validation Loss: 0.009257817216714223
Epoch: 5, Training Loss: 0.009226666627611433, Validation Loss: 0.009182151993115744
Epoch: 6, Training Loss: 0.009088774820736476, Validation Loss: 0.009084957993030548
Epoch: 7, Training Loss: 0.008979467001983097, Validation Loss: 0.009051678717136384
Epoch: 8, Training Loss: 0.00881090454033443, Validation Loss: 0.008938081240653992
Epoch: 9, Training Loss: 0.008730058087621416, Validation Loss: 0.008911931836605071
Epoch: 10, Training Loss: 0.00860604135308947, Validation Loss: 0.008885545110702515
Epoch: 11, Training Loss: 0.00853726339680808, Validation Loss: 0.008890131962299347
Epoch: 12, Training Loss: 0.008464037997382028, Validation Loss: 0.00

### Calculate performance of the trained model (2 points)

In [14]:
accuracies = []
recalls = []
precisions = []
for batch in test_iter:
    x = batch.text
    y = batch.label
    predictions = model(x).squeeze()
    rounded_preds = torch.round(predictions)
    correct = (rounded_preds == y.float()).float() 
    accuracy = correct.sum()/len(correct)
    correct_true_amount =  (correct * y.float()).sum()
    recall = correct_true_amount / y.float().sum()
    precision = correct_true_amount / predictions.sum()
    accuracies.append(accuracy)
    recalls.append(recall)
    precisions.append(precision)

In [15]:
eye = [1]
for i, r in enumerate(recalls):
    if  torch.isnan(r):
        recalls[i] = torch.Tensor(eye).cuda().squeeze()

In [26]:
accuracy = torch.mean(torch.stack(accuracies))
print("accuracy: ", accuracy)
precision = torch.mean(torch.stack(precisions))
print("precision: ", precision)
recall = torch.mean(torch.stack(recalls))
print("recall: ", recall)
f1 = 2*precision*recall/(precision+recall)
print("f1: ", f1)

accuracy:  tensor(0.8635, device='cuda:3')
precision:  tensor(0.8232, device='cuda:3', grad_fn=<MeanBackward0>)
recall:  tensor(0.8567, device='cuda:3')
f1:  tensor(0.8396, device='cuda:3', grad_fn=<DivBackward0>)


Write down the calculated performance

### Accuracy: 0.8635
### Precision: 0.8232
### Recall: 0.8567
### F1: 0.8396

### Experiments (5 points)

Experiment with the model and achieve better results. Implement and describe your experiments in details, mention what was helpful.

## 1. 
### dropout = 0.5, change Adam optimizer to SGD momentum

In [71]:
class CNN(nn.Module):
    def __init__(self, V, D, kernel_sizes, dropout=0.3):
        super(CNN, self).__init__()
        self.n_filters = 100
        self.embedding = nn.Embedding(V, D)

        self.convs = nn.ModuleList([nn.Conv2d(in_channels = 1, 
                                              out_channels = self.n_filters, 
                                              kernel_size = (k, D)) 
                                    for k in kernel_sizes])
        
        self.linear = nn.Linear(len(kernel_sizes) * self.n_filters, 1)
        self.dropout = nn.Dropout(dropout)
        self.act = nn.Sigmoid()
        
    def forward(self, x):
        emb = self.embedding(x).unsqueeze(1)
        cnv = [F.relu(conv(emb)).squeeze(3) for conv in self.convs]  
        pool = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in cnv]
        cat = self.dropout(torch.cat(pool, dim = 1))
        outputs = self.linear(cat)
        logit = self.act(outputs)
        return logit

In [27]:
kernel_sizes = [3,4,5]
vocab_size = len(TEXT.vocab)
dropout = 0.5
dim = 300
epochs = 10

model = CNN(vocab_size, dim, kernel_sizes, dropout)

In [28]:
model.cuda()

CNN(
  (embedding): Embedding(201784, 300)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1))
  )
  (linear): Linear(in_features=300, out_features=1, bias=True)
  (dropout): Dropout(p=0.5)
  (act): Sigmoid()
)

In [29]:
opt = opt = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
loss_func = nn.BCEWithLogitsLoss()

In [30]:
%%time
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() 
    for batch in train_iter:         
        
        x = batch.text
        y = batch.label
        
        opt.zero_grad()
        preds = model(x).squeeze()
        loss = loss_func(preds, y.float())
        loss.backward()
        opt.step()
        running_loss += loss.item()
        
    epoch_loss = running_loss / len(trn)
    
    val_loss = 0.0
    model.eval()
    correct = 0
    total = 0 
    for batch in val_iter:
        
        x = batch.text
        y = batch.label
        
        preds = model(x).squeeze()
        loss = loss_func(preds, y.float())
        val_loss += loss.item()
        
    val_loss /= len(vld)
    
    print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, epoch_loss, val_loss))

Epoch: 1, Training Loss: 0.010855203005245754, Validation Loss: 0.010905149841308593
Epoch: 2, Training Loss: 0.010852082306998117, Validation Loss: 0.01090480485757192
Epoch: 3, Training Loss: 0.010850114720208304, Validation Loss: 0.01090240683555603
Epoch: 4, Training Loss: 0.010846595283917019, Validation Loss: 0.01088865509033203
Epoch: 5, Training Loss: 0.010823195392744882, Validation Loss: 0.010823581592241923
Epoch: 6, Training Loss: 0.010669763101850237, Validation Loss: 0.010673945093154908
Epoch: 7, Training Loss: 0.010468876664979117, Validation Loss: 0.01014719382127126
Epoch: 8, Training Loss: 0.010272592316355024, Validation Loss: 0.009920679012934367
Epoch: 9, Training Loss: 0.01008944388798305, Validation Loss: 0.009886492236455281
Epoch: 10, Training Loss: 0.009969590612820217, Validation Loss: 0.009703847845395406
CPU times: user 2min 17s, sys: 50.7 s, total: 3min 8s
Wall time: 3min 7s


### It works worse than previos model.

## I have tried to change some other params but the result was terrible.