## Let's make a **convolution** work on MOVIE REVIEW CLASSIFICATION and utilize TORCHTEXT

In [2]:
import torchtext as TT
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset

In [3]:
from torchsummary import summary

In [4]:
with open('./data/reviews.txt','r') as f:
    data = f.readlines()
with open('./data/labels.txt', 'r') as f:
    labels = f.readlines()

In [5]:
i = np.random.randint(low=1,high=len(data))
print(data[i], labels[i])

i am    and i still like most of the scooby doo movies and the old episodes . i love the     s movies  and recently we were treated to one of the better direct to dvd scooby doo outings of this decade  scooby doo and the goblin king  which i wasn  t expecting to be as good as it was . anyway  back to get a clue  i watched some episodes  expecting something very good  but from what i saw of it  i wasn  t impressed at all . first of all  i hated the animation . it was flat  deflated and very saturday  morning  cartoon  standard  easily the worst aspect of the series . even some shows i really hate had slightly better animation . even worse  shaggy and scooby looked like aliens  and i really missed fred  velma and daphne  as they added a lot to the old episodes  when scooby doo was positively good . i also hated the character changes  because it seemed like instead of solving mysteries  shaggy and scooby were now playing superhero  something they would  ve never had done in the movies or 

In [7]:
len(data)

25000

In [6]:
%config Completer.use_jedi = False

### Install spacy
```sh
sudo su - 
pip install spacy
python -m spacy download en_core_web_sm
```

In [7]:
# train, test, validation split
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(data)*split_frac) + 1
test_val_idx = split_idx + int((len(data) - split_idx)//2)
split_idx, test_val_idx

(20001, 22500)

###  Make a vocabulary

In [8]:
%%time
from collections import Counter
from torchtext.vocab import Vocab

tokenizer = TT.data.utils.get_tokenizer('spacy', 'en_core_web_sm')
train_iter = zip(labels[:split_idx], data[:split_idx])
counter = Counter()

for (label, line) in train_iter:
    counter.update(tokenizer(line))
    
vocab = TT.vocab.Vocab(counter,min_freq=1)

CPU times: user 22.7 s, sys: 259 ms, total: 23 s
Wall time: 23.2 s


In [9]:
tokenizer(line)[:10]

['i', 'really', 'liked', 'tom', 'barman', ' ', 's', 'awtwb', '.', 'you']

### Load pre-trained word embeddings from GloVe

In [10]:
vocab.load_vectors('glove.6B.50d')

In [11]:
vocab.vectors.shape # these are the vectors that we can set as weights for embeddings

torch.Size([67359, 50])

In [12]:
vocab.vectors[vocab.stoi['apple']]

tensor([ 0.5204, -0.8314,  0.4996,  1.2893,  0.1151,  0.0575, -1.3753, -0.9731,
         0.1835,  0.4767, -0.1511,  0.3553,  0.2591, -0.7786,  0.5218,  0.4769,
        -1.4251,  0.8580,  0.5982, -1.0903,  0.3357, -0.6089,  0.4174,  0.2157,
        -0.0742, -0.5822, -0.4502,  0.1725,  0.1645, -0.3841,  2.3283, -0.6668,
        -0.5818,  0.7439,  0.0950, -0.4787, -0.8459,  0.3870,  0.2369, -1.5523,
         0.6480, -0.1652, -1.4719, -0.1622,  0.7986,  0.9739,  0.4003, -0.2191,
        -0.3094,  0.2658])

#### Check if load_vectors gives the same result as file with embedding

In [13]:
def get_word_embedding(word):
    with open('glove.6B.50d.txt','r') as f:
        while True:
            line = f.readline().split()
            if line[0] == word: return(torch.from_numpy(np.array(line[1:], dtype=np.float32)))

In [14]:
assert get_word_embedding('apple').equal(vocab.vectors[vocab.stoi['apple']])

In [15]:
# change the UNK vector with the mean of all known words
vocab.vectors[0] = vocab.vectors.mean(axis=0)

In [17]:
# show 10 most frequent tokens along with the frequency
sorted(vocab.freqs.items(), key=lambda x: x[-1], reverse=True)[:10]

[(' ', 496874),
 ('the', 268219),
 ('.', 261097),
 ('and', 131323),
 ('a', 129967),
 ('of', 116991),
 ('to', 108242),
 ('is', 85587),
 ('br', 81038),
 ('it', 77070)]

In [20]:
# vocabulary : map integer to a string
vocab.itos[:10]

['<unk>', '<pad>', ' ', 'the', '.', 'and', 'a', 'of', 'to', 'is']

In [21]:
# 10 words in a zero sentence
[s for s in data[0].split()[:10]] 

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at']

In [149]:
# 10 fifst codes of words in a zero sentence
[vocab[s] for s in data[0].split()[:10]]

[18777, 318, 9, 6, 1025, 217, 4, 11, 2194, 37]

In [22]:
# function that converts a line of text into a line of vocab indexes
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

In [23]:
data[0]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \n'

In [24]:
print(text_pipeline(data[0]))

[18777, 318, 9, 6, 1025, 217, 4, 11, 2194, 37, 3, 177, 63, 20, 56, 86, 5487, 50, 396, 119, 2, 144, 20, 2, 4896, 2, 4, 66, 13, 159, 12, 3, 4956, 5940, 497, 77, 8, 268, 16, 18777, 318, 2, 17, 2108, 9, 79, 2497, 8, 602, 80, 9, 2, 4896, 2, 4, 3, 25309, 8, 2111, 9960, 2, 3, 5679, 1501, 41, 55, 72, 212, 152, 74, 1202, 4896, 2, 18267, 2, 3, 39298, 7, 3, 231, 894, 2, 36, 3177, 77, 7, 3, 6291, 14, 689, 5, 74, 1501, 4, 60, 14, 221, 3, 372, 12, 68, 6, 1420, 4059, 817, 8, 3647, 186, 3, 396, 2, 14, 1198, 15590, 4, 4, 4, 4, 4, 4, 4, 4, 4, 37, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 318, 4, 6, 360, 351, 2742, 14, 2, 148, 134, 8, 7578, 35, 7, 136, 4896, 4, 1420, 2374, 8, 18777, 318, 4, 14, 529, 16, 117, 1496, 7, 66, 558, 108, 16, 18777, 318, 9, 237, 4233, 4, 54, 6, 2216, 16, 11, 227, 2, 28, 214]


In [25]:
vocab.stoi['  \n'], vocab.itos[214]

(214, '  \n')

In [27]:
text_pipeline('scary succumb')

[613, 16520]

In [28]:
# function to convert text label to a number label 0|1
label_pipeline = lambda x: 1 if x == 'positive\n' else 0

In [29]:
label_pipeline(labels[2])

1

In [30]:
class TextData(torch.utils.data.Dataset):
    '''
    very basic dataset for processing text data
    holds text and label
    implements len and getitem methods
    '''
    def __init__(self, labels, text):
        super(TextData, self).__init__()
        self.labels = labels
        self.text = text
        
    def __getitem__(self, index):
        return self.text[index], self.labels[index] 
    
    def __len__(self):
        return len(self.labels)

In [39]:
one, two = torch.ones(3).unsqueeze(dim=0), torch.zeros(3).unsqueeze(dim=0)
print(one)
print(two)

tensor([[1., 1., 1.]])
tensor([[0., 0., 0.]])


In [41]:
torch.cat([one, two], dim=0)

tensor([[1., 1., 1.],
        [0., 0., 0.]])

In [45]:
def tokenize_batch(batch, max_len=200):
    '''
    tokenizer to use in DataLoader
    takes a text batch of text dataset and produces a tensor batch, converting text and labels though tokenizer, labeler
    tokenizer is a global function text_pipeline
    labeler is a global function label_pipeline
    max_len is a fixed len size, if text is less than max_len it is padded with ones
    if text is larger that max_len it is truncated but from the end of the string
    '''
    labels_list, text_list = [], []
    for _text, _label in batch:
        labels_list.append(label_pipeline(_label))
        text_holder = torch.ones(max_len, dtype=torch.int32) # fixed size tensor of max_len
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int32)
        pos = min(200, len(processed_text))
        text_holder[-pos:] = processed_text[-pos:]
        text_list.append(text_holder.unsqueeze(dim=0))
    return torch.cat(text_list, dim=0), torch.FloatTensor(labels_list)

In [46]:
train_dataset = TextData(labels[:split_idx], data[:split_idx])

In [47]:
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=False, collate_fn=tokenize_batch)

In [48]:
train_loader.dataset.text[0]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \n'

In [49]:
vocab.stoi['bromwell']

18777

In [50]:
txt, lbl = iter(train_loader).next()

In [51]:
lbl, txt

(tensor([1., 0.]),
 tensor([[    1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1, 18777,   318,
              9,     6,  1025,   217,     4,    11,  2194,    37,     3,   177,
             63,    20,    56,    86,  5487,    50,   396,   119,     2,   144,
             20,     2,  4896,     2,     4,    66,    13,   159,    12,     3,
           4956,  5940,   497,    77,     8,   268,    16, 18777,   318,     2,
             17,  2108,     9,    79,  2497,     8,   602,    80,     9,     2,
           4896,     2,     4,     3, 25309,     8,  2111,  9960,     2,     3,
           5679,  1501,    41,    55,    72,   212,   152,    74,  1202,  4896,
              2, 18267,     2,     3, 39298,     7,     3,   231,   894,     2,
             36,  3177,    77,     7,     3,  6291,    14,   689,     5,    74,
           1501,     4,    60,    14,   221,     3,   372,    12,    68,     6,
           1420,  405

In [53]:
lbl.shape, txt.shape

(torch.Size([2]), torch.Size([2, 200]))

In [54]:
batch_size = 50
train_dataset = TextData(labels[:split_idx-1], data[:split_idx-1])
valid_dataset = TextData(labels[split_idx-1:test_val_idx], data[split_idx-1:test_val_idx])
test_dataset = TextData(labels[test_val_idx:], data[test_val_idx:])

In [55]:
len(test_dataset), len(valid_dataset), len(train_dataset)

(2500, 2500, 20000)

In [56]:
train_loader =  DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=tokenize_batch)
valid_loader =  DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, collate_fn=tokenize_batch)
test_loader =  DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=tokenize_batch)

## Make a model

In [57]:
import torch.nn as nn
import torch.nn.functional as F

#         Input shape     torch.Size([2, 200])
#         Embedding shape torch.Size([2, 200, 32]) 50
#         Conv1d shape    torch.Size([2, 32, 30]) 48
#         MaxPool shape   torch.Size([2, 32, 15]) 24
#         LSTM shape      torch.Size([2, 32, 64])
#         Dense shape     torch.Size([2, 32, 1])
#         Sigmoid shape   torch.Size([2])

class SentimentConvNN(nn.Module):
    def __init__(self, vocab_size, output_size=1, embedding_dim=32, hidden_dim=64, out_channels=32, drop_prob=0.5, vocab_vectors=None):
        super(SentimentConvNN, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.output_size = output_size
        self.hidden_dim = hidden_dim
        self.drop_prob = drop_prob
        self.out_channels = out_channels
        self.n_layers = 1
        
        # if we provide vocab_vectors then initialize weights
        if vocab_vectors is not None:
            self.embed = nn.Embedding.from_pretrained(vocab_vectors, freeze=True)
        else:
            self.embed = nn.Embedding(self.vocab_size, self.embedding_dim)
            
        self.conv1d = nn.Conv1d(in_channels=200, out_channels=self.out_channels, kernel_size=3, bias=False, padding=False)
        self.maxpool = nn.MaxPool1d(kernel_size=2)
        
        lstm_input = int((self.embed.weight.shape[-1]-2)/2) # repoduce the logic of conv1d resulting dimention
        
        self.lstm = nn.LSTM(input_size=lstm_input,
                            hidden_size=hidden_dim,
                            num_layers=1,
                            batch_first=True,
                            dropout=0)
        self.dense = nn.Linear(hidden_dim, 1)
        self.drop = nn.Dropout(p=drop_prob)
        
        self.bn_embedding = nn.BatchNorm1d(num_features=200)
        self.bn_conv1d = nn.BatchNorm1d(num_features=self.out_channels)
        self.bn_lstm = nn.BatchNorm1d(num_features=self.out_channels)
    
    def num_parameters(self):
        '''
        get the number of parameters in a network
        '''

        # return sum((list(map(lambda x: torch.as_tensor(x.flatten().size()).sum().item(), self.parameters()))))
        s=""
        for k, v in self.named_parameters():
            s+=f'{k:20} {v.shape}\n'
        s+=f'Total number of parameters = {sum(list(map(lambda x: x.numel(), self.parameters()))):,}'
        return s
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        hidden = (weight.new_zeros(self.n_layers, batch_size, self.hidden_dim),
                  weight.new_zeros(self.n_layers, batch_size, self.hidden_dim))

        return hidden
    
    def forward(self, x, hidden):
        '''
        Input shape 	torch.Size([2, 200])
        Embedding shape torch.Size([2, 200, 32])
        Conv1d shape 	torch.Size([2, 32, 30])
        MaxPool shape 	torch.Size([2, 32, 15])
        LSTM shape 		torch.Size([2, 32, 64])
        Dense shape 	torch.Size([2, 32, 1])
        Sigmoid shape 	torch.Size([2])
        '''
        #print(x.dtype)
        embed_out = self.embed(x)
        embed_out = self.bn_embedding(embed_out)
        embed_out = self.drop(embed_out)
        
        conv_out = self.conv1d(embed_out)
        conv_out = self.bn_conv1d(conv_out)
        conv_out_relu = F.relu(conv_out)
        maxpool_out = self.maxpool(conv_out_relu)
        
        lstm_out, _ = self.lstm(maxpool_out, hidden)
        lstm_out = self.bn_lstm(lstm_out)
        lstm_out = self.drop(lstm_out)
        
        out_dense = self.dense(lstm_out)
        out = nn.Sigmoid()(out_dense[:,-1,:]).view(out_dense.shape[0])
        
        return out, 1

In [58]:
# Instantiate the model with hyperparams
vocab_size = len(vocab) 
output_size = 1 # not needed
embedding_dim = 32
hidden_dim = 64
n_layers = 1

In [97]:
convRNN = SentimentConvNN(vocab_size=vocab_size)
print(convRNN.num_parameters())

embed.weight         torch.Size([67359, 32])
conv1d.weight        torch.Size([32, 200, 3])
lstm.weight_ih_l0    torch.Size([256, 15])
lstm.weight_hh_l0    torch.Size([256, 64])
lstm.bias_ih_l0      torch.Size([256])
lstm.bias_hh_l0      torch.Size([256])
dense.weight         torch.Size([1, 64])
dense.bias           torch.Size([1])
bn_embedding.weight  torch.Size([200])
bn_embedding.bias    torch.Size([200])
bn_conv1d.weight     torch.Size([32])
bn_conv1d.bias       torch.Size([32])
bn_lstm.weight       torch.Size([32])
bn_lstm.bias         torch.Size([32])
Total number of parameters = 2,196,017


In [60]:
h0 = convRNN.init_hidden(batch_size=2)

In [61]:
with torch.no_grad():
    out, _ = convRNN.forward(txt, h0)
out, out.shape

(tensor([0.1950, 0.5091]), torch.Size([2]))

In [98]:
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
convRNN.to(device)

SentimentConvNN(
  (embed): Embedding(67359, 32)
  (conv1d): Conv1d(200, 32, kernel_size=(3,), stride=(1,), bias=False)
  (maxpool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (lstm): LSTM(15, 64, batch_first=True)
  (dense): Linear(in_features=64, out_features=1, bias=True)
  (drop): Dropout(p=0.5, inplace=False)
  (bn_embedding): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_conv1d): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_lstm): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

### Training conv RNN

In [99]:
def val_score(net, val_loader, criterion):
    '''
    calculates validation loss
    does not put a net into eval mode - have to do this manually before val_score call
    
    '''
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    
    loss = []
    # that is the number of objects in loader, not batches
    number_of_objects = len(val_loader.dataset)
    
    # make array of zeros with the shape of response
    pred_y = np.zeros(number_of_objects)
    true_y = np.zeros_like(pred_y)

    # store a batch size 
    batch_size = val_loader.batch_size
    
    with torch.no_grad():
        for ii, (test_x, test_y) in enumerate(val_loader):
            h = net.init_hidden(test_y.shape[0])
            test_x, test_y = test_x.to(device), test_y.to(device)
            out, _ = net.forward(test_x, h)
            batch_loss = criterion(out, test_y)
            
            # store predictions and true labels
            pred_y[ii*batch_size:ii*batch_size + len(test_y)] = out.to('cpu').numpy()
            true_y[ii*batch_size:ii*batch_size + len(test_y)] = test_y.to('cpu').numpy()
            
            loss.append(batch_loss.item())
    
    precision = precision_score(true_y, np.round(pred_y))
    recall = recall_score(true_y, np.round(pred_y))
    accuracy = accuracy_score(true_y, np.round(pred_y))
    fscore = f1_score(true_y, np.round(pred_y))
    
    metrics = {'precision':precision, 'recall':recall, 'accuracy':accuracy, 'fscore':fscore}
    
    return np.mean(loss), metrics

In [100]:
def trainer_SWA(net, criterion, optimizer, train_loader, valid_loader, clip_value=5, epochs=10, print_every=200, max_fscore=-np.inf):
    '''
    Train the network
        net - network to trian
        criterion - loss function 
        optimizer - your optimiser of choice 
        train_loader - loader for training data
        vlid_loader - lodaer for validation/test data
        clip_value - upper limit for gradient 
        epochs - number of epochs to train the net
        print_every - prin stats every number of batches
        max_fscore - best fscore on validation set - used in mutiple runs of training
    '''
    from tqdm.notebook import tqdm, trange
    
    steps = 0

    net.train()

    # run over epochs of training
    for e in trange(epochs):
        
        # array to keep value of losses over current epoch
        train_loss = []

        # run one pass through training samples = one epoch
        for train_x, train_y in train_loader:
            steps +=1

            # zero out the grads 
            net.zero_grad()
            # optimizer.zero_grad()

            # send data to device
            train_x, train_y = train_x.to(device), train_y.to(device)

            # initialize hidden state
            h = net.init_hidden(len(train_x))

            # calculate the output of the network
            out, _ = net(train_x, h)

            # compute the loss
            loss = criterion(out, train_y)
            # backprop grads of the loss wrt to net parameters
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), clip_value)

            # upadate parameters of network
            optimizer.step()

            # append current batch loss (loss per object in current batch)
            train_loss.append(loss.item())

            # test loss calc every 
            if steps%print_every == 0:
                # calculate loss on test set
                if max_fscore > 0.75:
                    optimizer.update_swa()
                    optimizer.swap_swa_sgd() # use SWA weights for the calc of validation loss
                    for g in optimizer.param_groups:
                        g['lr'] = 0.0005
                net.eval()
                test_loss, metrics = val_score(net, valid_loader, criterion)
                if metrics['fscore'] > max_fscore:
                    max_fscore = metrics['fscore']
                    message = '=)'
                    check_point = {'vocab_size':net.vocab_size, 
                                   'embedding_dim': net.embedding_dim, 
                                   'hidden_dim':net.hidden_dim, 
                                   'n_layers':net.n_layers, 
                                   'net_params':net.state_dict()}
                    torch.save(check_point, f"spam_model_fscore_{metrics['fscore']:.3f}.pt")
                else:
                    message = ';('
                if max_fscore > 0.75:
                    optimizer.swap_swa_sgd() # swap back normal weights and continue training
                net.train()
                print(f"Step {steps} epoch {e+1}. {message}\nTest loss is {test_loss:.4f}. Train loss is {np.mean(train_loss):.4f}.\
                F1 Score={metrics['fscore']:.2%} Precision={metrics['precision']:.2%} Recall={metrics['recall']:.2%} Accuracy={metrics['accuracy']:.2%}\n")
    return max_fscore

In [101]:
max_fscore = 0.5

In [44]:
help(SWA)

Help on class SWA in module torchcontrib.optim.swa:

class SWA(torch.optim.optimizer.Optimizer)
 |  Base class for all optimizers.
 |  
 |      Parameters need to be specified as collections that have a deterministic
 |      ordering that is consistent between runs. Examples of objects that don't
 |      satisfy those properties are sets and iterators over values of dictionaries.
 |  
 |  Args:
 |      params (iterable): an iterable of :class:`torch.Tensor` s or
 |          :class:`dict` s. Specifies what Tensors should be optimized.
 |      defaults: (dict): a dict containing default values of optimization
 |          options (used when a parameter group doesn't specify them).
 |  
 |  Method resolution order:
 |      SWA
 |      torch.optim.optimizer.Optimizer
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, optimizer, swa_start=None, swa_freq=None, swa_lr=None)
 |      Implements Stochastic Weight Averaging (SWA).
 |      
 |      Stochastic Weight Aver

In [69]:
model = torch.load('spam_model_fscore_0.843.pt')
model.keys()

dict_keys(['vocab_size', 'embedding_dim', 'hidden_dim', 'n_layers', 'net_params'])

In [70]:
convRNN.load_state_dict(model['net_params'])

<All keys matched successfully>

In [65]:
next(convRNN.parameters()).is_cuda

True

In [None]:
# base_opt = torch.optim.SGD(model.parameters(), lr=0.1)
# opt = torchcontrib.optim.SWA(
#                 base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)
# for _ in range(100):
#     opt.zero_grad()
#     loss_fn(model(input), target).backward()
#     opt.step()
# opt.swap_swa_sgd()

In [103]:
# loss and optimization functions
criterion = nn.BCELoss(reduction='mean')
base_opt = torch.optim.Adam(convRNN.parameters(), lr=lr, weight_decay=0.005)
#optimizer = torch.optim.SGD(convRNN.parameters(), lr=lr, momentum=0.9, nesterov=True)
#optimizer = torch.optim.RMSprop(convRNN.parameters(), lr=lr, momentum=0.9)
lr = 0.001
#base_opt = torch.optim.SGD(convRNN.parameters(), lr=lr)
optimizer = SWA(base_opt)

In [104]:
for g in optimizer.param_groups:
    print(g['lr'])

0.001


In [106]:
max_fscore = trainer_SWA(convRNN, criterion, optimizer, train_loader, valid_loader, clip_value=10, epochs=50, print_every=200, max_fscore=max_fscore)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Step 200 epoch 1. ;(
Test loss is 0.8857. Train loss is 0.1112.                F1 Score=84.88% Precision=86.99% Recall=82.88% Accuracy=85.24%

Step 400 epoch 1. ;(
Test loss is 0.9245. Train loss is 0.1272.                F1 Score=84.68% Precision=87.27% Recall=82.24% Accuracy=85.12%

Step 600 epoch 2. ;(
Test loss is 0.9929. Train loss is 0.1085.                F1 Score=84.89% Precision=86.73% Recall=83.12% Accuracy=85.20%

Step 800 epoch 2. ;(
Test loss is 1.0517. Train loss is 0.1230.                F1 Score=84.57% Precision=87.04% Recall=82.24% Accuracy=85.00%

Step 1000 epoch 3. ;(
Test loss is 1.1424. Train loss is 0.1166.                F1 Score=84.91% Precision=86.61% Recall=83.28% Accuracy=85.20%

Step 1200 epoch 3. ;(
Test loss is 1.3974. Train loss is 0.1272.                F1 Score=84.97% Precision=86.56% Recall=83.44% Accuracy=85.24%

Step 1400 epoch 4. ;(
Test loss is 2.1828. Train loss is 0.1135.                F1 Score=85.09% Precision=86.46% Recall=83.76% Accuracy=85.3

## SWA result - you can not train a ntework with SGD... SWA does not give the result here
# BUT with Adam in our case it works almost on par with vanilla attention

## Make convRNN with pretrained embedding from GloVe

In [85]:
# Instantiate the model with hyperparams
vocab_size = len(vocab) 
output_size = 1 # not needed
embedding_dim = 50
hidden_dim = 64
n_layers = 1

In [86]:
# vocab_size, output_size=1, embedding_dim=32, hidden_dim=64, out_channels=32, drop_prob=0.5, vocab_vectors=None
convRNN = SentimentConvNN(vocab_size=vocab_size, embedding_dim=embedding_dim, vocab_vectors=vocab.vectors)
print(convRNN.num_parameters())

embed.weight         torch.Size([67359, 50])
conv1d.weight        torch.Size([32, 200, 3])
lstm.weight_ih_l0    torch.Size([256, 24])
lstm.weight_hh_l0    torch.Size([256, 64])
lstm.bias_ih_l0      torch.Size([256])
lstm.bias_hh_l0      torch.Size([256])
dense.weight         torch.Size([1, 64])
dense.bias           torch.Size([1])
bn_embedding.weight  torch.Size([200])
bn_embedding.bias    torch.Size([200])
bn_conv1d.weight     torch.Size([32])
bn_conv1d.bias       torch.Size([32])
bn_lstm.weight       torch.Size([32])
bn_lstm.bias         torch.Size([32])
Total number of parameters = 3,410,783


In [87]:
# if you want to make a grad
convRNN.embed.weight.requires_grad_(True)

Parameter containing:
tensor([[ 0.0460, -0.1792, -0.1163,  ..., -0.0013, -0.0254, -0.0351],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       requires_grad=True)

In [88]:
convRNN.embed.weight.requires_grad, convRNN.dense.weight.requires_grad

(True, True)

In [89]:
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
convRNN.to(device)

SentimentConvNN(
  (embed): Embedding(67359, 50)
  (conv1d): Conv1d(200, 32, kernel_size=(3,), stride=(1,), bias=False)
  (maxpool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (lstm): LSTM(24, 64, batch_first=True)
  (dense): Linear(in_features=64, out_features=1, bias=True)
  (drop): Dropout(p=0.5, inplace=False)
  (bn_embedding): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_conv1d): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_lstm): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [90]:
def trainer(net, criterion, optimizer, train_loader, valid_loader, clip_value=5, epochs=10, print_every=200, max_fscore=-np.inf):
    '''
    Train the network
        net - network to trian
        criterion - loss function 
        optimizer - your optimiser of choice 
        train_loader - loader for training data
        vlid_loader - lodaer for validation/test data
        clip_value - upper limit for gradient 
        epochs - number of epochs to train the net
        print_every - prin stats every number of batches
        max_fscore - best fscore on validation set - used in mutiple runs of training
    '''
    from tqdm.notebook import tqdm, trange
    
    steps = 0

    net.train()

    # run over epochs of training
    for e in trange(epochs):
        
        # array to keep value of losses over current epoch
        train_loss = []

        # run one pass through training samples = one epoch
        for train_x, train_y in train_loader:
            steps +=1

            # zero out the grads 
            net.zero_grad()
            # optimioptimizer.zero_grad()

            # send data to device
            train_x, train_y = train_x.to(device), train_y.to(device)

            # initialize hidden state
            h = net.init_hidden(len(train_x))

            # calculate the output of the network
            out, _ = net(train_x, h)

            # compute the loss
            loss = criterion(out, train_y)
            # backprop grads of the loss wrt to net parameters
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), clip_value)

            # upadate parameters of network
            optimizer.step()

            # append current batch loss (loss per object in current batch)
            train_loss.append(loss.item())

            # test loss calc every 
            if steps%print_every == 0:
                # calculate loss on test set
                test_loss, metrics = val_score(net, valid_loader, criterion)
                if metrics['fscore'] > max_fscore:
                    max_fscore = metrics['fscore']
                    message = '=)'
                    check_point = {'vocab_size':net.vocab_size, 
                                   'embedding_dim': net.embedding_dim, 
                                   'hidden_dim':net.hidden_dim, 
                                   'n_layers':net.n_layers, 
                                   'net_params':net.state_dict()}
                    torch.save(check_point, f"spam_model_fscore_{metrics['fscore']:.3f}.pt")
                else:
                    message = ';('
                net.train()
                print(f"Step {steps} epoch {e+1}. {message}\nTest loss is {test_loss:.4f}. Train loss is {np.mean(train_loss):.4f}.\
                F1 Score={metrics['fscore']:.2%} Precision={metrics['precision']:.2%} Recall={metrics['recall']:.2%} Accuracy={metrics['accuracy']:.2%}\n")
    return max_fscore

In [91]:
batch_size

50

In [92]:
# loss and optimization functions
lr = 0.001
criterion = nn.BCELoss(reduction='mean')
optimizer = torch.optim.Adam(convRNN.parameters(), lr=lr, weight_decay=1e-2)
#optimizer = torch.optim.SGD(convRNN.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=1e-2)
#optimizer = torch.optim.RMSprop(convRNN.parameters(), lr=lr, momentum=0.9)

In [93]:
max_fscore

0.8638963019443385

In [311]:
max_fscore = trainer(convRNN, criterion, optimizer, train_loader, valid_loader, clip_value=10, epochs=50, print_every=200, max_fscore=max_fscore)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Step 200 epoch 1. ;(
Test loss is 0.6891. Train loss is 0.6993.                F1 Score=64.30% Precision=51.78% Recall=84.80% Accuracy=52.92%

Step 400 epoch 1. ;(
Test loss is 0.6878. Train loss is 0.6941.                F1 Score=34.81% Precision=60.24% Recall=24.48% Accuracy=54.16%

Step 600 epoch 2. ;(
Test loss is 0.6638. Train loss is 0.6787.                F1 Score=64.02% Precision=58.37% Recall=70.88% Accuracy=60.16%

Step 800 epoch 2. ;(
Test loss is 0.6365. Train loss is 0.6688.                F1 Score=61.05% Precision=67.21% Recall=55.92% Accuracy=64.32%

Step 1000 epoch 3. ;(
Test loss is 0.5935. Train loss is 0.6214.                F1 Score=65.12% Precision=73.35% Recall=58.56% Accuracy=68.64%

Step 1200 epoch 3. ;(
Test loss is 0.5561. Train loss is 0.6003.                F1 Score=69.35% Precision=73.65% Recall=65.52% Accuracy=71.04%

Step 1400 epoch 4. ;(
Test loss is 0.5144. Train loss is 0.5418.                F1 Score=74.96% Precision=76.46% Recall=73.52% Accuracy=75.4

## CONV RNN with pretrained embedding gives no gain in metrics - you must make embedding trainable otherwise the results are horrible

---
### Manual conv network assembly
---
```python
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length,input_length=max_review_length)) 
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2)) 
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
```


In [426]:
embed = nn.Embedding(vocab_size, embedding_dim)
conv1d_sentiment = nn.Conv1d(in_channels=200, out_channels=32, kernel_size=3, bias=False, padding=False)
maxpool_sentiment = nn.MaxPool1d(kernel_size=2)
lstm_sentiment = nn.LSTM(input_size=15,
                         hidden_size=hidden_dim,
                         num_layers=1,
                         batch_first=True,
                         dropout=0)
lstm_h0 = (torch.zeros(1,2,hidden_dim), torch.zeros(1,2,hidden_dim))
dense_sentiment = nn.Linear(hidden_dim, 1)
drop_sentiment = nn.Dropout(p=0.5)

In [427]:
conv1d_sentiment.weight.shape

torch.Size([32, 200, 3])

In [429]:
with torch.no_grad():
    embed_out = embed(txt)
    embed_out = drop_sentiment(embed_out)
    conv_out = conv1d_sentiment(embed_out)
    conv_out_relu = F.relu(conv_out)
    maxpool_out = maxpool_sentiment(conv_out_relu)
    lstm_out, _ = lstm_sentiment(maxpool_out, lstm_h0)
    lstm_out = drop_sentiment(lstm_out)
    out_dense = dense_sentiment(lstm_out)
    out = nn.Sigmoid()(out_dense[:,-1,:]).view(out_dense.shape[0])
# Input:  (N, Cin,  Lin)
# Output: (N, Cout, Lout)
print(f'Input shape \t{txt.shape}\nEmbedding shape {embed_out.shape}\nConv1d shape \t{conv_out.shape}\
\nMaxPool shape \t{maxpool_out.shape}\nLSTM shape \t{lstm_out.shape}\nDense shape \t{out_dense.shape}\nSigmoid shape \t{out.shape}')


Input shape 	torch.Size([2, 200])
Embedding shape torch.Size([2, 200, 32])
Conv1d shape 	torch.Size([2, 32, 30])
MaxPool shape 	torch.Size([2, 32, 15])
LSTM shape 	torch.Size([2, 32, 64])
Dense shape 	torch.Size([2, 32, 1])
Sigmoid shape 	torch.Size([2])


---
### END:Manual conv network assembly
---

# Conv1d
in the simplest case, the output value of the layer with input size (N,Cin,L) and output (N,Cout,Lout)

In [338]:
conv_1d_input_temp = torch.arange(18).to(torch.float).view(3,2,-1)
conv_1d_input_temp, conv_1d_input_temp.shape

(tensor([[[ 0.,  1.,  2.],
          [ 3.,  4.,  5.]],
 
         [[ 6.,  7.,  8.],
          [ 9., 10., 11.]],
 
         [[12., 13., 14.],
          [15., 16., 17.]]]),
 torch.Size([3, 2, 3]))

In [340]:
extra = torch.arange(6).view((3,2,1))
extra, extra.shape

(tensor([[[0],
          [1]],
 
         [[2],
          [3]],
 
         [[4],
          [5]]]),
 torch.Size([3, 2, 1]))

In [341]:
conv_1d_input = torch.cat((conv_1d_input_temp, extra), dim = 2)
conv_1d_input, conv_1d_input.shape

(tensor([[[ 0.,  1.,  2.,  0.],
          [ 3.,  4.,  5.,  1.]],
 
         [[ 6.,  7.,  8.,  2.],
          [ 9., 10., 11.,  3.]],
 
         [[12., 13., 14.,  4.],
          [15., 16., 17.,  5.]]]),
 torch.Size([3, 2, 4]))

In [349]:
def init_weight(m):
    from functools import reduce
    l = reduce(lambda x,y: x*y, m.weight.data.shape)
    if type(m) == nn.Conv1d:
        m.weight.data = torch.ones(l).to(torch.float).reshape(m.weight.data.shape)
        m.weight.data[1] += m.weight.data[1]

layer_conv1d = nn.Conv1d(in_channels=2, out_channels=4, kernel_size=2, bias=False)
layer_conv1d.apply(init_weight)

layer_conv1d.weight.data, layer_conv1d.weight.data.shape

(tensor([[[1., 1.],
          [1., 1.]],
 
         [[2., 2.],
          [2., 2.]],
 
         [[1., 1.],
          [1., 1.]],
 
         [[1., 1.],
          [1., 1.]]]),
 torch.Size([4, 2, 2]))

In [350]:
conv_1d_out = layer_conv1d(conv_1d_input)

conv_1d_out

tensor([[[  8.,  12.,   8.],
         [ 16.,  24.,  16.],
         [  8.,  12.,   8.],
         [  8.,  12.,   8.]],

        [[ 32.,  36.,  24.],
         [ 64.,  72.,  48.],
         [ 32.,  36.,  24.],
         [ 32.,  36.,  24.]],

        [[ 56.,  60.,  40.],
         [112., 120.,  80.],
         [ 56.,  60.,  40.],
         [ 56.,  60.,  40.]]], grad_fn=<SqueezeBackward1>)

##  Convolution of kernel size 1
it is equal to linear layer for point-wise transformation in transformer

In [354]:
layer_conv1d = nn.Conv1d(in_channels=3, out_channels=4, kernel_size=1, bias=False)
layer_conv1d.apply(init_weight)

layer_conv1d.weight.data, layer_conv1d.weight.data.shape

(tensor([[[1.],
          [1.],
          [1.]],
 
         [[2.],
          [2.],
          [2.]],
 
         [[1.],
          [1.],
          [1.]],
 
         [[1.],
          [1.],
          [1.]]]),
 torch.Size([4, 3, 1]))

In [364]:
kernel1input = torch.arange(3).to(torch.float).view(1,3,-1)
kernel1input, kernel1input.shape

(tensor([[[0.],
          [1.],
          [2.]]]),
 torch.Size([1, 3, 1]))

In [365]:
layer_conv1d(kernel1input)

tensor([[[3.],
         [6.],
         [3.],
         [3.]]], grad_fn=<SqueezeBackward1>)