## Let's make a **convolution** work on MOVIE REVIEW CLASSIFICATION and utilize TORCHTEXT

In [13]:
import torchtext as TT
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
import transformers
from pandas import DataFrame

In [2]:
from torchsummary import summary

In [110]:
with open('./data/reviews.txt','r') as f:
    data = f.readlines()
with open('./data/labels.txt', 'r') as f:
    labels = f.readlines()

In [78]:
# optional - remove punctuation
from string import punctuation
print(punctuation)

# get rid of punctuation
all_data = []
for review in data:
    all_data.append("".join(c for c in review if c not in punctuation))
del data
data = all_data

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [111]:
i = np.random.randint(low=1,high=len(data))
print(data[i], labels[i])

this film is a spicy little piece of film  making from sam fuller which gives richard widmark the chance to show of some of his best  most edgy acting in the role of skip mccoy  a small  time thief who stumbles onto a military secret while picking beautiful candy  s  jean peters  pocket on a crowded bus . it turns out candy was doing a favor for her  ex   boyfriend  who  s working for the  commies  .  br    br   superficially  there  s a mystery here regarding candy  s motives and skip spends much of the film determining her motives . actually he seems to just initially assume that she  s a  commie   going so far as to pour beer in her face in a callous gesture . but the real question is  what  s going on with skip  what are his motives  and why does candy like him so much  why do we  the audience  want to like him so much  basically what the film  makers have done here is create a very striking  male fatale  in widmark  s character and his performance . just as the male audience tends

In [112]:
len(data)

25000

In [113]:
review_length = DataFrame([len(item.split()) for item in data])

In [114]:
review_length.describe()

Unnamed: 0,0
count,25000.0
mean,253.89552
std,186.927241
min,11.0
25%,139.0
50%,190.0
75%,309.0
max,2633.0


In [115]:
%config Completer.use_jedi = False

###  Define Dataset and DataLoader along with Tokenization

In [117]:
tokenizer = transformers.AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
tokenizer.encode_plus

In [156]:
class TextData(torch.utils.data.Dataset):
    '''
    very basic dataset for processing text data
    holds text and label
    implements len and getitem methods
    '''
    def __init__(
        self, 
        labels,
        text, 
        label_dict = None,
        max_seq_length = 200, 
        model_name="distilbert-base-uncased"
    ):
        
        """
        Constructor for the text dataset&
        Args:
        labels - list with labels
        text - a list with texts to classify
        label_dict - a dictionary to map label into a class id
        max_seq_length - max length of a single text in tokens, text is truncated or padded to equal max_seq_length
        model_name - specific model name from transformers library
        """ 
        if labels is not None:
            assert len(text) == len(labels)
        super(TextData, self).__init__()
        self.labels = labels
        self.text = text
        self.label_dict = label_dict
        self.max_seq_length = max_seq_length
        self.model_name = model_name
        
        if self.label_dict is None and labels is not None:
            # auto-encode labels to 0,1,2,... lebel_ids
            self.label_dict = dict(zip(sorted(set(labels)), range(len(set(labels)))))
        
        # init the tokenizer
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name)
        self.tokenizer.padding_side = 'left'
        
    def __getitem__(self, index):
        txt = self.text[index]
        
        # a dictionary with `input_ids` and `attention_mask` as keys
        output_dict = self.tokenizer.encode_plus( 
            txt, 
            add_special_tokens=True, 
            padding='max_length',
            max_length=self.max_seq_length,
            return_tensors="pt",
            truncation=True,
            return_attention_mask=True
        )
        output_dict['input_ids'] = output_dict['input_ids'].squeeze()
        output_dict['attention_mask'] = output_dict['attention_mask'].squeeze()
        
        # target
        if self.labels is not None:
            y = self.labels[index]
            y_encoded = torch.Tensor([self.label_dict.get(y, -1)]).long().squeeze()
            output_dict["targets"] = y_encoded
        
        return output_dict
    
    def __len__(self):
        return len(self.labels)

#### Try out the class/object 

In [157]:
txt_data = TextData(labels, data)

In [158]:
# see the parameters of our tokenizer
print(*txt_data.tokenizer.__repr__().split(","), sep="\n")

PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased'
 vocab_size=30522
 model_max_len=512
 is_fast=True
 padding_side='left'
 special_tokens={'unk_token': '[UNK]'
 'sep_token': '[SEP]'
 'pad_token': '[PAD]'
 'cls_token': '[CLS]'
 'mask_token': '[MASK]'})


#### Take a look at a sample text - how it gets encoded

In [159]:
rnd_review = np.random.randint(0,len(data))
print(data[rnd_review], labels[rnd_review], sep="\n")

contrary to my principles  let me first come up with a conclusion  because i have just seen this piece of  art   and still am under strong impressions . the reader is asked to excuse my stronger vocabulary .  br    br   well  this movie is absolutely horrible  and i would never bother to write a single word about it  if it were not for the fact that    minutes  made me sick to death  which rarely happens to me . the fact that i paid for that does not exactly makes me feel better  as well as the fact the movie deserved the high user rating here .  br    br   so what is wrong with the movie  it has a fashionable title     minutes  . one first thinks about    minutes   which is by the way a much better movie  but still bad in my book  and indeed the two can be compared to some extent . but  as luck would have it  the things they share are their worst characteristics . they both feature mr . oleg taktarov  who with his strong russian accent obviously meets the popular expectations and prej

In [160]:
txt_data[rnd_review]

{'input_ids': tensor([  101, 10043,  2000,  2026,  6481,  2292,  2033,  2034,  2272,  2039,
         2007,  1037,  7091,  2138,  1045,  2031,  2074,  2464,  2023,  3538,
         1997,  2396,  1998,  2145,  2572,  2104,  2844, 19221,  1012,  1996,
         8068,  2003,  2356,  2000,  8016,  2026,  6428, 16188,  1012,  7987,
         7987,  2092,  2023,  3185,  2003,  7078,  9202,  1998,  1045,  2052,
         2196,  8572,  2000,  4339,  1037,  2309,  2773,  2055,  2009,  2065,
         2009,  2020,  2025,  2005,  1996,  2755,  2008,  2781,  2081,  2033,
         5305,  2000,  2331,  2029,  6524,  6433,  2000,  2033,  1012,  1996,
         2755,  2008,  1045,  3825,  2005,  2008,  2515,  2025,  3599,  3084,
         2033,  2514,  2488,  2004,  2092,  2004,  1996,  2755,  1996,  3185,
        10849,  1996,  2152,  5310,  5790,  2182,  1012,  7987,  7987,  2061,
         2054,  2003,  3308,  2007,  1996,  3185,  2009,  2038,  1037, 19964,
         2516,  2781,  1012,  2028,  2034,  6732, 

#### Vocabulary from tokenizer 

In [128]:
vocab = txt_data.tokenizer.get_vocab()
reverse_vocab = dict(sorted((value,key) for (key,value) in vocab.items()))

In [129]:
vocab

{'dried': 9550,
 '##icus': 14239,
 '##escu': 19434,
 'dylan': 7758,
 'need': 2342,
 'thanking': 28638,
 'prizes': 11580,
 'orson': 25026,
 '##llar': 17305,
 'cara': 14418,
 'fathers': 11397,
 'propelled': 15801,
 'frescoes': 23360,
 'breathless': 16701,
 'turns': 4332,
 'pay': 3477,
 'awe': 15180,
 '1721': 27689,
 'colonel': 4327,
 'baggage': 20220,
 'crosby': 14282,
 'decreased': 10548,
 'jaya': 24120,
 'prevailed': 19914,
 'appointment': 6098,
 'regulations': 7040,
 'includes': 2950,
 'drawer': 13065,
 'famed': 15607,
 'gable': 13733,
 'danzig': 26669,
 '##arus': 29133,
 'consisting': 5398,
 '[unused356]': 361,
 'located': 2284,
 'every': 2296,
 'stadium': 3346,
 'arrived': 3369,
 'gazed': 11114,
 '##cise': 18380,
 'fashionable': 19964,
 'ornate': 18099,
 'custer': 28888,
 'covent': 29456,
 'revolves': 19223,
 '扌': 1859,
 'once': 2320,
 'denis': 11064,
 'vulture': 27588,
 'churches': 5231,
 'ultimately': 4821,
 'songwriting': 14029,
 'singer': 3220,
 'annoying': 15703,
 'vase': 18781

In [130]:
reverse_vocab

{0: '[PAD]',
 1: '[unused0]',
 2: '[unused1]',
 3: '[unused2]',
 4: '[unused3]',
 5: '[unused4]',
 6: '[unused5]',
 7: '[unused6]',
 8: '[unused7]',
 9: '[unused8]',
 10: '[unused9]',
 11: '[unused10]',
 12: '[unused11]',
 13: '[unused12]',
 14: '[unused13]',
 15: '[unused14]',
 16: '[unused15]',
 17: '[unused16]',
 18: '[unused17]',
 19: '[unused18]',
 20: '[unused19]',
 21: '[unused20]',
 22: '[unused21]',
 23: '[unused22]',
 24: '[unused23]',
 25: '[unused24]',
 26: '[unused25]',
 27: '[unused26]',
 28: '[unused27]',
 29: '[unused28]',
 30: '[unused29]',
 31: '[unused30]',
 32: '[unused31]',
 33: '[unused32]',
 34: '[unused33]',
 35: '[unused34]',
 36: '[unused35]',
 37: '[unused36]',
 38: '[unused37]',
 39: '[unused38]',
 40: '[unused39]',
 41: '[unused40]',
 42: '[unused41]',
 43: '[unused42]',
 44: '[unused43]',
 45: '[unused44]',
 46: '[unused45]',
 47: '[unused46]',
 48: '[unused47]',
 49: '[unused48]',
 50: '[unused49]',
 51: '[unused50]',
 52: '[unused51]',
 53: '[unused52]',

In [141]:
print(*[reverse_vocab[token_id] for token_id in txt_data[rnd_review]['input_ids'][0].numpy()])

[CLS] this movie can be labeled as a study case . it s not just the fact that it denotes an un ##hea ##lth ##y and non artistic lust for anything that might be termed as ca ##co imagery . the author lives with the impression that his san ##ct ##imo ##nio ##us revolt against some generic and childish ##ly termed social ill ##s mold ##avia is the most pau ##per region of europe i don t believe one io ##ta in the birds flu romanian people steal because they are poor europeans steal because they are thieves are more or less close to a responsible moral and artistic attitude but he is sore ##ly off target br br what dane ##li ##uc doesn t know is that it s not enough to pose as a righteous person you also need a mod ##icum of professional ##ism talent and intelligence to trans ##pose this stance into an artistic product . fate ##fully the foreign legion shows as much ac ##umen as a family video with uncle go ##gu drunken ##ly wet ##ting himself in front of the guests . the script is chaotic

In [144]:
print(data[rnd_review])

this movie can be labeled as a study case . it  s not just the fact that it denotes an unhealthy and non  artistic lust for anything that might be termed as caco  imagery . the author lives with the impression that his sanctimonious revolt against some generic and childishly termed social ills   moldavia is the most pauper region of europe    i don  t believe one iota in the birds flu    romanian people steal because they are poor europeans steal because they are thieves   are more or less close to a responsible moral and artistic attitude  but he is sorely off  target   br    br   what daneliuc doesn  t know  is that it  s not enough to pose as a righteous person  you also need a modicum of professionalism  talent and intelligence to transpose this stance into an artistic product . fatefully   the foreign legion  shows as much acumen as a family video with uncle gogu drunkenly wetting himself in front of the guests . the script is chaotic and incoherent  randomly bustling together sun

#### DataLoader and train/val/test split 

In [161]:
# train, test, validation split
split_frac = 0.8
BATCH_SIZE = 2

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(data)*split_frac) + 1
test_val_idx = split_idx + int((len(data) - split_idx)//2)
split_idx, test_val_idx

(20001, 22500)

In [162]:
train_dataset = TextData(labels[:split_idx], data[:split_idx])
val_dataset = TextData(labels[split_idx:test_val_idx], data[split_idx:test_val_idx])
test_dataset = TextData(labels[test_val_idx:], data[test_val_idx:])

In [148]:
!ls -lth /home/testuser/.cache/huggingface/transformers/

итого 708K
-rw-r--r-- 1 testuser testuser   28 июн 15 16:03 8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
-rw-r--r-- 1 testuser testuser  148 июн 15 16:03 8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.json
-rwxr-xr-x 1 testuser testuser    0 июн 15 16:03 8c8624b8ac8aa99c60c912161f8332de003484428c47906d7ff7eb7f73eecdbb.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock
-rw-r--r-- 1 testuser testuser 456K июн 15 16:03 75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4
-rw-r--r-- 1 testuser testuser  141 июн 15 16:03 75abb59d7a06f4f640158a9bfcde005264e59e8d566781ab1415b139d2e4c603.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.json
-rwxr-xr-x 1 testuser testuser    0 июн 15 16:03 75abb59d7a06f4f640158a9bfcde00

In [164]:
data_loader = {'train': DataLoader(train_dataset,batch_size=BATCH_SIZE, shuffle=True), 
               'valid': DataLoader(val_dataset,batch_size=BATCH_SIZE, shuffle=True), 
               'test': DataLoader(test_dataset,batch_size=BATCH_SIZE, shuffle=True)}

In [165]:
ttt = next(iter(data_loader['train']))

In [166]:
ttt

{'input_ids': tensor([[  101,  2023,  2003,  1037,  2200,  3697,  3185,  1998,  2009,  1055,
           2471,  5263,  2000,  2131,  1037,  5047,  2006,  2054,  1055,  2183,
           2006,  1012,  2012,  2034,  2009,  3849,  2000,  2022,  1037,  2738,
          14662,  3185,  2055,  1037,  3124, 29461, 13687, 21983,  2040,  3791,
           2019,  4545,  1998,  2738, 13675, 12054,  2135, 18675,  2370,  2046,
           2028,  2043,  1996,  2783, 16713,  1037,  2450, 27791,  5920,  1012,
           2059,  1996, 21438,  1998,  4332,  2707,  1012,  2024,  1996, 10638,
           2667,  2000,  3102,  2032,  1998,  2339,  2024,  1996,  2757, 16713,
           1055,  4253,  3810,  2039,  1999,  1996,  4545,  2028, 16278,  2633,
           2065, 29461, 13687,  4492,  5705,  2100,  2003,  1996,  3188, 16713,
           1012,  7987,  7987, 27594,  2121, 27594,  2121, 27594,  2121, 27594,
           2121, 27594,  2121, 27594,  2121,  7987,  7987,  2028,  1997,  1996,
          12225, 14955,  69

In [173]:
len(train_dataset), len(val_dataset), len(test_dataset)

(20001, 2499, 2500)

## Define the model 

In [54]:
batch_size = 50
train_dataset = TextData(labels[:split_idx-1], data[:split_idx-1])
valid_dataset = TextData(labels[split_idx-1:test_val_idx], data[split_idx-1:test_val_idx])
test_dataset = TextData(labels[test_val_idx:], data[test_val_idx:])

In [55]:
len(test_dataset), len(valid_dataset), len(train_dataset)

(2500, 2500, 20000)

In [56]:
train_loader =  DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=tokenize_batch)
valid_loader =  DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, collate_fn=tokenize_batch)
test_loader =  DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=tokenize_batch)

## Make a model

In [57]:
import torch.nn as nn
import torch.nn.functional as F

#         Input shape     torch.Size([2, 200])
#         Embedding shape torch.Size([2, 200, 32]) 50
#         Conv1d shape    torch.Size([2, 32, 30]) 48
#         MaxPool shape   torch.Size([2, 32, 15]) 24
#         LSTM shape      torch.Size([2, 32, 64])
#         Dense shape     torch.Size([2, 32, 1])
#         Sigmoid shape   torch.Size([2])

class SentimentConvNN(nn.Module):
    def __init__(self, vocab_size, output_size=1, embedding_dim=32, hidden_dim=64, out_channels=32, drop_prob=0.5, vocab_vectors=None):
        super(SentimentConvNN, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.output_size = output_size
        self.hidden_dim = hidden_dim
        self.drop_prob = drop_prob
        self.out_channels = out_channels
        self.n_layers = 1
        
        # if we provide vocab_vectors then initialize weights
        if vocab_vectors is not None:
            self.embed = nn.Embedding.from_pretrained(vocab_vectors, freeze=True)
        else:
            self.embed = nn.Embedding(self.vocab_size, self.embedding_dim)
            
        self.conv1d = nn.Conv1d(in_channels=200, out_channels=self.out_channels, kernel_size=3, bias=False, padding=False)
        self.maxpool = nn.MaxPool1d(kernel_size=2)
        
        lstm_input = int((self.embed.weight.shape[-1]-2)/2) # repoduce the logic of conv1d resulting dimention
        
        self.lstm = nn.LSTM(input_size=lstm_input,
                            hidden_size=hidden_dim,
                            num_layers=1,
                            batch_first=True,
                            dropout=0)
        self.dense = nn.Linear(hidden_dim, 1)
        self.drop = nn.Dropout(p=drop_prob)
        
        self.bn_embedding = nn.BatchNorm1d(num_features=200)
        self.bn_conv1d = nn.BatchNorm1d(num_features=self.out_channels)
        self.bn_lstm = nn.BatchNorm1d(num_features=self.out_channels)
    
    def num_parameters(self):
        '''
        get the number of parameters in a network
        '''

        # return sum((list(map(lambda x: torch.as_tensor(x.flatten().size()).sum().item(), self.parameters()))))
        s=""
        for k, v in self.named_parameters():
            s+=f'{k:20} {v.shape}\n'
        s+=f'Total number of parameters = {sum(list(map(lambda x: x.numel(), self.parameters()))):,}'
        return s
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        hidden = (weight.new_zeros(self.n_layers, batch_size, self.hidden_dim),
                  weight.new_zeros(self.n_layers, batch_size, self.hidden_dim))

        return hidden
    
    def forward(self, x, hidden):
        '''
        Input shape 	torch.Size([2, 200])
        Embedding shape torch.Size([2, 200, 32])
        Conv1d shape 	torch.Size([2, 32, 30])
        MaxPool shape 	torch.Size([2, 32, 15])
        LSTM shape 		torch.Size([2, 32, 64])
        Dense shape 	torch.Size([2, 32, 1])
        Sigmoid shape 	torch.Size([2])
        '''
        #print(x.dtype)
        embed_out = self.embed(x)
        embed_out = self.bn_embedding(embed_out)
        embed_out = self.drop(embed_out)
        
        conv_out = self.conv1d(embed_out)
        conv_out = self.bn_conv1d(conv_out)
        conv_out_relu = F.relu(conv_out)
        maxpool_out = self.maxpool(conv_out_relu)
        
        lstm_out, _ = self.lstm(maxpool_out, hidden)
        lstm_out = self.bn_lstm(lstm_out)
        lstm_out = self.drop(lstm_out)
        
        out_dense = self.dense(lstm_out)
        out = nn.Sigmoid()(out_dense[:,-1,:]).view(out_dense.shape[0])
        
        return out, 1

In [58]:
# Instantiate the model with hyperparams
vocab_size = len(vocab) 
output_size = 1 # not needed
embedding_dim = 32
hidden_dim = 64
n_layers = 1

In [97]:
convRNN = SentimentConvNN(vocab_size=vocab_size)
print(convRNN.num_parameters())

embed.weight         torch.Size([67359, 32])
conv1d.weight        torch.Size([32, 200, 3])
lstm.weight_ih_l0    torch.Size([256, 15])
lstm.weight_hh_l0    torch.Size([256, 64])
lstm.bias_ih_l0      torch.Size([256])
lstm.bias_hh_l0      torch.Size([256])
dense.weight         torch.Size([1, 64])
dense.bias           torch.Size([1])
bn_embedding.weight  torch.Size([200])
bn_embedding.bias    torch.Size([200])
bn_conv1d.weight     torch.Size([32])
bn_conv1d.bias       torch.Size([32])
bn_lstm.weight       torch.Size([32])
bn_lstm.bias         torch.Size([32])
Total number of parameters = 2,196,017


In [60]:
h0 = convRNN.init_hidden(batch_size=2)

In [61]:
with torch.no_grad():
    out, _ = convRNN.forward(txt, h0)
out, out.shape

(tensor([0.1950, 0.5091]), torch.Size([2]))

In [98]:
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
convRNN.to(device)

SentimentConvNN(
  (embed): Embedding(67359, 32)
  (conv1d): Conv1d(200, 32, kernel_size=(3,), stride=(1,), bias=False)
  (maxpool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (lstm): LSTM(15, 64, batch_first=True)
  (dense): Linear(in_features=64, out_features=1, bias=True)
  (drop): Dropout(p=0.5, inplace=False)
  (bn_embedding): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_conv1d): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_lstm): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

### Training conv RNN

In [99]:
def val_score(net, val_loader, criterion):
    '''
    calculates validation loss
    does not put a net into eval mode - have to do this manually before val_score call
    
    '''
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
    
    loss = []
    # that is the number of objects in loader, not batches
    number_of_objects = len(val_loader.dataset)
    
    # make array of zeros with the shape of response
    pred_y = np.zeros(number_of_objects)
    true_y = np.zeros_like(pred_y)

    # store a batch size 
    batch_size = val_loader.batch_size
    
    with torch.no_grad():
        for ii, (test_x, test_y) in enumerate(val_loader):
            h = net.init_hidden(test_y.shape[0])
            test_x, test_y = test_x.to(device), test_y.to(device)
            out, _ = net.forward(test_x, h)
            batch_loss = criterion(out, test_y)
            
            # store predictions and true labels
            pred_y[ii*batch_size:ii*batch_size + len(test_y)] = out.to('cpu').numpy()
            true_y[ii*batch_size:ii*batch_size + len(test_y)] = test_y.to('cpu').numpy()
            
            loss.append(batch_loss.item())
    
    precision = precision_score(true_y, np.round(pred_y))
    recall = recall_score(true_y, np.round(pred_y))
    accuracy = accuracy_score(true_y, np.round(pred_y))
    fscore = f1_score(true_y, np.round(pred_y))
    
    metrics = {'precision':precision, 'recall':recall, 'accuracy':accuracy, 'fscore':fscore}
    
    return np.mean(loss), metrics

In [100]:
def trainer_SWA(net, criterion, optimizer, train_loader, valid_loader, clip_value=5, epochs=10, print_every=200, max_fscore=-np.inf):
    '''
    Train the network
        net - network to trian
        criterion - loss function 
        optimizer - your optimiser of choice 
        train_loader - loader for training data
        vlid_loader - lodaer for validation/test data
        clip_value - upper limit for gradient 
        epochs - number of epochs to train the net
        print_every - prin stats every number of batches
        max_fscore - best fscore on validation set - used in mutiple runs of training
    '''
    from tqdm.notebook import tqdm, trange
    
    steps = 0

    net.train()

    # run over epochs of training
    for e in trange(epochs):
        
        # array to keep value of losses over current epoch
        train_loss = []

        # run one pass through training samples = one epoch
        for train_x, train_y in train_loader:
            steps +=1

            # zero out the grads 
            net.zero_grad()
            # optimizer.zero_grad()

            # send data to device
            train_x, train_y = train_x.to(device), train_y.to(device)

            # initialize hidden state
            h = net.init_hidden(len(train_x))

            # calculate the output of the network
            out, _ = net(train_x, h)

            # compute the loss
            loss = criterion(out, train_y)
            # backprop grads of the loss wrt to net parameters
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), clip_value)

            # upadate parameters of network
            optimizer.step()

            # append current batch loss (loss per object in current batch)
            train_loss.append(loss.item())

            # test loss calc every 
            if steps%print_every == 0:
                # calculate loss on test set
                if max_fscore > 0.75:
                    optimizer.update_swa()
                    optimizer.swap_swa_sgd() # use SWA weights for the calc of validation loss
                    for g in optimizer.param_groups:
                        g['lr'] = 0.0005
                net.eval()
                test_loss, metrics = val_score(net, valid_loader, criterion)
                if metrics['fscore'] > max_fscore:
                    max_fscore = metrics['fscore']
                    message = '=)'
                    check_point = {'vocab_size':net.vocab_size, 
                                   'embedding_dim': net.embedding_dim, 
                                   'hidden_dim':net.hidden_dim, 
                                   'n_layers':net.n_layers, 
                                   'net_params':net.state_dict()}
                    torch.save(check_point, f"spam_model_fscore_{metrics['fscore']:.3f}.pt")
                else:
                    message = ';('
                if max_fscore > 0.75:
                    optimizer.swap_swa_sgd() # swap back normal weights and continue training
                net.train()
                print(f"Step {steps} epoch {e+1}. {message}\nTest loss is {test_loss:.4f}. Train loss is {np.mean(train_loss):.4f}.\
                F1 Score={metrics['fscore']:.2%} Precision={metrics['precision']:.2%} Recall={metrics['recall']:.2%} Accuracy={metrics['accuracy']:.2%}\n")
    return max_fscore

In [101]:
max_fscore = 0.5

In [44]:
help(SWA)

Help on class SWA in module torchcontrib.optim.swa:

class SWA(torch.optim.optimizer.Optimizer)
 |  Base class for all optimizers.
 |  
 |      Parameters need to be specified as collections that have a deterministic
 |      ordering that is consistent between runs. Examples of objects that don't
 |      satisfy those properties are sets and iterators over values of dictionaries.
 |  
 |  Args:
 |      params (iterable): an iterable of :class:`torch.Tensor` s or
 |          :class:`dict` s. Specifies what Tensors should be optimized.
 |      defaults: (dict): a dict containing default values of optimization
 |          options (used when a parameter group doesn't specify them).
 |  
 |  Method resolution order:
 |      SWA
 |      torch.optim.optimizer.Optimizer
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, optimizer, swa_start=None, swa_freq=None, swa_lr=None)
 |      Implements Stochastic Weight Averaging (SWA).
 |      
 |      Stochastic Weight Aver

In [69]:
model = torch.load('spam_model_fscore_0.843.pt')
model.keys()

dict_keys(['vocab_size', 'embedding_dim', 'hidden_dim', 'n_layers', 'net_params'])

In [70]:
convRNN.load_state_dict(model['net_params'])

<All keys matched successfully>

In [65]:
next(convRNN.parameters()).is_cuda

True

In [None]:
# base_opt = torch.optim.SGD(model.parameters(), lr=0.1)
# opt = torchcontrib.optim.SWA(
#                 base_opt, swa_start=10, swa_freq=5, swa_lr=0.05)
# for _ in range(100):
#     opt.zero_grad()
#     loss_fn(model(input), target).backward()
#     opt.step()
# opt.swap_swa_sgd()

In [103]:
# loss and optimization functions
criterion = nn.BCELoss(reduction='mean')
base_opt = torch.optim.Adam(convRNN.parameters(), lr=lr, weight_decay=0.005)
#optimizer = torch.optim.SGD(convRNN.parameters(), lr=lr, momentum=0.9, nesterov=True)
#optimizer = torch.optim.RMSprop(convRNN.parameters(), lr=lr, momentum=0.9)
lr = 0.001
#base_opt = torch.optim.SGD(convRNN.parameters(), lr=lr)
optimizer = SWA(base_opt)

In [104]:
for g in optimizer.param_groups:
    print(g['lr'])

0.001


In [106]:
max_fscore = trainer_SWA(convRNN, criterion, optimizer, train_loader, valid_loader, clip_value=10, epochs=50, print_every=200, max_fscore=max_fscore)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Step 200 epoch 1. ;(
Test loss is 0.8857. Train loss is 0.1112.                F1 Score=84.88% Precision=86.99% Recall=82.88% Accuracy=85.24%

Step 400 epoch 1. ;(
Test loss is 0.9245. Train loss is 0.1272.                F1 Score=84.68% Precision=87.27% Recall=82.24% Accuracy=85.12%

Step 600 epoch 2. ;(
Test loss is 0.9929. Train loss is 0.1085.                F1 Score=84.89% Precision=86.73% Recall=83.12% Accuracy=85.20%

Step 800 epoch 2. ;(
Test loss is 1.0517. Train loss is 0.1230.                F1 Score=84.57% Precision=87.04% Recall=82.24% Accuracy=85.00%

Step 1000 epoch 3. ;(
Test loss is 1.1424. Train loss is 0.1166.                F1 Score=84.91% Precision=86.61% Recall=83.28% Accuracy=85.20%

Step 1200 epoch 3. ;(
Test loss is 1.3974. Train loss is 0.1272.                F1 Score=84.97% Precision=86.56% Recall=83.44% Accuracy=85.24%

Step 1400 epoch 4. ;(
Test loss is 2.1828. Train loss is 0.1135.                F1 Score=85.09% Precision=86.46% Recall=83.76% Accuracy=85.3

## SWA result - you can not train a ntework with SGD... SWA does not give the result here
# BUT with Adam in our case it works almost on par with vanilla attention

## Make convRNN with pretrained embedding from GloVe

In [85]:
# Instantiate the model with hyperparams
vocab_size = len(vocab) 
output_size = 1 # not needed
embedding_dim = 50
hidden_dim = 64
n_layers = 1

In [86]:
# vocab_size, output_size=1, embedding_dim=32, hidden_dim=64, out_channels=32, drop_prob=0.5, vocab_vectors=None
convRNN = SentimentConvNN(vocab_size=vocab_size, embedding_dim=embedding_dim, vocab_vectors=vocab.vectors)
print(convRNN.num_parameters())

embed.weight         torch.Size([67359, 50])
conv1d.weight        torch.Size([32, 200, 3])
lstm.weight_ih_l0    torch.Size([256, 24])
lstm.weight_hh_l0    torch.Size([256, 64])
lstm.bias_ih_l0      torch.Size([256])
lstm.bias_hh_l0      torch.Size([256])
dense.weight         torch.Size([1, 64])
dense.bias           torch.Size([1])
bn_embedding.weight  torch.Size([200])
bn_embedding.bias    torch.Size([200])
bn_conv1d.weight     torch.Size([32])
bn_conv1d.bias       torch.Size([32])
bn_lstm.weight       torch.Size([32])
bn_lstm.bias         torch.Size([32])
Total number of parameters = 3,410,783


In [87]:
# if you want to make a grad
convRNN.embed.weight.requires_grad_(True)

Parameter containing:
tensor([[ 0.0460, -0.1792, -0.1163,  ..., -0.0013, -0.0254, -0.0351],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],
       requires_grad=True)

In [88]:
convRNN.embed.weight.requires_grad, convRNN.dense.weight.requires_grad

(True, True)

In [89]:
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
convRNN.to(device)

SentimentConvNN(
  (embed): Embedding(67359, 50)
  (conv1d): Conv1d(200, 32, kernel_size=(3,), stride=(1,), bias=False)
  (maxpool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (lstm): LSTM(24, 64, batch_first=True)
  (dense): Linear(in_features=64, out_features=1, bias=True)
  (drop): Dropout(p=0.5, inplace=False)
  (bn_embedding): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_conv1d): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_lstm): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [90]:
def trainer(net, criterion, optimizer, train_loader, valid_loader, clip_value=5, epochs=10, print_every=200, max_fscore=-np.inf):
    '''
    Train the network
        net - network to trian
        criterion - loss function 
        optimizer - your optimiser of choice 
        train_loader - loader for training data
        vlid_loader - lodaer for validation/test data
        clip_value - upper limit for gradient 
        epochs - number of epochs to train the net
        print_every - prin stats every number of batches
        max_fscore - best fscore on validation set - used in mutiple runs of training
    '''
    from tqdm.notebook import tqdm, trange
    
    steps = 0

    net.train()

    # run over epochs of training
    for e in trange(epochs):
        
        # array to keep value of losses over current epoch
        train_loss = []

        # run one pass through training samples = one epoch
        for train_x, train_y in train_loader:
            steps +=1

            # zero out the grads 
            net.zero_grad()
            # optimioptimizer.zero_grad()

            # send data to device
            train_x, train_y = train_x.to(device), train_y.to(device)

            # initialize hidden state
            h = net.init_hidden(len(train_x))

            # calculate the output of the network
            out, _ = net(train_x, h)

            # compute the loss
            loss = criterion(out, train_y)
            # backprop grads of the loss wrt to net parameters
            loss.backward()
            nn.utils.clip_grad_norm_(net.parameters(), clip_value)

            # upadate parameters of network
            optimizer.step()

            # append current batch loss (loss per object in current batch)
            train_loss.append(loss.item())

            # test loss calc every 
            if steps%print_every == 0:
                # calculate loss on test set
                test_loss, metrics = val_score(net, valid_loader, criterion)
                if metrics['fscore'] > max_fscore:
                    max_fscore = metrics['fscore']
                    message = '=)'
                    check_point = {'vocab_size':net.vocab_size, 
                                   'embedding_dim': net.embedding_dim, 
                                   'hidden_dim':net.hidden_dim, 
                                   'n_layers':net.n_layers, 
                                   'net_params':net.state_dict()}
                    torch.save(check_point, f"spam_model_fscore_{metrics['fscore']:.3f}.pt")
                else:
                    message = ';('
                net.train()
                print(f"Step {steps} epoch {e+1}. {message}\nTest loss is {test_loss:.4f}. Train loss is {np.mean(train_loss):.4f}.\
                F1 Score={metrics['fscore']:.2%} Precision={metrics['precision']:.2%} Recall={metrics['recall']:.2%} Accuracy={metrics['accuracy']:.2%}\n")
    return max_fscore

In [91]:
batch_size

50

In [92]:
# loss and optimization functions
lr = 0.001
criterion = nn.BCELoss(reduction='mean')
optimizer = torch.optim.Adam(convRNN.parameters(), lr=lr, weight_decay=1e-2)
#optimizer = torch.optim.SGD(convRNN.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=1e-2)
#optimizer = torch.optim.RMSprop(convRNN.parameters(), lr=lr, momentum=0.9)

In [93]:
max_fscore

0.8638963019443385

In [311]:
max_fscore = trainer(convRNN, criterion, optimizer, train_loader, valid_loader, clip_value=10, epochs=50, print_every=200, max_fscore=max_fscore)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

Step 200 epoch 1. ;(
Test loss is 0.6891. Train loss is 0.6993.                F1 Score=64.30% Precision=51.78% Recall=84.80% Accuracy=52.92%

Step 400 epoch 1. ;(
Test loss is 0.6878. Train loss is 0.6941.                F1 Score=34.81% Precision=60.24% Recall=24.48% Accuracy=54.16%

Step 600 epoch 2. ;(
Test loss is 0.6638. Train loss is 0.6787.                F1 Score=64.02% Precision=58.37% Recall=70.88% Accuracy=60.16%

Step 800 epoch 2. ;(
Test loss is 0.6365. Train loss is 0.6688.                F1 Score=61.05% Precision=67.21% Recall=55.92% Accuracy=64.32%

Step 1000 epoch 3. ;(
Test loss is 0.5935. Train loss is 0.6214.                F1 Score=65.12% Precision=73.35% Recall=58.56% Accuracy=68.64%

Step 1200 epoch 3. ;(
Test loss is 0.5561. Train loss is 0.6003.                F1 Score=69.35% Precision=73.65% Recall=65.52% Accuracy=71.04%

Step 1400 epoch 4. ;(
Test loss is 0.5144. Train loss is 0.5418.                F1 Score=74.96% Precision=76.46% Recall=73.52% Accuracy=75.4

## CONV RNN with pretrained embedding gives no gain in metrics - you must make embedding trainable otherwise the results are horrible

---
### Manual conv network assembly
---
```python
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length,input_length=max_review_length)) 
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2)) 
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
```


In [426]:
embed = nn.Embedding(vocab_size, embedding_dim)
conv1d_sentiment = nn.Conv1d(in_channels=200, out_channels=32, kernel_size=3, bias=False, padding=False)
maxpool_sentiment = nn.MaxPool1d(kernel_size=2)
lstm_sentiment = nn.LSTM(input_size=15,
                         hidden_size=hidden_dim,
                         num_layers=1,
                         batch_first=True,
                         dropout=0)
lstm_h0 = (torch.zeros(1,2,hidden_dim), torch.zeros(1,2,hidden_dim))
dense_sentiment = nn.Linear(hidden_dim, 1)
drop_sentiment = nn.Dropout(p=0.5)

In [427]:
conv1d_sentiment.weight.shape

torch.Size([32, 200, 3])

In [429]:
with torch.no_grad():
    embed_out = embed(txt)
    embed_out = drop_sentiment(embed_out)
    conv_out = conv1d_sentiment(embed_out)
    conv_out_relu = F.relu(conv_out)
    maxpool_out = maxpool_sentiment(conv_out_relu)
    lstm_out, _ = lstm_sentiment(maxpool_out, lstm_h0)
    lstm_out = drop_sentiment(lstm_out)
    out_dense = dense_sentiment(lstm_out)
    out = nn.Sigmoid()(out_dense[:,-1,:]).view(out_dense.shape[0])
# Input:  (N, Cin,  Lin)
# Output: (N, Cout, Lout)
print(f'Input shape \t{txt.shape}\nEmbedding shape {embed_out.shape}\nConv1d shape \t{conv_out.shape}\
\nMaxPool shape \t{maxpool_out.shape}\nLSTM shape \t{lstm_out.shape}\nDense shape \t{out_dense.shape}\nSigmoid shape \t{out.shape}')


Input shape 	torch.Size([2, 200])
Embedding shape torch.Size([2, 200, 32])
Conv1d shape 	torch.Size([2, 32, 30])
MaxPool shape 	torch.Size([2, 32, 15])
LSTM shape 	torch.Size([2, 32, 64])
Dense shape 	torch.Size([2, 32, 1])
Sigmoid shape 	torch.Size([2])


---
### END:Manual conv network assembly
---

# Conv1d
in the simplest case, the output value of the layer with input size (N,Cin,L) and output (N,Cout,Lout)

In [338]:
conv_1d_input_temp = torch.arange(18).to(torch.float).view(3,2,-1)
conv_1d_input_temp, conv_1d_input_temp.shape

(tensor([[[ 0.,  1.,  2.],
          [ 3.,  4.,  5.]],
 
         [[ 6.,  7.,  8.],
          [ 9., 10., 11.]],
 
         [[12., 13., 14.],
          [15., 16., 17.]]]),
 torch.Size([3, 2, 3]))

In [340]:
extra = torch.arange(6).view((3,2,1))
extra, extra.shape

(tensor([[[0],
          [1]],
 
         [[2],
          [3]],
 
         [[4],
          [5]]]),
 torch.Size([3, 2, 1]))

In [341]:
conv_1d_input = torch.cat((conv_1d_input_temp, extra), dim = 2)
conv_1d_input, conv_1d_input.shape

(tensor([[[ 0.,  1.,  2.,  0.],
          [ 3.,  4.,  5.,  1.]],
 
         [[ 6.,  7.,  8.,  2.],
          [ 9., 10., 11.,  3.]],
 
         [[12., 13., 14.,  4.],
          [15., 16., 17.,  5.]]]),
 torch.Size([3, 2, 4]))

In [349]:
def init_weight(m):
    from functools import reduce
    l = reduce(lambda x,y: x*y, m.weight.data.shape)
    if type(m) == nn.Conv1d:
        m.weight.data = torch.ones(l).to(torch.float).reshape(m.weight.data.shape)
        m.weight.data[1] += m.weight.data[1]

layer_conv1d = nn.Conv1d(in_channels=2, out_channels=4, kernel_size=2, bias=False)
layer_conv1d.apply(init_weight)

layer_conv1d.weight.data, layer_conv1d.weight.data.shape

(tensor([[[1., 1.],
          [1., 1.]],
 
         [[2., 2.],
          [2., 2.]],
 
         [[1., 1.],
          [1., 1.]],
 
         [[1., 1.],
          [1., 1.]]]),
 torch.Size([4, 2, 2]))

In [350]:
conv_1d_out = layer_conv1d(conv_1d_input)

conv_1d_out

tensor([[[  8.,  12.,   8.],
         [ 16.,  24.,  16.],
         [  8.,  12.,   8.],
         [  8.,  12.,   8.]],

        [[ 32.,  36.,  24.],
         [ 64.,  72.,  48.],
         [ 32.,  36.,  24.],
         [ 32.,  36.,  24.]],

        [[ 56.,  60.,  40.],
         [112., 120.,  80.],
         [ 56.,  60.,  40.],
         [ 56.,  60.,  40.]]], grad_fn=<SqueezeBackward1>)

##  Convolution of kernel size 1
it is equal to linear layer for point-wise transformation in transformer

In [354]:
layer_conv1d = nn.Conv1d(in_channels=3, out_channels=4, kernel_size=1, bias=False)
layer_conv1d.apply(init_weight)

layer_conv1d.weight.data, layer_conv1d.weight.data.shape

(tensor([[[1.],
          [1.],
          [1.]],
 
         [[2.],
          [2.],
          [2.]],
 
         [[1.],
          [1.],
          [1.]],
 
         [[1.],
          [1.],
          [1.]]]),
 torch.Size([4, 3, 1]))

In [364]:
kernel1input = torch.arange(3).to(torch.float).view(1,3,-1)
kernel1input, kernel1input.shape

(tensor([[[0.],
          [1.],
          [2.]]]),
 torch.Size([1, 3, 1]))

In [365]:
layer_conv1d(kernel1input)

tensor([[[3.],
         [6.],
         [3.],
         [3.]]], grad_fn=<SqueezeBackward1>)