# 1. Github Repository Creation

# 2. Clone to the local repo

# 3. Create virtual environment

# 4. Add Virtual Environment Kernel to Jupyter

# Self Supervised Learning (A Feasible Approach)

![(Transfer Learning (A Data Centric Approach)](src/BERT-language-modeling-masked-lm.png)

# Transfer Learning (A Data Centric Approach)

![(Transfer Learning (A Data Centric Approach)](src/transfer_learning_general.png)

# 4. Intialize and Fix the Randomness

In [1]:
import torch

import random
import numpy as np

seed = 10

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

  from .autonotebook import tqdm as notebook_tqdm


# Transfer Learning (A Data Centric Approach)

![]("src/transfer_learning_general.png" "Title")

# 5. Model Selection

In [2]:
from transformers import AutoTokenizer

base_model = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(base_model)

In [3]:
len(tokenizer.vocab)

30522

In [4]:
tokens = tokenizer.tokenize('Welcome to MLDevOps Workshop')

print(tokens)

['welcome', 'to', 'ml', '##dev', '##ops', 'workshop']


In [5]:
indexes = tokenizer.convert_tokens_to_ids(tokens)

print(indexes)

[6160, 2000, 19875, 24844, 11923, 8395]


In [6]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [7]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [8]:
max_input_length = tokenizer.max_model_input_sizes[base_model]

print(max_input_length)

512


In [9]:
max_input_length = 10

In [10]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

# 6. Data Collation

In [11]:
from torchtext import data

text = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

label = data.LabelField(dtype = torch.float)



In [12]:
from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(text, label)

train_data, valid_data = train_data.split(random_state = random.seed(seed))

Token indices sequence length is longer than the specified maximum sequence length for this model (584 > 512). Running this sequence through the model will result in indexing errors


In [13]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [14]:
print(vars(train_data.examples[10]))

{'text': [2009, 3849, 10358, 2013, 2023, 6789, 2008, 2002], 'label': 'neg'}


In [15]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[10])['text'])

print(tokens)

['it', 'seems', 'evident', 'from', 'this', 'adaptation', 'that', 'he']


In [16]:
label.build_vocab(train_data)

In [17]:
print(label.vocab.stoi)

defaultdict(None, {'neg': 0, 'pos': 1})


In [18]:
batch_size= 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = batch_size, 
    device = device)



# 7. Build the model

In [19]:
from transformers import AutoTokenizer, AutoModel

b_model = AutoModel.from_pretrained(base_model)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
import torch.nn as nn

class SentimentAnalyzer(nn.Module):
    def __init__(self,
                 b_model,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.b_model = b_model
        
        embedding_dim = b_model.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.b_model(text)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

# 8. Configuring Hyper Parameters 

In [21]:
hidden_dim = 256
op_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.25

model = SentimentAnalyzer(b_model,
                         hidden_dim,
                         op_dim,
                         n_layers,
                         bidirectional,
                         dropout)

In [22]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 112,241,409 trainable parameters


In [23]:
for name, param in model.named_parameters():                
    if name.startswith('b_model'):
        param.requires_grad = False

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,759,169 trainable parameters


In [25]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


# 9. Model Training

In [26]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [27]:
criterion = nn.BCEWithLogitsLoss()

In [28]:
model = model.to(device)
criterion = criterion.to(device)

In [29]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [30]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [31]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [32]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [33]:
epochs = 3

best_valid_loss = float('inf')

for epoch in range(epochs):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'dev-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')



Epoch: 01 | Epoch Time: 0m 15s
	Train Loss: 0.652 | Train Acc: 60.47%
	 Val. Loss: 0.613 |  Val. Acc: 64.06%
Epoch: 02 | Epoch Time: 0m 15s
	Train Loss: 0.609 | Train Acc: 64.72%
	 Val. Loss: 0.600 |  Val. Acc: 66.17%
Epoch: 03 | Epoch Time: 0m 16s
	Train Loss: 0.588 | Train Acc: 67.08%
	 Val. Loss: 0.596 |  Val. Acc: 65.41%


In [34]:


test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.603 | Test Acc: 65.60%


# 10. Inference

In [35]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [36]:
hidden_dim = 256
op_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.25

model = SentimentAnalyzer(b_model,
                         hidden_dim,
                         op_dim,
                         n_layers,
                         bidirectional,
                         dropout)

In [39]:
model.load_state_dict(torch.load('dev-model.pt'))
model.to(device)

SentimentAnalyzer(
  (b_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [40]:
predict_sentiment(model, tokenizer, "This workshop is awesome")

0.9093654751777649

In [41]:
predict_sentiment(model, tokenizer, "This workshop is boring")

0.06864137202501297

# 11. Commit your code

# 12. Struturizing code with PyCharm

# 13. Commit your code

# 14. Serving SentimentAnalyser through REST APIs

# 15. Make requirements.txt

# 16. Commit your code