In [2]:
def create_model_inputs(sentence, tokenizer, T=20):
    #Step 1: Tokenize
    tokens = tokenizer.tokenize(sentence)
    #Step 2: Add [CLS] and [SEP]
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    #Step 3: Pad tokens
    padded_tokens = tokens + ['[PAD]' for _ in range(T - len(tokens))]
    attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
    #Step 4: Segment ids
    seg_ids = [0 for _ in range(len(padded_tokens))] #Optional!
    #Step 5: Get BERT vocabulary index for each token
    token_ids = tokenizer.convert_tokens_to_ids(padded_tokens)

    #Converting everything to torch tensors before feeding them to bert_model
    token_ids = torch.tensor(token_ids).unsqueeze(0) #Shape : [1, 12]
    attn_mask = torch.tensor(attn_mask).unsqueeze(0) #Shape : [1, 12]
    seg_ids   = torch.tensor(seg_ids).unsqueeze(0) #Shape : [1, 12]
    
    return token_ids, attn_mask, seg_ids, padded_tokens

def predict_masks(padded_tokens, hidden_reps, tokenizer):
    predicted_tokens = []
    for i, midx in enumerate(np.where(np.array(padded_tokens) == '[MASK]')[0]):
        idxs = torch.argsort(hidden_reps[0,midx], descending=True)
        predicted_token = tokenizer.convert_ids_to_tokens(idxs[:5])
        print(f'MASK {i}:', predicted_token)
        predicted_tokens.append(predicted_token)
    return predicted_tokens

In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

class GPT2:
    def __init__(self):
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.model = GPT2LMHeadModel.from_pretrained('gpt2').to('cuda')
        #self.model = GPT2Model.from_pretrained('gpt2')
        self.model.eval()

    def create_model_inputs(self, sentence):
        token_sentence = self.tokenizer.encode(sentence, add_special_tokens=True)
        input_ids = torch.tensor(token_sentence).unsqueeze(0).to('cuda')  # Batch size 1
        return input_ids.to('cuda')

    def predict_tokens(self, sentence, width=5):
        input_ids = self.create_model_inputs(sentence).to('cuda')
        hidden_reps = self.model(input_ids)[0].to('cuda')
        idxs = torch.argsort(hidden_reps[0,-1], descending=True).to('cuda')
        predicted_token = self.tokenizer.convert_ids_to_tokens(idxs[:width])
        predicted_token = [pred[1:]  for pred in predicted_token]
        return predicted_token

    def expand_predictions(self, sentence, sentences_tree=[], width=3, length=3):
        predicted_tokens = self.predict_tokens(sentence, width=width)
        sentences = [sentence + pred_token + " " for pred_token in predicted_tokens]
        for sent in sentences:
            if length > 0:
                self.expand_predictions(sent, sentences_tree=sentences_tree, width=width, length=length-1)
            else:
                sentences_tree.append(sent)
                
        return sentences_tree.copy()

    def run(self, sentence):
        predicted_token = self.predict(sentence, width=5)
        return predicted_token

In [55]:
model = GPT2()

In [70]:
sentence = "Before boarding your rocket to Mars, remember to pack these items "
#sentence = "My dear "
model.expand_predictions(sentence, [], width=2, length=10)

# BETO

In [2]:
# Fist install the library and download the models from github

#!pip install transformers
#!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/pytorch_weights.tar.gz 
#!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/vocab.txt 
#!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/config.json 
#!tar -xzvf pytorch_weights.tar.gz
#!mv config.json pytorch/.
#!mv vocab.txt pytorch/.

In [4]:
# import the necessary

import torch
import numpy as np
from transformers import BertForMaskedLM, BertTokenizer

In [8]:
# create the tokenizer and the model

tokenizer = BertTokenizer.from_pretrained("../weights/pytorch/", do_lower_case=False)
model = BertForMaskedLM.from_pretrained("../weights/pytorch/")
#model.eval()

Some weights of the model checkpoint at ../weights/pytorch/ were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at ../weights/pytorch/ and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
#Specifying the max length
T = 12

#sentence = "[CLS] Para [MASK] los [MASK] de Chile, el ministro debe [MASK] de inmediato. [SEP]"
sentence = "Tengo sed, dame un [MASK] de [MASK]"

# Inputs
token_ids, attn_mask, seg_ids, padded_tokens = create_model_inputs(sentence, tokenizer, T)

#Feed them to bert
hidden_reps = model(token_ids, attention_mask=attn_mask, token_type_ids=seg_ids)[0]

In [56]:
model(token_ids, attention_mask=attn_mask, token_type_ids=seg_ids)[0][0, 1]

tensor([-6.1508, -7.5639, -5.4450,  ...,  1.7980, -5.7680, -0.5667],
       grad_fn=<SelectBackward>)

In [49]:
torch.argsort(
    model(token_ids, attention_mask=attn_mask, token_type_ids=seg_ids)[0][0, 1], 
    descending=True)

tensor([ 2190,  1847,  2903,  ..., 30838, 27206, 28800])

In [58]:
predicted_tokens, idxs = predict_masks(padded_tokens, hidden_reps, tokenizer)

MASK 0: ['poco', 'vaso', 'trago', 'poquito', 'beso']
MASK 1: ['agua', 'vino', 'leche', 'amor', 'lluvia']


# SCIBERT

In [4]:
from transformers import *

tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = BertForMaskedLM.from_pretrained('allenai/scibert_scivocab_uncased')

In [5]:
import torch
import numpy as np

#Specifying the max length
T = 12

# Sentence
#sentence = 'Character-level modeling of [MASK] language text is [MASK], for several [MASK].'
#sentence = 'Prognosis may be essentially understood as the [MASK]' #the [MASK] of long-[MASK] predictions for a [MASK] indicator, made with the purpose'
sentence = 'The evaluation of these integrals, though, may be difficult and/or may require significant [MASK]'

# Inputs
token_ids, attn_mask, seg_ids, padded_tokens = create_model_inputs(sentence, tokenizer, T)

#Feed them to bert
hidden_reps = model(token_ids, attention_mask=attn_mask, token_type_ids=seg_ids)[0]

In [7]:
predicted_tokens = predict_masks(padded_tokens, hidden_reps, tokenizer)

MASK 0: ['computational', '.', 'numerical', 'mathematical', 'computation']


## Multiple

In [80]:
import torch
import numpy as np

#Specifying the max length
T = 100

# Sentence
#sentence = 'Character-level modeling of [MASK] language text is [MASK], for several [MASK].'
sentence = 'The evaluation of these integrals, though, may be difficult and/or may require significant [MASK]'

for _ in range(5):
    # Inputs
    token_ids, attn_mask, seg_ids, padded_tokens = create_model_inputs(sentence, tokenizer, T)

    #Feed them to bert
    hidden_reps = model(token_ids, attention_mask=attn_mask, token_type_ids=seg_ids)[0]

    for i, midx in enumerate(np.where(np.array(padded_tokens) == '[MASK]')[0]):
        idxs = torch.argsort(hidden_reps[0,midx], descending=True)
        predicted_token = tokenizer.convert_ids_to_tokens(idxs[:5])
        #print(f'MASK {i}:', predicted_token)
        sentence = sentence.split("[MASK]")[0] + predicted_token[0] + " [MASK]"
        print(sentence)

The evaluation of these integrals, though, may be difficult and/or may require significant computational [MASK]
The evaluation of these integrals, though, may be difficult and/or may require significant computational effort [MASK]
The evaluation of these integrals, though, may be difficult and/or may require significant computational effort . [MASK]
The evaluation of these integrals, though, may be difficult and/or may require significant computational effort . ( [MASK]
The evaluation of these integrals, though, may be difficult and/or may require significant computational effort . ( ) [MASK]


# BERT

In [10]:
import torch
import numpy as np
from transformers import BertModel, BertTokenizer, BertForMaskedLM

#Creating instance of BertModel
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased')

#Creating intance of tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [78]:
#Specifying the max length
T = 20

# Sentence
sentence = 'Character-level modeling of [MASK] language text is [MASK], for several [MASK].'

# Inputs
token_ids, attn_mask, seg_ids, padded_tokens = create_model_inputs(sentence, tokenizer, T)

#Feed them to bert
hidden_reps = bert_model(token_ids, attention_mask=attn_mask, token_type_ids=seg_ids)[0]

In [79]:
predicted_tokens = predict_masks(padded_tokens, hidden_reps, tokenizer)

MASK 0: ['regurg', 'quasi', '##wn', 'here', 'marx']
MASK 1: ['konnten', 'norm', 'lighting', 'leb', '##ю']
MASK 2: ['lighting', '##match', 'konnten', 'ses', 'leb']


## FINE TUNING

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
import pandas as pd

class SSTDataset(Dataset):

    def __init__(self, filename, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter = '\t')

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'sentence']
        label = self.df.loc[index, 'label']

        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label

In [None]:
from torch.utils.data import DataLoader

#Creating instances of training and validation set
train_set = SSTDataset(filename = 'DATA/trainDevTestTrees_PTB/trees/train.txt', maxlen = 30)
val_set = SSTDataset(filename = 'DATA/trainDevTestTrees_PTB/trees/dev.txt', maxlen = 30)

#Creating intsances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size = 64, num_workers = 5)
val_loader = DataLoader(val_set, batch_size = 64, num_workers = 5)

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

class SentimentClassifier(nn.Module):

    def __init__(self, freeze_bert = True):
        super(SentimentClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Freeze bert layers
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False
        
        #Classification layer
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        cont_reps, _ = self.bert_layer(seq, attention_mask = attn_masks)

        #Obtaining the representation of [CLS] head
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [None]:
net = SentimentClassifier(freeze_bert = True)

In [None]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [None]:
def train(net, criterion, opti, train_loader, val_loader, args):

    for ep in range(args.max_eps):
        
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(args.gpu), attn_masks.cuda(args.gpu), labels.cuda(args.gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()

            if (it + 1) % args.print_every == 0:
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss : {} Accuracy : {}".format(it+1, ep+1, loss.item(), acc))

In [None]:
class Arguments:
    def __init__(self, gpu=0, freeze_bert="store_true", maxlen=25, batch_size=32, lr=2e-5, print_every=100, max_eps=5):
        self.gpu = gpu
        self.freeze_bert = freeze_bert
        self.maxlen = maxlen
        self.batch_size = batch_size
        self.lr = lr
        self.print_every = print_every
        self.max_eps = max_eps

In [None]:
args = Arguments()

In [None]:
train(net, criterion, opti, train_loader, val_loader, args)