In [35]:
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np

In [46]:
PATH_TRAIN = "C:/Users/kelis/ITU/Year_2/4th_semester/NLP/project/handout/project/en_ewt-ud-train.iob2"
PATH_DEV = "C:/Users/kelis/ITU/Year_2/4th_semester/NLP/project/handout/project/en_ewt-ud-dev.iob2"
PATH_TEST = "C:/Users/kelis/ITU/Year_2/4th_semester/NLP/project/handout/project/en_ewt-ud-test-masked.iob2"
PATH_OUTPUT = "C:/Users/kelis/ITU/Year_2/4th_semester/NLP/project/scripts/outputs.txt"
UNK = "[UNK]"
PAD = 0
MLM = 'distilbert-base-cased'
BATCH_SIZE = 32
LEARNING_RATE = 0.00001
EPOCHS = 3
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MAX_SENTS=64

In [3]:
class ClassModel(torch.nn.Module):
    def __init__(self, nlabels: int, mlm: str):
        """
        Model for classification with transformers.

        The architecture of this model is simple, we just have a transformer
        based language model, and add one linear layer to converts it output
        to our prediction.
    
        Parameters
        ----------
        nlabels : int
            Vocabulary size of output space (i.e. number of labels)
        mlm : str
            Name of the transformers language model to use, can be found on:
            https://huggingface.co/models
        """
        super().__init__()

        # The transformer model to use
        self.mlm = AutoModel.from_pretrained(mlm)

        # Find the size of the output of the masked language model
        if hasattr(self.mlm.config, 'hidden_size'):
            self.mlm_out_size = self.mlm.config.hidden_size
        elif hasattr(self.mlm.config, 'dim'):
            self.mlm_out_size = self.mlm.config.dim
        else: # if not found, guess
            self.mlm_out_size = 768
            
        print(f"Hidden size: {self.mlm_out_size}")

        # Create prediction layer
        self.hidden_to_label = torch.nn.Linear(self.mlm_out_size, nlabels)

    def forward(self, input: torch.tensor):
        """
        Forward pass
    
        Parameters
        ----------
        input : torch.tensor
            Tensor with wordpiece indices. shape=(batch_size, max_sent_len).

        Returns
        -------
        output_scores : torch.tensor
            ?. shape=(?,?)
        """
        # Run transformer model on input
        mlm_out = self.mlm(input)

        # Keep only the last layer: shape=(batch_size, max_len, DIM_EMBEDDING)
        mlm_out = mlm_out.last_hidden_state
        # Keep only the output for the first ([CLS]) token: shape=(batch_size, DIM_EMBEDDING)
        mlm_out = mlm_out[:,:,:].squeeze() 

        # Matrix multiply to get scores for each label: shape=(?,?)
        output_scores = self.hidden_to_label(mlm_out)

        return output_scores

    def run_eval(self, feats_batches, labels_batches):
        """
        Run evaluation: predict and score
    
        Parameters
        ----------
        text_batched : List[torch.tensor]
            list with batches of text, containing wordpiece indices.
        labels_batched : List[torch.tensor]
            list with batches of labels (converted to ints).
        model : torch.nn.module
            The model to use for prediction.
    
        Returns
        -------
        score : float
            accuracy of model on labels_batches given feats_batches
        """
        self.eval()
        match = 0
        total = 0
        for sents, labels in zip(feats_batches, labels_batches):
            output_scores = self.forward(sents)
            predicted_tags  = torch.argmax(output_scores, 2)
            for goldSent, predSent in zip(labels, predicted_tags):
                for goldLabel, predLabel in zip(goldSent, predSent):
                    if goldLabel.item() != 0:
                        total += 1
                        if goldLabel.item() == predLabel.item():
                            match+= 1
        return(match/total)

In [47]:
def labels2lookup(labels, PAD):

    id2label = [PAD, 'O', 'B-LOC', 'I-LOC', 'B-PER', 'B-ORG', 'I-ORG', 'I-PER']
    label2id = {PAD: 0, 'O': 1, 'B-LOC': 2, 'I-LOC': 3, 'B-PER': 4, 'B-ORG': 5, 'I-ORG': 6, 'I-PER': 7}
                
    return id2label, label2id

def read_data(path):
    """
    read in iob2 file
    
    :param path: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data_words = []
    data_tags = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()

        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:  # skip empty lines
                data_words.append(current_words)
                data_tags.append(current_tags)
            current_words = []
            current_tags = []
    # check for last one
    if current_tags != []:
        data_words.append(current_words)
        data_tags.append(current_tags)
    return data_words, data_tags

def find_max_len(data):
    
    max_len = max([len(x) for x in data])
    
    return max_len

def pad_data(data, PAD, N):
    
    padded = []
    
    for sent in data:
        
        new_sent = sent.copy()
        
        sent_len = len(sent)
        dif = N - sent_len
        
        for _ in range(dif):
            
            new_sent.append(PAD)
            
        padded.append(new_sent)
        
    return padded

In [11]:
def get_data():

    train_text, train_labels = read_data(PATH_TRAIN)
    dev_text, dev_labels = read_data(PATH_DEV)
    test_text, test_labels = read_data(PATH_TEST)

    train_text = train_text[:MAX_SENTS] 
    train_labels = train_labels[:MAX_SENTS]
    
    dev_text = dev_text[:MAX_SENTS]
    dev_labels = dev_labels[:MAX_SENTS]
    
    test_text = test_text[:MAX_SENTS]
    test_labels = test_labels[:MAX_SENTS]
    
    return train_text, train_labels, dev_text, dev_labels, test_text, test_labels

In [17]:
def encode_labels(labels):
    
    id2label, label2id = labels2lookup(train_labels, UNK)
    
    enc_labels = labels.copy() 
    
    for i, label_list in enumerate(labels):
        for j, label in enumerate(label_list):
            enc_labels[i][j]= label2id[label]
            
    return enc_labels

In [87]:
def tokenize_words(words, tags, tokenizer, label_all_tokens=True):
    
    toks = tokenizer(words, truncation=True, is_split_into_words=True)
    labels = []
    
    for i, label in enumerate(tags):
        
        word_ids = toks.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else 0)
            previous_word_idx = word_idx
            
        labels.append(label_ids)
        
    toks["labels"] = labels
    
    return toks.data

In [75]:
def preprocess_text(text_data, label_data):
    
    tokzr = AutoTokenizer.from_pretrained(MLM)
    tok_data = tokenize_words(text_data, label_data, tokzr)
    enc_words = tok_data["input_ids"]
    label_data = tok_data["labels"]
    
    max_sent_len = find_max_len(enc_words)
    print(f"Max sent len is {max_sent_len}")
    
    print(f"Padding all sentences to {max_sent_len} tokens using pad token {PAD}")
    
    padded_data = pad_data(enc_words, PAD, max_sent_len)
    words = np.array(padded_data)
    words = torch.tensor(words)
    
    print(f"Text data converted to pytorch tensor, shape {words.shape}")
    
    return words, tok_data, label_data

In [65]:
def preprocess_labels(label_data):
    
    max_sent_len = find_max_len(label_data)
    padded_label_data = pad_data(label_data, PAD, max_sent_len)
    labels = torch.tensor(padded_label_data)
    
    return labels

In [63]:
def get_batches(text_data):
    
    num_batches = int(len(text_data)/BATCH_SIZE)
    print(f"num batches: {num_batches}")
    
    max_sent_len = find_max_len(text_data)
    batches = text_data[:BATCH_SIZE*num_batches].view(num_batches, BATCH_SIZE, max_sent_len)
    
    print(f"Got {num_batches} batches of size {BATCH_SIZE}, final tensor shape is {batches.shape}")
    
    return batches

In [88]:
train_text, train_labels, dev_text, dev_labels, test_text, test_labels = get_data()

In [89]:
enc_labels_train = encode_labels(train_labels)
enc_labels_dev = encode_labels(dev_labels)
enc_labels_test = encode_labels(test_labels)

In [90]:
pre_train, tok_data_train, labels_train = preprocess_text(train_text, enc_labels_train)
pre_dev, tok_data_dev, labels_dev = preprocess_text(dev_text, enc_labels_dev)
pre_test, tok_data_test, _ = preprocess_text(test_text, enc_labels_test)
pre_train_labels = preprocess_labels(labels_train)
pre_dev_labels = preprocess_labels(labels_dev)

Max sent len is 47
Padding all sentences to 47 tokens using pad token 0
Text data converted to pytorch tensor, shape torch.Size([64, 47])
Max sent len is 77
Padding all sentences to 77 tokens using pad token 0
Text data converted to pytorch tensor, shape torch.Size([64, 77])
Max sent len is 51
Padding all sentences to 51 tokens using pad token 0
Text data converted to pytorch tensor, shape torch.Size([64, 51])


In [91]:
train_batches = get_batches(pre_train)
train_label_batches = get_batches(pre_train_labels)
dev_batches = get_batches(pre_dev)
dev_label_batches = get_batches(pre_dev_labels)

num batches: 2
Got 2 batches of size 32, final tensor shape is torch.Size([2, 32, 47])
num batches: 2
Got 2 batches of size 32, final tensor shape is torch.Size([2, 32, 47])
num batches: 2
Got 2 batches of size 32, final tensor shape is torch.Size([2, 32, 77])
num batches: 2
Got 2 batches of size 32, final tensor shape is torch.Size([2, 32, 77])


In [92]:
id2label, label2id = labels2lookup(train_labels, UNK)
NLABELS = len(id2label)
model = ClassModel(NLABELS, MLM)
model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_function = torch.nn.CrossEntropyLoss(ignore_index=0, reduction='sum')

Hidden size: 768


In [94]:
MAX_SENT_LEN = train_batches.shape[2]

for epoch in range(EPOCHS):
        print('=====================')
        print('starting epoch ' + str(epoch))
        model.train() 
    
        # Loop over batches
        loss = 0
        for batch_idx in range(0, len(train_batches)):

            print(f"---running batch idx {batch_idx}---")
            print(f"size of current batch: {train_batches[batch_idx].shape}")

            optimizer.zero_grad()
            
            output_scores = model.forward(train_batches[batch_idx])
            
            flat_labels = train_label_batches[batch_idx].view(BATCH_SIZE * MAX_SENT_LEN)
            output_scores = output_scores.view(BATCH_SIZE * MAX_SENT_LEN, -1)
            
            print("output scores shape", output_scores.shape)
            
            batch_loss = loss_function(output_scores, flat_labels)
            
            predicted_labels = torch.argmax(output_scores, 1)
            #predicted_labels = predicted_labels.view(BATCH_SIZE, MAX_SENT_LEN)

            print("train labels in a single batch: ", train_label_batches[batch_idx].shape)
            print(f"predicted labels size: {predicted_labels.shape}")

            loss += batch_loss.item()
    
            batch_loss.backward()

            optimizer.step()
        dev_score = model.run_eval(dev_batches, dev_label_batches)
        print('Loss: {:.2f}'.format(loss))
        print('Acc(dev): {:.2f}'.format(100*dev_score))
        print()

starting epoch 0
---running batch idx 0---
size of current batch: torch.Size([32, 47])
output scores shape torch.Size([1504, 8])
train labels in a single batch:  torch.Size([32, 47])
predicted labels size: torch.Size([1504])
---running batch idx 1---
size of current batch: torch.Size([32, 47])
output scores shape torch.Size([1504, 8])
train labels in a single batch:  torch.Size([32, 47])
predicted labels size: torch.Size([1504])
Loss: 2242.91
Acc(dev): 11.46

starting epoch 1
---running batch idx 0---
size of current batch: torch.Size([32, 47])
output scores shape torch.Size([1504, 8])
train labels in a single batch:  torch.Size([32, 47])
predicted labels size: torch.Size([1504])
---running batch idx 1---
size of current batch: torch.Size([32, 47])
output scores shape torch.Size([1504, 8])
train labels in a single batch:  torch.Size([32, 47])
predicted labels size: torch.Size([1504])
Loss: 1988.14
Acc(dev): 49.34

starting epoch 2
---running batch idx 0---
size of current batch: torch.