In [1]:
import torch
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import EarlyStoppingCallback, IntervalStrategy
from datasets import load_metric
from datasets import Dataset
import numpy as np
import pandas as pd

In [2]:
PATH_TRAIN = "train_perturbed.bio"
PATH_DEV = "dev.bio"
PATH_TEST = "test.bio"
label_all_tokens = True

In [3]:
def read_bio_file(path):
    
    data = []
    current_words = []
    current_tags = []

    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()[2:]
        
    for line in lines:
        
        line = line.strip()
        
        if line: # if line is not an empty line
            tok = line.split('\t')
            current_words.append(tok[0])
            current_tags.append(tok[3])
            
        else:
            if current_words:
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []
            
            
    if current_tags != []:
        data.append((current_words, current_tags))

    df = pd.DataFrame(data, columns=['words', 'tags'])
    df['id'] = df.index
    df = df[['id', 'words', 'tags']]
    
    return df

In [4]:
train_data = read_bio_file(PATH_TRAIN)
dev_data = read_bio_file(PATH_DEV)
test_data = read_bio_file(PATH_TEST)

In [5]:
class Vocab():
    def __init__(self, pad_unk='<PAD>'):
        self.pad_unk = pad_unk
        self.word2idx = {}
        self.idx2word = []

    def getIdx(self, word, add=False):
        if word is None or word == self.pad_unk:
            return None
        if word not in self.word2idx:
            if add:
                idx = len(self.idx2word)
                self.word2idx[word] = idx
                self.idx2word.append(word)
                return idx
            else:
                return None
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word[idx]

label_indices = Vocab()
tags_column = train_data["tags"]

for tags in tags_column:
    for tag in tags:
        label_indices.getIdx(tag, add=True)

print(label_indices.word2idx)

{'O': 0, 'B-ORG': 1, 'B-PER': 2, 'B-LOC': 3, 'I-PER': 4, 'B-MISC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}


In [6]:
train_data['tag_idx'] = train_data['tags'].apply(lambda x: [label_indices.word2idx[tag] for tag in x])
dev_data['tag_idx'] = dev_data['tags'].apply(lambda x: [label_indices.word2idx[tag] for tag in x])
test_data['tag_idx'] = test_data['tags'].apply(lambda x: [label_indices.word2idx[tag] for tag in x])

model_checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, padding=True)

In [7]:
def tokenize_and_align_labels(dataset, word_column, tag_column, tokenizer):
    tokenized_inputs = tokenizer(dataset[word_column].tolist(), truncation=True, is_split_into_words=True, padding = True)

    labels = []
    for i, label in enumerate(dataset[tag_column]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs.data

In [8]:
tokenized_data = tokenize_and_align_labels(train_data, "words", "tag_idx", tokenizer)
tokenized_dev_data = tokenize_and_align_labels(dev_data, "words", "tag_idx", tokenizer)
tokenized_test_data = tokenize_and_align_labels(test_data, "words", "tag_idx", tokenizer)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
train_dataset = Dataset.from_dict({
    'id': range(len(tokenized_data['input_ids'])),
    'input_ids': tokenized_data['input_ids'],
    'attention_mask': tokenized_data['attention_mask'],
    'labels': tokenized_data['labels']
})

dev_dataset = Dataset.from_dict({
    'id': range(len(tokenized_dev_data['input_ids'])),
    'input_ids': tokenized_dev_data['input_ids'],
    'attention_mask': tokenized_dev_data['attention_mask'],
    'labels': tokenized_dev_data['labels']
})

test_dataset = Dataset.from_dict({
    'id': range(len(tokenized_test_data['input_ids'])),
    'input_ids': tokenized_test_data['input_ids'],
    'attention_mask': tokenized_test_data['attention_mask'],
    'labels': tokenized_test_data['labels']
})

In [10]:
dev_dataset_new = Dataset.from_dict({
    'input_ids': dev_dataset['input_ids'],
    'attention_mask': dev_dataset['attention_mask'],
    'labels': dev_dataset['labels']
})

test_dataset_new = Dataset.from_dict({
    'input_ids': test_dataset['input_ids'],
    'attention_mask': test_dataset['attention_mask'],
    'labels': test_dataset['labels']
})

## Loading trained model from HuggingFace hub

Models' names used to load them:
- Model with no noise in training data: **gabizh/dbbuc_OG**
- Model with 5% noise in training data: **cria111/dbbuc_5p**
- Model with 10% noise: **gabizh/dbbuc_10p**
- Model with 20% noise: **bozhidara-pesheva/dbbuc_20p**
- Model with 30% noise: **lilzzz/dbbuc_30p** 

To load any of them, use the code in the cell below.

In [11]:
loaded_model = AutoModelForTokenClassification.from_pretrained("gabizh/dbbuc_OG")

In [12]:
trainer = Trainer(model = loaded_model)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [13]:
label_list = label_indices.idx2word
batch_size = 16

In [14]:
predictions, labels, _ = trainer.predict(test_dataset_new)
predictions = np.argmax(predictions, axis=2)

  0%|          | 0/151 [00:00<?, ?it/s]

In [15]:
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

In [16]:
x = tokenizer.convert_ids_to_tokens(test_dataset_new["input_ids"][0])

In [17]:
def un_tok_labs(list_of_labels, list_of_words):
    tokenized_inputs = tokenizer(list_of_words, truncation=True, is_split_into_words=True)
    print(tokenized_inputs)
    labels = []
    for i, label in enumerate(list_of_labels):
        print(label)
        label_copy = label.copy()  # Create a copy of the label list

        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        print(word_ids)
        print(tokenizer.convert_ids_to_tokens(tokenized_inputs["input_ids"][i]))
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            print("word_idx", word_idx)
            if word_idx is None:  # Only label the first token of a given word.
                continue
            elif word_idx == previous_word_idx:
                label_copy.pop(word_idx)
                continue
            else:
                label_ids.append(label_copy[word_idx])
            previous_word_idx = word_idx 
        labels.append(label_ids)
    return labels

In [18]:
test_words = read_bio_file(PATH_TEST)
test_words = list(test_words["words"])

In [19]:
untok_labs = un_tok_labs(true_predictions, test_words)

{'input_ids': [[101, 1030, 5310, 15136, 12521, 2053, 1010, 1045, 1005, 1049, 2025, 1012, 2009, 1005, 1055, 5791, 2025, 1037, 10687, 1012, 102], [101, 2047, 4310, 13383, 999, 13585, 13528, 2007, 2715, 999, 24471, 2140, 12521, 2620, 2509, 3081, 1030, 5310, 23632, 2629, 102], [101, 19387, 1030, 5310, 2581, 2575, 2581, 1024, 2009, 1005, 1055, 2042, 1017, 2086, 2144, 1996, 2713, 1997, 1996, 2189, 2678, 1997, 5255, 4792, 1012, 24471, 2140, 2575, 24434, 102], [101, 1001, 2330, 14876, 7174, 2860, 2005, 1047, 16340, 2545, 2074, 2128, 2102, 28394, 2102, 102], [101, 19387, 1030, 5310, 25746, 2575, 1024, 26452, 1018, 17710, 7377, 1012, 1012, 1012, 1012, 1996, 5409, 1012, 15488, 2232, 15488, 2232, 1012, 1012, 100, 102], [101, 19387, 1030, 5310, 17465, 22407, 1024, 2079, 1050, 1005, 1056, 5376, 2000, 15306, 1012, 2079, 1050, 1005, 1056, 2202, 2054, 1996, 2865, 2758, 2012, 2227, 3643, 1012, 2079, 2115, 2219, 2470, 1004, 2994, 6727, 1012, 2644, 4634, 1529, 102], [101, 19387, 1030, 5310, 22932, 2629, 1

In [20]:
def save_preds(filename, tok, untok_labs):
    with open(filename, "w", encoding="utf-8") as f: 
        for idx, pair in enumerate(zip(tok, untok_labs)): 
            t, l = pair
            if len(t) != len(l):
                    print(idx)
                    print(t)
                    print(l)
            try:
                for i in range(len(t)): 
                    f.write(f"{i+1}\t{t[i]}\t{l[i]}\n")
            except:
                continue

            f.write("\n")
    return ("File has been saved")

## Saving predictions as a iob2 file

In [21]:
filename = "preds_perturbed_model.iob2"

save_preds(filename, test_words, untok_labs)

134
['It', "'s", 'raining', '!!', '\ue330\ue04b']
['O', 'O', 'O', 'O']
159
['RT', '@USER337', ':', 'I', 'just', 'missed', 'the', 'best', 'twitpic', 'opportunity', 'smh', '..', 'This', 'nigga', 'was', 'bald', 'all', 'over', 'but', 'had', 'a', 'pony', 'tail', 'of', 'dreads', 'lol', '<-', '\ue40b\ue107\ue107\ue107']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


'File has been saved'