In [1]:
import torch
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
from datasets import load_metric
from datasets import Dataset
import numpy as np
import pandas as pd

In [2]:
import random
import re
import math
import warnings
warnings.filterwarnings('ignore')

In [3]:
PATH_TRAIN = "train.bio"
PATH_DEV = "dev.bio"
PATH_TEST = "test.bio"
label_all_tokens = True

# Loading the data

In [4]:
def read_bio_file(path):
    
    data = []
    current_words = []
    current_tags = []

    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()[2:]
        
    for line in lines:
        
        line = line.strip()
        
        if line: # if line is not an empty line
            tok = line.split('\t')
            current_words.append(tok[0])
            current_tags.append(tok[3])
            
        else:
            if current_words:
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []
            
            
    if current_tags != []:
        data.append((current_words, current_tags))

    df = pd.DataFrame(data, columns=['words', 'tags'])
    df['id'] = df.index
    df = df[['id', 'words', 'tags']]
    
    return df

In [5]:
train_data = read_bio_file(PATH_TRAIN)
dev_data = read_bio_file(PATH_DEV)
test_data = read_bio_file(PATH_TEST)
train_data.head()

Unnamed: 0,id,words,tags
0,0,"[RT, @USER2362, :, Farmall, Heart, Of, The, Ho...","[O, O, O, B-ORG, O, O, O, O, O, O, O, O, O, O,..."
1,1,"[#Volunteers, are, key, members, of, #CHEO’s, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,2,"[@USER2092, is, n't, it, funny, how, that, alw...","[O, O, O, O, O, O, O, O, O, O, O]"
3,3,"[RT, @USER80, :, Silence, is, better, than, li...","[O, O, O, O, O, O, O, O, O]"
4,4,"[I, just, spent, twenty, minutes, trying, to, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


# Extracting tag => index dictionary

In [6]:
class Vocab():
    def __init__(self, pad_unk='<PAD>'):
        self.pad_unk = pad_unk
        self.word2idx = {}
        self.idx2word = []

    def getIdx(self, word, add=False):
        if word is None or word == self.pad_unk:
            return None
        if word not in self.word2idx:
            if add:
                idx = len(self.idx2word)
                self.word2idx[word] = idx
                self.idx2word.append(word)
                return idx
            else:
                return None
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word[idx]

label_indices = Vocab()
tags_column = train_data["tags"]

for tags in tags_column:
    for tag in tags:
        label_indices.getIdx(tag, add=True)

print(label_indices.word2idx)

{'O': 0, 'B-ORG': 1, 'B-PER': 2, 'B-LOC': 3, 'I-PER': 4, 'B-MISC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}


Now we map the tags to indices and add them as a column

In [7]:
train_data['tag_idx'] = train_data['tags'].apply(lambda x: [label_indices.word2idx[tag] for tag in x])
dev_data['tag_idx'] = dev_data['tags'].apply(lambda x: [label_indices.word2idx[tag] for tag in x])
test_data['tag_idx'] = test_data['tags'].apply(lambda x: [label_indices.word2idx[tag] for tag in x])

model_checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, padding=True)

train_data.head()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Unnamed: 0,id,words,tags,tag_idx
0,0,"[RT, @USER2362, :, Farmall, Heart, Of, The, Ho...","[O, O, O, B-ORG, O, O, O, O, O, O, O, O, O, O,...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,"[#Volunteers, are, key, members, of, #CHEO’s, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,"[@USER2092, is, n't, it, funny, how, that, alw...","[O, O, O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,3,"[RT, @USER80, :, Silence, is, better, than, li...","[O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,4,"[I, just, spent, twenty, minutes, trying, to, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


# Perturbations
Functions for perturbation of the dataset. We make them before tokenization for a reason. Should not change the length of the sentence (tags should still correspond).

In [8]:
def is_valid_word(word):
    
    return bool(re.search('[a-zA-Z]', word))

In [9]:
def deletion_sentence(sent, word_ids):
    
    #n_words = len(word_ids)

    for word_id in word_ids:
        
        type_of_mistake = random.randint(1,3)
        
        word = sent[word_id]
        if len(word) > 1:
            # mistake type 1: missed last letter
            if type_of_mistake == 1:
                word = word[:-1]
            # mistake type 2: missed first letter
            elif type_of_mistake == 2:
                word = word[1:]
            # mistake type 3: missed random middle letter
            elif type_of_mistake == 3:
                if len(word) >= 3:
                    # make it so that you can't remove first or last letter
                    # and have to remove smth in the middle
                    del_idx = random.randint(1, len(word) - 2)
                    word = word[:del_idx] + word[del_idx+1:]
                
        sent[word_id] = word
        
    return sent

In [10]:
def deletion_dataset(data, sent_ids, sent_words, prints=False):

    new_data = (data.copy()).iloc[sent_ids]
        
    for idx in sent_ids:
        
        word_ids = sent_words[idx]
        
        if prints:
            print(f"Perturbing sentence idx {idx} | Original: ")
            print(data["words"][idx])
            
        new_sent = (deletion_sentence((new_data["words"][idx]).copy(), word_ids)).copy()
        new_data["words"][idx] = new_sent
                    
        if prints:
            print(f"Perturbed version:")
            print(new_data["words"][idx])
        
            print(data["words"][idx] == new_data["words"][idx])
        
    return len(sent_ids), sent_ids, new_data

In [11]:
def get_alphabet_list():
        
    # loops through the ascii codes of all lower case english letters
    # and makes a list of the characters corresponding to those codes
    return [chr(ascii_code) for ascii_code in range(ord("a"), ord("z")+1)]

def insert_at_idx(word, letter, idx):

    new_str = "" 
    new_str += word[:idx] + letter + word[idx:] #insert chosen letter at chosen index
    
    return new_str

def insert_letter(word):
        
    insert_at = random.randint(0, len(word)) #choose a random index in the word to insert at
    # note: random.randint(start,end) is a closed interval so it takes the "end" number as well
    
    alph = get_alphabet_list()
    letter = random.choice(alph) #choose a random english alphabet letter to be inserted
    
    print(f"word {word} insert letter {letter} at idx {insert_at}")
    
    new_str = insert_at_idx(word, letter, insert_at) #insert chosen letter at chosen index
    
    return new_str

def insert_multiple_letters(word, N=1, prints=False):
    
    alph = get_alphabet_list()
    letters = [random.choice(alph) for i in range(N)] # choose N random letters from
    # the english alphabet to insert at the chosen indices
    
    if prints:
        print(f"word {word}")
        print(f"Letters to insert: {letters}")
    
    new_str = word

    for i in range(N):
        
        chosen_idx = random.randint(0, len(new_str)) # choose a random index to insert at
        if prints:
            print(f"Inserting letter {letters[i]} at index {chosen_idx} of word {new_str}")
        new_str = insert_at_idx(new_str, letters[i], chosen_idx) # update the word with the chosen insertion
    
    return new_str

def perturb_sentence(sent, word_ids, n_letters):
    
    n_words = len(word_ids)
    if n_words == 0:
        return sent
    
    new_sent = sent.copy()
    
    for idx in word_ids:
        new_sent[idx] = insert_multiple_letters(new_sent[idx], n_letters)
        
    return new_sent

In [12]:
def insertion_perturb(data, sent_ids, sent_words, n_letters=1, prints=False):
    
    n_sents = len(sent_ids)
    
    new_data = (data.copy()).iloc[sent_ids]
    
    for idx in sent_ids:
        
        if prints:
            print(f"Perturbing sentence idx {idx} | Original: ")
            print(data["words"][idx])
         
        word_ids = sent_words[idx]
        new_sent = (perturb_sentence((new_data["words"][idx]).copy(), word_ids, n_letters)).copy()
        new_data["words"][idx] = new_sent
        
        if prints:
            print(f"Perturbed version:")
            print(new_data["words"][idx])
        
            print(data["words"][idx] == new_data["words"][idx])
        
    return len(sent_ids), sent_ids, new_data

In [13]:
def swap_neighboring_letters(word):
    
    if len(word) <= 1:
        return word

    # Convert the word into a list of characters for easier manipulation
    word_list = list(word)

    # Choose a random index to swap with its neighboring letter
    idx = random.randint(0, len(word) - 2)  # Ensure that the chosen index is not the last character

    # Swap the character at the chosen index with its neighboring letter
    word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx]

    # Convert the list of characters back to a string
    return ''.join(word_list)

def swap_letters_in_sentences(df, sent_ids, sent_words, apply_to_all=True):
    
    df_copy = (df.copy()).iloc[sent_ids]
    num_of_sent = len(sent_ids)

    for sent_id in sent_ids:
        #indexing the sentence
        sentence = (df_copy["words"][sent_id]).copy()
        word_ids = sent_words[sent_id]
        
        if len(word_ids) > 0:
            
            for word in word_ids:
                word_to_change = sentence[word]
                sentence[word] = swap_neighboring_letters(word_to_change)
                
        df_copy["words"][sent_id] = sentence

    return num_of_sent, sent_ids, df_copy


In [14]:
def perturb_data(dataset, perc_sent, perc_words, n_letters_insert, apply_to_all=True):

    random.seed(21)
    n_sent = math.ceil(perc_sent * dataset.shape[0])
    sent_ids = [x for x in random.sample(list(range(dataset.shape[0])), n_sent)]
    sent_words = {sent_id:None for sent_id in sent_ids}
    
    for sent_id in sent_ids:
        sent = dataset["words"][sent_id]
        n_words = math.ceil(len(sent) * perc_words)
        word_ids = []
        for i in range(n_words):
            word_id = random.randint(0,len(sent)-1)
            word = sent[word_id]
            while not is_valid_word(word):
                word_id = random.randint(0,len(sent)-1)
                word = sent[word_id]

            word_ids.append(word_id)
        sent_words[sent_id] = word_ids
    
    sent_ids_swap = sent_ids[:(len(sent_ids)//3)]
    sent_ids_insert = sent_ids[(len(sent_ids)//3):((len(sent_ids)//3)*2)]
    sent_ids_del = sent_ids[((len(sent_ids)//3)*2):]
    
    print(sent_ids)
    print(len(sent_ids_swap))
    print(len(sent_ids_insert))
    print(len(sent_ids_del))
    
    num_swap, ids_swap, df_swap = swap_letters_in_sentences(dataset, sent_ids_swap, sent_words, apply_to_all)
    num_ins, ids_ins, df_ins = insertion_perturb(dataset, sent_ids_insert, sent_words, n_letters_insert)
    num_del, ids_del, df_del = deletion_dataset(dataset, sent_ids_del, sent_words)

    merged_df = pd.concat([dataset, df_del, df_ins, df_swap]).reset_index(drop=True)
    merged_df["id"] = list(range(merged_df.shape[0]))
    merged_df

    return sent_words, merged_df

In [15]:
sent_words, merged_df = perturb_data(dev_data, 0.05, 0.15, 1)
merged_df

[168, 428, 706, 650, 288, 490, 221, 486, 524, 187, 517, 540, 241, 3, 14, 379, 599, 438, 70, 148, 237, 238, 43, 447, 416, 630, 452, 34, 336, 553, 509, 118, 664, 377, 23, 158]
12
12
12


Unnamed: 0,id,words,tags,tag_idx
0,0,"[RT, @USER333, :, Never, give, up, on, somethi...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,"[RT, @USER1300, :, The, deal, is, very, simple...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,"[RT, @USER2184, :, school, staff, be, like, "",...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,"[Put, some, respeck, on, my, name]","[O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0]"
4,4,"[RT, @USER1317, :, "", The, EU, tackles, climat...","[O, O, O, O, O, B-ORG, O, O, O, O, O, O, O, O,...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
741,741,"[RT, @USE1R265, :, [, PICTURE, ], @USER859, co...","[O, O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
742,742,"[RT, @USER206, :, value, people, who, see, you...","[O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
743,743,"[Sport, Scouting, the, Bruins, -, Lightning, E...","[O, O, O, B-ORG, O, B-ORG, O, O, O, O, B-ORG, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ..."
744,744,"[This, ., Tshi, exactly, ., 😂😭, URL425]","[O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0]"


In [16]:
sent_lens = [len(x) for x in merged_df["words"]]
tags_lens = [len(x) for x in merged_df["tags"]]
sent_lens == tags_lens

True

In [8]:
PATH_PERTURBED_TEST = "30p_test.bio"

P_SENT = 0.3
P_WORDS = 0.15
N_LETTERS = 1

In [9]:
PATH_PERTURBED_TRAIN = "30p_train.bio"

P_SENT = 0.3
P_WORDS = 0.15
N_LETTERS = 1

In [18]:
def format_data(data):
    
    txt = ""
    
    txt += "-DOCSTART- -X- -X- O\n\n"
    
    for sent_id in range(data.shape[0]):
        
        line = ""
        n_words = len(data["words"][sent_id])
        
        for word_id in range(n_words):
            
            line += data["words"][sent_id][word_id] + "\t-" + "\t-\t" + data["tags"][sent_id][word_id] + "\n"
                
        txt += line
        if sent_id != data.shape[0] - 1:
            txt += "\n"
        
    return txt

In [19]:
def write_perturbed_files(train_data, perc_sent, perc_words, n_letters,
                          path_train):
    
    ids_train, perturbed_train_data = perturb_data(train_data, perc_sent, perc_words, n_letters)
    
    txt_train = format_data(perturbed_train_data)
    
    with open(path_train, "w", encoding="utf-8") as f:
        f.write(txt_train)
        
    print("File writing complete!")

In [52]:
write_perturbed_files(test_data, P_SENT, P_WORDS, N_LETTERS, PATH_PERTURBED_TEST)

[337, 856, 1199, 576, 981, 442, 972, 1049, 375, 1034, 1080, 483, 6, 28, 759, 876, 141, 296, 475, 476, 86, 894, 833, 904, 69, 673, 1106, 1018, 237, 754, 47, 317, 181, 255, 40, 928, 329, 1130, 702, 821, 992, 309, 382, 76, 962, 481, 152, 404, 427, 145, 250, 1147, 975, 1144, 794, 411, 698, 299, 1097, 838, 597, 747, 924, 1026, 790, 651, 1160, 791, 544, 54, 191, 325, 1015, 5, 1006, 143, 1069, 454, 652, 230, 370, 348, 713, 680, 709, 627, 503, 211, 345, 134, 356, 878, 767, 327, 75, 1064, 494, 1178, 334, 29, 1116, 1005, 1036, 769, 506, 1114, 332, 1184, 899, 834, 989, 117, 589, 106, 528, 593, 71, 859, 509, 349, 807, 142, 692, 1143, 1159, 980, 101, 516, 285, 1196, 57, 434, 749, 315, 755, 872, 1134, 976, 630, 284, 565, 1072, 20, 783, 257, 48, 541, 667, 511, 862, 526, 830, 283, 273, 290, 194, 812, 192, 1158, 51, 915, 288, 147, 243, 1197, 259, 622, 664, 811, 1011, 449, 340, 371, 376, 773, 412, 94, 116, 809, 951, 8, 396, 1182, 885, 65, 407, 782, 580, 179, 874, 728, 238, 258, 987, 1118, 619, 321, 763,

## Testing for correctness

In [29]:
PATH_PERTURBED_TRAIN = "datasets/5p_train.bio"
PATH_PERTURBED_TEST = "datasets/5p_test.bio"
train_data_perturbed = read_bio_file(PATH_PERTURBED_TRAIN)
test_data_perturbed = read_bio_file(PATH_PERTURBED_TEST)

In [30]:
train_data_perturbed

Unnamed: 0,id,words,tags
0,0,"[RT, @USER2362, :, Farmall, Heart, Of, The, Ho...","[O, O, O, B-ORG, O, O, O, O, O, O, O, O, O, O,..."
1,1,"[#Volunteers, are, key, members, of, #CHEO’s, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,2,"[@USER2092, is, n't, it, funny, how, that, alw...","[O, O, O, O, O, O, O, O, O, O, O]"
3,3,"[RT, @USER80, :, Silence, is, better, than, li...","[O, O, O, O, O, O, O, O, O]"
4,4,"[I, just, spent, twenty, minutes, trying, to, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...,...
1716,1716,"[hTe, plants, will, be, considreed, a, palm, t...","[O, O, O, O, O, O, O, O, O, O]"
1717,1717,"[RT, @USER1730, :, I, ca, n't, od, school, any...","[O, O, O, O, O, O, O, O, O, O, O, O]"
1718,1718,"[RT, @USRE789, :, Man, loses, everything, ni, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1719,1719,"[RT, @USER799, :, @USER334, @USER1374, @USER17...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [31]:
test_data_perturbed

Unnamed: 0,id,words,tags
0,0,"[@USER1812, No, ,, I, 'm, not, ., It, 's, defi...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,1,"[new, unique, backpack, !, combines, vintage, ...","[O, O, O, O, O, O, O, O, O, O, O, O]"
2,2,"[RT, @USER767, :, It, 's, been, 3, years, sinc...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,3,"[#openfollow, for, kpopers, just, retweet]","[O, O, O, O, O]"
4,4,"[RT, @USER526, :, empathy, 4, Kesha, ...., the...","[O, O, O, O, O, B-PER, O, O, O, O, O, O, O, O]"
...,...,...,...
1257,1257,"[RT, @USER333, :, Sometimes, I, feel, like, we...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1258,1258,"[My, book, traielr, -, WORST, SINGING, EVER, !...","[O, O, O, O, O, O, O, O, O, O, O, O]"
1259,1259,"[I, m', not, difficult, I, 'm, just, abuot, my...","[O, O, O, O, O, O, O, O, O, O, O]"
1260,1260,"[I, liked, a, @USER1490, video, URL986, LQIUID...","[O, O, O, O, O, O, O, O, O, O, O]"


In [32]:
#making sure sentences and tags are still aligned
sent_lens = [len(x) for x in test_data_perturbed["words"]]
tags_lens = [len(x) for x in test_data_perturbed["tags"]]
print(sent_lens == tags_lens)

sent_lens = [len(x) for x in train_data_perturbed["words"]]
tags_lens = [len(x) for x in train_data_perturbed["tags"]]
print(sent_lens == tags_lens)

True
True


In [34]:
train_data.iloc[-1]

id                                                      1638
words      [RT, @USER364, :, Donald, Trump, is, the, Clea...
tags       [O, O, O, B-PER, I-PER, O, O, O, O, O, O, O, B...
tag_idx    [0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, ...
Name: 1638, dtype: object

In [35]:
train_data_perturbed.iloc[1638]

id                                                    1638
words    [RT, @USER364, :, Donald, Trump, is, the, Clea...
tags     [O, O, O, B-PER, I-PER, O, O, O, O, O, O, O, B...
Name: 1638, dtype: object

In [38]:
train_data_perturbed.iloc[-1]

id                                                    1720
words    [BETBRIGHT, -, Get, a, £, 30, FREE, matched, e...
tags     [B-ORG, O, O, O, O, O, O, O, O, O, O, O, O, O,...
Name: 1720, dtype: object

In [45]:
paths_to_train = ["datasets/5p_train.bio", "datasets/10p_train.bio", "datasets/20p_train.bio", "datasets/30p_train.bio"]
paths_to_test = ["datasets/5p_test.bio", "datasets/10p_test.bio", "datasets/20p_test.bio", "datasets/30p_test.bio"]
#Checking that the sizes of new datasets are a fraction bigger than OG ones
for i in range(len(paths_to_train)):
    traindata = read_bio_file(paths_to_train[i])
    testdata = read_bio_file(paths_to_test[i])
    print(np.round(len(traindata)/len(train_data), 2))
    print(np.round(len(testdata)/len(test_data), 2))

1.05
1.05
1.1
1.1
1.2
1.2
1.3
1.3
