In [38]:
import torch
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import load_metric
from datasets import Dataset
import numpy as np
import pandas as pd
from huggingface_hub import notebook_login

In [34]:
PATH_TRAIN = "C:/Users/kelis/ITU/Year_2/4th_semester/NLP/project/scripts/NLP_project2024-main/train.bio"

### Functions to read and preprocess data before perturbations

In [37]:
def read_bio_file(path):
    
    data = []
    current_words = []
    current_tags = []

    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()[2:]
        
    for line in lines:
        
        line = line.strip()
        
        if line: # if line is not an empty line
            tok = line.split('\t')
            current_words.append(tok[0])
            current_tags.append(tok[3])
            
        else:
            if current_words:
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []
            
            
    if current_tags != []:
        data.append((current_words, current_tags))

    df = pd.DataFrame(data, columns=['words', 'tags'])
    df['id'] = df.index
    df = df[['id', 'words', 'tags']]
    
    return df

class Vocab():
    def __init__(self, pad_unk='<PAD>'):
        self.pad_unk = pad_unk
        self.word2idx = {}
        self.idx2word = []

    def getIdx(self, word, add=False):
        if word is None or word == self.pad_unk:
            return None
        if word not in self.word2idx:
            if add:
                idx = len(self.idx2word)
                self.word2idx[word] = idx
                self.idx2word.append(word)
                return idx
            else:
                return None
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word[idx]

label_indices = Vocab()
tags_column = train_data["tags"]

for tags in tags_column:
    for tag in tags:
        label_indices.getIdx(tag, add=True)

print(label_indices.word2idx)

{'O': 0, 'B-ORG': 1, 'B-PER': 2, 'B-LOC': 3, 'I-PER': 4, 'B-MISC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}


### Read data

In [40]:
train_data = read_bio_file(PATH_TRAIN)

train_data['tag_idx'] = train_data['tags'].apply(lambda x: [label_indices.word2idx[tag] for tag in x])

model_checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, padding=True)

train_data.head()

Unnamed: 0,id,words,tags,tag_idx
0,0,"[RT, @USER2362, :, Farmall, Heart, Of, The, Ho...","[O, O, O, B-ORG, O, O, O, O, O, O, O, O, O, O,...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,"[#Volunteers, are, key, members, of, #CHEO’s, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,"[@USER2092, is, n't, it, funny, how, that, alw...","[O, O, O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,3,"[RT, @USER80, :, Silence, is, better, than, li...","[O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,4,"[I, just, spent, twenty, minutes, trying, to, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


### Helper functions for the insertion

In [41]:
def get_alphabet_list():
        
    # loops through the ascii codes of all lower case english letters
    # and makes a list of the characters corresponding to those codes
    return [chr(ascii_code) for ascii_code in range(ord("a"), ord("z")+1)]

def insert_at_idx(word, letter, idx):

    new_str = "" 
    new_str += word[:idx] + letter + word[idx:] #insert chosen letter at chosen index
    
    return new_str

def insert_letter(word, seed=456):
    
    random.seed(seed) #set seed for reproducibility
    
    insert_at = random.randint(0, len(word)) #choose a random index in the word to insert at
    # note: random.randint(start,end) is a closed interval so it takes the "end" number as well
    
    alph = get_alphabet_list()
    letter = random.choice(alph) #choose a random english alphabet letter to be inserted
    
    print(f"word {word} insert letter {letter} at idx {insert_at} (seed {seed})")
    
    new_str = insert_at_idx(word, letter, insert_at) #insert chosen letter at chosen index
    
    return new_str

def insert_multiple_letters(word, N, seed=456, set_seed=False, prints=False):
    
    if set_seed:
        random.seed(seed) #set seed for reproducibility
    
    alph = get_alphabet_list()
    letters = [random.choice(alph) for i in range(N)] # choose N random letters from
    # the english alphabet to insert at the chosen indices
    
    if prints:
        print(f"word {word} | (seed {seed})")
        print(f"Letters to insert: {letters}")
    
    new_str = word

    for i in range(N):
        
        chosen_idx = random.randint(0, len(new_str)) # choose a random index to insert at
        if prints:
            print(f"Inserting letter {letters[i]} at index {chosen_idx} of word {new_str}")
        new_str = insert_at_idx(new_str, letters[i], chosen_idx) # update the word with the chosen insertion
    
    return new_str

def perturb_sentence(sent, perturb_func, perc, seed=456, set_seed=False):
    
    n_words = int(perc * len(sent))
    if n_words == 0:
        return sent
    
    if set_seed:
        random.seed(seed) #set seed for reproducibility
    
    new_sent = sent.copy()
    
    idxs = [x for x in random.sample(list(range(len(sent))), n_words)]
    
    for idx in idxs:
        new_sent[idx] = perturb_func(new_sent[idx], n_words)
        
    return new_sent

In [63]:
def perturb_dataset(data, perturb_func, perc_sents, perc_words, prints=False, seed=456, set_seed=False):
    
    n_sents = int(perc_sents * train_data.shape[0])
    if set_seed:
        random.seed(seed) #set seed for reproducibility
    
    new_data = data.copy()
    
    idxs = [x for x in random.sample(list(range(n_sents)), n_sents)]
    print(idxs)
    
    for idx in idxs:
        
        if prints:
            print(f"Perturbing sentence idx {idx} | Original: ")
            print(data["words"][idx])
            
        new_sent = (perturb_sentence((new_data["words"][idx]).copy(), perturb_func, perc_words)).copy()
        new_data["words"][idx] = new_sent
        
        if prints:
            print(f"Perturbed version:")
            print(new_data["words"][idx])
        
            print(data["words"][idx] == new_data["words"][idx])
        
    return len(idxs), idxs, new_data

In [64]:
p_sents = 0.2
p_words = 0.3

n_modified, idxs, perturbed_data = perturb_dataset(train_data, insert_multiple_letters, p_sents, p_words, prints=False)
print(n_modified)
print(idxs)

[275, 103, 86, 228, 97, 14, 152, 257, 243, 156, 262, 187, 194, 40, 115, 110, 265, 172, 140, 21, 155, 293, 185, 30, 236, 197, 295, 196, 4, 74, 241, 300, 232, 204, 313, 159, 53, 177, 24, 326, 234, 163, 88, 25, 207, 56, 189, 218, 301, 296, 31, 179, 85, 71, 48, 3, 6, 139, 256, 190, 230, 304, 252, 323, 260, 255, 251, 77, 160, 108, 60, 220, 67, 186, 161, 20, 138, 312, 199, 36, 93, 247, 65, 266, 233, 267, 318, 168, 290, 250, 66, 146, 305, 72, 62, 183, 130, 222, 38, 264, 157, 285, 319, 80, 44, 129, 132, 324, 52, 299, 317, 136, 70, 151, 310, 98, 297, 206, 26, 17, 94, 153, 274, 202, 289, 209, 244, 208, 259, 178, 131, 133, 200, 203, 162, 246, 282, 81, 128, 188, 91, 195, 41, 311, 78, 182, 120, 214, 142, 248, 279, 105, 205, 126, 239, 210, 211, 240, 12, 309, 87, 298, 54, 102, 7, 35, 227, 79, 2, 0, 242, 96, 22, 121, 320, 258, 191, 224, 43, 134, 125, 112, 231, 46, 137, 1, 68, 281, 219, 111, 175, 106, 270, 47, 180, 27, 42, 5, 216, 100, 254, 57, 122, 59, 104, 23, 307, 245, 170, 11, 15, 223, 225, 117, 21

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data["words"][idx] = new_sent


In [65]:
perturbed_data

Unnamed: 0,id,words,tags,tag_idx
0,0,"[RT, @USER2362, :, Farmall, Heart, wOkxarf, Td...","[O, O, O, B-ORG, O, O, O, O, O, O, O, O, O, O,...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,"[#Volunteeakodrvs, are, key, members, of, #CHE...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,"[@USErR2092ej, is, n't, fihtg, fguunnyy, how, ...","[O, O, O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,3,"[RT, @USER80, :, Silence, igsf, better, than, ...","[O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,4,"[I, just, spent, twenty, minutes, trycinekgd, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...
1634,1634,"[RT, @USER1701, :, FT, ISLAND, -, I, Hope, (, ...","[O, O, O, O, B-PER, O, B-MISC, I-MISC, O, O, O...","[0, 0, 0, 0, 2, 0, 5, 7, 0, 0, 0, 0]"
1635,1635,"[@USER1321, @USER2526, Probably, ., He, is, n'...","[O, O, O, O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1636,1636,"[RT, @USER1920, :, @USER1260, @USER2624, it, '...","[O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
1637,1637,"[You, have, that, right, ,, nor, do, they, int...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [68]:
np.all(perturbed_data["words"] == train_data["words"])

False