In [2]:
import torch
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from datasets import load_metric
from datasets import Dataset
import numpy as np
import pandas as pd
from huggingface_hub import notebook_login

In [3]:
import random
import warnings
warnings.filterwarnings('ignore')

# Loading the data

In [4]:
def read_bio_file(path):
    
    data = []
    current_words = []
    current_tags = []

    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()[2:]
        
    for line in lines:
        
        line = line.strip()
        
        if line: # if line is not an empty line
            tok = line.split('\t')
            current_words.append(tok[0])
            current_tags.append(tok[3])
            
        else:
            if current_words:
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []
            
            
    if current_tags != []:
        data.append((current_words, current_tags))

    df = pd.DataFrame(data, columns=['words', 'tags'])
    df['id'] = df.index
    df = df[['id', 'words', 'tags']]
    
    return df

In [5]:
train_data = read_bio_file("train.bio")
dev_data = read_bio_file("dev.bio")
test_data = read_bio_file("test.bio")
train_data.head()


Unnamed: 0,id,words,tags
0,0,"[RT, @USER2362, :, Farmall, Heart, Of, The, Ho...","[O, O, O, B-ORG, O, O, O, O, O, O, O, O, O, O,..."
1,1,"[#Volunteers, are, key, members, of, #CHEO’s, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,2,"[@USER2092, is, n't, it, funny, how, that, alw...","[O, O, O, O, O, O, O, O, O, O, O]"
3,3,"[RT, @USER80, :, Silence, is, better, than, li...","[O, O, O, O, O, O, O, O, O]"
4,4,"[I, just, spent, twenty, minutes, trying, to, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"


# Extracting tag => index dictionary

In [6]:
class Vocab():
    def __init__(self, pad_unk='<PAD>'):
        self.pad_unk = pad_unk
        self.word2idx = {}
        self.idx2word = []

    def getIdx(self, word, add=False):
        if word is None or word == self.pad_unk:
            return None
        if word not in self.word2idx:
            if add:
                idx = len(self.idx2word)
                self.word2idx[word] = idx
                self.idx2word.append(word)
                return idx
            else:
                return None
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word[idx]

label_indices = Vocab()
tags_column = train_data["tags"]

for tags in tags_column:
    for tag in tags:
        label_indices.getIdx(tag, add=True)

print(label_indices.word2idx)

{'O': 0, 'B-ORG': 1, 'B-PER': 2, 'B-LOC': 3, 'I-PER': 4, 'B-MISC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}


Now we map the tags to indices and add them as a column

In [7]:
train_data['tag_idx'] = train_data['tags'].apply(lambda x: [label_indices.word2idx[tag] for tag in x])
dev_data['tag_idx'] = dev_data['tags'].apply(lambda x: [label_indices.word2idx[tag] for tag in x])
test_data['tag_idx'] = test_data['tags'].apply(lambda x: [label_indices.word2idx[tag] for tag in x])

model_checkpoint = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, padding=True)

train_data.head()

Unnamed: 0,id,words,tags,tag_idx
0,0,"[RT, @USER2362, :, Farmall, Heart, Of, The, Ho...","[O, O, O, B-ORG, O, O, O, O, O, O, O, O, O, O,...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,"[#Volunteers, are, key, members, of, #CHEO’s, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,"[@USER2092, is, n't, it, funny, how, that, alw...","[O, O, O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,3,"[RT, @USER80, :, Silence, is, better, than, li...","[O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,4,"[I, just, spent, twenty, minutes, trying, to, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


# Perturbations
Functions for perturbation of the dataset. We make them before tokenization for a reason. Should not change the length of the sentence (tags should still correspond).

In [8]:
#add your functions here
train_data

Unnamed: 0,id,words,tags,tag_idx
0,0,"[RT, @USER2362, :, Farmall, Heart, Of, The, Ho...","[O, O, O, B-ORG, O, O, O, O, O, O, O, O, O, O,...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,"[#Volunteers, are, key, members, of, #CHEO’s, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,"[@USER2092, is, n't, it, funny, how, that, alw...","[O, O, O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,3,"[RT, @USER80, :, Silence, is, better, than, li...","[O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,4,"[I, just, spent, twenty, minutes, trying, to, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...
1634,1634,"[RT, @USER1701, :, FT, ISLAND, -, I, Hope, (, ...","[O, O, O, O, B-PER, O, B-MISC, I-MISC, O, O, O...","[0, 0, 0, 0, 2, 0, 5, 7, 0, 0, 0, 0]"
1635,1635,"[@USER1321, @USER2526, Probably, ., He, is, n'...","[O, O, O, O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1636,1636,"[RT, @USER1920, :, @USER1260, @USER2624, it, '...","[O, O, O, O, O, O, O, O, O]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
1637,1637,"[You, have, that, right, ,, nor, do, they, int...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [9]:
dev_data["words"][585]

['Chunky',
 'breads',
 'from',
 '@USER109',
 'are',
 'excellent',
 'for',
 'tearing',
 'and',
 'dunking',
 'into',
 'soup',
 '🍞',
 'URL596']

In [10]:
#me testing things
def cap_word(word, all_letters=True):
    if all_letters:
        return word.upper() #all caps on the word
    #otherwise only first letter of the word when False
    return word.capitalize()

def to_lower(word):
    return word.lower()

def irregular_capitalization(df, perc_sent, perc_words, apply_to_all=True):
    '''
    Input:
    df - the training set in a pandas dataframe
    perc_sent - the percentage of the data to be perturbed; float
    num_words - number of words to be perturbed in a sentence; defaults to 1
    perc_words - percentage of words to be perturbed in a sentence; used when num_words is None
    apply_to_all - wheter to apply capitalization to all letters
    Return: altered data + list of ids of changed sentences
    '''
    # random.seed(10)
    df_copy = df.copy(deep=True)
    num_of_sent = int(perc_sent*len(df_copy))
    # print(f"Number of sentences to alter: {num_of_sent}")

    #choose rand sentence ids
    sentences_ids = random.sample(range(len(df_copy)), num_of_sent)
    # print("type", type(sentences_ids))
    # print("len", len(sentences_ids))
    for sentence_id in sentences_ids:
        #indexing the sentence
        sentence = (df_copy["words"][sentence_id]).copy()
        # print("sentence type", type(sentence))
        num_words = int(perc_words*len(sentence)) #how many words to perturb
        # print("num of w", num_words)
        if num_words > 0:
            words_ids = random.sample(range(len(sentence)), num_words)
            for word in words_ids:
                if sentence[word][0].isupper():
                    sentence[word]=sentence[word].lower()
                else:
                    sentence[word]=cap_word(sentence[word], all_letters=apply_to_all)
        
        df_copy["words"][sentence_id] = sentence
        # df[sentence_id, "words"] = sentence
        # df.at[sentence_id, 'words'] = sentence
        # df.iloc[sentence_id, 1] = sentence
        # df.loc[sentence_id, "words"] = sentence

    return num_of_sent, sentences_ids, df_copy

In [11]:
def swap_neighboring_letters(word):
    if len(word) <= 1:
        return word  # Return the word unchanged if it has only one character

    # Convert the word into a list of characters for easier manipulation
    word_list = list(word)

    # Choose a random index to swap with its neighboring letter
    idx = random.randint(0, len(word) - 2)  # Ensure that the chosen index is not the last character

    # Swap the character at the chosen index with its neighboring letter
    word_list[idx], word_list[idx + 1] = word_list[idx + 1], word_list[idx]

    # Convert the list of characters back to a string
    return ''.join(word_list)

def swap_letters_in_sentences(df, perc_sent, perc_words, apply_to_all=True):
    # random.seed(10)
    df_copy = df.copy(deep=True)
    num_of_sent = int(perc_sent*len(df_copy))
    # print(f"Number of sentences to alter: {num_of_sent}")

    #choose rand sentence ids
    sentences_ids = random.sample(range(len(df_copy)), num_of_sent)
    # print("type", type(sentences_ids))
    # print("len", len(sentences_ids))
    for sentence_id in sentences_ids:
        #indexing the sentence
        sentence = (df_copy["words"][sentence_id]).copy()
        # print("sentence type", type(sentence))
        num_words = int(perc_words*len(sentence)) #how many words to perturb
        # print("num of w", num_words)
        if num_words > 0:
            words_ids = random.sample(range(len(sentence)), num_words)
            for word in words_ids:
                word_to_change = sentence[word]
                sentence[word]=swap_neighboring_letters(word_to_change)
        df_copy["words"][sentence_id] = sentence

    return num_of_sent, sentences_ids, df_copy

# Swap 20% of sentences, swapping 20% of words in each selected sentenc

In [12]:
def get_alphabet_list():
        
    # loops through the ascii codes of all lower case english letters
    # and makes a list of the characters corresponding to those codes
    return [chr(ascii_code) for ascii_code in range(ord("a"), ord("z")+1)]

def insert_at_idx(word, letter, idx):

    new_str = "" 
    new_str += word[:idx] + letter + word[idx:] #insert chosen letter at chosen index
    
    return new_str

def insert_letter(word, seed=456):
    
    # random.seed(seed) #set seed for reproducibility
    
    insert_at = random.randint(0, len(word)) #choose a random index in the word to insert at
    # note: random.randint(start,end) is a closed interval so it takes the "end" number as well
    
    alph = get_alphabet_list()
    letter = random.choice(alph) #choose a random english alphabet letter to be inserted
    
    print(f"word {word} insert letter {letter} at idx {insert_at} (seed {seed})")
    
    new_str = insert_at_idx(word, letter, insert_at) #insert chosen letter at chosen index
    
    return new_str

def insert_multiple_letters(word, N, seed=456, set_seed=False, prints=False):
    
    if set_seed:
        random.seed(seed) #set seed for reproducibility
    
    alph = get_alphabet_list()
    letters = [random.choice(alph) for i in range(N)] # choose N random letters from
    # the english alphabet to insert at the chosen indices
    
    if prints:
        print(f"word {word} | (seed {seed})")
        print(f"Letters to insert: {letters}")
    
    new_str = word

    for i in range(N):
        
        chosen_idx = random.randint(0, len(new_str)) # choose a random index to insert at
        if prints:
            print(f"Inserting letter {letters[i]} at index {chosen_idx} of word {new_str}")
        new_str = insert_at_idx(new_str, letters[i], chosen_idx) # update the word with the chosen insertion
    
    return new_str

def perturb_sentence(sent, perturb_func, perc, seed=456, set_seed=False):
    
    n_words = int(perc * len(sent))
    if n_words == 0:
        return sent
    
    if set_seed:
        random.seed(seed) #set seed for reproducibility
    
    new_sent = sent.copy()
    
    idxs = [x for x in random.sample(list(range(len(sent))), n_words)]
    
    for idx in idxs:
        new_sent[idx] = perturb_func(new_sent[idx], n_words)
        
    return new_sent

In [13]:
def insertion_perturb(data, perturb_func, perc_sents, perc_words, prints=False, seed=456, set_seed=False):
    
    n_sents = int(perc_sents * data.shape[0])
    if set_seed:
        random.seed(seed) #set seed for reproducibility
    
    new_data = data.copy()
    
    idxs = [x for x in random.sample(list(range(data.shape[0])), n_sents)]
    print(idxs)
    
    for idx in idxs:
        
        if prints:
            print(f"Perturbing sentence idx {idx} | Original: ")
            print(data["words"][idx])
            
        new_sent = (perturb_sentence((new_data["words"][idx]).copy(), perturb_func, perc_words)).copy()
        new_data["words"][idx] = new_sent
        
        if prints:
            print(f"Perturbed version:")
            print(new_data["words"][idx])
        
            print(data["words"][idx] == new_data["words"][idx])
        
    return len(idxs), idxs, new_data

In [14]:
def deletion_sentence(sent, perc_words):
    
    n_words = int(len(sent) * perc_words)
    if n_words == 0:
        return sent
    
    word_idxs = [x for x in random.sample(list(range(len(sent))), n_words)]

    for i in range(n_words):
        
        type_of_mistake = random.randint(1,3)
        
        word = sent[word_idxs[i]]
        if len(word) > 1:
            # mistake type 1: missed last letter
            if type_of_mistake == 1:
                word = word[:-1]
            # mistake type 2: missed first letter
            elif type_of_mistake == 2:
                word = word[1:]
            # mistake type 3: missed random middle letter
            elif type_of_mistake == 3:
                if len(word) >= 3:
                    # make it so that you can't remove first or last letter
                    # and have to remove smth in the middle
                    del_idx = random.randint(1, len(word) - 2)
                    word = word[:del_idx] + word[del_idx+1:]
                
        sent[word_idxs[i]] = word
        
    return sent

In [15]:
def deletion_dataset(data, perc_sent, perc_words, prints=False):

    n_sent = int(perc_sent * data.shape[0])
    new_data = data.copy()
    
    idxs = [x for x in random.sample(list(range(data.shape[0])), n_sent)]
    
    for idx in idxs:
        
        if prints:
            print(f"Perturbing sentence idx {idx} | Original: ")
            print(data["words"][idx])
            
        new_sent = (deletion_sentence((new_data["words"][idx]).copy(), perc_words)).copy()
        new_data["words"][idx] = new_sent
                    
        if prints:
            print(f"Perturbed version:")
            print(new_data["words"][idx])
        
            print(data["words"][idx] == new_data["words"][idx])
        
    return len(idxs), idxs, new_data

In [16]:
def perturb_data(dataset, perc_sent, perc_words, apply_to_all=True):

    num_cap_sent, ids_cap, df_cap = irregular_capitalization(dataset, perc_sent, perc_words, apply_to_all)
    print(f"Capped sent ids: {ids_cap}")
    print("###################cap done, start swapping##############")
    num_swapped_sent, ids_swapped, df_swapped = swap_letters_in_sentences(df_cap, perc_sent, perc_words, apply_to_all)
    print(f"Swapped sent ids: {ids_swapped}")
    print()
    print("#########swapping done, start insertion##################")
    num_ins, ids_ins, df_ins = insertion_perturb(df_swapped, insert_multiple_letters, perc_sent, perc_words, prints=False)
    print("#########insertion done, start deletion##################")
    num_del, ids_del, df_del = deletion_dataset(df_ins, perc_sent, perc_words)
    print()
    print("perturbations done, start merging lists")
    ids_cap.extend(ids_swapped)
    ids_cap.extend(ids_ins)
    ids_cap.extend(ids_del)
    print("final df")
    final_data = df_del.copy()

    return ids_cap, final_data

In [17]:
ids_of_mod_sentences, new_perturbed_dev_data = perturb_data(dev_data, 0.01, 0.4)
# ids_of_mod_sentences.sort()

Capped sent ids: [670, 635, 332, 266, 85, 333, 119]
###################cap done, start swapping##############
Swapped sent ids: [636, 628, 548, 451, 329, 353, 666]

#########swapping done, start insertion##################
[106, 382, 638, 373, 378, 677, 300]
#########insertion done, start deletion##################

perturbations done, start merging lists
final df


In [18]:
ids_of_mod_sentences.sort()
print(ids_of_mod_sentences)

[4, 45, 76, 85, 106, 107, 119, 266, 296, 300, 329, 332, 333, 353, 368, 373, 378, 382, 451, 548, 576, 628, 635, 636, 638, 666, 670, 677]


In [19]:
len(ids_of_mod_sentences)

28

In [20]:
for i in ids_of_mod_sentences:
    print("###########################")
    print(new_perturbed_dev_data["words"][i])
    print(dev_data["words"][i])
    if new_perturbed_dev_data["words"][i]==dev_data["words"][i]:
        print("SAME!")

###########################
['RT', '@USER1317', ':', '"', 'The', 'EU', 'tackles', 'climate', 'change', '"', 'by', 'gnoring', 'th', 'building', 'f', 'coal', 'fired', 'power', 'stations', 'in', 'Germany', 'and', 'puts', 'VAT', 'on', 'soar', 'p', '…']
['RT', '@USER1317', ':', '"', 'The', 'EU', 'tackles', 'climate', 'change', '"', 'by', 'ignoring', 'the', 'building', 'of', 'coal', 'fired', 'power', 'stations', 'in', 'Germany', 'and', 'puts', 'VAT', 'on', 'solar', 'p', '…']
###########################
['USER1095', 'Al', 'the', 'more', 'reason', 'o', 'mov', 'to', 'CA', '!', 'Humidity', 'i', 'virtually', 'nonexistent', '!']
['@USER1095', 'All', 'the', 'more', 'reason', 'to', 'move', 'to', 'CA', '!', 'Humidity', 'is', 'virtually', 'nonexistent', '!']
###########################
['@USER2149', 'Whe', 'you', 'going', 'nex', '?', ':)', 'Xxx']
['@USER2149', 'When', 'you', 'going', 'next', '?', ':)', 'Xxxx']
###########################
['Big', 'East', 'ceos', 'ok', 'commish', 'to', 'PURSUE', 'expans

# Tokenization

In [21]:
label_all_tokens = True # dw about it
def tokenize_and_align_labels(dataset, word_column, tag_column, tokenizer):
    '''
    Function tokenizes sentences and aligns the subword tokens with the labels
    '''
    tokenized_inputs = tokenizer(dataset[word_column].tolist(), truncation=True, is_split_into_words=True, padding = True)

    labels = []
    for i, label in enumerate(dataset[tag_column]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs.data

In [22]:
min_Len = np.min([len(list_) for list_ in train_data["words"]])
min_Len

1

Tokenize the data and align the labels. We do this because of the subwords tokenization to get a label per token instead of per word.

In [23]:
tokenized_data = tokenize_and_align_labels(train_data, "words", "tag_idx", tokenizer)
tokenized_dev_data = tokenize_and_align_labels(dev_data, "words", "tag_idx", tokenizer)
tokenized_test_data = tokenize_and_align_labels(test_data, "words", "tag_idx", tokenizer)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
