# Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import typo
import random

# Data loading

In [None]:
full_train = pd.read_csv('./data/full_train.tsv', sep="\t", header=None, names=['label', 'sent'])
test = pd.read_csv('./data/test.tsv', sep="\t", header=None, names=['label', 'sent'])

# Random sample

In [None]:
sample_df = test.groupby('label').apply(lambda x: x.sample(frac=0.25))

In [None]:
sample_df = sample_df.droplevel(level=0)

In [None]:
sample_df.head()

# Drop rows that were sampled from original df

In [None]:
test = test.drop(sample_df.index, axis=0, inplace=False)

In [None]:
test.head()

# Function for the simulated errors:

In [None]:
def keyboard_typo_noise(sent, ent):
    split_sent = sent.split(" ")
    
    if ent == 1:
        start = split_sent.index("<e1>")
        end = split_sent.index("</e1>")
    else:
        start = split_sent.index("<e2>")
        end = split_sent.index("</e2>")
        
    entity = (" ").join(split_sent[start+1:end])
    myStrErrer = typo.StrErrer(entity)
    new_ent = myStrErrer.nearby_char().result
    
    new_sent = split_sent[:start+1] + new_ent.split(" ") + split_sent[end:]
    
    return (" ").join(new_sent)

In [None]:
def swap_typo_noise(sent, ent):
    split_sent = sent.split(" ")
    
    if ent == 1:
        start = split_sent.index("<e1>")
        end = split_sent.index("</e1>")
    else:
        start = split_sent.index("<e2>")
        end = split_sent.index("</e2>")
        
    entity = (" ").join(split_sent[start+1:end])
    myStrErrer = typo.StrErrer(entity)
    new_ent = myStrErrer.char_swap().result
    
    new_sent = split_sent[:start+1] + new_ent.split(" ") + split_sent[end:]
    
    return (" ").join(new_sent)

In [None]:
def left_increasing_span(sent, ent):
    split_sent = sent.split(" ")
    
    if ent == 1:
        start = split_sent.index("<e1>")
        end = split_sent.index("</e1>")
        
        if (start != 0) and (start-1 != split_sent.index("</e2>")):
            split_sent.insert(start-1, split_sent.pop(start))
        # cannot swap left, do right swap
        elif (split_sent.index(split_sent[-1]) != end) and (end+1 != split_sent.index("<e2>")):
            split_sent.insert(end+1, split_sent.pop(end)) 
        
    elif ent == 2:
        start = split_sent.index("<e2>")
        end = split_sent.index("</e2>")
        
        if (start != 0) and (start-1 != split_sent.index("</e1>")):
            split_sent.insert(start-1, split_sent.pop(start))
        # cannot swap left, do right swap
        elif (split_sent.index(split_sent[-1]) != end) and (end+1 != split_sent.index("<e1>")):
            split_sent.insert(end+1, split_sent.pop(end)) 
            
    return (" ").join(split_sent)

In [None]:
def right_increasing_span(sent, ent):
    split_sent = sent.split(" ")
    
    if ent == 1:
        start = split_sent.index("<e1>")
        end = split_sent.index("</e1>")
        
        if (split_sent.index(split_sent[-1]) != end) and (end+1 != split_sent.index("<e2>")):
            split_sent.insert(end+1, split_sent.pop(end)) 
        # cannot swap right, do left swap 
        elif (start != 0) and (start-1 != split_sent.index("</e2>")):
            split_sent.insert(start-1, split_sent.pop(start))
            
    elif ent == 2:
        start = split_sent.index("<e2>")
        end = split_sent.index("</e2>")
        
        if (split_sent.index(split_sent[-1]) != end) and (end+1 != split_sent.index("<e1>")):
            split_sent.insert(end+1, split_sent.pop(end)) 
        # cannot swap right, do left swap 
        elif (start != 0) and (start-1 != split_sent.index("</e1>")):
            split_sent.insert(start-1, split_sent.pop(start))
            
    return (" ").join(split_sent)

In [None]:
def splitting_entity_span(sent, ent):
    split_sent = sent.split(" ")
    
    if ent == 1:
        start = split_sent.index("<e1>")
        end = split_sent.index("</e1>")
    else:
        start = split_sent.index("<e2>")
        end = split_sent.index("</e2>")
    entity = (" ").join(split_sent[start+1:end])
    
    split_ent = entity.split(" ")
    
    # if entity consists of 1 word, split randomly at position
    if len(split_ent) == 1:
        pos = random.randint(1, len(entity)-1)
        new_ent = (entity[:pos] + " " + entity[pos:]).split(" ")
        
        # choose random left (1) or right (0) side as entity
        k = random.randint(0, 1)
        if k == 0:
            new_sent =  split_sent[:start] + [new_ent[0]] + split_sent[start:start+1] + [new_ent[1]] + split_sent[end:]
            return (" ").join(new_sent)
        else:
            new_sent = split_sent[:start+1] + [new_ent[0]] + split_sent[end:end+1] + [new_ent[1]] + split_sent[end+1:]
            return (" ").join(new_sent)
    
    # if entity consists of (...) remove (...) from entity, or use (...) as entity - randomly chosen
    elif ("(" in entity) and (")" in entity):
        
        matching = [s for s in split_ent if "(" in s]
        index_1 = split_ent.index(matching[0])
        
        matching = [s for s in split_ent if ")" in s]
        index_2 = split_ent.index(matching[0])
        
        # if entity consists of only (...), split randomly
        if (index_1 == 0) and (index_2 == split_ent.index(split_ent[-1])):
            pos = random.randint(index_1+1, split_ent.index(split_ent[-1]))
            new_sent = split_sent[:start+1] + split_ent[:pos] + split_sent[end:end+1] +  split_ent[pos:] + split_sent[end+1:]
            return (" ").join(new_sent)

        # choose randomly to use (...) or remove (...) from entity
        k = random.randint(0, 1)

        # remove (...)
        if k == 0:
            matching = [s for s in split_ent if "(" in s and not ")" in s]
            if matching:
                index_1 = split_ent.index(matching[0])

                # if "... (...)", then
                if index_1 != 0:

                    new_sent = split_sent[:start+1] + split_ent[:index_1] + split_sent[end:end+1] +  split_ent[index_1:] + split_sent[end+1:]
                    return (" ").join(new_sent)

                # if "(...) ...", then 
                elif index_1 == 0:
                    matching = [s for s in split_ent if ")" in s and not "(" in s]
                    index_2 = split_ent.index(matching[0])
                    new_sent = split_ent[:index_2+1] + split_sent[:start+1] + split_ent[index_2+1:] + split_sent[end:end+1] + split_sent[end+1:]
                    return (" ").join(new_sent)
            # no (...)
            else:
                split_ent = entity.split(" ")
                k = random.randint(1, len(split_ent)-1) 
                new_sent = split_sent[:start+1] + split_ent[:k] + split_sent[end:end+1] + split_ent[k:] + split_sent[end+1:]
                return (" ").join(new_sent)
        
        # use (...)
        else:
            matching = [s for s in split_ent if "(" in s and not ")" in s]
            
            if matching:
                index_1 = split_ent.index(matching[0])
                # if "... (...)", then
                if index_1 != 0:
                    new_sent = split_sent[:start] + split_ent[:index_1] + split_sent[start:start+1] + split_ent[index_1:] + split_sent[end:]
                    return (" ").join(new_sent)

                # if "(...) ...", then 
                elif index_1 == 0:
                    matching = [s for s in split_ent if ")" in s and not "(" in s]
                    index_2 = split_ent.index(matching[0])
                    new_sent = split_sent[:start+1] + split_ent[:index_2+1] + split_sent[end:end+1] + split_ent[index_2+1:] + split_sent[end+1:]
                    return (" ").join(new_sent)
            # no (...)
            else:
                split_ent = entity.split(" ")
                k = random.randint(1, len(split_ent)-1) 
                new_sent = split_sent[:start+1] + split_ent[:k] + split_sent[end:end+1] + split_ent[k:] + split_sent[end+1:]
                return (" ").join(new_sent)
        
    # if not the above cases, split randomly. - case when (...) in entity and entity has 2 or more words.
    else:
        split_ent = entity.split(" ")
        k = random.randint(1, len(split_ent)-1) 
        new_sent = split_sent[:start+1] + split_ent[:k] + split_sent[end:end+1] + split_ent[k:] + split_sent[end+1:]
        return (" ").join(new_sent)
    


In [None]:
def whitespace_ent(string):
    start1 = string.index('<e1>')
    start2 = string.index('<e2>')
    end1 = string.index('</e1>')
    end2 = string.index('</e2>')

    if start1 != 0:
        if not(string[start1-1]).isspace():
            string = string[:start1] + ' ' + string[start1:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')
        if not(string[start1+4]).isspace():
            string = string[:start1+4] + ' ' + string[start1+4:]
            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')
    elif start1 == 0:
        if not(string[start1+4]).isspace():
            string = string[:start1+4] + ' ' + string[start1+4:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

    if start2 != 0:
        if not(string[start2-1]).isspace():
            string = string[:start2] + ' ' + string[start2:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

        if not(string[start2+4]).isspace():
            string = string[:start2+4] + ' ' + string[start2+4:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')
    elif start2 == 0:
        if not(string[start2+4]).isspace():
            string = string[:start2+4] + ' ' + string[start2+4:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

    if end1+5 != len(string):
        if not(string[end1-1]).isspace():
            string = string[:end1] + ' ' + string[end1:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

        if not(string[end1+5]).isspace():
            string = string[:end1+5] + ' ' + string[end1+5:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

    elif end1+5 == len(string):
        if not(string[end1-1]).isspace():
            string = string[:end1] + ' ' + string[end1:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

    if end2+5 != len(string):
        if not(string[end2-1]).isspace():
            string = string[:end2] + ' ' + string[end2:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

        if not(string[end2+5]).isspace():
            string = string[:end2+5] + ' ' + string[end2+5:]
            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

    elif end2+5 == len(string):
        if not(string[end2-1]).isspace():
            string = string[:end2] + ' ' + string[end2:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')
    return string

In [None]:
sample_df["new_sent"] = " "

In [None]:
skipped = 0    
for index, row in sample_df.iterrows():

    row['sent'] = whitespace_ent(row['sent'])
        
    # choose randomly to change entity 1 or 2
    k = random.randint(1, 2)
    
    # choose randomly noise
#     noise = [keyboard_typo_noise, 
#              swap_typo_noise, 
#              left_increasing_span, 
#              right_increasing_span, 
#              splitting_entity_span]
    noise = [left_increasing_span, 
             right_increasing_span, 
             splitting_entity_span]
    
    try:
        row['new_sent'] = random.choice(noise)(row['sent'], k)
    except:
        skipped += 1
        row['new_sent'] = row['sent']
        print(row['sent'])
        print()
        continue
print(skipped)

In [None]:
sample_df.head()

# Write to tsv 

In [None]:
augmented_data = sample_df[['label', 'new_sent']]

In [None]:
augmented_data = augmented_data.rename(columns={'new_sent':'sent'})

In [None]:
pd.concat([test, augmented_data]).sort_index().to_csv('./data/test_25_NER.tsv', 
                                                      header=False, index=False, sep="\t")

# Create BLANK dataset

In [None]:
full_train = pd.read_csv('./data/full_train.tsv', sep="\t", header=None, names=['label', 'sent'])
test = pd.read_csv('./data/test.tsv', sep="\t", header=None, names=['label', 'sent'])

In [None]:
def whitespace_ent(string):
    start1 = string.index('<e1>')
    start2 = string.index('<e2>')
    end1 = string.index('</e1>')
    end2 = string.index('</e2>')

    if start1 != 0:
        if not(string[start1-1]).isspace():
            string = string[:start1] + ' ' + string[start1:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')
        if not(string[start1+4]).isspace():
            string = string[:start1+4] + ' ' + string[start1+4:]
            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')
    elif start1 == 0:
        if not(string[start1+4]).isspace():
            string = string[:start1+4] + ' ' + string[start1+4:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

    if start2 != 0:
        if not(string[start2-1]).isspace():
            string = string[:start2] + ' ' + string[start2:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

        if not(string[start2+4]).isspace():
            string = string[:start2+4] + ' ' + string[start2+4:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')
    elif start2 == 0:
        if not(string[start2+4]).isspace():
            string = string[:start2+4] + ' ' + string[start2+4:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

    if end1+5 != len(string):
        if not(string[end1-1]).isspace():
            string = string[:end1] + ' ' + string[end1:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

        if not(string[end1+5]).isspace():
            string = string[:end1+5] + ' ' + string[end1+5:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

    elif end1+5 == len(string):
        if not(string[end1-1]).isspace():
            string = string[:end1] + ' ' + string[end1:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

    if end2+5 != len(string):
        if not(string[end2-1]).isspace():
            string = string[:end2] + ' ' + string[end2:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

        if not(string[end2+5]).isspace():
            string = string[:end2+5] + ' ' + string[end2+5:]
            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')

    elif end2+5 == len(string):
        if not(string[end2-1]).isspace():
            string = string[:end2] + ' ' + string[end2:]

            start1 = string.index('<e1>')
            start2 = string.index('<e2>')
            end1 = string.index('</e1>')
            end2 = string.index('</e2>')
    return string

In [None]:
for index, row in full_train.iterrows():
    
    row['sent'] = whitespace_ent(row['sent'])
    
    start1 = row['sent'].index('<e1>')
    start2 = row['sent'].index('<e2>')
    end1 = row['sent'].index('</e1>')
    end2 = row['sent'].index('</e2>')
    
    if start1 < start2:

        new_ent = f' [BLANK] '

        row['sent'] = row['sent'][:start1+4] + new_ent + row['sent'][end1:start2+4] + new_ent + row['sent'][end2:]
    else:
        row['sent'] = row['sent'][:start2+4] + new_ent + row['sent'][end2:start1+4] + new_ent + row['sent'][end1:]

for index, row in test.iterrows():
    
    row['sent'] = whitespace_ent(row['sent'])
    start1 = row['sent'].index('<e1>')
    start2 = row['sent'].index('<e2>')
    end1 = row['sent'].index('</e1>')
    end2 = row['sent'].index('</e2>')
    if start1 < start2:
        

        new_ent = f' [BLANK] '

        row['sent'] = row['sent'][:start1+4] + new_ent + row['sent'][end1:start2+4] + new_ent + row['sent'][end2:]
    else:
        row['sent'] = row['sent'][:start2+4] + new_ent + row['sent'][end2:start1+4] + new_ent + row['sent'][end1:]


In [None]:
full_train.to_csv('./data/full_train_blank.tsv', header=False, index=False, sep="\t")

In [None]:
test.to_csv('./data/test_blank.tsv', header=False, index=False, sep="\t")