In [17]:
import pandas as pd

practice = pd.read_csv("data/practice/task2.csv", sep=";", 
                     names = ['index', 'text', 'cause', 'effect', 'offset_sentence2',
                              'offset_sentence3', 'cause_start', 'cause_end', 
                              'effect_start', 'effect_end', 'sentence'],
                    header=0)
trial = pd.read_csv("data/trial/task2.csv", sep=";", 
                     names = ['index', 'text', 'cause', 'effect', 'offset_sentence2',
                              'offset_sentence3', 'cause_start', 'cause_end', 
                              'effect_start', 'effect_end', 'sentence'],
                    header=0)
comb = pd.concat([practice, trial])

In [14]:
import spacy
from spacy.gold import biluo_tags_from_offsets

nlp = spacy.load('en')

doc = nlp("I like London and Paris.")
entities = [(7, 23, "LOC")]
tags = biluo_tags_from_offsets(doc, entities)
print(tags)

['O', 'O', 'B-LOC', 'I-LOC', 'L-LOC', 'O']


In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

In [23]:
text = "The Sunshine State drew in a net influx of about $17.7 billion in adjusted gross income (AGI)  -  most of which (72 percent) came from those aged 55 and older. It is consistently one of the most popular destinations for retirees due to affordability and low taxes. Florida's $17.7 billion in net AGI dwarves the remaining 19 states that saw a positive net influx of income  -  which combined for a total of $19.4 billion."

cause = 'It is consistently one of the most popular destinations for retirees due to affordability and low taxes.'
effect = 'The Sunshine State drew in a net influx of about $17.7 billion in adjusted gross income (AGI)  -  most of which (72 percent) came from those aged 55 and older.'

In [43]:
 text_ec = " ".join([ str(t) for t in tokenizer.encode(text)])

In [44]:
cause_ec = 

'0 20 14995 331 4855 11 10 1161 15849 9 59 68 1360 4 406 325 11 5493 4200 1425 36 3450 100 43 1437 111 1437 144 9 61 36 4956 135 43 376 31 167 5180 3490 8 2530 4 85 16 6566 65 9 5 144 1406 11633 13 21156 528 7 21254 8 614 2556 4 1261 18 68 1360 4 406 325 11 1161 5680 100 48253 5 2405 753 982 14 794 10 1313 1161 15849 9 1425 1437 111 1437 61 2771 13 10 746 9 68 1646 4 306 325 4 2'

In [59]:
tag_list = ["X", "B-cause", "I-cause", "L-cause", "B-effect", "I-effect", "L-effect", "O"]
tag2idx = {"X":0, "B-cause":1, "I-cause":2, "L-cause":3, "B-effect":4, "I-effect":5, "L-effect":6, "O": 7}

In [91]:
from tqdm import tqdm

def get_tok_tags(text: str, type: str, tokenizer: AutoTokenizer) -> [str, str]:
    """
        Encode subspan and align tags to encoded subspan
        Return: encoded subspan and aligned tags
    """
    toks = tokenizer.encode(text)[1:-1] 
    if type == "cause":
        toks_tags = " ".join(["B-cause"] + ["I-cause"] * (len(toks) -2) + ["L-cause"])
    else:
        toks_tags = " ".join(["B-effect"] + ["I-effect"] * (len(toks) -2) + ["L-effect"])
    toks = " ".join([ str(t) for t in toks])
    return toks, toks_tags


dr_inputs = []
dr_tags = [] 

for i,v in tqdm(comb.iterrows(), total=len(comb)):
    text_ec = " ".join([str(t) for t in tokenizer.encode(v["text"])])

    ctoks, ctags = get_tok_tags(v["cause"], "cause", tokenizer)
    etoks, etags = get_tok_tags(v["effect"], "effect", tokenizer)

    text_tags = text_ec.replace(ctoks, ctags)
    text_tags = text_tags.replace(etoks, etags)  
    text_tags = [tag2idx[tok] if tok in tag_list else tag2idx["O"] for tok in text_tags.split()] 
    text_tags_encoded = " ".join([str(t) for t in text_tags])
    
    dr_inputs.append(text_ec)
    dr_tags.append(text_tags_encoded)

comb["dr_input"] = dr_inputs
comb["dr_tag"] = dr_tags
comb["dr_pad"] = 1

100%|██████████| 1750/1750 [00:01<00:00, 1057.50it/s]


In [92]:
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

b_inputs = []
b_tags = [] 

for i,v in tqdm(comb.iterrows(), total=len(comb)):
    text_ec = " ".join([str(t) for t in bert_tokenizer.encode(v["text"])])

    ctoks, ctags = get_tok_tags(v["cause"], "cause", bert_tokenizer)
    etoks, etags = get_tok_tags(v["effect"], "effect", bert_tokenizer)

    text_tags = text_ec.replace(ctoks, ctags)
    text_tags = text_tags.replace(etoks, etags)  
    text_tags = [tag2idx[tok] if tok in tag_list else tag2idx["O"] for tok in text_tags.split()] 
    text_tags_encoded = " ".join([str(t) for t in text_tags])
     
    b_inputs.append(text_ec)
    b_tags.append(text_tags_encoded)


comb["bert_input"] = b_inputs
comb["bert_tag"] = b_tags
comb["bert_pad"] = 0

100%|██████████| 1750/1750 [00:04<00:00, 432.57it/s]


In [94]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(comb, random_state = 1988, test_size=.15)

In [76]:

bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [98]:
train.to_csv("data/train_task2.csv", index=False)
test.to_csv("data/test_task2.csv", index=False)