In [3]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")


# Function for tokenising with space as token

In [4]:
#tokenize text

def tokenize(raw):
    doc = nlp(raw)
    token_texts = []
    for token in doc:
        token_texts.append(token.text)
        if token.whitespace_:  # filter out empty strings
            token_texts.append(token.whitespace_)
    return token_texts

# Illustrative example on English Legal Train set

In [5]:

data = pd.read_json('C:/Users/sadan/OneDrive/Surrey/acronyms/PR-AAAI22-SDU-ST1-AE/data/english/legal/train.json', encoding='utf8')

In [6]:
data['tokenized'] =data['text'].apply(tokenize)

In [10]:
# change the name of long-forms column for applying the function directly on the column
data = data.rename(columns={"long-forms": "longforms"})
data.head()

Unnamed: 0,text,acronyms,longforms,ID,tokenized
0,12).; Terms of reference A Correspondence Gro...,"[[194, 199]]","[[164, 192]]",1,"[12, ), ., ;, , , Terms, , of, , reference..."
1,The comprehensive list of currently identifie...,"[[233, 238]]",[],2,"[ , The, , comprehensive, , list, , of, , ..."
2,Subregional activities for development Legisl...,"[[142, 147]]","[[85, 140]]",3,"[ , Subregional, , activities, , for, , dev..."
3,OIOS recommended that Secretariat programmes t...,"[[239, 247], [142, 146], [0, 4]]","[[167, 237]]",4,"[OIOS, , recommended, , that, , Secretariat..."
4,98. The Ministry of Education and Culture has...,"[[82, 86]]","[[71, 80]]",5,"[ , 98, ., , The, , Ministry, , of, , Educ..."


#  The function for annotating with BIO:
 1. Long-forms are annotated as: B-LF, I-LF
 2. Acronyms are annotated as: B-AC (all subwords are concatenated e.g. un-women are treated as one acronym)
 3. Other is annotated as: B-O

In [11]:
def bio(text,longforms,acronyms,tokens):
    
    acr_acc = ''
    acr_start_flag = 0
    char_index = 0
    label_flag = 0
    sent_labeled = []
    for w in tokens:
        label_flag = 0
        if w in [' ', '(', ')'] and not(acr_start_flag):
            char_index += 1
            continue  
        #check longforms
        for indx in longforms:
            if char_index == indx[0]:
                sent_labeled.append((w,'B-LF'))
                label_flag = 1
                break
            elif indx[0] < char_index < indx[1]:
                sent_labeled.append((w,'I-LF'))
                label_flag = 1
                break
        
        # check acronym
        for indx in acronyms:
            
            if acr_start_flag and char_index >= indx[1]:
               
                sent_labeled.append((acr_acc,'B-AC'))
                acr_start_flag = 0
                acr_acc = ''
            if char_index == indx[0]:
                
                acr_acc += w
                label_flag = 1
                acr_start_flag = 1
                break
            elif indx[0] < char_index < indx[1]:
                
                acr_acc += w
                
                label_flag = 1
                break
            #  check O label
        if label_flag == 0:
            sent_labeled.append((w,'B-O'))
        char_index += len(w)
        
    return sent_labeled

# Apply the function to the dataframe directly


In [15]:
bio_eng_train_legal = data.apply(lambda x: bio(x.text, x.longforms,x.acronyms,x.tokenized), axis=1)

# Save the BIO list into a txt file with each token, BIO annotation on one line

In [16]:
with open('bio_eng_leg_train.txt', 'w') as outfile:
    for ls in bio_eng_train_legal:
        for tup in ls:
            line = " ".join(map(str, tup))
        
            outfile.write(line + '\n')