In [1]:
import pandas as pd

# Trying with English Legal Training Data

In [316]:
data= pd.read_excel('acronyms_with_text.xlsx')
data.head(10)

Unnamed: 0.1,Unnamed: 0,text,acronyms,long-forms,ID,acronym_text,longform_text
0,0,12).; Terms of reference A Correspondence Gro...,"[[194, 199]]","[[164, 192]]",1,"[""IBC's""]",['intermediate bulk containers']
1,1,The comprehensive list of currently identifie...,"[[233, 238]]",[],2,['INT 1'],[]
2,2,Subregional activities for development Legisl...,"[[142, 147]]","[[85, 140]]",3,['ESCAP'],['Economic and Social Commission for Asia and ...
3,3,OIOS recommended that Secretariat programmes t...,"[[239, 247], [142, 146], [0, 4]]","[[167, 237]]",4,"['UN-Women', 'OIOS', 'OIOS']",['United Nations Entity for Gender Equality an...
4,4,98. The Ministry of Education and Culture has...,"[[82, 86]]","[[71, 80]]",5,['NoRa'],['No Racism']
5,5,DRC = Disability Rights Commission ECHR = Conv...,"[[151, 154], [0, 3], [35, 39], [188, 191]]","[[157, 187], [6, 34], [114, 149], [194, 210]]",6,"['EOC', 'DRC', 'ECHR', 'HRA']","['Equal Opportunities Commission', 'Disability..."
6,6,Other travel-related costs Total expenditure B...,"[[324, 327], [366, 371], [69, 75], [168, 175],...","[[330, 364], [374, 429], [78, 166], [178, 230]...",7,"['SBI', 'SBSTA', 'AWG-KP', 'AWG-LCA', 'ADP']","['Subsidiary Body for Implementation', 'Subsid..."
7,7,The Alliance: (a) Led the European (Economic C...,"[[456, 464], [497, 501], [68, 71]]","[[388, 454], [36, 66]]",8,"['UN-Women', 'NGOs', 'ECE']",['United Nations Entity for Gender Equality an...
8,8,10 a.m. - 1 p.m. Closed meeting Conference Roo...,"[[174, 177], [3, 7], [12, 16], [51, 54], [125,...",[],9,"['NLB', 'a.m.', 'p.m.', 'NLB', 'a.m.']",[]
9,9,The Secretary-General has received the followi...,"[[248, 257], [348, 354], [279, 282]]","[[195, 245], [129, 167]]",10,"['I.F.W.L.C', 'ECOSOC', 'NGO']",['International Federation of Women in Legal C...


In [317]:
len(data)

3564

# Clean data from empty columns

In [318]:
delete_row = data[data["acronyms"]=='[]'].index
data = data.drop(delete_row)
len(data)

3554

# Function for creating a dictionary of {entities: index acronyms, Acronym} 
### Final index -1

In [331]:
def create_entity(row):
    row = row.replace('[', '').replace(']', '').split(',')
    row = [int(x) for x in row]
    
    new = []
    for i in range(0, len(row), 2):
        tup  = tuple(row[i : i+2])
        tup += ('Acronym',)
        ind,ind2,text = tup
        ind2 = ind2-1
        tup = (ind,ind2,text)
        #new.append({'entities': tup})
        new.append(tup)
    dic = {'entities': new}
    return dic
    


# Same function as above without subtracting 1 from end index

In [362]:
def create_entity2(row):
    row = row.replace('[', '').replace(']', '').split(',')
    row = [int(x) for x in row]
    
    new = []
    for i in range(0, len(row), 2):
        tup  = tuple(row[i : i+2])
        tup += ('Acronym',)
        ind,ind2,text = tup
        #ind2 = ind2-1
        tup = (ind,ind2,text)
        #new.append({'entities': tup})
        new.append(tup)
    dic = {'entities': new}
    return dic
    


# Creating a new column with the dictionaries

In [363]:

data['entities'] = data['acronyms'].apply(create_entity)

In [333]:
type(data['entities'][3])

dict

In [334]:
data['entities'][3]

{'entities': [(239, 247, 'Acronym'), (142, 146, 'Acronym'), (0, 4, 'Acronym')]}

# Zipping the text with the dictionaries to create the Spacy data format

In [335]:
spacydata = list(zip(data.text,data.entities))

In [336]:
type(spacydata[3][1])

dict

In [328]:
data.to_excel('acronyms_withspacydata.xlsx')

In [364]:
data.to_csv('acronyms_withspacydata2.csv')

# Train a spacy blanck model for our NER (acronyms)

In [None]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm

In [None]:
model = None
output_dir=Path("/content/drive/MyDrive/Surrey_acronyms")
n_iter=100

In [None]:
#load the model

if model is not None:
    #activated = spacy.prefer_gpu()
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    #activated = spacy.prefer_gpu()
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

#set up the pipeline

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

In [None]:
# connect the label acronyms with the indeces for Spacy

for _, annotations in spacydata:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])


# Train a blank model with 100 iterations

In [None]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(spacydata)
        losses = {}
        for text, annotations in tqdm(spacydata):
            nlp.update(
                [text],  
                [annotations],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

# Give it a try on a new data

In [None]:
for text, _ in spacydata:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])