In [1]:
import pandas as pd
import pickle
import rdflib
from glob import glob
from pandas import DataFrame, notnull
import os

# Create Gold Set

In [2]:
class FileReader:
    def __init__(self,dir_name,out_filename, multiple_folders):
        self.ann_dict = dict()     #stores information  contained in .ann files
        self.dir_name = dir_name   #name of the directory containing data
        self.org_name = []              #columns of the table
        self.org_entity_id = []         #columns of the table
        self.org_found = []             #columns of the table
        self.org_snippet = []           #columns of the table
        self.org_offset_start = []      #columns of the table
        self.org_offset_end = []        #columns of the table
        self.org_folder = []            #columns of the table
        self.org_filename = []          #columns of the table
        self.mention_offset_start = []  #columns of the table
        self.mention_offset_end = []    #columns of the table
        self.is_acronym = []            #columns of the table
        self.multiple_folders = multiple_folders #stores the multiple folders input
        self.df = DataFrame()       #dataframe
        
        #Annotation information is obtained using different functions
        #depending on the structure of the input directory.
        if self.multiple_folders:
            self.get_annotation_information_multiple_folders() #fill ann_dict with info from the files
        else:
            self.get_annotation_information()                  #fill ann_dict with info from the files
        self.get_columns()                  #construct columns 
        self.create_df()                    #create dataframe
        self.df.to_csv(out_filename)        #write output dataframe to csv
         
    def extract_values(self, list_line):
        """
        Input: Line of annotation file as a list where line is splitted with whitespace
        Output: Key,Value pair containing information of the line
        Some annotation classes are not implemented yet.  
        """
        annotation_type = list_line[1]
        key = None
        value = None
        if annotation_type == "Organization" or annotation_type == 'Sponsor':
            #key = ('Organization','annotation_row_id')
            #value = [Offset_1,Offset_2,Organization_Name]
            key = ('Organization',list_line[0])
            #Workaround for annotations like ";"
            end_ind = 3
            while ";" in list_line[end_ind]:
                end_ind += 1    
            value = [int(list_line[2]),int(list_line[end_ind]),' '.join(map(str,list_line[(end_ind+1):]))]
        
        elif annotation_type == "Organization-Acronym" or annotation_type == 'Sponsor-Acronym':
            #key = ('Organization-Acronym','annotation_row_id')
            #value = [Offset_1,Offset_2,Organization_Acronym]
            key = ('Organization-Acronym',list_line[0])
            #Workaround for annotations like ";"
            end_ind = 3
            while ";" in list_line[end_ind]:
                end_ind += 1    
            value = [int(list_line[2]),int(list_line[end_ind]),' '.join(map(str,list_line[(end_ind+1):]))]

        elif annotation_type == "Found_In_FundRef":
            #key = ('Found_In_FundRef','annotation_row_id_of_organization')
            #value = True/False
            key = ('Found_In_FundRef',list_line[2])
            value = True if list_line[3] == 'Yes' else False
        elif annotation_type == "AnnotatorNotes":
            #key = ('AnnotatorNotes','annotation_row_id_of_organization')
            #value = Entity_id
            key = ('AnnotatorNotes',list_line[2])
            value = list_line[3]
        elif annotation_type == "Funding_Snippet" :
            #key = ('Funding_Snippet',annotation_row_id)
            #value = [start_snippet_row_id,end_snippet_row_id]
            key = ('Funding_Snippet',list_line[0])
            value =[list_line[2][5:],list_line[3][5:]]
        elif annotation_type == "SnippetStart" :
            #key = ('SnippetStart','start_snippet_row_id')
            #value = [Offset_1,Offset_2]
            key = ('SnippetStart',list_line[0])
            value =  [int(list_line[2]),int(list_line[3])]
        elif annotation_type == "SnippetEnd" :
            #key = ('SnippetEnd','end_snippet_row_id')
            #value = [Offset_1,Offset_2]
            key = ('SnippetEnd',list_line[0])
            value =  [int(list_line[2]),int(list_line[3])]
        elif annotation_type == "No_FB_Found_In_This_Article" :
            #key = ('No_FB_Found_In_This_Article',None)
            #value = None
            key = ('No_FB_Found_In_This_Article',None)
        elif annotation_type == "AcronymOf" :
            #This is a bidirectional relationship. 
            #That is why it returnes two (key, value) pairs 
            #key = [('AcronymOf',acronym_row_id), ('AcronymOf',organization_row_id)]
            #value = [organization_row_id, acronym_row_id]
            key = [('AcronymOf',list_line[2][5:]), ('AcronymOf',list_line[3][5:])]
            value = [list_line[3][5:], list_line[2][5:]]
        return key, value
    
    def preprocess_snippet_info(self,ann_info):
        """Get offsets of funding snippets"""
        funding_snippets = [v for k, v in ann_info.items() if k[0] == 'Funding_Snippet']
        snippet_idxs = []
        for funding_snippet in funding_snippets:
            #Get first offset from SnippetStart to get the start point of the paragraph
            start_idx = ann_info[('SnippetStart',funding_snippet[0])][0] 
            #Get second offset from SnippetEnd to get the start point of the paragraph
            end_idx = ann_info[('SnippetEnd',funding_snippet[1])][1] 
            snippet_idxs.append((start_idx,end_idx))
        return snippet_idxs
    
    def get_annotation_information_multiple_folders(self):
        """Fill ann_dict with information from the .ann files"""
        #Loop over folders in data directory
        for folder1 in glob(self.dir_name + "*"):
            for folder in glob(folder1 + '\\*'):
                #Loop over annotation files
                for file_ in glob(folder +"\*.ann"):
                    #Get file id
                    file_id = file_.replace(folder,"")[1:-4]
                    #Initialize a dictionary for the file
                    file_dict = dict()
                    is_correct = True
                    with open(file_, "r", encoding="utf8") as f:
                        #Process information on each line
                        for line in f:
                            key = None
                            value = None
                            try:
                                key, value = self.extract_values(line.strip().split())
                            except (ValueError, IndexError) as ve:
                                print("Error on annotation file: ",file_, "\nError str: ",ve,"\n\n")
                                #Discard this annotation file
                                is_correct = False
                                break
                            #This if clause can be removed when all fields are used
                            if key is not None:
                                #Key,value of AcronymOf is a list
                                if type(key) == list:
                                    for i in range(2):
                                        file_dict[key[i]] = value[i]
                                else:
                                    file_dict[key] = value
                        #Add information from annotation file to dictionary if no errors
                        if is_correct and len(file_dict) != 0:
                            #Preprocess the folder name
                            folder_processed = folder.split('\\')[-2:]
                            folder_processed = folder_processed[0] + '\\' + folder_processed[1]
                            self.ann_dict[(folder_processed,file_id)]=file_dict
                            
    def get_annotation_information(self):
        """Fill ann_dict with information from the .ann files"""
        #Loop over folders in data directory
        for folder in glob(self.dir_name + "*"):
            #Loop over annotation files
            for file_ in glob(folder +"\*.ann"):
                #Get file id
                file_id = file_.replace(folder,"")[1:-4]
                #Initialize a dictionary for the file
                file_dict = dict()
                is_correct = True
                with open(file_, "r", encoding="utf8") as f:
                    #Process information on each line
                    for line in f:
                        key = None
                        value = None
                        try:
                            key, value = self.extract_values(line.strip().split())
                        except (ValueError, IndexError) as ve:
                            print("Error on annotation file: ",file_, "\nError str: ",ve,"\n\n")
                            #Discard this annotation file
                            is_correct = False
                            break
                        #This if clause can be removed when all fields are used
                        if key is not None:
                            #Key,value of AcronymOf is a list
                            if type(key) == list:
                                for i in range(2):
                                    file_dict[key[i]] = value[i]
                            else:
                                file_dict[key] = value
                    #Add information from annotation file to dictionary if no errors
                    if is_correct and len(file_dict) != 0:
                        folder_processed = folder.split('\\')[-1]
                        self.ann_dict[(folder_processed,file_id)]=file_dict
                                
    def get_columns(self):
        """Construct the columns using the information in ann_dict"""
        #Loop over information extracted on each annotation file
        for file_key, ann_info in self.ann_dict.items(): 
            #Get snippets from the annotation file
            try:
                snippet_idxs = self.preprocess_snippet_info(ann_info)
            except KeyError as ke:
                print("KeyError on annotation file: " + file_key[0]+"\\"+file_key[1]+".ann\nError str: ",ke,"\n\n")
                #Skip this iteration as there is an error
                continue
            #Get organization row ids
            org_ids = [(k[1],'org') for k in ann_info.keys() if k[0] == 'Organization']
            org_ids = org_ids + [(k[1],'acr') for k in ann_info.keys() if k[0] == 'Organization-Acronym']
            #For each organization in the annotation file
            for org_id in org_ids:
                #Organization name and offsets
                org_info = None
                #If it is an acronym
                if org_id[1] == 'acr':
                    org_info = ann_info.get(('Organization-Acronym',org_id[0]))
                    self.is_acronym.append(True)
                #If it is not an acronym
                else:
                    org_info = ann_info.get(('Organization',org_id[0]))
                    self.is_acronym.append(False)
                org_id = org_id[0]
                self.org_name.append(org_info[2])
                self.mention_offset_start.append(org_info[0])
                self.mention_offset_end.append(org_info[1])
                #Get related snippet information
                found = False
                for snippet_idx in snippet_idxs:
                    #If this snippet includes this organization's information
                    if snippet_idx[0] <= org_info[0] and snippet_idx[1] >= org_info[1] and not found:
                        try:
                            #extract snippet from txt file
                            text_file_path = ''
                            if self.multiple_folders:
                                text_file_path = self.dir_name + '\\' + file_key[0].split('\\')[-1]+"\\"+file_key[1]+".txt"
                            else:
                                text_file_path = self.dir_name+"\\"+file_key[1]+".txt"
                            with open(text_file_path, "r", encoding="utf8") as f:
                                self.org_snippet.append(f.read()[snippet_idx[0]:snippet_idx[1]])
                            #get the relative offset
                            self.org_offset_start.append(org_info[0]-snippet_idx[0])
                            self.org_offset_end.append(org_info[1]-snippet_idx[0])
                            found = True
                        except FileNotFoundError as e:
                            print("FileNotFoundError on text file: " + text_file_path  +'\nError str: ',e,"\n\n")
                #If there is no snippet information
                if not found: 
                    self.org_snippet.append(None)
                    self.org_offset_start.append(None)
                    self.org_offset_end.append(None)
                #Find entity_id and whether the annotator found this org in taxonomy
                #Check if this organization is linked to an entity
                ann_notes = ann_info.get(('AnnotatorNotes',org_id),None)
                #If it is not linked to an entity
                if ann_notes is None:
                    #Check if it is linked to an organization/acronym with the 
                    #AcronymOf relation
                    other_id = ann_info.get(('AcronymOf',org_id),None)
                    #If it is linked to an organization/acronym 
                    #append its linking as the linking of organization/acronym
                    if other_id is not None:
                        self.org_entity_id.append(ann_info.get(('AnnotatorNotes',other_id ),None))
                        self.org_found.append(ann_info.get(('Found_In_FundRef',other_id ),None))
                    #If not this is a NIL linking
                    else:
                        self.org_entity_id.append(None)
                        self.org_found.append(ann_info.get(('Found_In_FundRef',org_id),None))
                #If it is linked to an entity, we do not need to check the
                #AcronymOf relationship
                else:
                    self.org_entity_id.append(ann_info.get(('AnnotatorNotes',org_id),None))
                    self.org_found.append(ann_info.get(('Found_In_FundRef',org_id),None))
                #Add information on which file and folder this is in
                self.org_folder.append(file_key[0])
                self.org_filename.append(file_key[1])
                
    def create_df(self):
        """Create the dataframe using the columns"""
        #Replace NaN's with None's 
        #When pandas infers data type to be numeric, it converts None's to NaN's
        self.df = DataFrame(list(zip(self.org_folder,self.org_filename,self.org_name, self.org_entity_id, self.is_acronym,
                                     self.org_found,self.org_snippet,self.org_offset_start,self.org_offset_end,
                                     self.mention_offset_start, self.mention_offset_end)),
                            columns =["Folder","Filename","Organization","Entity_ID","Is_Acronym","Found_in_FundRef","Snippet",
                            "Rel_Offset_Start","Rel_Offset_End", 'Mention_Offset_Start','Mention_Offset_End'])
        self.df = self.df.where(notnull(self.df), None)

In [3]:
dir_name = "C:\\Users\\aydxng\\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\\Desktop\\ElsevierData\\Dataset2020July\\Dataset2020July\\GoldSet_2019\\Regression"
out_filename = "temp.csv"
multiple_folders = False

df_reg = FileReader(dir_name,out_filename, multiple_folders).df
df_reg['Folder'] = "GoldSet_2019\\Regression"
print('Number of samples: ',df_reg.shape[0])
####################
dir_name = "C:\\Users\\aydxng\\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\\Desktop\\ElsevierData\\Dataset2020July\\Dataset2020July\\GoldSet_2019\\KPI"
out_filename = "temp.csv"
multiple_folders = False

df_kpi = FileReader(dir_name,out_filename, multiple_folders).df
df_kpi['Folder'] = "GoldSet_2019\\KPI"
print('Number of samples: ',df_kpi.shape[0])
####################
dir_name = "C:\\Users\\aydxng\\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\\Desktop\\ElsevierData\\Dataset2020July\\Dataset2020July\\GoldSet_2019\\Trainin-Train"
out_filename = "temp.csv"
multiple_folders = False

df_ttra = FileReader(dir_name,out_filename, multiple_folders).df
df_ttra['Folder'] = "GoldSet_2019\\Trainin-Train"
print('Number of samples: ',df_ttra.shape[0])
####################
dir_name = "C:\\Users\\aydxng\\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\\Desktop\\ElsevierData\\Dataset2020July\\Dataset2020July\\GoldSet_2019\\Trainin-Test"
out_filename = "temp.csv"
multiple_folders = False


df_tte = FileReader(dir_name,out_filename, multiple_folders).df
df_tte['Folder'] = "GoldSet_2019\\Trainin-Test"
print('Number of samples: ',df_tte.shape[0])
####################
dir_name = "C:\\Users\\aydxng\\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\\Desktop\\ElsevierData\\Dataset2020July\\Dataset2020July\\GoldSet_2019\\Trainin-Validation"
out_filename = "temp.csv"
multiple_folders = False

df_tval = FileReader(dir_name,out_filename, multiple_folders).df
df_tval['Folder'] = "GoldSet_2019\\Trainin-Validation"
print('Number of samples: ',df_tval.shape[0])
####################
dir_name = "C:\\Users\\aydxng\\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\\Desktop\\ElsevierData\\Dataset2020July\\Dataset2020July\\Improvement"
out_filename = "temp.csv"
multiple_folders = True

df_imp = FileReader(dir_name,out_filename, multiple_folders).df
print('Number of samples: ',df_imp.shape[0])
####################
df = pd.concat([df_reg,df_kpi,df_ttra,df_tte,df_tval,df_imp])

Error on annotation file:  C:\Users\aydxng\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\Desktop\ElsevierData\Dataset2020July\Dataset2020July\GoldSet_2019\Regression\02022R_619867865.ann 
Error str:  invalid literal for int() with base 10: '2810;2811' 


Error on annotation file:  C:\Users\aydxng\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\Desktop\ElsevierData\Dataset2020July\Dataset2020July\GoldSet_2019\Regression\02034R_622431326.ann 
Error str:  list index out of range 


Error on annotation file:  C:\Users\aydxng\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\Desktop\ElsevierData\Dataset2020July\Dataset2020July\GoldSet_2019\Regression\02535R_622459459.ann 
Error str:  list index out of range 


Error on annotation file:  C:\Users\aydxng\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\Desktop\ElsevierData\Dataset2020July\Dataset2020July\GoldSet_2019\Regression\02782R_622758516.ann 
Error str:  list index out of range 


Error on annotation file:  C:\Users\

Error on annotation file:  C:\Users\aydxng\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\Desktop\ElsevierData\Dataset2020July\Dataset2020July\GoldSet_2019\Trainin-Train\19006T_605827728.ann 
Error str:  invalid literal for int() with base 10: '5581;5582' 


KeyError on annotation file: Trainin-Train\14850T_354625742.ann
Error str:  ('SnippetEnd', 'TMariappan5') 


Number of samples:  27883
Error on annotation file:  C:\Users\aydxng\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\Desktop\ElsevierData\Dataset2020July\Dataset2020July\GoldSet_2019\Trainin-Test\06141T_623784283.ann 
Error str:  list index out of range 


Error on annotation file:  C:\Users\aydxng\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\Desktop\ElsevierData\Dataset2020July\Dataset2020July\GoldSet_2019\Trainin-Test\08576T_620737251.ann 
Error str:  list index out of range 


Error on annotation file:  C:\Users\aydxng\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\Desktop\ElsevierData\Dataset2020

In [4]:
df.head()

Unnamed: 0,Folder,Filename,Organization,Entity_ID,Is_Acronym,Found_in_FundRef,Snippet,Rel_Offset_Start,Rel_Offset_End,Mention_Offset_Start,Mention_Offset_End
0,GoldSet_2019\Regression,00000R_618242926,NSF,100000001.0,True,True,We thank K. Pierce who assisted with mesocosm ...,184,187,63693,63696
1,GoldSet_2019\Regression,00001R_619263419,JSPS,501100001691.0,True,True,This study was partially supported by JSPS Gra...,38,42,20746,20750
2,GoldSet_2019\Regression,00002R_618805515,ADAPT Centre for Digital Content Technology,,False,False,The authors would like to acknowledge the fund...,119,162,45965,46008
3,GoldSet_2019\Regression,00002R_618805515,Irish Research Council,501100002081.0,False,True,The authors would like to acknowledge the fund...,71,93,45917,45939
4,GoldSet_2019\Regression,00003R_621409062,Ministerio de Ciencia e Innovación,501100004837.0,False,True,This work was supported in part by the Ministe...,39,73,2735,2769


### Filenames in Monitor Dataset

In [5]:
monitor_fnames = []
with open("C:\\Users\\aydxng\\Documents\\ds-fundingbodies-linkingcomponent-masterthesis\\Thesis\\PrepareBERT_forDatabricks\\CleanCode\\MONITOR_FNAMES.txt",'r') as f:
    monitor_fnames = [x.strip() for x in f.readlines()]

### Filenames in Validation Dataset

In [6]:
path = "C:\\Users\\aydxng\\Documents\\ds-fundingbodies-linkingcomponent-masterthesis\\Thesis\\CreateNERSilverSet_NewPipelineOutput\\"
with open(path+'df_sent_divided.pkl','rb') as f:
    df_sent= pickle.load(f)
validation_fnames = df_sent[df_sent.Dataset=='Validation'].Filename.unique()

### Filenames in Test Dataset

In [7]:
test_fnames = os.listdir("C:\\Users\\aydxng\\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\\Desktop\\ElsevierData\\Dataset2020July\\Dataset2020July\\GoldSet_2019\\Regression")
test_fnames = [x[:-4] for x in test_fnames]
test_fnames = list(set(test_fnames))

### All Entities in Taxonomy

In [8]:
filename = 'C:\\Users\\aydxng\\Documents\\FilesForExperiment\\NewTaxononmy\\FundRef_v2020-11_20201112_nowrapper_shadow\\FundRef_v2020-11_20201112_nowrapper_shadow.rdf'
g = rdflib.Graph()
g.load(filename)

entity_ids = []
for x in g:
    #If the element is a "Concept", it is an entity.
    #We add the entity ID to the list
    if x[2] == rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#Concept'):
        entity_ids.append(str(x[0]))
        
entity_ids =[x.split("/")[-1] for x in entity_ids]

# Create Dict of NEs

In [9]:
#Get all the extracted mentions. Store them in set to delete duplicate mentions
ne_dict = dict()
for index, row in df.iterrows():
    key = row['Folder'] + "\\" + row['Filename']
    if ne_dict.get(key,None) is None:
        ne_dict[key] = set()
    ne_dict[key].add((row['Mention_Offset_Start'],row['Mention_Offset_End'],row['Organization'],row['Entity_ID']))

In [10]:
#This dataframe has all the processed sentences
df_sent.head()

Unnamed: 0,Folder,Filename,ID,Sentence,Start_Idx,End_Idx,Sentence_Tokenized,Token_Spans,Gold_Span_Tags_ORG_IOB,Gold_Span_Tags_GRT_IOB,Gold_Span_Tags_IOB,Pipeline_Span_Tags_ORG_IOB,Pipeline_Span_Tags_GRT_IOB,Pipeline_Span_Tags_IOB,Dataset
0,GoldSet_2019\KPI,00000K_622440654,2,The work was supported by funding from NIH (EB...,46958,47235,"[The, work, was, supported, by, funding, from,...","[(46958, 46961), (46962, 46966), (46967, 46970...","[O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, ...","[O, O, O, O, O, O, O, B_ORG, O, B_GRT, O, O, O...","[O, O, O, O, O, O, O, B, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, ...","[O, O, O, O, O, O, O, B_ORG, O, B_GRT, O, O, O...",Training
1,GoldSet_2019\KPI,00001K_613572066,1,We appreciate the generous financial support f...,38478,39070,"[We, appreciate, the, generous, financial, sup...","[(38478, 38480), (38481, 38491), (38492, 38495...","[O, O, O, O, O, O, O, O, O, O, O, O, B, I, I, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, B_ORG, I_...","[O, O, O, O, O, O, O, O, O, O, O, O, B, I, I, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, B_ORG, I_...",Training
2,GoldSet_2019\KPI,00002K_365094003,0,Universidad de Zaragoza gratefully acknowledge...,29863,30013,"[Universidad, de, Zaragoza, gratefully, acknow...","[(29863, 29874), (29875, 29877), (29878, 29886...","[O, O, O, O, O, O, O, O, O, B, I, I, I, I, I, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, B_ORG, I_ORG, I_OR...","[O, O, O, O, O, O, O, O, O, B, I, I, I, I, I, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, B_ORG, I_ORG, I_OR...",Validation
3,GoldSet_2019\KPI,00003K_362108299,0,Q. Wei was partially supported by the National...,32927,33095,"[Q, ., Wei, was, partially, supported, by, the...","[(32927, 32928), (32928, 32929), (32930, 32933...","[O, O, O, O, O, O, O, O, B, I, I, I, I, I, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, B_ORG, I_ORG, I_ORG, ...","[O, O, O, O, O, O, O, O, B, I, I, I, I, I, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, B_ORG, I_ORG, I_ORG, ...",Validation
4,GoldSet_2019\KPI,00003K_362108299,0,The research is partially supported by the Hon...,33096,33202,"[The, research, is, partially, supported, by, ...","[(33096, 33099), (33100, 33108), (33109, 33111...","[O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B, I, ...","[O, O, O, O, O, O, O, O, O, B_ORG, O, O, O, B_...","[O, O, O, O, O, O, O, O, O, B, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B, ...","[O, O, O, O, O, O, O, O, O, B_ORG, O, O, O, O,...",Validation


In [11]:
#Store the mentions for training set here
training_named_entities = []
for key, value in ne_dict.items():
    train=False
    if "Regression" not in key:
        #Definitely training set
        if "KPI" not in key:
            train=True
        else:
            filename = key.split("\\")[-1]
            #Definitely training set
            if filename not in monitor_fnames and filename not in validation_fnames:
                train=True
    #If this file is in training data
    if train:
        folder = "\\".join(key.split("\\")[:-1])
        filename = key.split("\\")[-1]
        #Loop over named entities of this file
        for ne in value:
            #If the link is None or link exists in taxonomy we are interested
            if ne[3] is None or ne[3] in entity_ids:
                training_named_entities.append((folder,filename,ne[0],ne[1],ne[2],ne[3]))

In [12]:
#Store the mentions for monitor set here
monitor_named_entities = []
for key, value in ne_dict.items():
    #All monitor filenames are from KPI
    if "KPI" in key:
        folder = "\\".join(key.split("\\")[:-1])
        filename = key.split("\\")[-1]
        #Definitely monitor
        if filename in monitor_fnames:
            #Loop over named entities of this file
            for ne in value:
                #If the link is None or link exists in taxonomy we are interested
                if ne[3] is None or ne[3] in entity_ids:
                    monitor_named_entities.append((folder,filename,ne[0],ne[1],ne[2],ne[3]))

In [13]:
#Store the mentions for validation set here
valid_named_entities = []
for key, value in ne_dict.items():
    #All valid filenames are from KPI
    if "KPI" in key:
        folder = "\\".join(key.split("\\")[:-1])
        filename = key.split("\\")[-1]
        #Definitely monitor
        if filename in validation_fnames:
            #Loop over named entities of this file
            for ne in value:
                #If the link is None or link exists in taxonomy we are interested
                if ne[3] is None or ne[3] in entity_ids:
                    valid_named_entities.append((folder,filename,ne[0],ne[1],ne[2],ne[3]))

Print the lengths and make sure there is no overlap

In [14]:
print(len(training_named_entities))

103003


In [15]:
print(len(monitor_named_entities))

5745


In [16]:
print(len(valid_named_entities))

20107


In [17]:
set.intersection(set(training_named_entities),set(valid_named_entities))

set()

In [18]:
set.intersection(set(valid_named_entities),set(monitor_named_entities))

set()

In [19]:
set.intersection(set(training_named_entities),set(monitor_named_entities))

set()

In [20]:
#These will store the samples in the format we will write them
to_write_train = []
to_write_monitor = []
to_write_valid = []

In [21]:
#We may need to process more sentences to get the context. However, this is expensive
#That is why, first we make a deep copy of this list, then, when we create a training example,
#we delete it from this list. This means that we already have the sentence for that training example,
#we may need to keep processing sentences for others
training_named_entities_copy = [x for x in training_named_entities]

In [22]:
for i in range(len(training_named_entities)):
    if i%1000==0:
        print(i)
    item = training_named_entities[i]
    temp = df_sent[(df_sent['Folder']==item[0]) & (df_sent['Filename']==item[1])]
    for index, row in temp.iterrows():
        if item[2] >= row['Start_Idx'] and item[3] <= row['End_Idx']:
            training_named_entities_copy.remove(item)
            to_write_train.append({"mention":item[4],
                                   "context_left": row['Sentence'][0:item[2]-row['Start_Idx']],
                                   "context_right":row['Sentence'][item[3]+1-row['Start_Idx']:],
                                   "label_id":item[5],
                                   "folder":item[0],
                                   "filename":item[1]})
            break

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000


In [23]:
monitor_named_entities_copy = [x for x in monitor_named_entities]
valid_named_entities_copy = [x for x in valid_named_entities]

In [24]:
for i in range(len(monitor_named_entities)):
    item = monitor_named_entities[i]
    temp = df_sent[(df_sent['Folder']==item[0]) & (df_sent['Filename']==item[1])]
    for index, row in temp.iterrows():
        if item[2] >= row['Start_Idx'] and item[3] <= row['End_Idx']:
            monitor_named_entities_copy.remove(item)
            to_write_monitor.append({"mention":item[4],
                                   "context_left": row['Sentence'][0:item[2]-row['Start_Idx']],
                                   "context_right":row['Sentence'][item[3]+1-row['Start_Idx']:],
                                   "label_id":item[5],
                                   "folder":item[0],
                                   "filename":item[1]})
            break

In [25]:
for i in range(len(valid_named_entities)):
    item = valid_named_entities[i]
    temp = df_sent[(df_sent['Folder']==item[0]) & (df_sent['Filename']==item[1])]
    for index, row in temp.iterrows():
        if item[2] >= row['Start_Idx'] and item[3] <= row['End_Idx']:
            valid_named_entities_copy.remove(item)
            to_write_valid.append({"mention":item[4],
                                   "context_left": row['Sentence'][0:item[2]-row['Start_Idx']],
                                   "context_right":row['Sentence'][item[3]+1-row['Start_Idx']:],
                                   "label_id":item[5],
                                   "folder":item[0],
                                   "filename":item[1]})
            break

In [None]:
#Now we check the articles, for which we have named entities but no sentences.
#For those, we need to check negative sentences

In [26]:
process_these = set()
for item in training_named_entities_copy:
    process_these.add((item[0],item[1]))
for item in valid_named_entities_copy:
    process_these.add((item[0],item[1]))
for item in monitor_named_entities_copy:
    process_these.add((item[0],item[1]))

In [27]:
len(process_these)

14199

In [28]:
import re
import numpy as np
import time
def get_sentences(to_be_processed):
    path_to_goldset = "C:\\Users\\aydxng\\OneDrive - Reed Elsevier Group ICO Reed Elsevier Inc\\Desktop\\ElsevierData\\Dataset2020July\\Dataset2020July\\"
    path_to_pipeline = "C:\\Users\\aydxng\PIPELINEOUTPUT\\"
    #Get sentences
    sentences = []
    #Loop over the fiels again
    start = time.time()
    for i, item in enumerate(to_be_processed):
        folder= item[0]
        fname=item[1]
        if i%100==0:
            print(i," ",time.time()-start)
        content = None
        #Open the original text file
        with open(path_to_goldset+folder+"\\"+fname+".txt",'r',encoding='utf-8') as f:
            content = f.read()
        index_array = np.ones(len(content))
        for i in range(len(content)):
            if content[i] == " ":
                index_array[i] = 0
        content_no_ws = content.replace(" ","")
        #Open the file
        with open(path_to_pipeline+folder+"\\"+fname+".txt",'r',encoding='utf-8') as f:
            #Read file line by line
            for line in f.readlines():
                #Split line by whitespace
                line_vec = line.strip().split()
                try:
                    #If the line contains a sentence
                    if 'Sentence' in line_vec[1]:
                        #If this is a negative sentence
                        if line_vec[2] == 'negative':
                            #Get sentence text
                            text = line[re.search('negative',line).span()[1]+1:][:-1].replace(" ", "")
                            #Get relative span wrt the corresponding Section
                            temp_start_idx = content_no_ws.find(text)
                            temp_end_idx = temp_start_idx + len(text)
                            start_idx = -1
                            end_idx = -1
                            ctr = 0
                            for i in range(len(index_array)):
                                if ctr == temp_start_idx:
                                    start_idx = i
                                if ctr == temp_end_idx-1:
                                    end_idx = i+1
                                ctr+=index_array[i]
                            #Add row
                            sentences.append([folder,fname,content[start_idx:end_idx],start_idx,end_idx])
                except IndexError:
                    pass        
    #Create sentences dataframe
    df_sent = pd.DataFrame(sentences,columns=['Folder','Filename','Sentence','Start_Idx','End_Idx'])
    print("Problematic Sentences: ",len(df_sent[(df_sent.Start_Idx==-1)|(df_sent.End_Idx==-1)])," out of ",len(df_sent),"(",100*(len(df_sent[(df_sent.Start_Idx==-1)|(df_sent.End_Idx==-1)])/len(df_sent)),"%)")
    #Return both dataframes
    return df_sent

In [29]:
df_sent_negative = get_sentences(process_these)

0   0.0
100   18.08596658706665
200   44.339909076690674
300   66.60586166381836
400   105.14178037643433
500   152.25073862075806
600   181.0206789970398
700   222.1665599346161
800   321.6925354003906
900   359.78242588043213
1000   383.8383822441101
1100   416.32737374305725
1200   450.1572802066803
1300   530.605183839798
1400   563.636093378067
1500   587.8510899543762
1600   620.2520360946655
1700   645.5199964046478
1800   677.8929092884064
1900   756.7547886371613
2000   794.3007309436798
2100   820.6097304821014
2200   862.7506713867188
2300   906.5676126480103
2400   942.510555267334
2500   965.3304917812347
2600   985.1574954986572
2700   1015.2344539165497
2800   1079.5673732757568
2900   1101.5893383026123
3000   1196.3352122306824
3100   1239.3631224632263
3200   1260.949096441269
3300   1279.8751137256622
3400   1311.1990621089935
3500   1338.6780326366425
3600   1404.6639091968536
3700   1434.5448760986328
3800   1465.183839559555
3900   1490.6778416633606
4000   1514.1

Now we can get the context for others as well.

In [30]:
len(training_named_entities_copy)

35903

In [31]:
for i in range(len(training_named_entities_copy)):
    if i%1000==0:
        print(i)
    item = training_named_entities_copy[i]
    temp = df_sent_negative[(df_sent_negative['Folder']==item[0]) & (df_sent_negative['Filename']==item[1])]
    for index, row in temp.iterrows():
        if item[2] >= row['Start_Idx'] and item[3] <= row['End_Idx']:
            to_write_train.append({"mention":item[4],
                                   "context_left": row['Sentence'][0:item[2]-row['Start_Idx']],
                                   "context_right":row['Sentence'][item[3]+1-row['Start_Idx']:],
                                   "label_id":item[5],
                                   "folder":item[0],
                                   "filename":item[1]})
            break

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000


In [32]:
for i in range(len(monitor_named_entities_copy)):
    item = monitor_named_entities_copy[i]
    temp = df_sent_negative[(df_sent_negative['Folder']==item[0]) & (df_sent_negative['Filename']==item[1])]
    for index, row in temp.iterrows():
        if item[2] >= row['Start_Idx'] and item[3] <= row['End_Idx']:
            to_write_monitor.append({"mention":item[4],
                                   "context_left": row['Sentence'][0:item[2]-row['Start_Idx']],
                                   "context_right":row['Sentence'][item[3]+1-row['Start_Idx']:],
                                   "label_id":item[5],
                                   "folder":item[0],
                                   "filename":item[1]})
            break

In [33]:
for i in range(len(valid_named_entities_copy)):
    item = valid_named_entities_copy[i]
    temp = df_sent_negative[(df_sent_negative['Folder']==item[0]) & (df_sent_negative['Filename']==item[1])]
    for index, row in temp.iterrows():
        if item[2] >= row['Start_Idx'] and item[3] <= row['End_Idx']:
            to_write_valid.append({"mention":item[4],
                                   "context_left": row['Sentence'][0:item[2]-row['Start_Idx']],
                                   "context_right":row['Sentence'][item[3]+1-row['Start_Idx']:],
                                   "label_id":item[5],
                                   "folder":item[0],
                                   "filename":item[1]})
            break

Now we write everything.

In [34]:
print(len(to_write_train))
print(len(to_write_monitor))
print(len(to_write_valid))

95761
5618
19765


In [35]:
import json

with open('train_all.jsonl', 'w') as outfile:
    for entry in to_write_train:
        json.dump(entry, outfile)
        outfile.write('\n')
with open('monitor_all.jsonl', 'w') as outfile:
    for entry in to_write_monitor:
        json.dump(entry, outfile)
        outfile.write('\n')
with open('valid_all.jsonl', 'w') as outfile:
    for entry in to_write_valid:
        json.dump(entry, outfile)
        outfile.write('\n')

# Preprocess Entity Information

In [1]:
from fuzzywuzzy import fuzz
import numpy as np
import pickle
import rdflib
from transformers import  BertTokenizerFast

In [2]:
filename = 'C:\\Users\\aydxng\\Documents\\FilesForExperiment\\NewTaxononmy\\FundRef_v2020-11_20201112_nowrapper_shadow\\FundRef_v2020-11_20201112_nowrapper_shadow.rdf'
g = rdflib.Graph()
g.load(filename)

In [3]:
entity_ids = []
for x in g:
    #If the element is a "Concept", it is an entity.
    #We add the entity ID to the list
    if x[2] == rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#Concept'):
        entity_ids.append(str(x[0]))

In [4]:
with open('entities.pkl','rb') as f:
    entity_dict=pickle.load(f)

In [5]:
ENT_LABEL_TAG = "[unused2]"
ENT_COUNTRY_TAG = "[unused3]"

In [6]:
def get_input_ids_and_tokens(tokenizer,cand_labels,cand_country,max_cand_length):
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    
    cand_tokens = [cls_token]
    for label_ in cand_labels:
        cand_tokens += tokenizer.tokenize(label_) + [ENT_LABEL_TAG]
    cand_tokens = cand_tokens[:-1]
    cand_tokens += [ENT_COUNTRY_TAG] + tokenizer.tokenize(cand_country) + [sep_token]
    
    input_ids = tokenizer.convert_tokens_to_ids(cand_tokens)
    padding = [0] * (max_cand_length - len(input_ids))
    input_ids += padding
    return input_ids, cand_tokens

In [7]:
def reduce_one(lst):
    sim_scores = []
    for i in range(len(lst)):
        max_sim =0
        for j in range(len(lst)):
            if i!=j:
                max_sim = max(0,fuzz.token_sort_ratio(lst[i],lst[j]))
        sim_scores.append(max_sim)
    idx = np.argmax(sim_scores)
    new_lst = lst[0:idx] + lst[idx+1:]
    return new_lst

In [8]:
def get_candidate_representation(label_idx, tokenizer, max_cand_length):
    
    cand_labels = entity_dict[str(label_idx)]['Labels']
    cand_labels = list(set(cand_labels))
    cand_country = entity_dict[str(label_idx)]['Country']
    
    input_ids, cand_tokens = get_input_ids_and_tokens(tokenizer,cand_labels,cand_country,max_cand_length)
    
    while len(input_ids) != max_cand_length:
        cand_labels = reduce_one(cand_labels)
        input_ids, cand_tokens = get_input_ids_and_tokens(tokenizer,cand_labels,cand_country,max_cand_length)
    
    assert len(input_ids) == max_cand_length
    
    return {
        "tokens": cand_tokens,
        "ids": input_ids,
    }

In [9]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")



In [10]:
all_dicts = dict()
for e_id in entity_ids:
    e_id=e_id.split("/")[-1]
    all_dicts[e_id] = get_candidate_representation(e_id, tokenizer, 256)

In [11]:
list(all_dicts.items())[0]

('501100001230',
 {'tokens': ['[CLS]',
   'Macquarie',
   'University',
   '[unused3]',
   'Australia',
   '[SEP]'],
  'ids': [101,
   26828,
   1239,
   3,
   1754,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0

In [12]:
with open('entities_256.pkl','wb') as f:
    pickle.dump(all_dicts,f)

128 -> 96
256 -> 36
512 -> 23