# Inputs

1. The input to this model are sentences. You can either input them from a file or manually. To input them from a file, write the input sentences line by line in a file called `sentences.txt`. The codeblock below (can be found on second cell as well) reads this file.
```python
#Input sentences from file
file_path = "sentences.txt"
with open(file_path,'r',encoding='utf-8') as f:
         sentences = [[sent] for sent in f.read().splitlines()]
```
If you would like to input the sentences manually, uncomment the following block from the second cell and comment the codeblock mentioned above.
```python
#Input sentences manually.
sentences = [["Sentence 1"],
                ["Sentence 2"],
                ["Sentence 3"],
                ["..."]]
```
2. `path` variable should point out to the path that contains all required files:
    * `hardneg_m_4.pt`, `hardneg_ctxt_model_4.pt`: Linear scoring layer  and mention encoder for the candidate selector.
    * `entity_embeds_4.pkl`: Entity Embeddings computed with the latest entity encoder.
    * `entities.pkl`: See `ED For Funding Organizations/Train GBM Reranker.ipynb`
    * `link_prob.json`, `commonness.json`, `popularity.json`: See `ED For Funding Organizations/Train GBM Reranker.ipynb`
    * `lgbm12.pkl`: The raranker model
    * `BERT_SC_NER_Model.pt`: The NER model

3. Input batch sizes and the device to be used.

# Output

The predictions are stored in the dataframe `df_sent_new`. Important columns:
* `Sentence`: The input sentence	
* `Sentence_Tokenized`: Tokenized version of the input sentence
* `Token_Spans`: List of tuples, length is equal to number of tokens. For each token, the tuple contains start and end character of the token.
* `Preds`:	List of strings, predicted NER tag for each token.
* `Probs`:	List of dictionaries, NER tag probability distribution for each token.
* `Entities`: Annotations for each sentence. Each annotation is contained in a dictionary:
    * `Mention`: Mention extracted by the NER component.
    * `Type`: Type of mention. 'ORG' for organization and 'GRT' for grant.
    * `Character_Spans`: Spans of mention in terms of character index.
    * `Token_Spans`: Spans of mention in terms of tokens
    * `Link_to_Fundref`: Linked FundRef entity ID. None if NIL mention, "-" if non-applicable (e.g. for grant mentions)
    * `Link_Confidence`: Probability assigned by the reranker, "-" if non-applicable (e.g. for grant mentions and NIL mentions)

In [None]:
from transformers import BertTokenizerFast, BertModel
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
import pandas as pd
import numpy as np
from scipy.special import softmax
import torch
from seqeval.metrics.sequence_labeling import get_entities
from NERFunctions import *
from EDFunctions import *
import random
import time
from fuzzywuzzy import fuzz
import pickle
import json
import lightgbm as lgb
import annoy

In [None]:
############INPUTS################
#path to model
path = ""

#Batch sizes
ner_batch_size =16
ctxt_batch_size= 32
#'cuda' for gpu 'cpu' for cpu
device_str = 'cuda'
#ED Default threshold
threshold = 0.042


### INPUT SENTENCES ###
#Input sentences from file
with open("sentences.txt",'r',encoding='utf-8') as f:
    sentences = [[sent] for sent in f.read().splitlines()]
#Input sentences manually.
#sentences = [["Sentence 1"],
#             ["Sentence 2"],
#             ["Sentence 3"],
#             ["..."]]

In [None]:
############VARIABLES################
device = torch.device(device_str)
#Path to NER model
model = torch.load(path+'BERT_SC_NER_Model.pt')
#Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
#Define tag ids
id2tag = {0: 'I-GRT', 1: 'O', 2: 'B-GRT', 3: 'B-ORG', 4: 'I-ORG'}

#Load files
with open(path+'commonness.json','r',encoding='utf-8') as f:
    commonness = json.load(f)
with open(path+'link_prob.json','r',encoding='utf-8') as f:
    link_probability = json.load(f)
with open(path+'popularity.json','r',encoding='utf-8') as f:
    popularity = json.load(f)
with open(path+'lgbm12.pkl','rb') as f:
    model_lgb = pickle.load(f)
with open(path+'entity_embeds_4.pkl',"rb") as f:
    entity_emebeddings=pickle.load(f)
with open(path+'entities.pkl','rb') as f:
    entity_labels=pickle.load(f)
ctxt_model = torch.load(path+'hardneg_ctxt_model_4.pt').to(device_str)
m = torch.load(path+'hardneg_m_4.pt').to('cpu')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
ctxt_model.eval()
m.eval()

## 1. Mention Extraction

In [None]:
########PROCESS DATASET##########
#Create a dataframe with the input sentences.
df_sent = pd.DataFrame(sentences,columns=['Sentence'])
df_sent['Start_Idx'] = 0
df_sent['End_Idx'] = [len(x) for x in df_sent.Sentence.values]
df_sent['ID'] = None

#This variable will contain the dataset with long sentences
df_sent_withlong = df_sent.copy(deep=True)

#Add BERT tokenization to the dataset to prepare input
res = tokenize_input_bert(df_sent_withlong,'Sentence',tokenizer)
df_sent_withlong['Sentence_Tokenized'] = res[0]
df_sent_withlong['Token_Spans'] = res[1]
#Token encodings does not have [CLS] an [SEP at the moment]
df_sent_withlong['Token_Encoding'] = res[2]


#Define max token length (512-2=510)
#Split big sentences
#df_sent -> big sentences splitted
#df_sent_withlong -> original
max_len = 510
df_sent, too_long_df_sent =split_long_sentences_old(df_sent_withlong,max_len)

#Prepare "df_sent" for the model
max_len = 512
df_sent_texts = df_sent['Sentence_Tokenized'].values
df_sent_encodings = df_sent['Token_Encoding'].values
df_sent_seq_lens = df_sent['Token_Encoding'].apply(lambda x: len(x)+2)
#This is to distinguish the wordpieces
df_sent_labels = get_labels(df_sent)
df_sent_labels = add_and_pad(df_sent_labels,max_len,-1,-1,-1)
df_sent_encodings = add_and_pad(df_sent_encodings,max_len,101,102,0)
df_sent_attention_mask = [[0 if num==0 else 1 for num in lst]  for lst in df_sent_encodings]

#Create datasets
sent_dataset = FB_Dataset(df_sent_encodings, df_sent_labels,df_sent_attention_mask,df_sent_seq_lens)

In [None]:
########PREDICT NER##########
model.eval()
model.to(device)
data_loader = DataLoader(sent_dataset, batch_size=ner_batch_size, shuffle=False)
print('Getting Predictions...')
with torch.no_grad():
    preds = np.zeros((0,512,5))
    #Loop over minibatches
    for i, batch in enumerate(data_loader):
        print(i,"/",len(data_loader))
        #Get the max length in this batch and crop based on that
        seq_lens = batch['seq_len']
        max_len_for_batch = max(seq_lens.cpu().detach().numpy())
        #Get inputs and labels for that batch and crop
        input_ids_ = torch.tensor(batch['input_ids'][:,:max_len_for_batch].detach().numpy()).to(device)
        attention_mask_ = torch.tensor(batch['attention_mask'][:,:max_len_for_batch].detach().numpy()).to(device)
        #Do a forward pass
        outputs = model(input_ids_, attention_mask=attention_mask_)
        #Save the predictions
        these_preds = outputs[0].cpu().detach().numpy()
        #Pad the predictions again
        new_preds = np.ones((len(input_ids_),512,5)) * -100
        new_preds[:,:max_len_for_batch,:] = these_preds
        preds = np.concatenate([preds,new_preds],axis=0)

In [None]:
############PREPROCESS THE OUTPUT################
taglist = ['I-GRT','O','B-GRT', 'B-ORG',  'I-ORG']
#Get softmax of the logits
preds=softmax(preds,axis=2)
#Get predicted label index
preds_labels = np.argmax(preds,axis=2)
#Get the probability of the predicted label
preds_probs = []
for row in preds:
    new_row =[]
    for tok in row:
        new_row.append(dict(zip(taglist,np.round(tok,4))))
    preds_probs.append(new_row)
#Get predicted label and discard special tokens
preds_tagged = []
preds_probs_tagged = []
for i in range(len(df_sent_labels)):
    lbl = df_sent_labels[i]
    preds_ = preds_labels[i]
    probs_ = preds_probs[i]
    new_preds = []
    new_probs = []
    for j in range(len(lbl)):
        enc = lbl[j]
        pred = preds_[j]
        prob = probs_[j]
        if enc != -1:
            new_preds.append(id2tag[pred])
            new_probs.append(prob)
    preds_tagged.append(new_preds)
    preds_probs_tagged.append(new_probs)
df_sent['Preds'] = preds_tagged
df_sent['Probs'] = preds_probs_tagged


#Part of validation without the split sentences
df_sent_ok = df_sent[df_sent.index<(len(df_sent_withlong)-len(too_long_df_sent))].copy(deep=True)
#Part of validation with the split sentences
df_sent_merge = df_sent[df_sent.index>=(len(df_sent_withlong)-len(too_long_df_sent))]

#Get the merged predictions for the split sentences
preds = df_sent_merge.groupby('ID').Preds.apply(sum)
probs = df_sent_merge.groupby('ID').Probs.apply(sum)

#Extract the part that we will paste (these are the long sentences)
to_be_pasted = df_sent_withlong[df_sent_withlong.index.isin(too_long_df_sent)].copy(deep=True)

#Append predictions for the long sentences
new_preds = []
new_probs = []
for index, row in to_be_pasted.iterrows():
    new_preds.append(preds[index])
    new_probs.append(probs[index])
to_be_pasted['Preds'] = new_preds
to_be_pasted['Probs'] = new_probs

#Construct the new validation set by merging them
df_sent_new = pd.concat([df_sent_ok,to_be_pasted])

#Make sure we did not miss anything
print(df_sent_new.shape[0] == df_sent_withlong.shape[0])

df_sent_new.reset_index(drop=True,inplace=True)

df_sent_new.drop(['Token_Encoding','ID'],axis=1,inplace=True)

#Get the tokenized words (wordpieces alltogether)
res = tokenize_with_bert(df_sent_new,tokenizer)
df_sent_new['Sentence_Tokenized'] = res[0]
df_sent_new['Token_Spans'] = res[1]

##### Extract Named Entities with Spans ####
entities = []
ann_number = 0
for index, row in df_sent_new.iterrows():
    these_entities = []
    extracted= get_entities(row['Preds'])
    for item in extracted:
        item_dict = dict()
        item_dict['Mention'] = row['Sentence'][int(row['Token_Spans'][item[1]][0]):int(row['Token_Spans'][item[2]][1])]
        item_dict['Type'] = item[0]
        item_dict['Character_Spans'] = (row['Token_Spans'][item[1]][0],row['Token_Spans'][item[2]][1])
        item_dict['Token_Spans'] = (item[1],item[2]+1)
        item_dict['Annotation_ID'] = ann_number
        ann_number +=1
        these_entities.append(item_dict)
    entities.append(these_entities)
df_sent_new['Entities'] = entities

In [None]:
df_sent_new.head()

## 3. Prepare for ED

In [None]:
ed_input = []
#Loop over inputs
for index, row in df_sent_new.iterrows():
    #Loop over extracted mentions
    for ann in row['Entities']:
        #If the mention is an organization, we perform linking.
        if ann['Type'] == 'ORG':
            ed_sample = dict()
            ed_sample['Annotation_ID'] = ann['Annotation_ID']
            ed_sample['mention'] = ann['Mention']
            ed_sample['context_left'] = row['Sentence'][0:int(ann['Character_Spans'][0])]
            ed_sample['context_right'] = row['Sentence'][int(ann['Character_Spans'][1]):]
            ed_input.append(ed_sample)

In [None]:
ed_input[0]

## 4. Candidate Selection with Biencoder

In [None]:
#Create dataset
train_data, train_tensor_data = process_mention_data_2(ed_input,tokenizer)

train_sampler = SequentialSampler(train_tensor_data)
train_dataloader = DataLoader(train_tensor_data, sampler=train_sampler, batch_size=ctxt_batch_size)

In [None]:
#Get mention embeddings
ctxt_model.eval()
print(len(train_dataloader))
mention_embeddings = []
with torch.no_grad():
    start = time.time()
    for step, context_input in enumerate(train_dataloader):
        if step%10==0:
            print("Step: ",step," ",time.time()-start)
        context_input = context_input[0]
        this_batch= context_input.size(0)
        ctxt_rep = ctxt_model(context_input.to(device))[0][:,0,:]
        for i in range(this_batch):
            mention_embeddings.append(ctxt_rep[i].cpu().detach().numpy())

In [None]:
#Build annoy index for nearest neighbor search
#Param for positive class
m_second_param = list(m.parameters())[0][1].detach().numpy()

entity_emebeddings_with_m = dict()
keys_map = dict()
ctr = 0
for k,v in entity_emebeddings.items():
    entity_emebeddings_with_m[ctr] = np.multiply(m_second_param,v)
    keys_map[ctr] = k
    ctr+=1
    
t = annoy.AnnoyIndex(768, 'dot') 

t.set_seed(0)

for k,v in entity_emebeddings_with_m.items():
    t.add_item(k, v)
t.build(1000, n_jobs=-1)

In [None]:
#Get top 12 candidates
num_cands=12

cands = []
start = time.time()
#Loop over mentions 
for i in range(len(mention_embeddings)):
    if i%100 == 0:
        print(i, " ",time.time()-start)
    #Get the mention embedding
    this_ment_embed = mention_embeddings[i]
    
    
    #Now we get the top num_hard_negs predictions
    res = t.get_nns_by_vector(this_ment_embed, num_cands, search_k=len(entity_emebeddings_with_m), include_distances=True)
    #Store entities and scores
    #Score = -dot
    returned_entities = [keys_map[x] for x in res[0]]
    scores = [1- 1/(1 + np.exp(x)) for x in res[1]]
    merged = list(zip(scores,returned_entities))
    #Sort returned instances
    merged.sort(key=lambda tup: tup[0],reverse=True) 

    cands.append(merged)

## 5. Reranking with GBM

In [None]:
#Get LGBM preds
bert_scores = []
fw_scores2 = []
unique_id = []
entity_ = []
commonness_ = []
popularity_ = []
link_probability_ = []

for i in range(len(ed_input)):
    candidates = cands[i]
    this_mention = ed_input[i]['mention']
    for j in range(num_cands):
        #Get FW score
        this_ent_labels = entity_labels[str(candidates[j][1])]['Labels']
        
        fw_score2 = 0
        for lbl in this_ent_labels:
            fw_score2 = max(fw_score2,fuzz.token_sort_ratio(this_mention,lbl)/100)
        fw_scores2.append(fw_score2)
        
        #Get BERT score
        bert_scores.append(candidates[j][0])
        
        commonness_.append(commonness.get(this_mention.lower(),{}).get(str(candidates[j][1]),0.))
        popularity_.append(popularity.get(str(candidates[j][1]),0.))
        link_probability_.append(link_probability.get(this_mention.lower(),0.))
        
        
        unique_id.append(i)
        entity_.append(candidates[j][1])
df=pd.DataFrame({'ID':unique_id,'Commonness':commonness_,'BERT':bert_scores,
                 'Popularity':popularity_,'Link_Probability':link_probability_,
                 'FW2':fw_scores2,'Entity':entity_})
preds = model_lgb.predict(df[['Commonness', 'BERT', 'FW2','Popularity' ,'Link_Probability']]) 

df['Score'] = preds
temp = df.copy(deep=True)
temp=temp.loc[temp.groupby('ID').Score.idxmax().values][['ID','Score','Entity']]
#Get Entities
entities = temp.Entity.values
#Get scores
scores = temp.Score.values
#Apply threshold
temp['Entity']=get_thresholded_preds(threshold,entities,scores)
temp.reset_index(drop=True,inplace=True)

In [None]:
annotation_dict = dict()
for index, row in temp.iterrows():
    annotation_dict[int(ed_input[index]['Annotation_ID'])] = (row['Entity'],row['Score'])

In [None]:
new_entities = []
for index, row in df_sent_new.iterrows():
    this_new_entities = []
    for ann in row['Entities']:
        res = annotation_dict.get(int(ann['Annotation_ID']),('-','-'))
        ann['Link_to_Fundref'] = res[0]
        if res[0] == 'None':
            ann['Link_Confidence'] = "-"
        else:
            ann['Link_Confidence'] = res[1]
        this_new_entities.append(ann)
    new_entities.append(this_new_entities)
df_sent_new['Entities'] = new_entities

In [None]:
df_sent_new.head()