## Instructions

### Inputs
Change the values in the second cell for inputs.
1. `batch_size`: The batch size for model predictions. If the value is higher, the model will run faster. However, it would use more memory. Set it to an integer that is a power of 2.
2. `device_str`: Set to `"cuda"` to use GPU, `"cpu"` to use CPU.
3. `path_to_model`: Path to the NER model
4. `file_path`: Write the sentences line by line to a text file. This variable should hold the path to the text file.
5. If you would like to input just a few sentences, comment-out this code piece:
```python
#Input sentences from file
file_path = "sentences.txt"
with open(file_path,'r',encoding='utf-8') as f:
         sentences = [[sent] for sent in f.read().splitlines()]
```
and then uncomment this code piece to input your sentences there:
```python
#Input sentences manually.
sentences = [["Sentence 1"],
                ["Sentence 2"],
                ["Sentence 3"],
                ["..."]]
```

Can predict on sequences longer than 512 tokens.

### Run
Run the whole notebook after setting the variables

### Output
The output of the model with some extra information is contained in the DataFrame variable`df_sent_new`. Columns:
* `Sentence`: Input sentence
* `Start_Idx`: Start index of the sentence (== 0)
* `End_Idx`: End index of the sentence (== length)
* `Sentence_Tokenized`: Tokenized version of the sentence
* `Token_Spans`: For each token, contains a tuple representing the token's span indexes (Start_Index, End_Index)
* `Preds`:	For each token, contains the predicted tag.
* `Probs`: For each token, contains the predicted probabilities for each token.
* `Entities`: For each sentence, contains a list of named entities with following information:
    * Mention
    * Type
    * Span

## Library versions
* transformers == 3.5.1
* torch == 1.7.1 (+cu110)
* pandas == 1.1.3
* numpy == 1.19.2
* scipy == 1.5.2
* seqeval == 1.2.2

In [None]:
from transformers import BertTokenizerFast
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from scipy.special import softmax
import torch
from seqeval.metrics.sequence_labeling import get_entities
from NERFunctions import *

In [None]:
############INPUTS################

#Batch size (decrease if you get memory erros)
batch_size =16
#'cuda' for gpu 'cpu' for cpu
device_str = 'cuda'
#path to model
path_to_model = "bert_sc_ner.pt"
#Input sentences from file
file_path = "sentences.txt"
with open(file_path,'r',encoding='utf-8') as f:
    sentences = [[sent] for sent in f.read().splitlines()]


#Input sentences manually.
#sentences = [["Sentence 1"],
#             ["Sentence 2"],
#             ["Sentence 3"],
#             ["..."]]


In [None]:
############VARIABLES################
device = torch.device(device_str)
#Path to NER model
model = torch.load(path_to_model)
#Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
#Define tag ids
id2tag = {0: 'I-GRT', 1: 'O', 2: 'B-GRT', 3: 'B-ORG', 4: 'I-ORG'}

In [None]:
########PROCESS DATASET##########
#Create a dataframe with the input sentences.
df_sent = pd.DataFrame(sentences,columns=['Sentence'])
df_sent['Start_Idx'] = 0
df_sent['End_Idx'] = [len(x) for x in df_sent.Sentence.values]
df_sent['ID'] = None

#This variable will contain the dataset with long sentences
df_sent_withlong = df_sent.copy(deep=True)

#Add BERT tokenization to the dataset to prepare input
res = tokenize_input_bert(df_sent_withlong,'Sentence',tokenizer)
df_sent_withlong['Sentence_Tokenized'] = res[0]
df_sent_withlong['Token_Spans'] = res[1]
#Token encodings does not have [CLS] an [SEP at the moment]
df_sent_withlong['Token_Encoding'] = res[2]


#Define max token length (512-2=510)
#Split big sentences
#df_sent -> big sentences splitted
#df_sent_withlong -> original
max_len = 510
df_sent, too_long_df_sent =split_long_sentences_old(df_sent_withlong,max_len)

#Prepare "df_sent" for the model
max_len = 512
df_sent_texts = df_sent['Sentence_Tokenized'].values
df_sent_encodings = df_sent['Token_Encoding'].values
df_sent_seq_lens = df_sent['Token_Encoding'].apply(lambda x: len(x)+2)
#This is to distinguish the wordpieces
df_sent_labels = get_labels(df_sent)
df_sent_labels = add_and_pad(df_sent_labels,max_len,-1,-1,-1)
df_sent_encodings = add_and_pad(df_sent_encodings,max_len,101,102,0)
df_sent_attention_mask = [[0 if num==0 else 1 for num in lst]  for lst in df_sent_encodings]

#Create datasets
sent_dataset = FB_Dataset(df_sent_encodings, df_sent_labels,df_sent_attention_mask,df_sent_seq_lens)

In [None]:
########PREDICT##########
model.eval()
model.to(device)
data_loader = DataLoader(sent_dataset, batch_size=batch_size, shuffle=False)
print('Getting Predictions...')
with torch.no_grad():
    preds = np.zeros((0,512,5))
    #Loop over minibatches
    for i, batch in enumerate(data_loader):
        print(i,"/",len(data_loader))
        #Get the max length in this batch and crop based on that
        seq_lens = batch['seq_len']
        max_len_for_batch = max(seq_lens.cpu().detach().numpy())
        #Get inputs and labels for that batch and crop
        input_ids_ = torch.tensor(batch['input_ids'][:,:max_len_for_batch].detach().numpy()).to(device)
        attention_mask_ = torch.tensor(batch['attention_mask'][:,:max_len_for_batch].detach().numpy()).to(device)
        #Do a forward pass
        outputs = model(input_ids_, attention_mask=attention_mask_)
        #Save the predictions
        these_preds = outputs[0].cpu().detach().numpy()
        #Pad the predictions again
        new_preds = np.ones((len(input_ids_),512,5)) * -100
        new_preds[:,:max_len_for_batch,:] = these_preds
        preds = np.concatenate([preds,new_preds],axis=0)

In [None]:
############PREPROCESS THE OUTPUT################
taglist = ['I-GRT','O','B-GRT', 'B-ORG',  'I-ORG']
#Get softmax of the logits
preds=softmax(preds,axis=2)
#Get predicted label index
preds_labels = np.argmax(preds,axis=2)
#Get the probability of the predicted label
preds_probs = []
for row in preds:
    new_row =[]
    for tok in row:
        new_row.append(dict(zip(taglist,np.round(tok,4))))
    preds_probs.append(new_row)
#Get predicted label and discard special tokens
preds_tagged = []
preds_probs_tagged = []
for i in range(len(df_sent_labels)):
    lbl = df_sent_labels[i]
    preds_ = preds_labels[i]
    probs_ = preds_probs[i]
    new_preds = []
    new_probs = []
    for j in range(len(lbl)):
        enc = lbl[j]
        pred = preds_[j]
        prob = probs_[j]
        if enc != -1:
            new_preds.append(id2tag[pred])
            new_probs.append(prob)
    preds_tagged.append(new_preds)
    preds_probs_tagged.append(new_probs)
df_sent['Preds'] = preds_tagged
df_sent['Probs'] = preds_probs_tagged


#Part of validation without the split sentences
df_sent_ok = df_sent[df_sent.index<(len(df_sent_withlong)-len(too_long_df_sent))].copy(deep=True)
#Part of validation with the split sentences
df_sent_merge = df_sent[df_sent.index>=(len(df_sent_withlong)-len(too_long_df_sent))]

#Get the merged predictions for the split sentences
preds = df_sent_merge.groupby('ID').Preds.apply(sum)
probs = df_sent_merge.groupby('ID').Probs.apply(sum)

#Extract the part that we will paste (these are the long sentences)
to_be_pasted = df_sent_withlong[df_sent_withlong.index.isin(too_long_df_sent)].copy(deep=True)

#Append predictions for the long sentences
new_preds = []
new_probs = []
for index, row in to_be_pasted.iterrows():
    new_preds.append(preds[index])
    new_probs.append(probs[index])
to_be_pasted['Preds'] = new_preds
to_be_pasted['Probs'] = new_probs

#Construct the new validation set by merging them
df_sent_new = pd.concat([df_sent_ok,to_be_pasted])

#Make sure we did not miss anything
print(df_sent_new.shape[0] == df_sent_withlong.shape[0])

df_sent_new.reset_index(drop=True,inplace=True)

df_sent_new.drop(['Token_Encoding','ID'],axis=1,inplace=True)

#Get the tokenized words (wordpieces alltogether)
res = tokenize_with_bert(df_sent_new,tokenizer)
df_sent_new['Sentence_Tokenized'] = res[0]
df_sent_new['Token_Spans'] = res[1]

##### Extract Named Entities with Spans ####
entities = []
for index, row in df_sent_new.iterrows():
    these_entities = []
    extracted= get_entities(row['Preds'])
    for item in extracted:
        item_dict = dict()
        item_dict['Mention'] = row['Sentence'][int(row['Token_Spans'][item[1]][0]):int(row['Token_Spans'][item[2]][1])]
        item_dict['Type'] = item[0]
        item_dict['Character_Spans'] = (row['Token_Spans'][item[1]][0],row['Token_Spans'][item[2]][1])
        item_dict['Token_Spans'] = (item[1],item[2]+1)
        these_entities.append(item_dict)
    entities.append(these_entities)
df_sent_new['Entities'] = entities

In [None]:
df_sent_new.head()