## OECD - Semantic Role Labeling

** It is highly recommended to run this notebook on a GPU for a reasonable execution time.

This notebook generates (Subject, Verb, Object) tuples for the entire OECD corpus of documents

### 1. Import the data

In [None]:
from gensim.test.utils import datapath
from gensim import utils
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import json
import string 
# import nltk
# from nltk.corpus import stopwords
# nltk.download('punkt')
from pathlib import Path
import os
path = Path(os.getcwd())
data_dir = os.path.join(path.parents[0], "data-files")

In [None]:
with open(os.path.join(data_dir, "processed_ngram_ner_data.json"), encoding='utf-8') as f:
    datajson = json.load(f)

processed_sentences = []
for key in datajson:
    sentences = sent_tokenize(datajson[key])
    for sentence in sentences:
        processed_sentences.append(sentence)    
        
print(len(processed_sentences))

processed_sentences = list(set(processed_sentences))

print(len(processed_sentences))

In [None]:
# processed corpus into cleaned list of sentences
# processed_sentences = get_preprocessed_corpus(corpus)

In [None]:
# save corpus to file
import pickle
with open(os.path.join(data_dir, "srl_corpus_new.pkl"), 'wb') as f:
    pickle.dump(processed_sentences, f)

If the `processed_sentences` are stored to file (data-files/srl_corpus.pkl), you can also load it with the code in the cell below:

In [None]:
# import pickle
# with open(os.path.join(data_dir, "srl_corpus_new.pkl"), 'rb') as f:
#     processed_sentences = pickle.load(f)

### 2. Initialise or prepare SRL model

In [None]:
# initialise the bilstm model for SRL using the allennlp wrapper
from allennlp_models import pretrained


predictor = pretrained.load_predictor('structured-prediction-srl') # Bilstm model. use 'structured-prediction-srl-bert' as an alternative

If you encounter any runtime / installation errors with allennlp, try installing spacy-transformers below:

In [None]:
# !pip install git+https://github.com/explosion/spacy-transformers

### 3. Generate SRL tuples for each sentence in the corpus

In [None]:
# function to parse SRL output per sentence
def get_srl_tag_words(sentence):
    tokens = re.findall(r'\[(.*?)\]', sentence)
    verb = None
    arg0 = None 
    arg1 = None
    for token in tokens:
        if  token.startswith('V:'):
            verb = token.replace('V:','').strip()
        if  token.startswith('ARG0:'):
            arg0 = token.replace('ARG0:','').strip()
        if  token.startswith('ARG1:'):
            arg1 = token.replace('ARG1:','').strip()

    return verb, arg0, arg1
            
preds_list = []
index = 1
for sent in processed_sentences:
    # print('sentence', index, '/', str(len(processed_sentences)))#
    if (len(sent.split()) < 512):
        preds = predictor.predict(sent)
        for i in range(0, len(preds["verbs"])):
            verb, arg0, arg1 = get_srl_tag_words(preds["verbs"][i]['description'])
            if (((verb is not None) and (arg0 is not None) and (arg1 is not None)) and (len(verb) > 0 and len(arg0) > 0 and len(arg1) > 0)):
                preds_list.append(preds)
    index += 1
                

### 4. Save RAW SRL results to file

In [None]:
import pickle
with open(os.path.join(data_dir, "srl_predictions_big.pkl"), 'wb') as f:
    pickle.dump(preds_list, f)

In [None]:
# with open(os.path.join(data_dir, "srl_predictions_big.pkl"), 'rb') as f:
#     srl_results = pickle.load(f)

In [None]:
# print("number of pairs: ", len(srl_results))