## OECD - Semantic Role Labeling

** It is highly recommended to run this notebook on a GPU for a reasonable execution time.

This notebook generates (Subject, Verb, Object) tuples for the entire OECD corpus of documents

### 1. Import the data

In [1]:
from gensim.test.utils import datapath
from gensim import utils
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import json
import string 
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')

stop_words = set(stopwords.words('english'))  
irrelevant_tokens = ['et', 'al.', 'x', 'pdf', 'yes', 'abbrev','also','fe',
                            'page', 'pp', 'p', 'er', 'doi', 'can', 'b', 'c', 'd', 'e',
                            'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'q', 'r', 's', 'herein', 'furthermore',
                            't', 'u', 'v', 'w', 'y', 'z', 'www', 'com', 'org', 'de', 'dx', 'th', 'ii', 'le']

stop_words_list = []
stop_words_list.extend(list(stop_words))
stop_words_list.extend(list(irrelevant_tokens))
stop_words_list = list(set(stop_words_list))

with open('../data-files/processed_ngram_ner_data.json', encoding='utf-8') as f:
    datajson = json.load(f)

corpus = ''
corpus_doclist = []
for key in datajson:
    word_tokens = word_tokenize(datajson[key])
    filtered_doc = [w for w in word_tokens if not w.lower() in stop_words_list]
    corpus_doclist.append(' '.join(filtered_doc))
    corpus += datajson[key] + ' '
    
f = open('../data-files/ngram_replacements.json')
ngram_replacements = json.load(f)
        
def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text

def get_preprocessed_corpus(corpus):
    global ngram_replacements
    
    # split corpus into sentences
    sentences = sent_tokenize(corpus)
    cleaned_sentences = []
    for sentence in sentences:
        # replace ngrams with single tokens
        cleaned_sentence = replace_all(sentence, ngram_replacements)        
        cleaned_sentence = cleaned_sentence.replace('(', '').replace(')', '')
        cleaned_sentences.append(cleaned_sentence)
            
    return cleaned_sentences

[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>


In [4]:
# processed corpus into cleaned list of sentences
processed_sentences = get_preprocessed_corpus(corpus)

In [5]:
# save corpus to file
import pickle
with open('../data-files/srl_corpus.pkl', 'wb') as f:
    pickle.dump(processed_sentences, f)

If the `processed_sentences` are stored to file (data-files/srl_corpus.pkl), you can also load it with the code in the cell below:

In [None]:
# import pickle
# with open('../data-files/srl_corpus.pkl', 'rb') as f:
#     processed_sentences = pickle.load(f)

### 2. Initialise or prepare SRL model

In [6]:
# initialise the bilstm model for SRL using the allennlp wrapper
from allennlp_models import pretrained
predictor = pretrained.load_predictor('structured-prediction-srl') # Bilstm model. use 'structured-prediction-srl-bert' as an alternative

roberta-rte is not a registered model.
lerc is not a registered model.


downloading:   0%|          | 0/54185577 [00:00<?, ?B/s]



If you encounter any runtime / installation errors with allennlp, try installing spacy-transformers below:

In [7]:
# !pip install git+https://github.com/explosion/spacy-transformers

### 3. Generate SRL tuples for each sentence in the corpus

In [9]:
# function to parse SRL output per sentence
def get_srl_tag_words(sentence):
    tokens = re.findall(r'\[(.*?)\]', sentence)
    verb = None
    arg0 = None 
    arg1 = None
    for token in tokens:
        if  token.startswith('V:'):
            verb = token.replace('V:','').strip()
        if  token.startswith('ARG0:'):
            arg0 = token.replace('ARG0:','').strip()
        if  token.startswith('ARG1:'):
            arg1 = token.replace('ARG1:','').strip()

    return verb, arg0, arg1
            
preds_list = []
index = 1
for sent in processed_sentences:
    print('sentence', index, '/', str(len(processed_sentences)))# 
    if (len(sent.split()) < 512):
        preds = predictor.predict(sent)
        for i in range(0, len(preds["verbs"])):
            verb, arg0, arg1 = get_srl_tag_words(preds["verbs"][i]['description'])
            if (((verb is not None) and (arg0 is not None) and (arg1 is not None)) and (len(verb) > 0 and len(arg0) > 0 and len(arg1) > 0)):
                preds_list.append(preds)
    index += 1
                

sentence 1 / 127867
sentence 2 / 127867
sentence 3 / 127867
sentence 4 / 127867
sentence 5 / 127867
sentence 6 / 127867
sentence 7 / 127867
sentence 8 / 127867
sentence 9 / 127867
sentence 10 / 127867
sentence 11 / 127867
sentence 12 / 127867
sentence 13 / 127867
sentence 14 / 127867
sentence 15 / 127867
sentence 16 / 127867
sentence 17 / 127867
sentence 18 / 127867
sentence 19 / 127867
sentence 20 / 127867
sentence 21 / 127867
sentence 22 / 127867
sentence 23 / 127867
sentence 24 / 127867
sentence 25 / 127867
sentence 26 / 127867
sentence 27 / 127867
sentence 28 / 127867
sentence 29 / 127867
sentence 30 / 127867
sentence 31 / 127867
sentence 32 / 127867
sentence 33 / 127867
sentence 34 / 127867
sentence 35 / 127867



KeyboardInterrupt



### 4. Save RAW SRL results to file

In [None]:
import pickle
with open('../data-files/srl_predictions_big.pkl', 'wb') as f:
    pickle.dump(preds_list, f)

In [None]:
# with open('../data-files/srl_predictions_big.pkl', 'rb') as f:
#     srl_results = pickle.load(f)

In [None]:
# print("number of pairs: ", len(srl_results))

### 5. Exploratory analysis of document term matrix

In [2]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# v = TfidfVectorizer()
# x = v.fit_transform(corpus_doclist)
# dtm = v.transform(corpus_doclist)

In [3]:
# import pandas as pd
# # Select the first five rows from the data set
# td = pd.DataFrame(x.todense()).iloc[:10]  
# td.columns = v.get_feature_names_out()
# term_document_matrix = td.T
# term_document_matrix.columns = ['Doc '+str(i) for i in range(1, 11)]
# term_document_matrix['total_count'] = term_document_matrix.sum(axis=1)

# # Top 25 words 
# term_document_matrix = term_document_matrix.sort_values(by ='total_count',ascending=False)[:25] 

# # Print the first 10 rows 
# print(term_document_matrix.drop(columns=['total_count']).head(10))

                Doc 1     Doc 2     Doc 3     Doc 4     Doc 5     Doc 6  \
water        0.447218  0.616300  0.185371  0.515143  0.312592  0.381654   
oecd         0.151829  0.153343  0.026068  0.159456  0.143150  0.001332   
countries    0.103725  0.082532  0.062997  0.017483  0.096407  0.015985   
moldova      0.000000  0.000000  0.355790  0.000000  0.000907  0.217215   
wss          0.012327  0.000000  0.470274  0.000000  0.000000  0.063357   
development  0.044346  0.044928  0.057928  0.053654  0.032970  0.094581   
public       0.048104  0.026371  0.045619  0.060889  0.009599  0.037966   
management   0.048104  0.063486  0.014482  0.090127  0.028380  0.105904   
may          0.018791  0.088392  0.087617  0.065712  0.112683  0.022646   
risk         0.026307  0.195829  0.005793  0.084099  0.097659  0.037299   

                Doc 7     Doc 8     Doc 9    Doc 10  
water        0.277393  0.425145  0.242719  0.358689  
oecd         0.093125  0.172985  0.166716  0.138756  
countries   