In [1]:
# Download and extract from: https://1drv.ms/u/s!ApPZx_TWwibImHl49ZBwxOU0ktHv
# available in data/neural_oie
# Or even better the "span model" dataset for 
# https://arxiv.org/abs/1901.10879
import re
import json
from transformers import DistilBertTokenizerFast
import h5py


from pathlib import Path
data_path = Path('../data') / 'span_model_oie'

In [2]:
# You will need ~6 GB of RAM to load the entire JSON in memory
with open(data_path / 'structured_data.json', 'r') as json_file:
    data = json.load(json_file)

### Data Preparation

* treshold at 0.9 score, because some of the lower scored training entries are quite poor (e.g. "Descartes" used as relation, score 0.39)
* the start of each input should be alphanumeric, as well as both ends of the triple fragments
* we'll ignore position indexes, and POS tags, as we will let the WordPiece tokenizer retokenize anything, and we are (currently) not interesting with creating pointers between input string and output triple
* also contain the extraction to some "common sense" approximation of a typical noun phrase. E.g. at most 7 words in either subject or object entry. 
   * Example of what we want to avoid:
     ```
      'a 17th-18th century french writer whose defensio religionis a 251-page critique of the pantheism of john toland'
     ```
   * and what we'd like to instead keep:
     ```
      a 17th-18th century french writer
     ```
   * valid maximal length example: (*third son of a humble farmer*, **will be a part of**, *a much anticipated next generation reality show*)
   * as we don't have a truly "stable" way to vet the training data ourselves, we'll simply discard the extra long annotations to avoid that kind of noise. We are also more interested in triples that are closer to "ontological", so best to avoid confounding training.
* another cleaning condition - drop relations that do not contain at least one word that appears in the unix dictionary (e.g. eliminating names that wrongly appear as relations -- too much noise!)


In [3]:
with open('/usr/share/dict/words','r') as dict_f:
 unix_words = set(dict_f.read().splitlines())

In [4]:
alphanum_start = re.compile('^[\W_]+', re.UNICODE)
alphanum_end =  re.compile('[\W_]+$', re.UNICODE)
def sanitize(w):
    w = re.sub(alphanum_start, '', w)
    w = re.sub(alphanum_end, '', w)
    return w

filtered_data = []
for datum in data:
    sentence = sanitize(datum['sentence'])
    for triple in datum['tuples']:
        # one more constraint - the relation *must* be in the unix dictionary, to drop obvious noise
        if triple['score'] > 0.9:
            subj = sanitize(triple['arg0'])
            pred = sanitize(triple['relation'])
            pred_words = pred.split(' ')
            if subj.count(' ')<=5 and \
               len(pred_words)<=5 and \
               set(pred_words).issubset(unix_words):
                for arg2 in triple['args']:
                    obj = sanitize(arg2)
                    if obj.count(' ')<=6:
                        filtered_data.append([sentence, [subj, pred, obj]])

In [5]:
print("Total triples: %d" % len(filtered_data))

max_sentence_len = max([len(sent[0].split()) for sent in filtered_data]) 
max_subj_len = max([len(sent[1][0].split()) for sent in filtered_data])
max_pred_len = max([len(sent[1][1].split()) for sent in filtered_data])
max_obj_len = max([len(sent[1][2].split()) for sent in filtered_data])
max_target_len = max([len(sent[1][0].split())+len(sent[1][1].split())+len(sent[1][2].split()) for sent in filtered_data])

print("max lengths: sentence=%d, target=%d (subject=%d, predicate=%d, object=%d)" % \
      (max_sentence_len, max_target_len, max_subj_len, max_pred_len, max_obj_len))

Total triples: 1002626
max lengths: sentence=40, target=17 (subject=6, predicate=5, object=7)


### Transfer to Language Modeling paradigm

We would like to use this data in order to fine-tune a language model pretrained on a vast collection of English. In the ideal case, this will allow the underlying transformer architecture the opportunity to generalize the triple extraction procedure over a larger/open-ended subset of English than present in the training data.

Deciding on the neural architecture influences how to prepare the final serialization of this data, and pose it as a fine-tuning downstream task to the LM.

For now, we are going with:
 * Use the `huggingface/transformer` library for reliability and simplicity
 * Currently the only convenient "out of the box" experience for summarization/translation appears to be their `
 * Follow their fine-tuning documentation for BART: [README](https://github.com/huggingface/transformers/blob/master/examples/seq2seq/README.md)
 

In [9]:
# We ideally have a grasp of our expected max-sizes, so that we can use input/output shapes with fixed lengths
max_input_size = 128
max_output_size = 64
distil_bert = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(distil_bert)

### The following check was used to initially estimate max_input_size prior setting "max_length":
### And similarly for max_output_size
# max_encoded_len = max([len(sent[0]) for sent in encoded_input]) 
# print("max encoded sentence len: %d" % max_encoded_len)
# max_encoded_target_len = max([len(sent[0]) for sent in encoded_target]) 
# print("max encoded target len: %d" % max_encoded_target_len)

In [10]:
# This is quite RAM intensive as well, likely more than reasonable, but works on my beefy PC
# I am seeing ~15 GB of RAM allocated for creating `encoded_input`
encoded_input = [tokenizer(datum[0], return_tensors='tf', max_length=max_input_size, padding='max_length')\
                 for datum in filtered_data]

In [11]:
# and another ~9 GB allocated for `encoded_target`
encoded_target = [tokenizer(entry[1][0] + " [SEP] "+entry[1][1] +" [SEP] "+entry[1][2],\
                            return_tensors='tf', max_length=max_output_size, padding='max_length')\
                  for entry in filtered_data]

In [12]:
### To check on some decoded strings use e.g.
#print(tokenizer.decode(encoded_input[180000]['input_ids'][0]))
#print(tokenizer.decode(encoded_target[180000]['input_ids'][0]))

# Grab 10,000 triples for quick prototyping with the modeling setup:
sample_input_train = encoded_input[180_000:190_000]
sample_targets_train = encoded_target[180_000:190_000]
# And 2,500 triples for quick testing, to keep with an 80/20 ratio
sample_input_test = encoded_input[780_000:782_500]
sample_targets_test = encoded_target[780_000:782_500]

In [21]:
# Notebook guard - close fp if it is defined, to be able to re-run this cell at any point.
if 'fp' in locals():
    fp.close()
# Write *SAMPLE* data out as an HDF5 dataset
fp = h5py.File(data_path / "encoded_sample.hdf5", "w")
train_chunk_size = len(sample_input_train)
test_chunk_size = len(sample_input_test)

x_train = fp.create_dataset("x_train", (train_chunk_size, max_input_size), dtype="int")
y_train = fp.create_dataset("y_train", (train_chunk_size,max_output_size),dtype="int")
x_test = fp.create_dataset("x_test", (test_chunk_size, max_input_size), dtype="int")
y_test = fp.create_dataset("y_test", (test_chunk_size,max_output_size), dtype="int")

for index, sample in enumerate(sample_input_train):
    x_train[index,:] = sample['input_ids']
for index, sample in enumerate(sample_targets_train):
    y_train[index,:] = sample['input_ids']
for index, sample in enumerate(sample_input_test):
    x_test[index,:] = sample['input_ids']
for index, sample in enumerate(sample_targets_test):
    y_test[index,:] = sample['input_ids']
    
fp.close()

In [20]:
# Write *FULL* data out as an HDF5 dataset
if 'fullw' in locals():
    fullw.close()
fullw = h5py.File(data_path / "encoded_span_oie.hdf5", "w")
full_chunk_size = 100_000
input_len = len(encoded_input)
train_size=int(0.8 * input_len)
test_size=input_len - train_size

x_train = fullw.create_dataset("x_train", (train_size, max_input_size), 
                              chunks=(full_chunk_size, max_input_size), dtype="int")
y_train = fullw.create_dataset("y_train", (train_size,max_output_size), 
                              chunks=(full_chunk_size,max_output_size), dtype="int")
x_test = fullw.create_dataset("x_test", (test_size, max_input_size), 
                              chunks=(full_chunk_size, max_input_size), dtype="int")
y_test = fullw.create_dataset("y_test", (test_size,max_output_size), 
                              chunks=(full_chunk_size,max_output_size), dtype="int")

train_idx = 0
test_idx = 0
for index, (x,y) in enumerate(zip(encoded_input,encoded_target)):
    if index % 5 == 0:
        x_test[test_idx,:] = x['input_ids']
        y_test[test_idx,:] = y['input_ids']
        test_idx+=1
    else:
        x_train[train_idx,:] = x['input_ids']
        y_train[train_idx,:] = y['input_ids']
        train_idx+=1
fullw.close()