In [1]:
import json
import spacy
import numpy as np

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
with open("dataset/train-v2.0.json", 'r') as handle:
    jdata = json.load(handle)
    data = jdata['data']
contexts = []
questions = []
unanswerable = []
answerable = []
for i in range(len(data)):
    section = data[i]['paragraphs']
    for sec in section:
        context = sec['context']
        contexts.append(context)
        qas = sec['qas']
        for j in range(len(qas)):
            question = qas[j]['question']
            questions.append(question)
            label = qas[j]['is_impossible']
            if label:
                unanswerable.append((len(contexts)-1, len(questions)-1))
            else:
                answerable.append((len(contexts)-1, len(questions)-1))

In [4]:
candidate_questions = list(questions[:500])

In [45]:
parsed_questions = []
for q in candidate_questions:
    parsed_questions.append(nlp(q))

In [30]:
print (parsed_questions[0])
[t for t in parsed_questions[0].noun_chunks]

When did Beyonce start becoming popular?


[Beyonce]

In [25]:
spacy.parts_of_speech.ADV == parsed_questions[0][0].pos

True

In [32]:
for i in np.random.choice(50, 5):
    print(parsed_questions[i])
    print ([t for t in parsed_questions[i].noun_chunks])

Which magazine declared her the most dominant woman musician?
[Which magazine]
Where did Beyonce get her name from?
[Beyonce, her name]
How many records has Beyonce sold in her 19 year career?
[How many records, Beyonce, her 19 year career]
In which decade did Beyonce become famous?
[which decade, Beyonce]
When did Destiny's Child end their group act?
[Destiny's Child, their group act]


In [33]:
# Which magazine declared her the most dominant woman musician?
# [Which] [magazine] [declared] [her] [the [most dominant woman musician] ]
q  = [q for q in parsed_questions if "magazine declared her the most dominant" in q.text][0]

In [38]:
print([c for c in q.noun_chunks])
for t in q:
    print (t, t.pos_, t.dep_, t.tag_)

[Which magazine]
Which ADJ det WDT
magazine NOUN nsubj NN
declared VERB ROOT VBD
her PRON poss PRP
the DET det DT
most ADV advmod RBS
dominant ADJ amod JJ
woman NOUN compound NN
musician NOUN ccomp NN
? PUNCT punct .


# Plan
1) Algorithm to convert sentences to nested spans  
2) Algorithm to map span to bert tokens  
3) generate dataset  

In [39]:
from pytorch_pretrained_bert.tokenization import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [41]:
tokenizer.tokenize(q.text)

['which',
 'magazine',
 'declared',
 'her',
 'the',
 'most',
 'dominant',
 'woman',
 'musician',
 '?']

In [81]:
tested = 0
for q in parsed_questions:
    tokenized = tokenizer.tokenize(q.text)
    j = [0,0]
    for i in range(len(q.text)):
        if q.text[i] == " ":
            continue
        while tokenized[j[0]][j[1]] == "#":
            j[1] += 1
        if ord(q.text[i]) < 128:
            assert q.text[i].lower() == tokenized[j[0]][j[1]],(q, i, j, q.text[i:] )
        if (j[1] + 1) == len(tokenized[j[0]]):
            j[0] += 1
            j[1] = 0
        else:
            j[1] += 1
    tested += 1
tested

500

In [147]:
def mask_tokens_by_chunk(tokens, text, span):
    masked_tokens = list(tokens)
    target = list(tokens)
    tokens_to_mask = set()
    j = [0,0]
    for i in range(len(text)):
        if text[i] == " ":
            continue
        while tokens[j[0]][j[1]] == "#":
            j[1] += 1
            if j[1] == len(tokens[j[0]]):
                return None, None
        if ord(text[i]) < 128:
            if text[i].lower() != tokens[j[0]][j[1]]:#,(text, i, j, text[i:], tokens )
                return None,None
        if i >= span.start_char and i < span.end_char:
            tokens_to_mask.add(j[0])
        if (j[1] + 1) == len(tokens[j[0]]):
            j[0] += 1
            j[1] = 0
        else:
            j[1] += 1
    for i in tokens_to_mask:
        masked_tokens[i] = '[MASK]'
        target[i] = '[MASK]'
        target.append(tokens[i])
    return masked_tokens, target

In [85]:
testq = q
test_tokens = tokenizer.tokenize(q.text)
test_text = q.text
test_span = list(q.noun_chunks)[1]
print(mask_tokens_by_chunk(test_tokens, test_text, test_span))

(['for', 'what', 'does', '[MASK]', 'receive', 'praise', '?'], ['for', 'what', 'does', '[MASK]', 'receive', 'praise', '?', 'beyonce'])


In [92]:
for q in np.random.choice(parsed_questions, 10):
    print(q)
    tokens = tokenizer.tokenize(q.text)
    for chunk in q.noun_chunks:
        masked, target = mask_tokens_by_chunk(tokens, q.text, chunk)
        print (masked)
        print (target)
        print("~"*30)
    for chunk in q.ents:
        masked, target = mask_tokens_by_chunk(tokens, q.text, chunk)
        print (masked)
        print (target)
        print("~"*30)
    print("#"*30)

Who did Beyoncé sing a duet with for "The Best Man" film?
['[MASK]', 'did', 'beyonce', 'sing', 'a', 'duet', 'with', 'for', '"', 'the', 'best', 'man', '"', 'film', '?']
['[MASK]', 'did', 'beyonce', 'sing', 'a', 'duet', 'with', 'for', '"', 'the', 'best', 'man', '"', 'film', '?', 'who']
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
['who', 'did', '[MASK]', 'sing', 'a', 'duet', 'with', 'for', '"', 'the', 'best', 'man', '"', 'film', '?']
['who', 'did', '[MASK]', 'sing', 'a', 'duet', 'with', 'for', '"', 'the', 'best', 'man', '"', 'film', '?', 'beyonce']
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
['who', 'did', 'beyonce', 'sing', '[MASK]', '[MASK]', 'with', 'for', '"', 'the', 'best', 'man', '"', 'film', '?']
['who', 'did', 'beyonce', 'sing', '[MASK]', '[MASK]', 'with', 'for', '"', 'the', 'best', 'man', '"', 'film', '?', 'a', 'duet']
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
['who', 'did', 'beyonce', 'sing', 'a', 'duet', 'with', 'for', '"', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '?']
['who', 'did', 'beyonce', 'sing', 'a

In [110]:
%time
docs = []
for doc in nlp.pipe(questions, batch_size=500, n_threads=16):
     docs.append(doc)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 8.11 µs


In [111]:
docs[-1][0].pos_

'ADJ'

In [112]:
len(docs)

130319

In [113]:
context_sizes = []
for c in contexts:
    tokens = tokenizer.tokenize(c)
    context_sizes.append(len(tokens))

In [114]:
print(max(context_sizes), min(context_sizes), np.mean(context_sizes))

853 25 151.62348305752562


In [125]:
print(np.sum(np.array(context_sizes) > 256))

1288


In [129]:
total_sizes = []
for ci, qi, in answerable+ unanswerable:
    size = context_sizes[ci]
    tokens = tokenizer.tokenize(questions[qi])
    size += len(tokens)
    total_sizes.append(size)

In [130]:
print(max(total_sizes), min(total_sizes), np.mean(total_sizes))

867 32 167.72663234064103


In [136]:
print(np.sum(np.array(total_sizes) > 256) / len(total_sizes))

0.08909675488608722


In [148]:
training_data = {}
for i, doc in enumerate(docs):
    training_data[i] = []
    tokens = tokenizer.tokenize(doc.text)
    for chunk in doc.noun_chunks:
        masked, target = mask_tokens_by_chunk(tokens, doc.text, chunk)
        training_data[i].append([masked, target])
    for chunk in doc.ents:
        masked, target = mask_tokens_by_chunk(tokens, doc.text, chunk)
        training_data[i].append([masked, target])

In [150]:
c = 0
nones = 0
for k, v in training_data.items():
    c += len(v)
    for vv in v:
        if vv[0] is None:
            nones += 1
print(c, nones)

517932 159


# TODO:
1) make test data  
2) build finetuning model

In [151]:
with open("dataset/dev-v2.0.json", 'r') as handle:
    jdata = json.load(handle)
    data = jdata['data']
val_contexts = []
val_questions = []
val_unanswerable = []
val_answerable = []
for i in range(len(data)):
    section = data[i]['paragraphs']
    for sec in section:
        context = sec['context']
        val_contexts.append(context)
        qas = sec['qas']
        for j in range(len(qas)):
            question = qas[j]['question']
            val_questions.append(question)
            label = qas[j]['is_impossible']
            if label:
                val_unanswerable.append((len(contexts)-1, len(questions)-1))
            else:
                val_answerable.append((len(contexts)-1, len(questions)-1))

In [152]:
val_docs = []
for doc in nlp.pipe(val_questions, batch_size=500, n_threads=16):
     val_docs.append(doc)

In [153]:
val_data = {}
for i, doc in enumerate(val_docs):
    val_data[i] = []
    tokens = tokenizer.tokenize(doc.text)
    for chunk in doc.noun_chunks:
        masked, target = mask_tokens_by_chunk(tokens, doc.text, chunk)
        val_data[i].append([masked, target])
    for chunk in doc.ents:
        masked, target = mask_tokens_by_chunk(tokens, doc.text, chunk)
        val_data[i].append([masked, target])

In [154]:
c = 0
nones = 0
for k, v in val_data.items():
    c += len(v)
    for vv in v:
        if vv[0] is None:
            nones += 1
print(c, nones)

45674 0


In [155]:
val_data[9]

[[['[MASK]',
   'was',
   'the',
   'duke',
   'in',
   'the',
   'battle',
   'of',
   'hastings',
   '?'],
  ['[MASK]',
   'was',
   'the',
   'duke',
   'in',
   'the',
   'battle',
   'of',
   'hastings',
   '?',
   'who']],
 [['who',
   'was',
   '[MASK]',
   '[MASK]',
   'in',
   'the',
   'battle',
   'of',
   'hastings',
   '?'],
  ['who',
   'was',
   '[MASK]',
   '[MASK]',
   'in',
   'the',
   'battle',
   'of',
   'hastings',
   '?',
   'the',
   'duke']],
 [['who',
   'was',
   'the',
   'duke',
   'in',
   '[MASK]',
   '[MASK]',
   'of',
   'hastings',
   '?'],
  ['who',
   'was',
   'the',
   'duke',
   'in',
   '[MASK]',
   '[MASK]',
   'of',
   'hastings',
   '?',
   'the',
   'battle']],
 [['who', 'was', 'the', 'duke', 'in', 'the', 'battle', 'of', '[MASK]', '?'],
  ['who',
   'was',
   'the',
   'duke',
   'in',
   'the',
   'battle',
   'of',
   '[MASK]',
   '?',
   'hastings']],
 [['who', 'was', 'the', 'duke', 'in', 'the', 'battle', 'of', '[MASK]', '?'],
  ['who',
 

In [156]:
import pickle

In [157]:
with open("training_data_chunks.pkl", "wb") as f:
    pickle.dump(training_data, f)
with open("val_data_chunks.pkl", "wb") as f:
    pickle.dump(val_data, f)

# Looking at overlap

In [6]:
import random

In [23]:
# stop list = nlp.Defaults.stop_words

In [46]:
from pytorch_pretrained_bert.tokenization import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

def overlapping_spans(c, q):
    qt = tokenizer.tokenize(q)
    ct = tokenizer.tokenize(c)
    # build index
    output_set = set()
    context_ngram_set = set()
    for i in range(len(ct)):
        for j in range(10):
            if j == 0:
                continue
            context_ngram_set.add(tuple(ct[i:i+j]))
    skip = 0
    for i in range(len(qt)):
        longest = None
        if skip:
            skip -= 1
            continue
        for j in range(10):
            if j == 0 or len(qt[i:i+j]) < j:
                continue
            span = tuple(qt[i:i+j])
            if span in context_ngram_set:
                longest = span
            if span not in context_ngram_set and longest:
                output_set.add(longest)
                skip = len(longest) - 1 
                break
    return output_set

In [61]:
cid, qid = random.sample(answerable, 1)[0]
q = questions[qid]
c = contexts[cid]
print(c)
print("~"*20)
print(q)
print("~"*20)
print(overlapping_spans(c,q))

From the 9th to 11th century, Armenian architecture underwent a revival under the patronage of the Bagratid Dynasty with a great deal of building done in the area of Lake Van, this included both traditional styles and new innovations. Ornately carved Armenian Khachkars were developed during this time. Many new cities and churches were built during this time, including a new capital at Lake Van and a new Cathedral on Akdamar Island to match. The Cathedral of Ani was also completed during this dynasty. It was during this time that the first major monasteries, such as Haghpat and Haritchavank were built. This period was ended by the Seljuk invasion.
~~~~~~~~~~~~~~~~~~~~
What Armenian monasteries were built in the 11th century?
~~~~~~~~~~~~~~~~~~~~
{('in', 'the'), ('11th', 'century'), ('were', 'built'), ('monasteries',), ('armenian',)}


In [59]:
print(tokenizer.tokenize(c))
print(tokenizer.tokenize(q))

['the', 'free', 'officers', "'", 'intention', 'was', 'not', 'to', 'install', 'themselves', 'in', 'government', ',', 'but', 'to', 're', '-', 'establish', 'a', 'parliamentary', 'democracy', '.', 'nasser', 'did', 'not', 'believe', 'that', 'a', 'low', '-', 'ranking', 'officer', 'like', 'himself', '(', 'a', 'lieutenant', 'colonel', ')', 'would', 'be', 'accepted', 'by', 'the', 'egyptian', 'people', ',', 'and', 'so', 'selected', 'general', 'na', '##gui', '##b', 'to', 'be', 'his', '"', 'boss', '"', 'and', 'lead', 'the', 'coup', 'in', 'name', '.', 'the', 'revolution', 'they', 'had', 'long', 'sought', 'was', 'launched', 'on', '22', 'july', 'and', 'was', 'declared', 'a', 'success', 'the', 'next', 'day', '.', 'the', 'free', 'officers', 'seized', 'control', 'of', 'all', 'government', 'buildings', ',', 'radio', 'stations', ',', 'and', 'police', 'stations', ',', 'as', 'well', 'as', 'army', 'headquarters', 'in', 'cairo', '.', 'while', 'many', 'of', 'the', 'rebel', 'officers', 'were', 'leading', 'their

## Issues
* verb / noun lemmatization might help (violated vs violating)
* stop words - partial and full (eg. "The administration", "in the")
* quotes interfere with matching (eg. "pristine" vs pristine)
* there are nouns phrases that don't match (eg. Everton Football Clubs vs Everton team etc)
* plural / possessive mistakes (free officer's vs free officers')

In [62]:
s = nlp("What Armenian monasteries were built in the 11th century?")

In [66]:
for token in s:
    print(token, token.dep_)

What dobj
Armenian amod
monasteries nsubjpass
were auxpass
built ROOT
in prep
the det
11th amod
century pobj
? punct


In [72]:
for c in s.noun_chunks:
    print(c, c.root.dep_, c.root.pos_)


What dobj NOUN
Armenian monasteries nsubjpass NOUN
the 11th century pobj NOUN


In [74]:
dir(c)

['_',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_recalculate_indices',
 '_vector',
 '_vector_norm',
 'as_doc',
 'doc',
 'end',
 'end_char',
 'ent_id',
 'ent_id_',
 'ents',
 'get_extension',
 'get_lca_matrix',
 'has_extension',
 'has_vector',
 'label',
 'label_',
 'lefts',
 'lemma_',
 'lower_',
 'merge',
 'n_lefts',
 'n_rights',
 'noun_chunks',
 'orth_',
 'remove_extension',
 'rights',
 'root',
 'sent',
 'sentiment',
 'set_extension',
 'similarity',
 'start',
 'start_char',
 'string',
 'subtree',
 'text',
 'text_with_ws',
 'to_array',
 'upper_',
 'vector',
 'vector_norm',
 'vocab']

In [69]:
for e in s.ents:
    print(e)

Armenian
the 11th century


# Sentence Parsing types

In [7]:
import random

In [9]:
rand_questions = random.sample(questions, 20)

In [11]:
for q in rand_questions:
    print(q)
    print("~~"*20)
    print()
    print("#"*40)

What did employees discuss at the  second Jam event put on by PwC consulting in 2002?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
The interceptors were no longer built because of the shift of the bombing role to what?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
 When did the demand for new NES software get boosted?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
What was proposed at the beginning of the 1991 season?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
Who was first to invade Manchuria?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
When did The Dead End Guys run?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
What is the name of the Presbyterian church in Brazil with Dutch origins?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

###################################

What did employees discuss at the  second Jam event put on by PwC consulting in 2002?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
The interceptors were no longer built because of the shift of the bombing role to what?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
 When did the demand for new NES software get boosted?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
What was proposed at the beginning of the 1991 season?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
Who was first to invade Manchuria?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
When did The Dead End Guys run?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
What is the name of the Presbyterian church in Brazil with Dutch origins?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
How many USB ports may a host controller provide?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
For what kind of aircraft is deicing fluid sprayed on the airfield?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
What state does the intensive theory conceive pain as being?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
What is the population of San Diego's urgan area?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
What political party did the Tories have to form a coalition with in 2010?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
Where was the Massey conference held?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
What was the average temperature for July 2011?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
What might stop a switch from transitioning quickly? 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
What are Tucson's typical winter high temperatures?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
Which stateroom has a small bow as a feature of the facade?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
What were Men of God expected to stay behind?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
 What is the reason for never using capacitors in power factor correction?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################
For how many years has Melbourne been considered the world's most liveable city?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

########################################