In [1]:
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz")


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
import json

In [3]:
res = predictor.predict(
  sentence="If I bring 10 dollars tomorrow, can you buy me lunch?"
)

Your label namespace was 'pos'. We recommend you use a namespace ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by default to your vocabulary.  See documentation for `non_padded_namespaces` parameter in Vocabulary.


In [4]:
res.keys()

dict_keys(['class_probabilities', 'spans', 'tokens', 'pos_tags', 'num_spans', 'hierplane_tree', 'trees'])

In [5]:
for k, v in res.items():
    if k == 'class_probabilities':
        continue
    print(k)
    print("~"*20)
    print(v)
    print("#"*20)

spans
~~~~~~~~~~~~~~~~~~~~
[[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7], [0, 8], [0, 9], [0, 10], [0, 11], [0, 12], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6], [1, 7], [1, 8], [1, 9], [1, 10], [1, 11], [1, 12], [2, 2], [2, 3], [2, 4], [2, 5], [2, 6], [2, 7], [2, 8], [2, 9], [2, 10], [2, 11], [2, 12], [3, 3], [3, 4], [3, 5], [3, 6], [3, 7], [3, 8], [3, 9], [3, 10], [3, 11], [3, 12], [4, 4], [4, 5], [4, 6], [4, 7], [4, 8], [4, 9], [4, 10], [4, 11], [4, 12], [5, 5], [5, 6], [5, 7], [5, 8], [5, 9], [5, 10], [5, 11], [5, 12], [6, 6], [6, 7], [6, 8], [6, 9], [6, 10], [6, 11], [6, 12], [7, 7], [7, 8], [7, 9], [7, 10], [7, 11], [7, 12], [8, 8], [8, 9], [8, 10], [8, 11], [8, 12], [9, 9], [9, 10], [9, 11], [9, 12], [10, 10], [10, 11], [10, 12], [11, 11], [11, 12], [12, 12]]
####################
tokens
~~~~~~~~~~~~~~~~~~~~
['If', 'I', 'bring', '10', 'dollars', 'tomorrow', ',', 'can', 'you', 'buy', 'me', 'lunch', '?']
####################
pos_tags
~~~~~~~~~~~~~~~~~~~~
['IN',

(SQ   
 (SBAR    
  (IN If)    
  (S    
   (NP (PRP I))    
   (VP (VBP bring)    
    (NP (CD 10) (NNS dollars))    
    (NP (NN tomorrow)))   
  )   
 )    
 (, ,)    
 (MD can)    
 (NP (PRP you))    
 (VP (VB buy)    
  (NP (PRP me))    
  (NP (NN lunch))   
 )    
 (. ?)   
)






In [18]:
questions = []
contexts = []
examples = []
with open("dataset/dev-v2.0.json", 'r') as handle:
    jdata = json.load(handle)
    data = jdata['data']
for i in range(len(data)):
    section = data[i]['paragraphs']
    for sec in section:
        context = sec['context']
        contexts.append(context)
        qas = sec['qas']
        for j in range(len(qas)):
            question = qas[j]['question']
            unanswerable = qas[j]['is_impossible']
            questions.append(question)
            examples.append((len(contexts)-1, len(questions)-1))

In [7]:
%%time
trees = []
for q in questions[:10]:
    res = predictor.predict(sentence=q)
    trees.append(res['trees'])

CPU times: user 4.93 s, sys: 26.8 ms, total: 4.96 s
Wall time: 1.24 s


In [8]:
trees

['(SBARQ (IN In) (WHNP (WP what) (NN country)) (SQ (VBZ is) (NP (NNP Normandy)) (VP (VBN located))) (. ?))',
 '(SBARQ (WHADVP (WRB When)) (SINV (VBD were) (NP (DT the) (NNPS Normans)) (PP (IN in) (NP (NNP Normandy)))) (. ?))',
 '(SBARQ (WHPP (IN From) (WHNP (WDT which) (NNS countries))) (SQ (VBD did) (NP (DT the) (NNP Norse)) (VP (VB originate))) (. ?))',
 '(SBARQ (WHNP (WP Who)) (SQ (VP (VBD was) (NP (DT the) (NNP Norse) (NN leader)))) (. ?))',
 '(SBARQ (WHNP (WDT What) (NN century)) (SQ (VBD did) (NP (DT the) (NNPS Normans)) (ADVP (RB first)) (VP (VBP gain) (NP (PRP$ their) (JJ separate) (NN identity)))) (. ?))',
 "(SBARQ (WHNP (WP Who)) (S (VP (VBD gave) (NP (PRP$ their) (NN name)) (PP (IN to) (NP (NNP Normandy))) (PP (IN in) (NP (NP (DT the) (CD 1000) (POS 's)) (CC and) (NP (CD 1100) (POS 's)))))))",
 '(SBARQ (WHNP (WP What)) (SQ (VBZ is) (NP (NNP France)) (NP (NP (DT a) (NN region)) (PP (IN of)))) (. ?))',
 '(SBARQ (WHNP (WP Who)) (SQ (VBD did) (NP (NNP King) (NNP Charles) (NNP II

In [9]:
def print_np(entry):
    phrase = ""
    for child in entry['children']:
        if child['nodeType'] != 'DT':
            phrase += child['word']+" "
    return("["+entry['nodeType']+"]"+phrase.strip())

def print_nps(entry, pos=None):
    print("Calling...")
    if entry['nodeType'].startswith('VB'):
        if not nlp.vocab[entry['word'].lower()].is_stop:
            yield "["+entry['nodeType']+"]"+ entry['word']
    elif entry['nodeType'].startswith("WH"):
        yield "["+entry['nodeType']+"]"+ entry['word']
    elif entry['nodeType'] == 'NP':
        keep = True
        for child in entry['children']:
            if child['nodeType'] == 'NP' or child['nodeType'] == 'PP':
                keep = False
        if keep:
            yield print_np(entry) # (entry['word'])
        else:
            if 'children' in entry and entry['children']:
                for  child in entry['children']:
                    print_nps(child)
    else:
        if 'children' in entry and entry['children']:
            for child in entry['children']:
                print_nps(child)

for q in questions[360:370]:
    print(q)
    res = predictor.predict(sentence=q)
    for x in print_nps(res['hierplane_tree']['root']):
        print(x)
    #print(res['trees'])
    print("#"*20)

What is not used for a precise definition of what it means to solve a problem using a given amount of time and space?
Calling...
####################
How is Turing machine M said not to operate?
Calling...
####################
What is the expression used to identify any given series of solutions capable of being solved within time on a deterministic Turing machine?
Calling...
####################
What is the least critical resource measured in assessing the determination of a Turing machine's ability to solve any given set of problems?
Calling...
####################
How can decision problem B be solved in time x(f)?
Calling...
####################
Time and space are both examples of what type of resource?
Calling...
####################
A complexity resource can also be described as what other type of resource?
Calling...
####################
What is typically used to broadly define complexity measures?
Calling...
####################
Communication complexity is an example of what typ

# TODO:
maintain order in x of y scenarios  
capture the NN part of NP where there is a PP inside the NP

* generate all the parts
* include any required dependencies
* generate the training data with masks and different # of conditionals 
* write model to train
* kick off a run

In [13]:
from allennlp.predictors.predictor import Predictor
import json
import pickle
import spacy
import sys

In [14]:
def print_np(entry):
    phrase = ""
    for child in entry['children']:
        if child['nodeType'] != 'DT':
            phrase += child['word']+" "
    return(("["+entry['nodeType']+"]",phrase.strip()))

def get_q_parts(entry, nlp, tokens):
    if entry['nodeType'].startswith('VB'):
        if not nlp.vocab[entry['word'].lower()].is_stop:
            tokens.append(("["+entry['nodeType']+"]", entry['word']))
    elif entry['nodeType'].startswith("WH"):
        tokens.append(("["+entry['nodeType']+"]", entry['word']))
    elif entry['nodeType'] == 'NP':
        keep = True
        for child in entry['children']:
            if child['nodeType'] == 'NP' or child['nodeType'] == 'PP':
                keep = False
        if keep:
            tokens.append(print_np(entry)) # (entry['word'])
        else:
            if 'children' in entry and entry['children']:
                for child in entry['children']:
                    get_q_parts(child, nlp, tokens)
    else:
        if 'children' in entry and entry['children']:
            for child in entry['children']:
                get_q_parts(child, nlp, tokens)


def build_labels(dev_data_file, test_data_file, limit=None):
    predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz")
    nlp = spacy.load("en_core_web_sm")
    questions = {}

    for data_file in [dev_data_file, test_data_file]:
        with open(data_file, 'r') as handle: # update
            jdata = json.load(handle)
            data = jdata['data']
        for i in range(len(data)):
            section = data[i]['paragraphs']
            for sec in section:
                qas = sec['qas']
                for j in range(len(qas)):
                    qid = qas[j]['id']
                    question = qas[j]['question']
                    questions[qid] = question

    labels = {}
    counter = 0
    section = 0#int(sys.argv[1])
    chunk = 6000
    for id, q in list(questions.items())[chunk*section:chunk*(section+1)] if limit is None else list(questions.items())[:limit]:
        res = predictor.predict(sentence=q)
        tokens = []
        get_q_parts(res['hierplane_tree']['root'], nlp, tokens)
        labels[id] = tokens
        counter += 1
        if counter % 1000 == 0:
            print("Finished with ", str(counter))
    return labels

#     with open("part_labels"+str(section)+".pkl", "wb") as f:
#         pickle.dump(labels, f)

In [15]:
labels = build_labels("dataset/dev-v2.0.json", "dataset/train-v2.0.json", 20)

In [21]:
from difflib import SequenceMatcher

In [53]:
questions[2]

'From which countries did the Norse originate?'

In [54]:
contexts[0]

'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'

In [104]:
i = 0
t1 = tokenizer.tokenize(questions[2])
t2 = tokenizer.tokenize(contexts[0])
match = SequenceMatcher(None,  
                        t1, 
                        t2, 
                        autojunk=False).get_matching_blocks()
for m in match:
    if m.size == 0:
        continue
    print(t1[m.a:m.a+m.size])


['from']
['the']


In [86]:
i = 0
t1 = tokenizer.tokenize(questions[2])
t2 = tokenizer.tokenize(contexts[0])
while i < len(t1) - 1:
    match = SequenceMatcher(None,  
                            t1, 
                            t2, 
                            autojunk=False).find_longest_match(i, len(t1), 0, len(t2))
    if match.size == 0:
        break
    print(t1[match.a:match.a+match.size])
    i = match.a+match.size
    

['from']
['the']
['norse']


In [63]:
t1

['from', 'which', 'countries', 'did', 'the', 'norse', 'originate', '?']

In [135]:
nlp = spacy.load("en_core_web_sm")

In [230]:
#my_stop = ["a", "an", "and", "as", "or", "the", "that", "which", "when", "whose", "is", "was", "what", "to", "in", "at"]
strip_stop = ["a", "an", "and", "as", "or", "the", "that", "which", "when", "whose", "is", "was", "what", "to", "of"]
#my_stop = strip_stop + ["to", "in", "at", "on", "under"]


def individual_filter(term):
    if term.startswith("##"):
        return False
    if term in nlp.vocab and  (nlp.vocab[term].is_stop or nlp.vocab[term].is_punct):
        return False
    return True

def simple_filter(term):
    if term in strip_stop or (term in nlp.vocab and nlp.vocab[term].is_punct):
        return False
    return True

def strip_terms(phrase):
    fltr = [simple_filter(t) for t in phrase]
    start = fltr.index(True)
    end = list(reversed(fltr)).index(True)
    return phrase[start:len(fltr)-end]

def matcher(source, target):
    i = 0
    matches = []
    while i < len(source):
        ii = 1
        current = []
        for j in range(len(target)):
            if source[i] == target[j]:
                cand = []
                for ii in range(len(source) - i):
                    if j+ii > len(target) - 1:
                        break
                    if source[i+ii] == target[j+ii]:
                        cand.append(source[i+ii])
                    else:
                        if len(cand) > len(current):
                            current = list(cand)
                        break
                if len(cand) > len(current):
                    current = list(cand)
        if current:
            matches.append(current)
        i += len(current) if current else 1
        
    ## filters
    matches = [m for m in matches if sum(
        [individual_filter(token) for token in m]) > 0]
    
    matches = [strip_terms(m) for m in matches]
    return matches

In [144]:
print(matcher(t1, t2))

[['norse']]


In [141]:
nlp.vocab[','].is_punct

True

In [44]:
from pytorch_pretrained_bert.tokenization import BertTokenizer


In [45]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)


In [99]:
def get_matches(q, c):
    t1 = tokenizer.tokenize(q)
    t2 = tokenizer.tokenize(c)
    matches = []
    i = 0
    while i < len(t1) - 1:
        match = SequenceMatcher(None,  
                                t1, 
                                t2, 
                                autojunk=False).find_longest_match(i, len(t1), 0, len(t2))
        if match.size == 0:
            break
        matches.append(t1[match.a:match.a+match.size])
        i = match.a+match.size
    return matches

In [100]:
import random

In [257]:
def x_in_y(query, base):
    try:
        l = len(query)
    except TypeError:
        l = 1
        query = type(base)((query,))

    for i in range(len(base)):
        if base[i:i+l] == query:
            return True
    return False

for i in random.sample(range(len(examples)), 20):
    q = questions[examples[i][1]]
    c = contexts[examples[i][0]]
    print("QUESTION",q)#print(c, "\n", q)
    matched_terms = matcher(tokenizer.tokenize(q),tokenizer.tokenize(c))
    print("MATCHED WORDS", matched_terms)
    res = predictor.predict(sentence=q)
    tokens = []
    get_q_parts(res['hierplane_tree']['root'], nlp, tokens)
    spans = {}
    gather_spans(res['hierplane_tree']['root'], spans)
    print("SPANS FOUND", spans)
    for s in matched_terms:
        min_span = (None,None)
        for cand in spans.keys():
            if x_in_y(s, tokenizer.tokenize(cand)) and ( min_span[0] is None or len(cand) < len(min_span[0]) ):
                min_span = (cand, spans[cand])
        print("~~~~",s, "->", min_span)
    print("TOKENS", tokens)
    print("#"*20)

QUESTION What are both Branko Milanovic and Joseph Stiglitz?
MATCHED WORDS [['bran', '##ko', 'milano', '##vic'], ['joseph', 'st', '##ig', '##litz']]
SPANS FOUND {'What are both Branko Milanovic and Joseph Stiglitz ?': 'SBARQ', 'What': 'WP', 'are both Branko Milanovic and Joseph Stiglitz': 'SQ', 'are': 'VBP', 'both Branko Milanovic and Joseph Stiglitz': 'NP', 'both': 'DT', 'Branko Milanovic and Joseph Stiglitz': 'NP', 'Branko Milanovic': 'NP', 'Branko': 'NNP', 'Milanovic': 'NNP', 'and': 'CC', 'Joseph Stiglitz': 'NP', 'Joseph': 'NNP', 'Stiglitz': 'NNP', '?': '.'}
~~~~ ['bran', '##ko', 'milano', '##vic'] -> ('Branko Milanovic', 'NP')
~~~~ ['joseph', 'st', '##ig', '##litz'] -> ('Joseph Stiglitz', 'NP')
TOKENS [('[WHNP]', 'What'), ('[NP]', 'Branko Milanovic'), ('[NP]', 'Joseph Stiglitz')]
####################
QUESTION How many people did Hamas kill between 2010 to 2017?
MATCHED WORDS [['people'], ['hamas']]
SPANS FOUND {'How many people did Hamas kill between 2010 to 2017 ?': 'SBARQ', 'How 

SPANS FOUND {'When were the talks held for braodcast right to the Primier league for a five year period from the 1992 season ?': 'SBARQ', 'When': 'WRB', 'were the talks held for braodcast right to the Primier league for a five year period from the 1992 season': 'SQ', 'were': 'VBD', 'the talks': 'NP', 'the': 'DT', 'talks': 'NNS', 'held for braodcast right to the Primier league for a five year period from the 1992 season': 'VP', 'held': 'VBN', 'for braodcast right to the Primier league': 'PP', 'for': 'IN', 'braodcast right': 'NP', 'braodcast': 'NN', 'right': 'NN', 'to the Primier league': 'PP', 'to': 'IN', 'the Primier league': 'NP', 'Primier': 'NNP', 'league': 'NN', 'for a five year period from the 1992 season': 'PP', 'a five year period from the 1992 season': 'NP', 'a five year period': 'NP', 'a': 'DT', 'five': 'CD', 'year': 'NN', 'period': 'NN', 'from the 1992 season': 'PP', 'from': 'IN', 'the 1992 season': 'NP', '1992': 'CD', 'season': 'NN', '?': '.'}
~~~~ ['talks'] -> ('talks', 'NNS

SPANS FOUND {'What new product did Bank of America introduce in 1958 ?': 'SBARQ', 'What new product': 'WHNP', 'What': 'WDT', 'new': 'JJ', 'product': 'NN', 'did Bank of America introduce in 1958': 'SQ', 'did': 'VBD', 'Bank of America': 'NP', 'Bank': 'NNP', 'of America': 'PP', 'of': 'IN', 'America': 'NNP', 'introduce in 1958': 'VP', 'introduce': 'VBP', 'in 1958': 'PP', 'in': 'IN', '1958': 'CD', '?': '.'}
~~~~ ['new', 'product'] -> ('What new product', 'WHNP')
~~~~ ['bank', 'of', 'america'] -> ('Bank of America', 'NP')
~~~~ ['1958'] -> ('1958', 'CD')
TOKENS [('[WHNP]', 'What new product'), ('[NP]', 'Bank'), ('[NP]', 'America'), ('[VBP]', 'introduce'), ('[NP]', '1958')]
####################
QUESTION What's the name of where the Rhine branches off near Dordrecht?
MATCHED WORDS [['rhine'], ['branches', 'off'], ['near', 'do', '##rd', '##recht']]
SPANS FOUND {"What 's the name of where the Rhine branches off near Dordrecht ?": 'SBARQ', 'What': 'WP', "'s the name of where the Rhine branches off

In [244]:
def gather_spans(o, out):
    out[o['word']] = o['nodeType']
    if 'children' in o:
        for child in o['children']:
            gather_spans(child, out)

out = {}
gather_spans(res['hierplane_tree']['root'], out)
print(out)

{'What measurement do scientists used to determine the quality of water ?': 'SBARQ', 'What measurement': 'WHNP', 'What': 'WDT', 'measurement': 'NN', 'do scientists used to determine the quality of water': 'SQ', 'do': 'VBP', 'scientists': 'NNS', 'used to determine the quality of water': 'VP', 'used': 'VBN', 'to determine the quality of water': 'VP', 'to': 'TO', 'determine the quality of water': 'VP', 'determine': 'VB', 'the quality of water': 'NP', 'the quality': 'NP', 'the': 'DT', 'quality': 'NN', 'of water': 'PP', 'of': 'IN', 'water': 'NN', '?': '.'}


In [235]:
res['hierplane_tree']['root']

{'word': 'What measurement do scientists used to determine the quality of water ?',
 'nodeType': 'SBARQ',
 'attributes': ['SBARQ'],
 'link': 'SBARQ',
 'children': [{'word': 'What measurement',
   'nodeType': 'WHNP',
   'attributes': ['WHNP'],
   'link': 'WHNP',
   'children': [{'word': 'What',
     'nodeType': 'WDT',
     'attributes': ['WDT'],
     'link': 'WDT'},
    {'word': 'measurement',
     'nodeType': 'NN',
     'attributes': ['NN'],
     'link': 'NN'}]},
  {'word': 'do scientists used to determine the quality of water',
   'nodeType': 'SQ',
   'attributes': ['SQ'],
   'link': 'SQ',
   'children': [{'word': 'do',
     'nodeType': 'VBP',
     'attributes': ['VBP'],
     'link': 'VBP'},
    {'word': 'scientists',
     'nodeType': 'NP',
     'attributes': ['NP'],
     'link': 'NP',
     'children': [{'word': 'scientists',
       'nodeType': 'NNS',
       'attributes': ['NNS'],
       'link': 'NNS'}]},
    {'word': 'used to determine the quality of water',
     'nodeType': 'VP',
  

In [206]:
 
#[['of', 'victorian', '##s'], ['christian']]

q ="What says that formations must be older than the inclusions inside them?"
c = "The principle of inclusions and components states that, with sedimentary rocks, if inclusions (or clasts) are found in a formation, then the inclusions must be older than the formation that contains them. For example, in sedimentary rocks, it is common for gravel from an older formation to be ripped up and included in a newer layer. A similar situation with igneous rocks occurs when xenoliths are found. These foreign bodies are picked up as magma or lava flows, and are incorporated, later to cool in the matrix. As a result, xenoliths are older than the rock which contains them."
print(matcher(tokenizer.tokenize(q),tokenizer.tokenize(c)))


[['must', 'be', 'older', 'than'], ['inclusion', '##s']]


In [213]:
res = predictor.predict(sentence="How much did Silas B. Cobb pledge to the university?")
tokens = []
get_q_parts(res['hierplane_tree']['root'], nlp, tokens)

In [214]:
tokens

[('[WHNP]', 'How much'), ('[NP]', 'Silas B. Cobb'), ('[NP]', 'university')]

In [215]:
res['hierplane_tree']['root']

{'word': 'How much did Silas B. Cobb pledge to the university ?',
 'nodeType': 'SBARQ',
 'attributes': ['SBARQ'],
 'link': 'SBARQ',
 'children': [{'word': 'How much',
   'nodeType': 'WHNP',
   'attributes': ['WHNP'],
   'link': 'WHNP',
   'children': [{'word': 'How',
     'nodeType': 'WRB',
     'attributes': ['WRB'],
     'link': 'WRB'},
    {'word': 'much', 'nodeType': 'JJ', 'attributes': ['JJ'], 'link': 'JJ'}]},
  {'word': 'did Silas B. Cobb pledge to the university',
   'nodeType': 'SQ',
   'attributes': ['SQ'],
   'link': 'SQ',
   'children': [{'word': 'did',
     'nodeType': 'VBD',
     'attributes': ['VBD'],
     'link': 'VBD'},
    {'word': 'Silas B. Cobb',
     'nodeType': 'NP',
     'attributes': ['NP'],
     'link': 'NP',
     'children': [{'word': 'Silas',
       'nodeType': 'NNP',
       'attributes': ['NNP'],
       'link': 'NNP'},
      {'word': 'B.', 'nodeType': 'NNP', 'attributes': ['NNP'], 'link': 'NNP'},
      {'word': 'Cobb',
       'nodeType': 'NNP',
       'attrib

In [290]:
for i, q in enumerate(questions):
    if "What is the name of the private day school for K-12 students the university runs?" in q:
         print(i)

7370


In [294]:
for i, e in enumerate(examples):
    
    if e[1] == 7370:
        print(i)
        print(e[0])

7370
732


In [292]:
print(contexts[732])

The university runs a number of academic institutions and programs apart from its undergraduate and postgraduate schools. It operates the University of Chicago Laboratory Schools (a private day school for K-12 students and day care), the Sonia Shankman Orthogenic School (a residential treatment program for those with behavioral and emotional problems), and four public charter schools on the South Side of Chicago administered by the university's Urban Education Institute. In addition, the Hyde Park Day School, a school for students with learning disabilities, maintains a location on the University of Chicago campus. Since 1983, the University of Chicago has maintained the University of Chicago School Mathematics Project, a mathematics program used in urban primary and secondary schools. The university runs a program called the Council on Advanced Studies in the Social Sciences and Humanities, which administers interdisciplinary workshops to provide a forum for graduate students, faculty

In [223]:
for c in contexts:
    if "NP" in c:
        print (c)

This motivates the concept of a problem being hard for a complexity class. A problem X is hard for a class of problems C if every problem in C can be reduced to X. Thus no problem in C is harder than X, since an algorithm for X allows us to solve any problem in C. Of course, the notion of hard problems depends on the type of reduction being used. For complexity classes larger than P, polynomial-time reductions are commonly used. In particular, the set of problems that are hard for NP is the set of NP-hard problems.
If a problem X is in C and hard for C, then X is said to be complete for C. This means that X is the hardest problem in C. (Since many problems could be equally hard, one might say that X is one of the hardest problems in C.) Thus the class of NP-complete problems contains the most difficult problems in NP, in the sense that they are the ones most likely not to be in P. Because the problem P = NP is not solved, being able to reduce a known NP-complete problem, Π2, to another

In [221]:
contexts[0]

'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'

# TODO:
x remove stop words (to, the)
- random "nots" and "don'ts" that appear bewteen stuff
    - What is the process that asks a more specific question about all possible algorithms that could not be used to solve the same problem?
[['question', 'about', 'all', 'possible', 'algorithms', 'that', 'could'], ['be', 'used', 'to', 'solve', 'the', 'same', 'problem']]
- maybe use LSTM for minor changes and limit in length (Vitamin c vs vitamin d)
- many answerable questions use synonyms ("isolated" vs "seperated") but unanswerable less so, but probabilities of copy is always higher than paraphrase (input explicitly?????)
- close match (What do gravitational forces act between?
    [['gravitational'], ['forces'], ['act'], ['between']] -> " gravitational force acts between masses"
    - practice vs practices
- typoes (primier league vs primer)
- overlap with q words (may not matter since we still use those)
- Check how model handles this:
    - The principle of inclusions and components states that, with sedimentary rocks, if inclusions (or clasts) are found in a formation, then the inclusions must be older than the formation that contains them. For example, in sedimentary rocks, it is common for gravel from an older formation to be ripped up and included in a newer layer. A similar situation with igneous rocks occurs when xenoliths are found. These foreign bodies are picked up as magma or lava flows, and are incorporated, later to cool in the matrix. As a result, xenoliths are older than the rock which contains them. 
     - What says that formations must be older than the inclusions inside them?  
     
x single endings (["'"], ['t']) or ##s, ##ed, etc  
x remove commas  

# Data processing and "Parsing"

In [282]:
#my_stop = ["a", "an", "and", "as", "or", "the", "that", "which", "when", "whose", "is", "was", "what", "to", "in", "at"]
strip_stop = ["a", "an", "and", "as", "or", "the", "that", "which", "when", "whose", "is", "was", "what", "to", "of"]
#my_stop = strip_stop + ["to", "in", "at", "on", "under"]


def individual_filter(term):
    if term.startswith("##"):
        return False
    if term in nlp.vocab and  (nlp.vocab[term].is_stop or nlp.vocab[term].is_punct):
        return False
    return True

def simple_filter(term):
    if term in strip_stop or (term in nlp.vocab and nlp.vocab[term].is_punct):
        return False
    return True

def strip_terms(phrase):
    fltr = [simple_filter(t) for t in phrase]
    start = fltr.index(True)
    end = list(reversed(fltr)).index(True)
    return phrase[start:len(fltr)-end]

def matcher(source, target):
    i = 0
    matches = []
    while i < len(source):
        ii = 1
        current = []
        for j in range(len(target)):
            if source[i] == target[j]:
                cand = []
                for ii in range(len(source) - i):
                    if j+ii > len(target) - 1:
                        break
                    if source[i+ii] == target[j+ii]:
                        cand.append(source[i+ii])
                    else:
                        if len(cand) > len(current):
                            current = list(cand)
                        break
                if len(cand) > len(current):
                    current = list(cand)
        if current:
            matches.append(current)
        i += len(current) if current else 1
        
    ## filters
    matches = [m for m in matches if sum(
        [individual_filter(token) for token in m]) > 0]
    
    matches = [strip_terms(m) for m in matches]
    return matches

def gather_spans(o, out):
    out[o['word']] = o['nodeType']
    if 'children' in o:
        for child in o['children']:
            gather_spans(child, out)

def x_in_y(query, base):
    try:
        l = len(query)
    except TypeError:
        l = 1
        query = type(base)((query,))

    for i in range(len(base)):
        if base[i:i+l] == query:
            return True
    return False            
            
def get_copies(q, c, parse):
    copies =  matcher(tokenizer.tokenize(q), tokenizer.tokenize(c))
    spans = {}
    gather_spans(parse, spans)
    annotated_copies = []
    for s in copies:
        min_span = (None,None)
        for cand in spans.keys():
            if x_in_y(s, tokenizer.tokenize(cand)) and ( min_span[0] is None or len(cand) < len(min_span[0]) ):
                min_span = (cand, spans[cand])
        annotated_copies.append((s, min_span))
    return annotated_copies
        

def get_q_parts(entry, nlp, tokens):
    if entry['nodeType'].startswith('VB'):
        if not nlp.vocab[entry['word'].lower()].is_stop:
            tokens.append(("["+entry['nodeType']+"]", entry['word']))
    elif entry['nodeType'].startswith("WH"):
        tokens.append(("["+entry['nodeType']+"]", entry['word']))
    elif entry['nodeType'] == 'NP':
        keep = True
        for child in entry['children']:
            if child['nodeType'] == 'NP' or child['nodeType'] == 'PP':
                keep = False
        if keep:
            tokens.append(print_np(entry)) # (entry['word'])
        else:
            if 'children' in entry and entry['children']:
                for child in entry['children']:
                    get_q_parts(child, nlp, tokens)
    else:
        if 'children' in entry and entry['children']:
            for child in entry['children']:
                get_q_parts(child, nlp, tokens)

def get_const_chunks(q):
    res = predictor.predict(sentence=q)
    tokens = []
    get_q_parts(res['hierplane_tree']['root'], nlp, tokens)
    return tokens, res['hierplane_tree']['root']

def get_ordered_qsegs(q, c):
    print(q)
    q_chunks, parse = get_const_chunks(q)
    print(q_chunks)
    copied_chunks = get_copies(q, c, parse)
    print(copied_chunks)
    #print(parse)
    assert False
    combined_chunks = combine_chunks(copied_chunks, q_chunks, parse)
    ordered_qsegs = order_chunks(combined_chunks, parse)
    return ordered_qsegs

In [295]:
#####FIX THIS SHITS
for i in random.sample(range(len(examples)), 1):
    print(i)
    i = 7370
    q = questions[examples[i][1]]
    c = contexts[examples[i][0]]
    print(get_ordered_qsegs(q,c))

2426
What is the name of the private day school for K-12 students the university runs?
[('[WHNP]', 'What'), ('[NP]', 'name'), ('[NP]', 'private day school'), ('[NP]', 'K-12 students'), ('[NP]', 'university'), ('[VBZ]', 'runs')]
[(['private', 'day', 'school', 'for', 'k', '-', '12', 'students'], ('the private day school for K-12 students the university runs', 'NP')), (['university', 'runs'], ('the university runs', 'S'))]


AssertionError: 