In [1]:
from allennlp.predictors.predictor import Predictor
predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz")


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [19]:
import json

In [2]:
res = predictor.predict(
  sentence="If I bring 10 dollars tomorrow, can you buy me lunch?"
)

Your label namespace was 'pos'. We recommend you use a namespace ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by default to your vocabulary.  See documentation for `non_padded_namespaces` parameter in Vocabulary.


In [4]:
res.keys()

dict_keys(['class_probabilities', 'spans', 'tokens', 'pos_tags', 'num_spans', 'hierplane_tree', 'trees'])

In [6]:
for k, v in res.items():
    if k == 'class_probabilities':
        continue
    print(k)
    print("~"*20)
    print(v)
    print("#"*20)

spans
~~~~~~~~~~~~~~~~~~~~
[[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [0, 6], [0, 7], [0, 8], [0, 9], [0, 10], [0, 11], [0, 12], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6], [1, 7], [1, 8], [1, 9], [1, 10], [1, 11], [1, 12], [2, 2], [2, 3], [2, 4], [2, 5], [2, 6], [2, 7], [2, 8], [2, 9], [2, 10], [2, 11], [2, 12], [3, 3], [3, 4], [3, 5], [3, 6], [3, 7], [3, 8], [3, 9], [3, 10], [3, 11], [3, 12], [4, 4], [4, 5], [4, 6], [4, 7], [4, 8], [4, 9], [4, 10], [4, 11], [4, 12], [5, 5], [5, 6], [5, 7], [5, 8], [5, 9], [5, 10], [5, 11], [5, 12], [6, 6], [6, 7], [6, 8], [6, 9], [6, 10], [6, 11], [6, 12], [7, 7], [7, 8], [7, 9], [7, 10], [7, 11], [7, 12], [8, 8], [8, 9], [8, 10], [8, 11], [8, 12], [9, 9], [9, 10], [9, 11], [9, 12], [10, 10], [10, 11], [10, 12], [11, 11], [11, 12], [12, 12]]
####################
tokens
~~~~~~~~~~~~~~~~~~~~
['If', 'I', 'bring', '10', 'dollars', 'tomorrow', ',', 'can', 'you', 'buy', 'me', 'lunch', '?']
####################
pos_tags
~~~~~~~~~~~~~~~~~~~~
['IN',

(SQ   
 (SBAR    
  (IN If)    
  (S    
   (NP (PRP I))    
   (VP (VBP bring)    
    (NP (CD 10) (NNS dollars))    
    (NP (NN tomorrow)))   
  )   
 )    
 (, ,)    
 (MD can)    
 (NP (PRP you))    
 (VP (VB buy)    
  (NP (PRP me))    
  (NP (NN lunch))   
 )    
 (. ?)   
)






In [21]:
questions = []
with open("dataset/dev-v2.0.json", 'r') as handle:
    jdata = json.load(handle)
    data = jdata['data']
for i in range(len(data)):
    section = data[i]['paragraphs']
    for sec in section:
        context = sec['context']
        #contexts.append(context)
        qas = sec['qas']
        for j in range(len(qas)):
            question = qas[j]['question']
            unanswerable = qas[j]['is_impossible']
            questions.append(question)

In [27]:
%%time
trees = []
for q in questions[:10]:
    res = predictor.predict(sentence=q)
    trees.append(res['trees'])

CPU times: user 4.86 s, sys: 22.7 ms, total: 4.88 s
Wall time: 1.23 s


In [28]:
trees

['(SBARQ (IN In) (WHNP (WP what) (NN country)) (SQ (VBZ is) (NP (NNP Normandy)) (VP (VBN located))) (. ?))',
 '(SBARQ (WHADVP (WRB When)) (SINV (VBD were) (NP (DT the) (NNPS Normans)) (PP (IN in) (NP (NNP Normandy)))) (. ?))',
 '(SBARQ (WHPP (IN From) (WHNP (WDT which) (NNS countries))) (SQ (VBD did) (NP (DT the) (NNP Norse)) (VP (VB originate))) (. ?))',
 '(SBARQ (WHNP (WP Who)) (SQ (VP (VBD was) (NP (DT the) (NNP Norse) (NN leader)))) (. ?))',
 '(SBARQ (WHNP (WDT What) (NN century)) (SQ (VBD did) (NP (DT the) (NNPS Normans)) (ADVP (RB first)) (VP (VBP gain) (NP (PRP$ their) (JJ separate) (NN identity)))) (. ?))',
 "(SBARQ (WHNP (WP Who)) (S (VP (VBD gave) (NP (PRP$ their) (NN name)) (PP (IN to) (NP (NNP Normandy))) (PP (IN in) (NP (NP (DT the) (CD 1000) (POS 's)) (CC and) (NP (CD 1100) (POS 's)))))))",
 '(SBARQ (WHNP (WP What)) (SQ (VBZ is) (NP (NNP France)) (NP (NP (DT a) (NN region)) (PP (IN of)))) (. ?))',
 '(SBARQ (WHNP (WP Who)) (SQ (VBD did) (NP (NNP King) (NNP Charles) (NNP II

In [102]:
def print_np(entry):
    phrase = ""
    for child in entry['children']:
        if child['nodeType'] != 'DT':
            phrase += child['word']+" "
    return("["+entry['nodeType']+"]"+phrase.strip())

def print_nps(entry, pos=None):
    print("Calling...")
    if entry['nodeType'].startswith('VB'):
        if not nlp.vocab[entry['word'].lower()].is_stop:
            yield "["+entry['nodeType']+"]"+ entry['word']
    elif entry['nodeType'].startswith("WH"):
        yield "["+entry['nodeType']+"]"+ entry['word']
    elif entry['nodeType'] == 'NP':
        keep = True
        for child in entry['children']:
            if child['nodeType'] == 'NP' or child['nodeType'] == 'PP':
                keep = False
        if keep:
            yield print_np(entry) # (entry['word'])
        else:
            if 'children' in entry and entry['children']:
                for  child in entry['children']:
                    print_nps(child)
    else:
        if 'children' in entry and entry['children']:
            for child in entry['children']:
                print_nps(child)

for q in questions[360:370]:
    print(q)
    res = predictor.predict(sentence=q)
    for x in print_nps(res['hierplane_tree']['root']):
        print(x)
    #print(res['trees'])
    print("#"*20)

What is not used for a precise definition of what it means to solve a problem using a given amount of time and space?
Calling...
####################
How is Turing machine M said not to operate?
Calling...
####################
What is the expression used to identify any given series of solutions capable of being solved within time on a deterministic Turing machine?
Calling...
####################
What is the least critical resource measured in assessing the determination of a Turing machine's ability to solve any given set of problems?
Calling...
####################
How can decision problem B be solved in time x(f)?
Calling...
####################
Time and space are both examples of what type of resource?
Calling...
####################
A complexity resource can also be described as what other type of resource?
Calling...
####################
What is typically used to broadly define complexity measures?
Calling...
####################
Communication complexity is an example of what typ

# TODO:
maintain order in x of y scenarios  
capture the NN part of NP where there is a PP inside the NP

* generate all the parts
* include any required dependencies
* generate the training data with masks and different # of conditionals 
* write model to train
* kick off a run

In [50]:
import spacy

In [52]:
nlp = spacy.load("en_core_web_sm")

In [59]:
nlp.vocab['located'].is_stop

False

In [93]:
%load_ext autoreload
%autoreload 2

In [94]:
import examples.build_labels

In [112]:
examples.build_labels.build_labels("dataset/dev-v2.0.json","dataset/train-v2.0.json", 100)

In [114]:
questions = {}
(dev_data_file, test_data_file) = ("dataset/dev-v2.0.json","dataset/train-v2.0.json")
for data_file in [dev_data_file, test_data_file]:
    with open(data_file, 'r') as handle: # update
        jdata = json.load(handle)
        data = jdata['data']
    for i in range(len(data)):
        section = data[i]['paragraphs']
        for sec in section:
            qas = sec['qas']
            for j in range(len(qas)):
                qid = qas[j]['id']
                question = qas[j]['question']
                questions[qid] = question

In [115]:
len(questions)

142192

In [117]:
import pickle

In [120]:
data = pickle.load(open("part_labels.pkl", 'rb'))

In [124]:
data.keys()

dict_keys(['56ddde6b9a695914005b9628', '56ddde6b9a695914005b9629', '56ddde6b9a695914005b962a', '56ddde6b9a695914005b962b', '56ddde6b9a695914005b962c', '5ad39d53604f3c001a3fe8d1', '5ad39d53604f3c001a3fe8d2', '5ad39d53604f3c001a3fe8d3', '5ad39d53604f3c001a3fe8d4', '56dddf4066d3e219004dad5f', '56dddf4066d3e219004dad60', '56dddf4066d3e219004dad61', '5ad3a266604f3c001a3fea27', '5ad3a266604f3c001a3fea28', '5ad3a266604f3c001a3fea29', '5ad3a266604f3c001a3fea2a', '5ad3a266604f3c001a3fea2b', '56dde0379a695914005b9636', '56dde0379a695914005b9637', '5ad3ab70604f3c001a3feb89', '5ad3ab70604f3c001a3feb8a', '56dde0ba66d3e219004dad75', '56dde0ba66d3e219004dad76', '56dde0ba66d3e219004dad77', '5ad3ad61604f3c001a3fec0d', '5ad3ad61604f3c001a3fec0e', '5ad3ad61604f3c001a3fec0f', '5ad3ad61604f3c001a3fec10', '56dde1d966d3e219004dad8d', '5ad3ae14604f3c001a3fec39', '5ad3ae14604f3c001a3fec3a', '56dde27d9a695914005b9651', '56dde27d9a695914005b9652', '5ad3af11604f3c001a3fec63', '5ad3af11604f3c001a3fec64', '5ad3af11