In [1]:
import re
import os
import nltk
import json
import numpy as np
import sklearn
import spacy
from spacy2folia import spacy2folia
from itertools import groupby

In [2]:
def get_pp_attachment_ambiguity(parse):
    # https://stackoverflow.com/questions/18799036/python-best-way-to-remove-duplicate-character-from-string
    step0 = re.sub(r'SBAR \(IN', 'PP (IN', parse)
    step1 = [m[0] for m in re.findall(r'[VNP]P',step0)]
    step2 = ''.join(step1)
    step3 = ''.join(ch for ch, _ in groupby(step2))
    phrase_pattern = step3
    
    # If no prep phrase, let's not waste time.
    if not re.search(r'P',phrase_pattern):
        if len(phrase_pattern)>0:
            print("No preps: {}".format(phrase_pattern))
            print(parse)
        return False, 'P'
    
    # (V, N, P, N) => ambiguous (Pantel&Lin) 
    # Mary ate a salad with a fork. 
    # Mary ate a salad with croutons. 
    match_obj = re.search(r'VNPN', phrase_pattern)
    if match_obj:
        return True, 'VNPN'
    #(V, P, N, P, N) => ambiguous (I made this!) 
    #I walked with my golf bag to the clubhouse. 
    #I walked with my golf bag in a pullcart.
    match_obj = re.search(r'VPNPN', phrase_pattern)
    if match_obj:
        return True, 'VPNPN'
    # (N, P, N, P, N) => ambiguous (I made this!)
    # In at least one image there is a single tree with orange flowers in front of a church with the open door facing forward left.
    # ... [tree with orange] flowers in front of a church ...
    # ... [tree with orange] flowers with five petals on thick branches... 
    # ([N, P, {N, P, N], P, N})
    match_obj = re.search(r'NPNPN', phrase_pattern)
    if match_obj:
        return True, 'NPNPN'
    # (N, V, P, N) => ambiguous (discovered empirically)
    # ... women wearing white bikinis standing next to the water. 
    # (simplification)
    # ... A woman is wearing a bikini cooking on a gas stove. 
    # ... A woman is holding a spatula cooking in a white bikini. 
    if re.search(r'NVNVPN', phrase_pattern):
        return True, 'NVNVPN'
    
    # We have now passed all the checks for ambiguity. 
    
    #^(N, P, N, V) => unambiguous (I made this!)
    # The man with the beard sells tacos. 
    match_obj = re.search(r'^NPNV', phrase_pattern)
    if match_obj:
        print("Pattern match (^NPNV=un): {}".format(phrase_pattern))
        print(parse)
        return False, '^NPNV'
    else:
        # ...in the image on the left...
        match_obj = re.search(r'PNPN', phrase_pattern)
        if match_obj:
            print("Pattern match (PNPN=un): {}".format(phrase_pattern))
            print(parse)
            return False, 'PNPN'
        else:
            print("New pattern: {}".format(phrase_pattern))
    print(parse)
    return False, phrase_pattern

In [3]:
def has_pp_attachment_ambiguity(parse):
    ambig, pat = get_pp_attachment_ambiguity(parse)
    return ambig

In [4]:
nlp = spacy.load('en_core_web_lg')

In [5]:
def load_text_file(file):
    lines = []
    with open(file) as text_in:
        for line in text_in:
            lines.append(line.strip())
    return lines


In [6]:
dev_sents = load_text_file('../data/dev.sent')
dev_parses = load_text_file('../data/dev.parse')

In [7]:
ambiguous_sents = []
ambiguous_parses = []
ambiguous_pattern_map = {}
for sent,parse in zip(dev_sents,dev_parses):
    ambig,pat = get_pp_attachment_ambiguity(parse)
    if ambig:
        if pat not in ambiguous_pattern_map:
            ambiguous_pattern_map[pat] = []
        ambiguous_pattern_map[pat].append((sent,parse))
        ambiguous_sents.append(sent)
        ambiguous_parses.append(parse)
        
        

New pattern: PNVNVN
(S (PP (IN IN) (NP (QP (RB at) (RBS least) (CD one)) (NN image))) (NP (EX there)) (VP (VBP are) (NP (NP (QP (RB at) (JJS least) (CD four)) (NN bottle) (NNS rows)) (SBAR (WHNP (WDT that)) (S (ADVP (RB together)) (VP (VBP make) (NP (DT a) (VBG walking) (NN path))))))) (. .))
New pattern: PNVNVN
(S (PP (IN IN) (NP (QP (RB at) (RBS least) (CD one)) (NN image))) (NP (EX there)) (VP (VBP are) (NP (NP (QP (RB at) (JJS least) (CD four)) (NN bottle) (NNS rows)) (SBAR (WHNP (WDT that)) (S (ADVP (RB together)) (VP (VBP make) (NP (DT a) (VBG walking) (NN path))))))) (. .))
New pattern: PNVNVN
(S (PP (IN IN) (NP (QP (RB at) (RBS least) (CD one)) (NN image))) (NP (EX there)) (VP (VBP are) (NP (NP (QP (RB at) (JJS least) (CD four)) (NN bottle) (NNS rows)) (SBAR (WHNP (WDT that)) (S (ADVP (RB together)) (VP (VBP make) (NP (DT a) (VBG walking) (NN path))))))) (. .))
New pattern: PNVNVN
(S (PP (IN IN) (NP (QP (RB at) (RBS least) (CD one)) (NN image))) (NP (EX there)) (VP (VBP are) (N

Pattern match (^NPNV=un): NPNVN
(S (NP (NP (CD One) (NN seal)) (PP (IN in) (NP (DT the) (JJ right) (NN image)))) (VP (VBZ has) (S (NP (PRP$ its) (NN mouth)) (ADJP (JJ open)))) (. .))
Pattern match (^NPNV=un): NPNVN
(S (NP (NP (CD One) (NN seal)) (PP (IN in) (NP (DT the) (JJ right) (NN image)))) (VP (VBZ has) (S (NP (PRP$ its) (NN mouth)) (ADJP (JJ open)))) (. .))
Pattern match (^NPNV=un): NPNVN
(S (NP (NP (CD One) (NN seal)) (PP (IN in) (NP (DT the) (JJ right) (NN image)))) (VP (VBZ has) (S (NP (PRP$ its) (NN mouth)) (ADJP (JJ open)))) (. .))
Pattern match (^NPNV=un): NPNVN
(S (NP (NP (CD One) (NN seal)) (PP (IN in) (NP (DT the) (JJ right) (NN image)))) (VP (VBZ has) (S (NP (PRP$ its) (NN mouth)) (ADJP (JJ open)))) (. .))
No preps: NVN
(S (S (NP (PRP We)) (VP (VBP have) (ADVP (RB here)))) (, ,) (NP (CD two) (NNS seals)) (. .))
No preps: NVN
(S (S (NP (PRP We)) (VP (VBP have) (ADVP (RB here)))) (, ,) (NP (CD two) (NNS seals)) (. .))
No preps: NVN
(S (S (NP (PRP We)) (VP (VBP have) (ADVP

Pattern match (^NPNV=un): NPNVN
(S (NP (NP (CD One)) (PP (IN of) (NP (DT the) (NNS images)))) (VP (VBZ has) (S (NP (DT some) (NN vegetation) (SYM /) (NN greenery)) (ADJP (JJ visible)))) (. .))
Pattern match (^NPNV=un): NPNVN
(S (NP (NP (CD One)) (PP (IN of) (NP (DT the) (NNS images)))) (VP (VBZ has) (S (NP (DT some) (NN vegetation) (SYM /) (NN greenery)) (ADJP (JJ visible)))) (. .))
Pattern match (^NPNV=un): NPNVN
(S (NP (NP (CD One)) (PP (IN of) (NP (DT the) (NNS images)))) (VP (VBZ has) (S (NP (DT some) (NN vegetation) (SYM /) (NN greenery)) (ADJP (JJ visible)))) (. .))
Pattern match (PNPN=un): PNPNVPN
(S (PP (IN In) (NP (CD one) (NN image))) (NP (NP (DT a) (NN pair)) (PP (IN of) (NP (NNS ferrets)))) (VP (MD can) (VP (VB be) (VP (VBN seen) (S (VP (VBG eating) (PP (IN out) (PP (IN of) (NP (DT a) (NN food) (NN bowl))))))))) (. .))
Pattern match (PNPN=un): PNPNVPN
(S (PP (IN In) (NP (CD one) (NN image))) (NP (NP (DT a) (NN pair)) (PP (IN of) (NP (NNS ferrets)))) (VP (MD can) (VP (VB be)

In [14]:
ambiguous_sents_and_parses = [(s,p) for (s,p) in zip(ambiguous_sents, ambiguous_parses)]

In [16]:
batch_size = 10
unique_sents = set()
sent_hopper = []
bcount = 0
for (s,p) in ambiguous_sents_and_parses:
    s = ' '.join(s.strip().split())
    if s in unique_sents:
        continue
    unique_sents.add(s)
    sent_hopper.append(s)
    if (len(sent_hopper)%batch_size)==0:
        bcount += 1
        doc = nlp('\n'.join(sent_hopper))
        batch_id = "nlvr2_batch_{}".format(str(bcount).zfill(4))
        foliadoc = spacy2folia.convert(doc, batch_id, paragraphs=False)
        foliadoc.save("../data/folia/{}.folia.xml".format(batch_id))
        sent_hopper = []