In [1]:
import re
import os
import nltk
import json
import numpy as np
import sklearn
import spacy
from spacy2folia import spacy2folia
from itertools import groupby

In [2]:
def get_pp_attachment_ambiguity(parse):
    # https://stackoverflow.com/questions/18799036/python-best-way-to-remove-duplicate-character-from-string
    step0 = re.sub(r'SBAR \(IN', 'PP (IN', parse)
    step1 = [m[0] for m in re.findall(r'[VNP]P',step0)]
    step2 = ''.join(step1)
    step3 = ''.join(ch for ch, _ in groupby(step2))
    phrase_pattern = step3
    
    # If no prep phrase, let's not waste time.
    if not re.search(r'P',phrase_pattern):
        if len(phrase_pattern)>0:
            print("No preps: {}".format(phrase_pattern))
            print(parse)
        return False, 'P'
    
    # (V, N, P, N) => ambiguous (Pantel&Lin) 
    # Mary ate a salad with a fork. 
    # Mary ate a salad with croutons. 
    match_obj = re.search(r'VNPN', phrase_pattern)
    if match_obj:
        return True, 'VNPN'
    #(V, P, N, P, N) => ambiguous (I made this!) 
    #I walked with my golf bag to the clubhouse. 
    #I walked with my golf bag in a pullcart.
    match_obj = re.search(r'VPNPN', phrase_pattern)
    if match_obj:
        return True, 'VPNPN'
    # (N, P, N, P, N) => ambiguous (I made this!)
    # In at least one image there is a single tree with orange flowers in front of a church with the open door facing forward left.
    # ... [tree with orange] flowers in front of a church ...
    # ... [tree with orange] flowers with five petals on thick branches... 
    # ([N, P, {N, P, N], P, N})
    match_obj = re.search(r'NPNPN', phrase_pattern)
    if match_obj:
        return True, 'NPNPN'
    # (N, V, P, N) => ambiguous (discovered empirically)
    # ... women wearing white bikinis standing next to the water. 
    # (simplification)
    # ... A woman is wearing a bikini cooking on a gas stove. 
    # ... A woman is holding a spatula cooking in a white bikini. 
    if re.search(r'NVNVPN', phrase_pattern):
        return True, 'NVNVPN'
    
    # We have now passed all the checks for ambiguity. 
    
    #^(N, P, N, V) => unambiguous (I made this!)
    # The man with the beard sells tacos. 
    match_obj = re.search(r'^NPNV', phrase_pattern)
    if match_obj:
        print("Pattern match (^NPNV=un): {}".format(phrase_pattern))
        print(parse)
        return False, '^NPNV'
    else:
        # ...in the image on the left...
        match_obj = re.search(r'PNPN', phrase_pattern)
        if match_obj:
            print("Pattern match (PNPN=un): {}".format(phrase_pattern))
            print(parse)
            return False, 'PNPN'
        else:
            print("New pattern: {}".format(phrase_pattern))
    print(parse)
    return False, phrase_pattern

In [3]:
def has_pp_attachment_ambiguity(parse):
    ambig, pat = get_pp_attachment_ambiguity(parse)
    return ambig

In [4]:
nlp = spacy.load('en_core_web_lg')

In [5]:
def load_text_file(file):
    lines = []
    with open(file) as text_in:
        for line in text_in:
            lines.append(line.strip())
    return lines


In [7]:
dev_sents = load_text_file('../../data/dev.sent')
dev_parses = load_text_file('../../data/dev.parse')

In [9]:
dev_sents_and_parses = [(s,p) for (s,p) in zip(dev_sents,dev_parses)]
np.random.shuffle(dev_sents_and_parses)

In [10]:
print(len(dev_sents_and_parses))

7005


In [12]:
batch_size = 35
unique_sents = set()
sent_hopper = []
bcount = 0
for (s,p) in dev_sents_and_parses:
    s = ' '.join(s.strip().split())
    if s in unique_sents:
        continue
    unique_sents.add(s)
    sent_hopper.append(s)
    if (len(sent_hopper)%batch_size)==0:
        bcount += 1
        doc = nlp('\n'.join(sent_hopper))
        batch_id = "nlvr2_dev_{}".format(str(bcount).zfill(3))
        foliadoc = spacy2folia.convert(doc, batch_id, paragraphs=False)
        foliadoc.save("../../data/folia/dev/{}.folia.xml".format(batch_id))
        sent_hopper = []

In [14]:
# Rewrite FoLiA XML files
# without the prep attachments
prepdeps = []
dep_on = re.compile(r'<dependencies>')
dep_off = re.compile(r'<\/dependencies>')
depann_on = re.compile(r'<dependency-annotation.*>')
depann_off = re.compile(r'<\/dependency-annotation>')
xmlfiles = []
for xmlfile in os.listdir('../../data/folia/dev'):
    fnlines = []
    with open('../../data/folia/dev/{}'.format(xmlfile)) as fnin:
        #print(xmlfile)
        doline = True
        for line in fnin:
            if doline:
                fnlines.append(line)
                if re.search(dep_on, line) or re.search(depann_on, line):
                    doline = False
            else:
                if re.search(dep_off, line) or re.search(depann_off, line):
                    doline = True
                    fnlines.append(line)
    with open('../../data/folia/dev_stripped/{}'.format(xmlfile), 'w') as fnout:
        for line in fnlines:
            fnout.write(line)


In [15]:
len(unique_sents)

2010