# Resolving Ambiguity in Prepositional Phrase Attachment

The problem of resolving ambiguity in prepositional phrase attachment is one that remains largely unsolved in NLP, and one that pre-trained language models such as BERT will likely not be of much help with. This notebook shows results of predicting prepositional phrase attachments across a subset of the NLVR2 dataset which has been annotated, leveraging a pre-trained language model commonly known as "BERT" (cite). 

We trained an SVM classifier from the output (hidden layers) of the large uncased model from BERT with whole word masking. The results are presented in terms of Cohen's kappa score and F1 score. 

In [1]:
from IPython.display import Image

# Preliminary Steps

In [2]:
# conda create -n python=3.7 ...
# pip install transformers... 

In [3]:
import os
import json
import numpy as np
import pandas as pd

import sklearn
from sklearn import svm
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import cohen_kappa_score as kappa
from sklearn.metrics import confusion_matrix

from collections import Counter

In [4]:
from generator import HuggingFaceGenerator, MaskedPrepGenerator

In [5]:
np.random.seed(91768)

## Load Dataset (train/test)

In [6]:
datadir = "data"
outputdir = "."

In [7]:
train_data = json.load(open('{}/ppa_train.json'.format(datadir)))
labels_train = [instance['label'] for instance in train_data]

test_data = json.load(open('{}/ppa_test.json'.format(datadir)))
labels_test = [instance['label'] for instance in test_data]

## Using BERT Language Model
We load a pre-trained model from BERT and use it to generate instances for model training. 

In [8]:
bert_model_name = "bert-large-uncased-whole-word-masking"
hf_generator = HuggingFaceGenerator(bert_model_name)

## Transform Dataset (or reload)

In [9]:
train_feature_file = "{}/hf_train.csv".format(outputdir)
test_feature_file = "{}/hf_test.csv".format(outputdir)

In [10]:
if os.path.exists(train_feature_file):
    hf_train = pd.read_csv(train_feature_file, header=None)
else:
    hf_train = hf_generator.generate_dataset(train_data)
    pd.DataFrame(hf_train).to_csv(train_feature_file, header=False,index=False)

In [11]:
if os.path.exists(test_feature_file):
    hf_test = pd.read_csv(test_feature_file, header=None)
else:
    hf_test = hf_generator.generate_dataset(test_data)
    pd.DataFrame(hf_test).to_csv(test_feature_file, header=False,index=False)

# Model Training

In [12]:
clfhf = svm.SVC(gamma=0.0001, C=100., random_state=91768)
clfhf.fit(hf_train, labels_train)

SVC(C=100.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=91768, shrinking=True,
    tol=0.001, verbose=False)

In [13]:
preds_test_hf = clfhf.predict(hf_test)

In [14]:
f1_score(labels_test, preds_test_hf, labels=['N','V','O'], average=None)

array([0.90909091, 0.68656716, 0.5       ])

In [15]:
kappa(labels_test, preds_test_hf)

0.6134147542598247

In [16]:
Counter(labels_test)

Counter({'V': 36, 'N': 111, 'O': 10})

In [17]:
f1_score(labels_test, preds_test_hf, labels=['N','V'], average=None)

array([0.90909091, 0.68656716])

In [18]:
confusion_matrix(labels_test, preds_test_hf)

array([[105,   0,   6],
       [  4,   4,   2],
       [ 11,   2,  23]])

In [19]:
discards = [i for i,lbl in enumerate(labels_train) if lbl=='O']

In [20]:
len(train_data)

472

In [21]:
clfhf = svm.SVC(gamma=0.0001, C=100., random_state=91768)
clfhf.fit(hf_train, labels_train)

SVC(C=100.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=91768, shrinking=True,
    tol=0.001, verbose=False)

In [22]:
good_labels = [labels_test[i] for i in range(len(labels_test)) if i not in discards]

In [23]:
good_preds = [preds_test_hf[i] for i in range(len(labels_test)) if i not in discards]

In [24]:
f1_score(good_labels, good_preds, labels=['N','V'], average=None)

array([0.90196078, 0.66666667])

## Applying BERT to the same dataset for the masked prep task... 

In [25]:
mpgen = MaskedPrepGenerator(bert_model_name)

In [26]:
labels,predictions=mpgen.evaluate_dataset(test_data,use_cuda=True)

(82 correct / 100 total)
(160 correct / 200 total)
(236 correct / 300 total)


In [27]:
accuracy_score(labels,predictions)

0.7849462365591398

In [28]:
kappa(labels, predictions)

0.7426407001279878

In [29]:
get_4tpl = lambda x : (x['V']['lemma'],x['N']['lemma'],x['P']['lemma'],x['N2']['lemma'])

In [30]:
train_tuples = [get_4tpl(td) for td in list(train_data)]
test_tuples = [get_4tpl(td) for td in list(test_data)]

In [31]:
test_set = [t for i,t in enumerate(test_tuples)]

In [32]:
Counter(labels_test)

Counter({'V': 36, 'N': 111, 'O': 10})

In [33]:
Counter(preds_test_hf)

Counter({'N': 120, 'V': 31, 'O': 6})

In [34]:
confusion_matrix(labels_test,preds_test_hf)

array([[105,   0,   6],
       [  4,   4,   2],
       [ 11,   2,  23]])

In [35]:
errors = [(i,lbl,pred) for i,(lbl,pred) in enumerate(zip(labels_test,preds_test_hf)) if not lbl==pred]

In [36]:
nvvn = [err for err in errors if 'O' not in err]

In [37]:
nv = [nv for nv in nvvn if nv[1]=='N']

In [38]:
vn = [vn for vn in nvvn if vn[1]=='V']

In [39]:
nv

[(20, 'N', 'V'),
 (57, 'N', 'V'),
 (64, 'N', 'V'),
 (67, 'N', 'V'),
 (116, 'N', 'V'),
 (145, 'N', 'V')]

In [40]:
nvi = [x[0] for x in nv]
nv_data = [td for i,td in enumerate(test_data) if i in nvi]

In [41]:
nv_data

[{'sentence_text': 'A girl in long one piece pajamas is wearing mouse ears on her head in one of the images.',
  'label': 'N',
  'V': {'text': 'wearing',
   'source': 'nlvr2_dev_006.text.s.3.w.9',
   'pos_tag': 'VBG',
   'lemma': 'wear',
   'trail_space': True},
  'N': {'text': 'ears',
   'source': 'nlvr2_dev_006.text.s.3.w.11',
   'pos_tag': 'NNS',
   'lemma': 'ear',
   'trail_space': True},
  'P': {'text': 'on',
   'source': 'nlvr2_dev_006.text.s.3.w.12',
   'pos_tag': 'IN',
   'lemma': 'on',
   'trail_space': True},
  'N2': {'text': 'head',
   'source': 'nlvr2_dev_006.text.s.3.w.14',
   'pos_tag': 'NN',
   'lemma': 'head',
   'trail_space': True},
  'tokenized_sentence': [['A', 'nlvr2_dev_006.text.s.3.w.1', 'DT', 'a', True],
   ['girl', 'nlvr2_dev_006.text.s.3.w.2', 'NN', 'girl', True],
   ['in', 'nlvr2_dev_006.text.s.3.w.3', 'IN', 'in', True],
   ['long', 'nlvr2_dev_006.text.s.3.w.4', 'JJ', 'long', True],
   ['one', 'nlvr2_dev_006.text.s.3.w.5', 'CD', 'one', True],
   ['piece', 'nl

In [42]:
get_4tpl(nv_data[0])

('wear', 'ear', 'on', 'head')

In [43]:
for nv in nv_data:
    print(nv['sentence_text'])
    print(get_4tpl(nv))

A girl in long one piece pajamas is wearing mouse ears on her head in one of the images.
('wear', 'ear', 'on', 'head')
There is exactly one dog in the right image.
('be', 'dog', 'in', 'image')
There is one bottle with a lid and one bottle without a lid.
('be', 'bottle', 'with', 'lid')
Two tall narrow cabinets have at least three upper shelves and have flat tops, but only one has two doors in its lower section.
('have', 'door', 'in', 'section')
there are at least 3 deer in a tree eating in the image pair
('be', 'deer', 'in', 'tree')
There is at least one person on the bus.
('be', 'person', 'on', 'bus')


In [44]:
vni = [x[0] for x in vn]
vn_data = [td for i,td in enumerate(test_data) if i in vni]

In [45]:
for vn in vn_data:
    print(vn['sentence_text'])
    print(get_4tpl(vn))

There are three chow dogs in the image pair.
('be', 'dog', 'in', 'pair')
Each dispenser has a circle shape and an upside-down raindrop shape on its front, and at least one dispenser features the raindrop shape above the circle shape.
('feature', 'shape', 'above', 'shape')
putting their right leg high up on a fence.
('put', 'leg', 'on', 'fence')
At least one of the dogs has a small toy in front of him.
('have', 'toy', 'in', 'front')
The combined images include an uncapped lipstick wand to the right of a capped lipstick, and a squarish smear of reddish rouge left of them.
('include', 'wand', 'to', 'right')
There is an awning over the machines in one of the images.
('be', 'awning', 'over', 'machine')
One image shows a bottle next to a white cylinder but not overlapping it, and the other image shows a single upright bottle.
('show', 'bottle', 'to', 'cylinder')
A shelving unit covers one wall with a unique center area, but identical sections on each side, glass upper doors in one image, and

In [46]:
tpls = [get_4tpl(td) for td in train_data]

In [47]:
bigrams = [(t[0],t[2]) for t in tpls] + [(t[1],t[2]) for t in tpls]

In [48]:
c=Counter(bigrams).most_common(25)

In [49]:
labeled_tpls = [tuple((*get_4tpl(td),td['label'])) for td in train_data]

In [50]:
labeled_bigrams = [(t[0],t[2],t[4]=='V') for t in labeled_tpls] + [(t[1],t[2],t[4]=='N') for t in labeled_tpls]

In [51]:
Counter([b for b in labeled_bigrams if b[0]=='be' and b[1]=='in'])

Counter({('be', 'in', True): 51, ('be', 'in', False): 9})

In [52]:
for ctr in c:
    matches = [lb[2] for lb in labeled_bigrams if (lb[0],lb[1])==(ctr[0][0],ctr[0][1])]
    if len(matches)==0:
        print("Not found: {}".format((ctr[0][0],ctr[0][1])))
    num_pos = len([m for m in matches if m])
    proportion = float(num_pos/len(matches))
    if proportion>0.1 and proportion<0.9:
        print("{}: {}/{}={:.3f}".format(ctr,num_pos,len(matches),proportion))

(('be', 'in'), 60): 51/60=0.850
(('show', 'in'), 18): 2/18=0.111
(('dog', 'in'), 14): 2/14=0.143
(('have', 'in'), 10): 6/10=0.600
(('be', 'on'), 9): 7/9=0.778
(('have', 'on'), 7): 3/7=0.429
(('feature', 'on'), 6): 1/6=0.167
(('show', 'on'), 6): 1/6=0.167


These would seem to be the cases that are not clearly discriminated. 
Unless these are attributable to annotation errors, we may need additional context to distinguish these cases. 


In [53]:
# Recover the context for these cases... 

In [54]:
[t for t in labeled_tpls if t[0]=='be' and t[2]=='in']

[('be', 'skunk', 'in', 'total', 'V'),
 ('be', 'filling', 'in', 'bread', 'N'),
 ('be', 'animal', 'in', 'total', 'V'),
 ('be', 'dog', 'in', 'pair', 'V'),
 ('be', 'man', 'in', 'sleeve', 'N'),
 ('be', 'product', 'in', 'image', 'V'),
 ('be', 'dog', 'in', 'total', 'V'),
 ('be', 'kid', 'in', 'pair', 'V'),
 ('be', 'dog', 'in', 'image', 'V'),
 ('be', 'crab', 'in', 'image', 'V'),
 ('be', 'stingray', 'in', 'pair', 'V'),
 ('be', 'cup', 'in', 'image', 'V'),
 ('be', 'mammal', 'in', 'image', 'N'),
 ('be', 'bottle', 'in', 'image', 'V'),
 ('be', 'pelican', 'in', 'image', 'V'),
 ('be', 'person', 'in', 'library', 'V'),
 ('be', 'ibex', 'in', 'image', 'V'),
 ('be', 'dog', 'in', 'image', 'V'),
 ('be', 'horse', 'in', 'image', 'V'),
 ('be', 'dog', 'in', 'image', 'N'),
 ('be', 'panda', 'in', 'image', 'V'),
 ('be', 'bottle', 'in', 'total', 'V'),
 ('be', 'item', 'in', 'image', 'V'),
 ('be', 'saxophone', 'in', 'total', 'V'),
 ('be', 'bottle', 'in', 'total', 'V'),
 ('be', 'female', 'in', 'bikini', 'N'),
 ('be', 'd

In [55]:
[t for t in labeled_tpls if t[0]=='be' and t[2]=='in' and t[4]=='N']

[('be', 'filling', 'in', 'bread', 'N'),
 ('be', 'man', 'in', 'sleeve', 'N'),
 ('be', 'mammal', 'in', 'image', 'N'),
 ('be', 'dog', 'in', 'image', 'N'),
 ('be', 'female', 'in', 'bikini', 'N'),
 ('be', 'cover', 'in', 'stripe', 'N'),
 ('be', 'woman', 'in', 'gown', 'N'),
 ('be', 'jellyfish', 'in', 'water', 'N'),
 ('be', 'plug', 'in', 'basin', 'N')]

I'm not too sure about some of these. We generally expect 'in image' to attach to the verb. 

### Data format
### -----------
There are train and test files, identified by different prefixes. Each line in the the files is a PP attachment instance, in the following format:
```
<prefix>.preps.words - the preposition.
<prefix>.children.words - the preposition's child.
<prefix>.heads.words - the candidate heads for the PP.
<prefix>.heads.pos - the part-of-speech of the candidate heads for the PP, where "1" is for verbs and "-1" is for nouns.
<prefix>.heads.next.pos - the part-of-speech of the words following the candidate heads of the PP, in text format.
<prefix>.labels - the gold head of the PP, indicated by an index specifying its position in the list of candidate heads (e.g. in <prefix>.heads.words)
<prefix>.nheads - the number of candidate heads. 
```
    


In [56]:
nheads = pd.read_csv('data/wsj.2-21.txt.dep.pp.nheads',header=None)

In [57]:
heads = pd.read_csv('data/wsj.2-21.txt.dep.pp.heads.words',header=None)[nheads[0]==2]

In [58]:
preps = pd.read_csv('data/wsj.2-21.txt.dep.pp.preps.words',header=None)[nheads[0]==2]

In [59]:
labels = pd.read_csv('data/wsj.2-21.txt.dep.pp.labels',header=None)[nheads[0]==2]

In [60]:
pos = pd.read_csv('data/wsj.2-21.txt.dep.pp.heads.pos',header=None)[nheads[0]==2]

In [61]:
nheadslist = nheads[0].tolist()

In [62]:
headcandposlist=[tuple(hh.split("\t")) for hh in pos[0].tolist()]

In [63]:
vnlist = [i for i,hc in enumerate(headcandposlist) if hc[0]>hc[1]] # verb-noun candidates

In [64]:
headslist = pd.read_csv('data/wsj.2-21.txt.dep.pp.heads.words',header=None)[nheads[0]==2][0].tolist()

In [65]:
vnheads = [headslist[i] for i in vnlist]

In [66]:
vnverbs = [vn.split()[0] for vn in vnheads]
vnnouns = [vn.split()[1] for vn in vnheads]

In [67]:
prepslist = preps[0].tolist()
vnpreps = [prepslist[i] for i in vnlist]

In [68]:
labelslist = labels[0].tolist()
relabel = lambda x : 'V' if int(x)==1 else 'N'
vnlabels = [relabel(labelslist[i]) for i in vnlist]

In [69]:
children = pd.read_csv('data/wsj.2-21.txt.dep.pp.children.words',header=None)[nheads[0]==2][0].tolist()
vnchildren = [children[i] for i in vnlist]

In [70]:
wsj_labeled_tuples = [x for x in zip(vnverbs,vnnouns,vnpreps,vnchildren,vnlabels)]

In [71]:
wsj_labeled_tuples[11]

('be', 'charge', 'of', 'research', 'N')

In [72]:
[t for t in wsj_labeled_tuples if t[0]=='be' and t[2]=='in' and t[4]=='N']

[('be', 'volume', 'in', 'stocks', 'N'), ('be', 'wind', 'in', 'sails', 'N')]

In [73]:
labeled_wsj_bigrams = [(t[0],t[2],t[4]=='V') for t in wsj_labeled_tuples] + [(t[1],t[2],t[4]=='N') for t in wsj_labeled_tuples]

In [74]:
wsj_bigrams = [(t[0],t[2]) for t in wsj_labeled_tuples] + [(t[1],t[2]) for t in wsj_labeled_tuples]

In [75]:
c2=Counter(wsj_bigrams).most_common(50)

In [76]:
for ctr in c:
    matches = [lb[2] for lb in labeled_wsj_bigrams if (lb[0],lb[1])==(ctr[0][0],ctr[0][1])]
    if len(matches)==0:
        print("Not found: {}".format((ctr[0][0],ctr[0][1])))
    num_pos = len([m for m in matches if m])
    proportion = float(num_pos/len(matches))
    if proportion>0.1 and proportion<0.9:
        print("{}: {}/{}={:.3f}".format(ctr,num_pos,len(matches),proportion))

(('be', 'in'), 60): 2/4=0.500
Not found: ('contain', 'of')


ZeroDivisionError: division by zero

Problem: this dataset does not use lemma, but that's what I've been doing. 
I can either get the lemma for the new dataset or go back and get the words for my dataset.  
Since the former may introduce another source of error, I'll try the latter. 