# Resolving Ambiguity in Prepositional Phrase Attachment

The problem of resolving ambiguity in prepositional phrase attachment is one that remains largely unsolved in NLP, and one that pre-trained language models such as BERT will likely not be of much help with. This notebook shows results of predicting prepositional phrase attachments across a subset of the NLVR2 dataset which has been annotated, leveraging a pre-trained language model commonly known as "BERT" (cite). 

We trained an SVM classifier from the output (hidden layers) of the large uncased model from BERT with whole word masking. The results are presented in terms of Cohen's kappa score and F1 score. 

In [1]:
from IPython.display import Image

# Preliminary Steps

In [2]:
# conda create -n python=3.7 ...
# pip install transformers... 

In [3]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn import svm
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import cohen_kappa_score as kappa
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

from collections import Counter
from operator import itemgetter

In [4]:
from generator import HuggingFaceGenerator, MaskedPrepGenerator, SpacyModel

In [5]:
np.random.seed(91768)

## Load Dataset (train/test)

In [6]:
datadir = "data"
outputdir = "."

In [7]:
train_data = json.load(open('{}/ppa_train.json'.format(datadir)))
labels_train = [instance['label'] for instance in train_data]

test_data = json.load(open('{}/ppa_test.json'.format(datadir)))
labels_test = [instance['label'] for instance in test_data]

## Using BERT Language Model
We load a pre-trained model from BERT and use it to generate instances for model training. 

In [8]:
bert_model_name = "bert-large-uncased-whole-word-masking"
hf_generator = HuggingFaceGenerator(bert_model_name)

## Transform Dataset (or reload)

In [9]:
train_feature_file = "{}/hf_train.csv".format(outputdir)
test_feature_file = "{}/hf_test.csv".format(outputdir)

In [10]:
if os.path.exists(train_feature_file):
    hf_train = pd.read_csv(train_feature_file, header=None)
else:
    hf_train = hf_generator.generate_dataset(train_data)
    pd.DataFrame(hf_train).to_csv(train_feature_file, header=False,index=False)

In [11]:
if os.path.exists(test_feature_file):
    hf_test = pd.read_csv(test_feature_file, header=None)
else:
    hf_test = hf_generator.generate_dataset(test_data)
    pd.DataFrame(hf_test).to_csv(test_feature_file, header=False,index=False)

# Model Training

In [12]:
clfhf = svm.SVC(gamma=0.0001, C=100., random_state=91768)
clfhf.fit(hf_train, labels_train)

SVC(C=100.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=91768, shrinking=True,
    tol=0.001, verbose=False)

In [13]:
preds_test_hf = clfhf.predict(hf_test)

In [14]:
print(sklearn.metrics.classification_report(labels_test, preds_test_hf, digits=3))

              precision    recall  f1-score   support

           N      0.875     0.946     0.909       111
           O      0.667     0.400     0.500        10
           V      0.742     0.639     0.687        36

    accuracy                          0.841       157
   macro avg      0.761     0.662     0.699       157
weighted avg      0.831     0.841     0.832       157



### without 'O'(ther) classification

In [15]:
nvi = [i for i,lbl in enumerate(labels_train) if lbl in ['N','V']]

In [16]:
labels_train_NV = [lbl for lbl in labels_train if lbl in ['N','V']]

In [17]:
hf_train_NV = hf_train.to_numpy()[nvi]
#hf_generator.generate_dataset([td for td,lbl in zip(train_data,labels_train) if lbl in ['N','V']])

In [18]:
hf_train_NV.shape

(435, 16384)

In [19]:
nvi_test = [i for i,lbl in enumerate(labels_test) if lbl in ['N','V']]

In [20]:
hf_test_NV = hf_test.to_numpy()[nvi_test]
#hf_generator.generate_dataset([td for td,lbl in zip(test_data,labels_test) if lbl in ['N','V']])

In [21]:
labels_test_NV = [lbl for lbl in labels_test if lbl in ['N','V']]

In [22]:
clfhf2 = svm.SVC(gamma=0.0001, C=100., random_state=91768)
clfhf2.fit(hf_train_NV, labels_train_NV)

SVC(C=100.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=91768, shrinking=True,
    tol=0.001, verbose=False)

In [23]:
preds_test_hf2 = clfhf2.predict(hf_test_NV)

In [24]:
confusion_matrix(labels_test, preds_test_hf)

array([[105,   0,   6],
       [  4,   4,   2],
       [ 11,   2,  23]])

In [25]:
confusion_matrix(labels_test_NV, preds_test_hf2)

array([[104,   7],
       [ 12,  24]])

In [26]:
print(sklearn.metrics.classification_report(labels_test_NV, preds_test_hf2, digits=3))

              precision    recall  f1-score   support

           N      0.897     0.937     0.916       111
           V      0.774     0.667     0.716        36

    accuracy                          0.871       147
   macro avg      0.835     0.802     0.816       147
weighted avg      0.867     0.871     0.867       147



## Error Analysis

In [27]:
get_4tpl = lambda x : (x['V']['lemma'],x['N']['lemma'],x['P']['lemma'],x['N2']['lemma'])

In [28]:
spacy_model = SpacyModel()

In [29]:
spacy_preds_train = [spacy_model.predict(t) for t in train_data]

In [30]:
train_tuples = [get_4tpl(td) for td in list(train_data)]
test_tuples = [get_4tpl(td) for td in list(test_data)]

In [31]:
errors = [(i,lbl,pred) for i,(lbl,pred) in enumerate(zip(labels_test,preds_test_hf)) if not lbl==pred]

In [32]:
def get_err_info(err,data):
    (i,lbl,pred)=err
    td=data[i]
    #print(td['sentence_text'])
    return ((i,*get_4tpl(td),lbl,pred,td['sentence_text']))

In [33]:
error_tuples=[]
for err in errors: 
    # Exclude 'other' cases for now
    if 'O' in err:
        continue
    error_tuples.append(get_err_info(err,test_data))

In [34]:
df = pd.DataFrame(error_tuples, columns=['i','V','N1','P','N2','label','pred','sentence_text'])

In [35]:
df

Unnamed: 0,i,V,N1,P,N2,label,pred,sentence_text
0,0,be,dog,in,pair,V,N,There are three chow dogs in the image pair.
1,20,wear,ear,on,head,N,V,A girl in long one piece pajamas is wearing mo...
2,28,feature,shape,above,shape,V,N,Each dispenser has a circle shape and an upsid...
3,57,be,dog,in,image,N,V,There is exactly one dog in the right image.
4,64,be,bottle,with,lid,N,V,There is one bottle with a lid and one bottle ...
5,67,have,door,in,section,N,V,Two tall narrow cabinets have at least three u...
6,72,put,leg,on,fence,V,N,putting their right leg high up on a fence.
7,75,have,toy,in,front,V,N,At least one of the dogs has a small toy in fr...
8,78,include,wand,to,right,V,N,The combined images include an uncapped lipsti...
9,83,be,awning,over,machine,V,N,There is an awning over the machines in one of...


In [36]:
# Not sure:
# There are at least 3 deer in a tree...
# There is a dog on a green rug. 
# also not sure what to do about 'next to' => compound

In [37]:
# Some instances seem to be labeled incorrectly
wrong_label_indices = [1,3,12,15]

In [38]:
new_test_labels = [lbl for lbl in labels_test]
for i in df.to_numpy()[wrong_label_indices,0]:
    new_test_labels[i] = 'V'

## Analyze train/dev

In [39]:
spacy_preds = [spacy_model.predict(t) for t in test_data]

In [111]:
classifiers = ['all_noun','spacy','bert','spacy+pvc']

In [112]:
test_preds = [['N' for i in range(len(labels_test))], spacy_preds, preds_test_hf,corrected_preds]

In [42]:
class_reports = {cl: sklearn.metrics.classification_report(labels_test, preds, digits=3, output_dict=True, zero_division=0) 
                 for (cl,preds) in zip(classifiers, test_preds)}

In [113]:
class_reports = {cl: sklearn.metrics.classification_report(labels_test, preds, digits=3, output_dict=True, zero_division=0) 
                 for (cl,preds) in zip(classifiers, test_preds)}

In [114]:
pd.DataFrame([[c, class_reports[c]['weighted avg']['f1-score']] for c in classifiers], columns=['classifier','weighted avg f1-score'])

Unnamed: 0,classifier,weighted avg f1-score
0,all_noun,0.585655
1,spacy,0.722477
2,bert,0.83201
3,spacy+pvc,0.761087


In [115]:
class_reports = {cl: sklearn.metrics.classification_report(corrected_labels, preds, digits=3, output_dict=True, zero_division=0) 
                 for (cl,preds) in zip(classifiers, test_preds)}

In [116]:
pd.DataFrame([[c, class_reports[c]['weighted avg']['f1-score']] for c in classifiers], columns=['classifier','weighted avg f1-score'])

Unnamed: 0,classifier,weighted avg f1-score
0,all_noun,0.440713
1,spacy,0.594235
2,bert,0.7509
3,spacy+pvc,0.873773


In [44]:
def get_subtuples(tpl):
    triples = [tpl[:3],(tpl[0],*tpl[2:]),tpl[1:]]
    doubles = [(tpl[0],tpl[2]),tpl[1:2],tpl[2:]]
    singles = [tuple([t]) for t in tpl]
    subtuples = [(tpl)] + triples + doubles + singles
    return subtuples

In [45]:
verbs_df = pd.DataFrame.from_dict(Counter([v for (v,n1,p,n2) in train_tuples]),orient='index') \
    .reset_index() \
    .rename(columns={"index":"verb",0:"count"}) \
    .sort_values(by='count', ascending=False, ignore_index=True, inplace=False)

In [46]:
verbs_df

Unnamed: 0,verb,count
0,be,117
1,show,91
2,contain,72
3,feature,32
4,have,29
...,...,...
63,plow,1
64,cap,1
65,make,1
66,bear,1


In [47]:
presentation_verb_lemmas = ['include','feature','show','be',
                            'wear','cover','contain','have']

In [48]:
presentation_verb_test = verbs_df['verb'].map(lambda x: x in presentation_verb_lemmas)

In [49]:
nonpresentation_verb_test = verbs_df['verb'].map(lambda x: x not in presentation_verb_lemmas)

In [50]:
# Presentation verbs or not? 
s="hold, face, see, match, depict, look, view, stare, angle, expose, draw, receive, frame, accompany, wave"
pres_candidates = s.split(", ")

What proportion of training data feature
presentation verbs? 

In [51]:
verbs_df[presentation_verb_test].sum(axis=0,numeric_only=True)/len(train_tuples)

count    0.800847
dtype: float64

 Presentation verbs identified comprise 80% of the training data.
 Another 8% or so seem like they might or might not belong to this category.
 The rest (12-20% of the data) are not in this category and would thus attach to the noun. 


In [58]:
pres_verb_df = pd.DataFrame([
    [*td,lbl]
    for td,lbl in zip(train_tuples,labels_train)
    if td[0] in presentation_verb_lemmas
],columns=["verb","noun","prep","pobj","label"])

In [59]:
Counter(pres_verb_df.label.tolist())

Counter({'N': 277, 'V': 82, 'O': 19})

In [60]:
pd.DataFrame.from_dict(Counter(pres_verb_df.verb.tolist()),orient="index") \
    .reset_index() \
    .rename(columns={"index":"verb",0:"count"}) \
    .sort_values(by='count', ascending=False, ignore_index=True, inplace=False)


Unnamed: 0,verb,count
0,be,117
1,show,91
2,contain,72
3,feature,32
4,have,29
5,include,18
6,wear,17
7,cover,2


I wanted to review all the cases with presentation verbs to see which ones describe a state that changes over time. I flagged them all with "Y", "N", or "?", accordingly ("?" indicates uncertainty or need to see the rest of sentence). 

In [61]:
pres_verb_df.to_csv('data/presentation_verbs.csv',index=False)

In [62]:
df=pd.read_csv('data/presentation_verbs_with_state.csv')

In [63]:
# Want PP+POBJ for all cases with mod_state_mutable=Y
matches=list(df[df['mod_state_mutable']=='Y'].itertuples(index=False,name=None))
pp_matches = set([(x[2],x[3]) for x in matches])


Using this as a starting point, we can correct some of the parses from spaCy. 

In [69]:
corrected_preds = []
ann_vs_predV = []
for i,(tt,lbl,spc) in enumerate(zip(test_tuples, new_test_labels, spacy_preds)):
    pp = (tt[2],tt[3])
    
    if pp in pp_matches:
        if lbl!='V':
            print("E:",tt)
            ann_vs_predV.append(tt)
        corrected_preds.append('V')
    else:
        corrected_preds.append(spc)

E: ('form', 'screen', 'in', 'front')
E: ('hold', 'dog', 'of', 'image')
E: ('leave', 'pig', 'in', 'image')
E: ('show', 'hound', 'on', 'grass')
E: ('have', 'hole', 'in', 'front')
E: ('be', 'pillow', 'of', 'image')
E: ('show', 'cover', 'on', 'side')
E: ('feature', 'oval', 'on', 'image')
E: ('view', 'head', 'on', 'left')
E: ('surround', 'table', 'in', 'image')
E: ('flesh', 'fruit', 'on', 'top')
E: ('contain', 'gorilla', 'in', 'image')
E: ('wear', 'pack', 'in', 'image')
E: ('feature', 'dog', 'on', 'left')
E: ('mash', 'potato', 'in', 'bowl')
E: ('be', 'dog', 'on', 'right')
E: ('show', 'person', 'in', 'sleeve')
E: ('show', 'skunk', 'in', 'profile')
E: ('leave', 'display', 'in', 'image')
E: ('be', 'spoon', 'on', 'top')
E: ('show', 'buffalo', 'in', 'water')


In [70]:
err_df = pd.DataFrame(ann_vs_predV,columns=['verb','noun','prep','pobj'])

In [None]:
# Make a table of this, show it... 
# Flag cases I'm still unsure about

In [73]:
# wrong === lbl=N/O, pred=V, but lbl should be V
wrong = [0,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
flagged = []

In [74]:
err_df['wrong'] = wrong

In [109]:
err_df[err_df['wrong']==0]

Unnamed: 0,verb,noun,prep,pobj,wrong
0,form,screen,in,front,0
1,hold,dog,of,image,0
4,have,hole,in,front,0
5,be,pillow,of,image,0


In [92]:
wrong_labels_test=err_df[err_df['wrong']==1][['verb','noun','prep','pobj']]

In [99]:
wrongly_labeled_tpls = list(wrong_labels_test.to_records(index=False))

In [107]:
corrected_labels = []
for tpl,lbl in zip(test_tuples, new_test_labels):
    corrected = False
    for wtpl in wrongly_labeled_tpls:
        if tuple(tpl)==tuple(wtpl):
            corrected_labels.append('V')
            corrected = True
            continue
    if not corrected:
        corrected_labels.append(lbl)

In [110]:
print(sklearn.metrics.classification_report(labels_test, corrected_preds, digits=3))

              precision    recall  f1-score   support

           N      0.916     0.784     0.845       111
           O      0.375     0.300     0.333        10
           V      0.519     0.778     0.622        36

    accuracy                          0.752       157
   macro avg      0.603     0.621     0.600       157
weighted avg      0.790     0.752     0.761       157



In [None]:
# Disappointing, but I think error analysis will show
# that the annotations are wrong, or at least questionable
# ...show test cases where annotation and correction disagree

In [108]:
print(sklearn.metrics.classification_report(corrected_labels, corrected_preds, digits=3))

              precision    recall  f1-score   support

           N      0.905     0.925     0.915        93
           O      0.375     0.429     0.400         7
           V      0.889     0.842     0.865        57

    accuracy                          0.873       157
   macro avg      0.723     0.732     0.727       157
weighted avg      0.876     0.873     0.874       157

