# Resolving Ambiguity in Prepositional Phrase Attachment

This notebook shows results of predicting prepositional phrase attachments across a subset of the NLVR2 dataset which has been annotated. 

The first group of models are trained from the output the large uncased model from BERT with whole word masking. 
This model was subsequently converted to PyTorch/HuggingFace via command-line. 


In [1]:
from IPython.display import Image

Blah blah blah about prepositional phrase attachments... 

Blah blah blah some interesting examples. 

Blah blah blah about NLVR2 paper and dataset

Some stuff about this dataset and how it was collected
and how it was annotated

What this notebook shows... 

(my sig)

Prelims
Imports
outline/toc
Background


## Preliminary Steps

In [2]:
# conda create -n python=3.7 ...
# pip install transformers... 

In [3]:
import sys
import os
import json
import numpy as np
import sklearn
import torch
import spacy
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import cohen_kappa_score as kappa
from itertools import groupby

from sklearn import svm
from collections import Counter

#sys.path.append('/bridge/science/AI/nlp/bert')
#from notebook_source import generate_huggingface_instances

from sklearn.neural_network import MLPClassifier


In [4]:
from transformers import BertConfig, BertTokenizer, BertModel, BertForMaskedLM
from sklearn.neural_network import MLPClassifier

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
np.random.seed(91768)


In [6]:
datadir = "/bridge/data/compositional_semantics"

In [7]:
train_data = json.load(open('{}/ppa_train.json'.format(datadir)))
#train_data = json.load(open(os.path.join([datadir,'ppa-hugging-face-train.json'])))
#test_data = json.load(open(os.path.join([datadir,'ppa-hugging-face-test.json'])))
test_data = json.load(open('{}/ppa_test.json'.format(datadir)))


In [8]:
def generate_huggingface_instances(model,tokenizer,
                                   dataset,
                                   orig_tokenizer=None,
                                   max_length=128,
                                   pad_to_max_length=True,
                                   use_cuda=False):
    sents_all = [instance['sentence_text'] for instance in dataset]
    if use_cuda:
        model.to('cuda')
    retuple = lambda word_attr : (word_attr['text'],
                                  word_attr['source'],
                                  word_attr['pos_tag'],
                                  word_attr['lemma'],
                                  word_attr['trail_space'])
    for instance in dataset:
        annotated4tpl = (retuple(instance['V']),
                         retuple(instance['N']),
                         retuple(instance['P']),
                         retuple(instance['N2']))
        sent = instance['sentence_text']
        label = instance['label']
        if 'tokenized_sentence' in instance:
            orig_tokens = [t[0] for t in instance['tokenized_sentence']]
        elif orig_tokenizer is not None:
            orig_tokens = [t.text for t in orig_tokenizer(sent)]
        else:  
            orig_tokens = sent
        bert_tokens = ["[CLS]"]
        orig_token_indexes = [int(tpl[1].split('.')[-1])-1 for tpl in annotated4tpl]
        orig_4tpl_tokens = [orig_tokens[i] for i in orig_token_indexes]
        orig_bert_token_indexes = []
        word_pieces_array = []
        for orig_token in orig_tokens:
            #orig_to_tok_map.append(len(bert_tokens))
            orig_bert_token_indexes.append(len(bert_tokens))
            word_pieces = tokenizer.tokenize(orig_token)
            word_pieces_array.append(word_pieces)
            bert_tokens.extend(word_pieces)
        bert_tokens.append("[SEP]")
        indexed_tokens = tokenizer.convert_tokens_to_ids(bert_tokens)
        tokens_tensor = tokenizer.encode(indexed_tokens,
                                         max_length=max_length,
                                         pad_to_max_length=pad_to_max_length,
                                         return_tensors='pt')
        if use_cuda:
            tokens_tensor = tokens_tensor.to('cuda')
        x = []
        y = []
        with torch.no_grad():
            # When output_hidden_states = True,
            # the hidden states are output in the third value
            # in the tuple returned from the model.
            # That value is itself a tuple of the embedding matrix and
            # hidden layers, 1-N (where N is the number of hidden layers)
            # We want the last 4 layers of 24, which will be found in
            # elements 21-24 of the second return tuple (embedding matrix is
            # element 0).
            hidden_layers = model(tokens_tensor)[2][21:25]
            for orig_token_idx in orig_token_indexes:
                word_pieces = word_pieces_array[orig_token_idx]
                num_word_pieces = len(word_pieces)
                # If token >1 piece, use layers from word pieces (4 total)
                # 4th-from-top layer from first piece...
                # top layer from 4th (or last) piece

                layeridx = 3
                wpi = 0
                token_layers_values = []        
                orig_bert_token_index = orig_bert_token_indexes[orig_token_idx]
                while layeridx>=0:
                    #tli = 0
                    tli=layeridx
                    #tli = 3-layeridx
                    token_layer_values = hidden_layers[tli][0,orig_bert_token_index+wpi]
                    #token_layers_values.extend([token_layer_values])
                    #y.extend(token_layer_values)
                    x.extend(token_layer_values)
                    layeridx-=1
                    if wpi<(num_word_pieces-1):
                        wpi+=1
                #flattened_layers = [xi for layer in token_layers_values for xi in layer]
                #x.extend(flattened_layers)
        #assert np.array_equal(x,y)
        #print("{} => {}".format(orig_4tpl_tokens,orig_4tpl_pieces))
        yield x

In [9]:
labels_train = [instance['label'] for instance in train_data]
labels_test = [instance['label'] for instance in test_data]

In [10]:
bert_config = BertConfig.from_pretrained("bert-large-uncased-whole-word-masking")
bert_config.output_hidden_states=True

bert_model = BertModel.from_pretrained("bert-large-uncased-whole-word-masking",config=bert_config)
bert_model.eval()

bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking',config=bert_config)

In [11]:
train = np.array([x for x in generate_huggingface_instances(
    bert_model,bert_tokenizer,train_data,use_cuda=True)])
test = np.array([x for x in generate_huggingface_instances(
    bert_model,bert_tokenizer,test_data,use_cuda=True)])

In [12]:
clfhf = svm.SVC(gamma=0.0001, C=100., random_state=91768)

clfhf.fit(train, labels_train)

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=91768, shrinking=True,
    tol=0.001, verbose=False)

In [13]:
labels_test_hf = clfhf.predict(test)
kappa(labels_test, labels_test_hf)

0.6134147542598247

In [14]:
f1_score(labels_test, labels_test_hf, labels=['N','V','O'], average=None)

array([0.90909091, 0.68656716, 0.5       ])