This notebook shows results of predicting prepositional phrase attachments across a subset of the NLVR2 dataset which has been annotated. 

The first group of models are trained from the output the large uncased model from BERT with whole word masking. 
This model was subsequently converted to PyTorch/HuggingFace via command-line. 


In [1]:
import sys
import os
import json
import numpy as np
import sklearn
import torch
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score as kappa
from itertools import groupby

from sklearn import svm
from collections import Counter

sys.path.append('/bridge/science/AI/nlp/bert')
from notebook_source import load_text_file, load_xml_files, generate_tuples
from notebook_source import load_folia_xml
from notebook_source import find_sentence_from_file, find_sentence_from_word_id
from notebook_source import generate_annotated4tpls, generate_sentences_from_4tpls
from notebook_source import generate_google_instances, generate_huggingface_instances
#import tokenization

from transformers import BertConfig, BertTokenizer, BertModel, BertForMaskedLM
from sklearn.neural_network import MLPClassifier


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
np.random.seed(91768)


In [3]:
anndir = "/bridge/data/compositional_semantics/folia/jblackmore/done"
spacydir = "/bridge/data/compositional_semantics/folia/dev"

In [4]:
sents, generator = load_folia_xml(anndir)

In [5]:
annotated4tpls = []
tdeps = {}
for t,dep in generator():
    tdeps[t[2]] = dep
    annotated4tpls.append(t)

In [6]:
len(tdeps)

631

In [7]:
spacy_sents, spacy_gen = load_folia_xml(spacydir)
sdeps = {}
spacy4tpls = []
for spacy_tpl,sdep in spacy_gen():
    sprep = spacy_tpl[2]
    sdeps[sprep] = sdep
    spacy4tpls.append(spacy_tpl)

In [8]:
len(sdeps)

930

In [9]:
len(annotated4tpls)

631

In [10]:
len(spacy4tpls)

930

In [11]:
missing_eyes = [i for i,a4tpl in enumerate(annotated4tpls) if a4tpl[2] not in sdeps]

In [12]:
annotated4tpls[missing_eyes[0]]

(('is', 'nlvr2_dev_012.text.s.35.w.2', 'VBZ', 'be', True),
 ('doberman', 'nlvr2_dev_012.text.s.35.w.5', 'NNP', 'doberman', True),
 ('with', 'nlvr2_dev_012.text.s.35.w.6', 'IN', 'with', True),
 ('cut', 'nlvr2_dev_012.text.s.35.w.9', 'NN', 'cut', False))

In [13]:
annotated4tpls = [a4tpl for i,a4tpl in enumerate(annotated4tpls) if i not in missing_eyes]

In [14]:
tdeps[annotated4tpls[21][2]]

('flower', 'nlvr2_dev_002.text.s.24.w.6', 'NN', 'flower', True)

In [15]:
prep_attachment_class = lambda tpl, deps : \
    '!' if tpl[2] not in deps else \
    'V' if deps[tpl[2]]==tpl[0] else \
    'N' if deps[tpl[2]]==tpl[1] else \
    'O'

In [16]:
labels = [prep_attachment_class(tpl,tdeps) for tpl in annotated4tpls]

In [17]:
Counter(labels)

Counter({'N': 442, 'V': 140, 'O': 47})

In [18]:
spacy_preds = [prep_attachment_class(tpl,sdeps) for tpl in annotated4tpls]

In [19]:
kappa(spacy_preds, labels)

0.275600163537006

In [20]:
Counter(spacy_preds)

Counter({'V': 94, 'N': 465, 'O': 70})

In [21]:
import pandas as pd
pd.DataFrame(confusion_matrix(labels, spacy_preds, labels=['N','V','O']), index=None)

Unnamed: 0,0,1,2
0,361,50,31
1,95,37,8
2,9,7,31


In [22]:
bert_datadir = '/bridge/science/AI/nlp/data/compositional_semantics/BERT'
bert_model = 'wwm_uncased_L-24_H-1024_A-16'
bert_basedir="/bridge/science/AI/nlp/corpora/BERT/wwm_uncased_L-24_H-1024_A-16"

In [23]:
#sents_all = list(generate_sentences_from_4tpls(annotated4tpls,sents))

In [24]:
#from notebook_source import stext
#sents_all=[stext(find_sentence_from_word_id(t4tpl[0][1],sents)) for t4tpl in annotated4tpls]

In [25]:
#len(sents_all)

In [26]:
# Write all sentences with annotations to disk for further
# processing with BERT models. 

#with open(os.path.join(bert_datadir,'sents_all.txt'),'w') as allout:
#    for s in sents_all:
#        allout.write(s)
#        allout.write('\n')

In [27]:
# ... Wait for features from BERT ...
#export BERT_BASE_DIR=/bridge/science/AI/nlp/corpora/BERT/wwm_uncased_L-24_H-1024_A-16
#python extract_features.py --input_file=/bridge/science/AI/nlp/data/compositional_semantics/BERT/sents_all.txt --output_file=/bridge/science/AI/nlp/data/compositional_semantics/BERT/sents_all_wwmu_output.jsonl --vocab_file=$BERT_BASE_DIR/vocab.txt --bert_config_file=$BERT_BASE_DIR/bert_config.json --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt --layers=-1,-2,-3,-4 --max_seq_length=128 --batch_size=8


In [28]:
#X = np.array(X_list)
test_size = int(len(annotated4tpls)/4)
randidx = list(range(len(annotated4tpls)))
np.random.shuffle(randidx)
trainidx = randidx[test_size:]
testidx = randidx[:test_size]
labels_train = [labels[i] for i in trainidx]
labels_test = [labels[i] for i in testidx]


In [29]:
jsonl_file = os.path.join(bert_datadir,'sents_all_wwmu_output.jsonl')
sents_all = list(generate_sentences_from_4tpls(annotated4tpls,sents))
X_goog = [x for x in generate_google_instances(annotated4tpls, sents,jsonl_file,labels=labels, omit_indexes=missing_eyes)]




In [30]:
X_goog = np.array(X_goog)

In [31]:
train = X_goog[trainidx]
test = X_goog[testidx]


In [32]:
clf = svm.SVC(gamma=0.0001, C=100., random_state=91768)

clf.fit(train, labels_train)

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=91768, shrinking=True,
    tol=0.001, verbose=False)

In [33]:
kappa(labels_test, clf.predict(test))

0.595881595881596

In [34]:
Counter(labels_test)

Counter({'N': 111, 'V': 36, 'O': 10})

We can convert the same BERT model to work with huggingface with a command-line based converter. 

In [35]:
#export HF_MODEL_DIR="/bridge/science/AI/nlp/bert/huggingface"
#transformers-cli convert --model_type bert --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt --config $BERT_BASE_DIR/bert_config.json --pytorch_dump_output $HF_MODEL_DIR/pytorch_model.bin
hf_model_dir="/bridge/science/AI/nlp/bert/huggingface"
hf_model_file=os.path.join(hf_model_dir,"pytorch_model.bin")

Now, we're going to load the same BERT model through the huggingface
transformers API. 

We need to stack these in such a way that layers 4-3-2-1 appear for each of the 4 words, selected across word pieces. 

Ex: <br>
Mary ate noodles with chopsticks. <br>
Mary ate noodles with curry. <br>

4-tuple (VNPN): ate, noodles, with, (chopsticks/curry)

The BERT tokenizer may break up words, so it's possible to see something like 
ate,noodl#es, with, chop#sticks/cur#ry
We take the 4 layers of up to 4 pieces of each word, starting with the 
4th layer of the first piece, then the
3rd layer of the second/last piece, ...
top layer of the fourth/last piece, 
So we'll have 16 piece-layers for each attachment instance. 
 

In [36]:
# Swapping in the huggingface.co model (easy)
# Load pre-trained model (weights)
#config = BertConfig.from_pretrained("bert-large-uncased-whole-word-masking")
xconfig = BertConfig.from_pretrained(os.path.join(hf_model_dir,"config.json"))
xconfig.output_hidden_states=True

#hfxmodel = BertModel.from_pretrained(config=xconfig)
hfxmodel = BertModel.from_pretrained(hf_model_file,config=xconfig)
hfxmodel.eval()

hfxtokenizer = BertTokenizer.from_pretrained(hf_model_dir, config=xconfig)
# Load pre-trained model tokenizer (vocabulary)
#tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking')
#hftokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking", config=config)

In [37]:
XX = np.array([x for x in generate_huggingface_instances(
    hfxmodel,hfxtokenizer,annotated4tpls,sents,labels,use_cuda=True)])

In [38]:
XX.shape

(629, 16384)

In [39]:
train_hf = XX[trainidx]
test_hf = XX[testidx]


In [40]:
clfhf = svm.SVC(gamma=0.0001, C=100., random_state=91768)

clfhf.fit(train_hf, labels_train)

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=91768, shrinking=True,
    tol=0.001, verbose=False)

In [41]:
labels_test_hf = clfhf.predict(test_hf)
kappa(labels_test, labels_test_hf)

0.6134147542598247

In [42]:
#export HF_MODEL_DIR="/bridge/science/AI/nlp/bert/huggingface"
#transformers-cli convert --model_type bert --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt --config $BERT_BASE_DIR/bert_config.json --pytorch_dump_output $HF_MODEL_DIR/pytorch_model.bin
hf2_model_dir="/bridge/science/AI/nlp/bert/model/pytorch-1600"
hf2_model_file=os.path.join(hf2_model_dir,"pytorch_model.bin")

In [43]:
# Swapping in the huggingface.co model (easy)
# Load pre-trained model (weights)
#config = BertConfig.from_pretrained("bert-large-uncased-whole-word-masking")
x2config = BertConfig.from_pretrained(os.path.join(hf2_model_dir,"config.json"))
x2config.output_hidden_states=True

#hfxmodel = BertModel.from_pretrained(config=xconfig)
hfxmodel = BertModel.from_pretrained(hf2_model_file,config=x2config)
hfxmodel.eval()

hf2tokenizer = BertTokenizer.from_pretrained(hf2_model_dir, config=x2config)

In [45]:
XX2 = np.array([x for x in generate_huggingface_instances(
    hfxmodel,hf2tokenizer,annotated4tpls,sents,labels,use_cuda=True)])

In [47]:
train_hf2 = XX2[trainidx]
test_hf2 = XX2[testidx]


In [48]:
clfhf2 = svm.SVC(gamma=0.0001, C=100., random_state=91768)

clfhf2.fit(train_hf2, labels_train)

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=91768, shrinking=True,
    tol=0.001, verbose=False)

In [49]:
labels_test_hf2 = clfhf2.predict(test_hf2)
kappa(labels_test, labels_test_hf2)

0.6025718914540299