In [1]:
import ndjson
import random

import sys
sys.path.append('../')

MAX_SEQ_LENGTH = 100
TEST_SIZE = 10000
BATCH_SIZE = 32
MODEL = "bert-base-uncased"

file_list = ["/home/yves/projects/Quill-NLP-Tools-and-Datasets/Passive_with_incorrect_be.ndjson",
             "/home/yves/projects/Quill-NLP-Tools-and-Datasets/Passive_without_be.ndjson",
             "/home/yves/projects/Quill-NLP-Tools-and-Datasets/Passive_past_tense_as_participle.ndjson",
             "/home/yves/projects/Quill-NLP-Tools-and-Datasets/Perfect_tense_with_simple_past.ndjson",
             "/home/yves/projects/Quill-NLP-Tools-and-Datasets/Perfect_tense_without_have.ndjson",
             "/home/yves/projects/Quill-NLP-Tools-and-Datasets/Perfect_progressive_with_incorrect_be_and_without_have.ndjson",
             "/home/yves/projects/Quill-NLP-Tools-and-Datasets/Incorrect_verb_with_a_simple_noun_subject.ndjson",
             "/home/yves/projects/Quill-NLP-Tools-and-Datasets/Incorrect_verb_with_a_pronoun_subject.ndjson",
             "/home/yves/projects/Quill-NLP-Tools-and-Datasets/Incorrect_negative_verb_with_a_simple_noun_subject.ndjson",
             "/home/yves/projects/Quill-NLP-Tools-and-Datasets/Subject-verb_agreement_with_neither-nor.ndjson",
             "/home/yves/projects/Quill-NLP-Tools-and-Datasets/Subject-verb_agreement_with_either-or.ndjson",
             ]

data = []
for f in file_list:
    with open(f) as i:
        data.extend(ndjson.load(i))

data = [{"text": item[0], "entities": item[1]["entities"]} for item in data]


In [2]:
label2idx = {"O": 0}

for sentence in data:
    if "entities" in sentence:
        for (_, _, label) in sentence["entities"]:
            if label not in label2idx:
                label2idx[label] = len(label2idx)
            
print(label2idx)
        

{'O': 0, 'Passive with incorrect be': 1, 'Passive without be': 2, 'Passive past tense as participle': 3, 'Perfect tense with simple past': 4, 'Perfect tense without have': 5, 'Perfect progressive with incorrect be and without have': 6, 'Incorrect verb with a simple noun subject': 7, 'Incorrect verb with a pronoun subject': 8, 'Incorrect negative verb with a simple noun subject': 9, 'Subject-verb agreement with neither-nor': 10, 'Subject-verb agreement with either-or': 11}


In [3]:
from transformers import BertForTokenClassification
from transformers import BertTokenizer
from quillnlp.models.bert.preprocessing import convert_data_to_input_items, NLPTask

tokenizer = BertTokenizer.from_pretrained(MODEL)
input_items = convert_data_to_input_items(data, label2idx, MAX_SEQ_LENGTH, tokenizer, NLPTask.SEQUENCE_LABELING)

I0511 20:15:01.468373 139933356599104 file_utils.py:41] PyTorch version 1.2.0+cu92 available.
I0511 20:15:02.473465 139933356599104 file_utils.py:57] TensorFlow version 2.1.0 available.
I0511 20:15:03.080697 139933356599104 tokenization_utils.py:501] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/yves/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [4]:
import random

from quillnlp.models.bert.preprocessing import get_data_loader

random.shuffle(input_items)

test_items = input_items[-TEST_SIZE:]
valid_items = input_items[-2*TEST_SIZE:-TEST_SIZE]
train_items = input_items[:-2*TEST_SIZE]

#train_items = train_items[:1000]

test_dl = get_data_loader(test_items, BATCH_SIZE, NLPTask.SEQUENCE_LABELING, shuffle=False)
dev_dl = get_data_loader(valid_items, BATCH_SIZE, NLPTask.SEQUENCE_LABELING, shuffle=False)
train_dl = get_data_loader(train_items, BATCH_SIZE, NLPTask.SEQUENCE_LABELING, shuffle=True)


In [5]:
from quillnlp.models.bert.train import train
from transformers import BertModel

model = BertForTokenClassification.from_pretrained(MODEL, num_labels=len(label2idx))
model.to("cuda")

train(model, train_dl, dev_dl, BATCH_SIZE, 32/BATCH_SIZE, device="cuda", num_train_epochs=20, learning_rate=1e-5)

I0511 20:15:36.324699 139933356599104 configuration_utils.py:256] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/yves/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
I0511 20:15:36.326292 139933356599104 configuration_utils.py:292] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embedd

HBox(children=(IntProgress(value=0, description='Training iteration', max=1822, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='Evaluation iteration', max=313, style=ProgressStyle(descripti…

Epoch 0 step 1821: dev loss = 0.04009568199705773


Epoch:   5%|▌         | 1/20 [11:50<3:45:03, 710.69s/it]

Lower loss => saving model to /tmp/model.bin.



HBox(children=(IntProgress(value=0, description='Training iteration', max=1822, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='Evaluation iteration', max=313, style=ProgressStyle(descripti…

Epoch 1 step 3642: dev loss = 0.02121305822606047
Lower loss => saving model to /tmp/model.bin.


Epoch:  10%|█         | 2/20 [23:42<3:33:16, 710.90s/it]




HBox(children=(IntProgress(value=0, description='Training iteration', max=1822, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='Evaluation iteration', max=313, style=ProgressStyle(descripti…

Epoch 2 step 5463: dev loss = 0.016693677110508228
Lower loss => saving model to /tmp/model.bin.


Epoch:  15%|█▌        | 3/20 [35:33<3:21:25, 710.93s/it]




HBox(children=(IntProgress(value=0, description='Training iteration', max=1822, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='Evaluation iteration', max=313, style=ProgressStyle(descripti…

Epoch 3 step 7284: dev loss = 0.015043783533653191
Lower loss => saving model to /tmp/model.bin.


Epoch:  20%|██        | 4/20 [47:23<3:09:33, 710.82s/it]




HBox(children=(IntProgress(value=0, description='Training iteration', max=1822, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='Evaluation iteration', max=313, style=ProgressStyle(descripti…

Epoch 4 step 9105: dev loss = 0.014384063102450734
Lower loss => saving model to /tmp/model.bin.


Epoch:  25%|██▌       | 5/20 [59:14<2:57:40, 710.72s/it]




HBox(children=(IntProgress(value=0, description='Training iteration', max=1822, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='Evaluation iteration', max=313, style=ProgressStyle(descripti…

Epoch 5 step 10926: dev loss = 0.013735000782768233
Lower loss => saving model to /tmp/model.bin.


Epoch:  30%|███       | 6/20 [1:11:05<2:45:52, 710.87s/it]




HBox(children=(IntProgress(value=0, description='Training iteration', max=1822, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='Evaluation iteration', max=313, style=ProgressStyle(descripti…

Epoch 6 step 12747: dev loss = 0.015369788507284514


Epoch:  35%|███▌      | 7/20 [1:22:56<2:34:00, 710.83s/it]




HBox(children=(IntProgress(value=0, description='Training iteration', max=1822, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='Evaluation iteration', max=313, style=ProgressStyle(descripti…

Epoch 7 step 14568: dev loss = 0.016467628872087865


Epoch:  40%|████      | 8/20 [1:34:46<2:22:09, 710.80s/it]




HBox(children=(IntProgress(value=0, description='Training iteration', max=1822, style=ProgressStyle(descriptio…

HBox(children=(IntProgress(value=0, description='Evaluation iteration', max=313, style=ProgressStyle(descripti…

Epoch 8 step 16389: dev loss = 0.016728581071417694


Epoch:  45%|████▌     | 9/20 [1:46:37<2:10:18, 710.76s/it]




HBox(children=(IntProgress(value=0, description='Training iteration', max=1822, style=ProgressStyle(descriptio…




KeyboardInterrupt: 

In [6]:
import torch
from quillnlp.models.bert.train import evaluate

output_model_file = "/tmp/model.bin"
print("Loading model from", output_model_file)
device="cpu"

model_state_dict = torch.load(output_model_file, map_location=lambda storage, loc: storage)
model = BertForTokenClassification.from_pretrained(MODEL, state_dict=model_state_dict, num_labels=len(label2idx))
model.to(device)

#_, train_correct, train_predicted = evaluate(model, train_dataloader)
#_, dev_correct, dev_predicted = evaluate(model, dev_dataloader)
_, _, test_correct, test_predicted = evaluate(model, test_dl, device)

Loading model from /tmp/model.bin


I0511 22:06:09.915359 139933356599104 configuration_utils.py:256] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/yves/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.7156163d5fdc189c3016baca0775ffce230789d7fa2a42ef516483e4ca884517
I0511 22:06:09.917084 139933356599104 configuration_utils.py:292] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "do_sample": false,
  "eos_token_ids": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embedd

HBox(children=(IntProgress(value=0, description='Evaluation iteration', max=313, style=ProgressStyle(descripti…

In [7]:
from sklearn.metrics import f1_score, classification_report

idx2label = {v:k for k,v in label2idx.items()}

all_correct = []
all_predicted = []

for item, correct, predicted in zip(test_items, test_correct, test_predicted):
    #print(item.text)
    #for error in set(predicted):
    #    print("Found:", idx2label[error])
    #for error in set(correct):
    #    print("Correct:", idx2label[error])
    all_correct.extend(correct)
    all_predicted.extend(predicted)
    
    

In [8]:
f = f1_score(all_correct, all_predicted, average="micro")
print("F1:", f)
print(classification_report(all_correct, all_predicted, labels=list(label2idx.values()), target_names=list(label2idx.keys())))

F1: 0.971634


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                                        precision    recall  f1-score   support

                                                     O       1.00      0.97      0.99    994605
                             Passive with incorrect be       0.13      0.85      0.23       601
                                    Passive without be       0.10      0.82      0.17       798
                      Passive past tense as participle       0.14      0.94      0.25       760
                        Perfect tense with simple past       0.15      0.87      0.26       748
                            Perfect tense without have       0.17      0.88      0.29       632
Perfect progressive with incorrect be and without have       0.16      0.90      0.27       203
             Incorrect verb with a simple noun subject       0.18      0.90      0.30       663
                 Incorrect verb with a pronoun subject       0.26      0.93      0.40       694
    Incorrect negative verb with a simp