# Evaluating the SRL models

In [4]:
import utils

from transformers import AutoModelForTokenClassification, Trainer, AutoTokenizer, DataCollatorForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
data_collator = DataCollatorForTokenClassification(tokenizer)

## Loading the two models

In [2]:
base_model = AutoModelForTokenClassification.from_pretrained("dannashao/bert-base-uncased-finetuned-srl_arg",num_labels=len(utils.baseline_ds.label_dict),id2label=utils.baseline_ds.label_dict_rev, label2id=utils.baseline_ds.label_dict)
base_trainer = Trainer(base_model,data_collator=data_collator,tokenizer=tokenizer)

advanced_model = AutoModelForTokenClassification.from_pretrained("dannashao/bert-base-uncased-finetuned-advanced-srl_arg",num_labels=len(utils.baseline_ds.label_dict),id2label=utils.baseline_ds.label_dict_rev, label2id=utils.baseline_ds.label_dict)
advanced_trainer = Trainer(advanced_model,data_collator=data_collator,tokenizer=tokenizer)

## A step by step demo using Test 6 and Baseline model

- Creating tokenized dataset from the test file:

In [3]:
test_6_ds, test_6_text = utils.file_to_ds('./tests/6.txt')

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

- Making prediction using the basline model:

In [5]:
pred,lab,_ = base_trainer.predict(test_6_ds)

- Remove special tokens (-100) and reverse map int labels:

In [6]:
pred, true = utils.remove_and_reverse(pred, lab)

- Print the results and failure rate:

In [9]:
utils.print_results(test_6_text, pred, true)

------Sentence  0 : Failure------
Sent:  ['We', 'arrived', 'school', '.']
Pred:  ['ARG1', '_', 'ARG4', '_']
True:  ['ARG0', '_', 'ARG4', '_']

------Sentence  1 : Success------
Sent:  ['He', 'talked', 'with', 'Jim', '.']
Pred:  ['ARG0', '_', '_', 'ARG2', '_']
True:  ['ARG0', '_', '_', 'ARG2', '_']

------Sentence  2 : Success------
Sent:  ['He', 'talked', 'about', 'it', '.']
Pred:  ['ARG0', '_', '_', 'ARG1', '_']
True:  ['ARG0', '_', '_', 'ARG1', '_']

------Sentence  3 : Failure------
Sent:  ['The', 'plant', 'grows', 'quickly', '.']
Pred:  ['_', 'ARG1', '_', 'ARGM-MNR', '_']
True:  ['_', 'ARG0', '_', 'ARGM-MNR', '_']

------Sentence  4 : Failure------
Sent:  ['The', 'refugees', 'immigrated', 'to', 'Canada', '.']
Pred:  ['_', 'ARG1', '_', '_', 'ARG4', '_']
True:  ['_', 'ARG0', '_', '_', 'ARG4', '_']

------Sentence  5 : Success------
Sent:  ['They', 'eat', 'good', 'foods', '.']
Pred:  ['ARG0', '_', '_', 'ARG1', '_']
True:  ['ARG0', '_', '_', 'ARG1', '_']

------Sentence  6 : Success---

0.2631578947368421

## Warp up and run all tests

In [10]:
def do_test(path,trainer,print=False):
    testds, test_text = utils.file_to_ds(path)
    pred,lab,_ = trainer.predict(testds)
    pred, true = utils.remove_and_reverse(pred, lab)
    failrate = utils.calculate_failrate(test_text, pred, true)
    if print==True:
        utils.print_results(test_text, pred, true)

In [7]:
paths = ['./tests/6.txt','./tests/8.txt','./tests/11.txt','./tests/14.txt','./tests/18.txt']

In [12]:
for path in paths:
    do_test(path,base_trainer)

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Failure rate:  0.2631578947368421


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Failure rate:  0.4117647058823529


Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Failure rate:  0.23529411764705882


Map:   0%|          | 0/33 [00:00<?, ? examples/s]

Failure rate:  0.5757575757575758


Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Failure rate:  0.5263157894736842


In [11]:
for path in paths:
    do_test(path,advanced_trainer)

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Failure rate:  0.2631578947368421


Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Failure rate:  0.4117647058823529


Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Failure rate:  0.23529411764705882


Map:   0%|          | 0/33 [00:00<?, ? examples/s]

Failure rate:  0.5757575757575758


Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Failure rate:  0.5263157894736842


## Full results

In [13]:
for path in paths:
    do_test(path,base_trainer,True)

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Failure rate:  0.2631578947368421
------Sentence  0 : Failure------
Sent:  ['We', 'arrived', 'school', '.']
Pred:  ['ARG1', '_', 'ARG4', '_']
True:  ['ARG0', '_', 'ARG4', '_']

------Sentence  1 : Success------
Sent:  ['He', 'talked', 'with', 'Jim', '.']
Pred:  ['ARG0', '_', '_', 'ARG2', '_']
True:  ['ARG0', '_', '_', 'ARG2', '_']

------Sentence  2 : Success------
Sent:  ['He', 'talked', 'about', 'it', '.']
Pred:  ['ARG0', '_', '_', 'ARG1', '_']
True:  ['ARG0', '_', '_', 'ARG1', '_']

------Sentence  3 : Failure------
Sent:  ['The', 'plant', 'grows', 'quickly', '.']
Pred:  ['_', 'ARG1', '_', 'ARGM-MNR', '_']
True:  ['_', 'ARG0', '_', 'ARGM-MNR', '_']

------Sentence  4 : Failure------
Sent:  ['The', 'refugees', 'immigrated', 'to', 'Canada', '.']
Pred:  ['_', 'ARG1', '_', '_', 'ARG4', '_']
True:  ['_', 'ARG0', '_', '_', 'ARG4', '_']

------Sentence  5 : Success------
Sent:  ['They', 'eat', 'good', 'foods', '.']
Pred:  ['ARG0', '_', '_', 'ARG1', '_']
True:  ['ARG0', '_', '_', 'ARG1', '_

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Failure rate:  0.4117647058823529
------Sentence  0 : Failure------
Sent:  ['The', 'Sky', 'appears', 'blue', '.']
Pred:  ['_', 'ARG1', '_', 'ARGM-PRD', '_']
True:  ['_', 'ARG1', '_', 'C-ARG1', '_']

------Sentence  1 : Success------
Sent:  ['They', 'are', 'happy', '.']
Pred:  ['ARG1', '_', 'ARG2', '_']
True:  ['ARG1', '_', 'ARG2', '_']

------Sentence  2 : Failure------
Sent:  ['The', 'book', 'seems', 'interesting', '.']
Pred:  ['_', 'ARG1', '_', 'ARG1', '_']
True:  ['_', 'ARG1', '_', 'C-ARG1', '_']

------Sentence  3 : Failure------
Sent:  ['She', 'appears', 'confused', '.']
Pred:  ['ARG1', '_', 'ARGM-ADV', '_']
True:  ['ARG1', '_', 'C-ARG1', '_']

------Sentence  4 : Success------
Sent:  ['The', 'dog', 'is', 'playful', '.']
Pred:  ['_', 'ARG1', '_', 'ARG2', '_']
True:  ['_', 'ARG1', '_', 'ARG2', '_']

------Sentence  5 : Success------
Sent:  ['He', 'seems', 'relaxed', '.']
Pred:  ['ARG1', '_', 'C-ARG1', '_']
True:  ['ARG1', '_', 'C-ARG1', '_']

------Sentence  6 : Success------
Sent:

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

Failure rate:  0.23529411764705882
------Sentence  0 : Success------
Sent:  ['It', 'happened', 'at', 'the', 'White', 'House', '.']
Pred:  ['ARG1', '_', '_', '_', '_', 'ARGM-LOC', '_']
True:  ['ARG1', '_', '_', '_', '_', 'ARGM-LOC', '_']

------Sentence  1 : Success------
Sent:  ['It', 'happened', 'in', 'the', 'WW2', '.']
Pred:  ['ARG1', '_', '_', '_', 'ARGM-TMP', 'ARGM-TMP', 'ARGM-TMP', '_']
True:  ['ARG1', '_', '_', '_', 'ARGM-TMP', 'ARGM-TMP', 'ARGM-TMP', '_']

------Sentence  2 : Success------
Sent:  ['It', 'happened', 'near', 'the', 'Middle', 'Ages', '.']
Pred:  ['ARG1', '_', '_', '_', '_', 'ARGM-TMP', '_']
True:  ['ARG1', '_', '_', '_', '_', 'ARGM-TMP', '_']

------Sentence  3 : Failure------
Sent:  ['It', 'happened', 'in', 'near', 'Statue', 'of', 'Liberty', '.']
Pred:  ['ARG1', '_', '_', '_', 'ARGM-LOC', '_', '_', '_']
True:  ['ARG1', '_', '_', '_', '_', '_', 'ARGM-LOC', '_']

------Sentence  4 : Success------
Sent:  ['It', 'happened', 'in', 'the', '20th', 'century', '.']
Pred:  

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

Failure rate:  0.5757575757575758
------Sentence  0 : Failure------
Sent:  ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']
Pred:  ['_', 'ARG1', '_', '_', '_', 'ARG2', '_']
True:  ['_', 'ARG0', '_', '_', '_', 'ARGM-LOC', '_']

------Sentence  1 : Failure------
Sent:  ['On', 'the', 'mat', 'sat', 'the', 'cat', '.']
Pred:  ['_', '_', 'ARGM-LOC', '_', '_', 'ARG1', '_']
True:  ['_', '_', 'ARGM-LOC', '_', '_', 'ARG0', '_']

------Sentence  2 : Success------
Sent:  ['The', 'children', 'played', 'in', 'the', 'garden', '.']
Pred:  ['_', 'ARG0', '_', '_', '_', 'ARGM-LOC', '_']
True:  ['_', 'ARG0', '_', '_', '_', 'ARGM-LOC', '_']

------Sentence  3 : Failure------
Sent:  ['In', 'the', 'mgarden', 'played', 'the', 'children', '.']
Pred:  ['_', '_', 'ARG2', 'ARGM-LOC', 'ARG2', '_', '_', 'ARG1', '_']
True:  ['_', '_', 'ARGM-LOC', 'ARGM-LOC', 'ARGM-LOC', '_', '_', 'ARG0', '_']

------Sentence  4 : Failure------
Sent:  ['She', 'walked', 'along', 'the', 'beach', '.']
Pred:  ['ARG0', '_', '_', '_', 'ARG2',

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Failure rate:  0.5263157894736842
------Sentence  0 : Failure------
Sent:  ['He', 'also', 'nominated', 'him', 'a', 'term', '.']
Pred:  ['ARG0', 'ARGM-DIS', '_', 'ARG1', '_', 'ARG1', '_']
True:  ['ARG0', 'ARGM-DIS', '_', 'ARG1', '_', 'ARG2', '_']

------Sentence  1 : Failure------
Sent:  ['Yet', 'we', 'charged', 'them', 'for', 'the', 'evacuation']
Pred:  ['ARGM-DIS', 'ARG0', '_', 'ARG1', '_', '_', 'ARG3']
True:  ['ARGM-DIS', 'ARG0', '_', 'ARG2', '_', '_', 'ARG3']

------Sentence  2 : Success------
Sent:  ['That', "'s", 'right', ',', 'folks', '.']
Pred:  ['ARG1', '_', '_', 'ARG2', '_', 'ARGM-DIS', '_', 'ARG1']
True:  ['ARG1', '_', '_', 'ARG2', '_', 'ARGM-DIS', '_', 'ARG1']

------Sentence  3 : Success------
Sent:  ['And', 'I', 'gave', 'it', 'all', 'my', 'heart', '.']
Pred:  ['ARGM-DIS', 'ARG0', '_', 'ARG2', '_', '_', 'ARG1', '_']
True:  ['ARGM-DIS', 'ARG0', '_', 'ARG2', '_', '_', 'ARG1', '_']

------Sentence  4 : Success------
Sent:  ['John', ',', 'you', 'are', 'in', ',', 'right', '?']
P