In [1]:
import json
import wandb
import click
import torch
import transformers
from pathlib import Path
from copy import deepcopy
from functools import partial
from typing import List, Callable, Iterable, Union
from mytorch.utils.goodies import mt_save_dir

# Local imports
try:
    import _pathfix
except ImportError:
    from . import _pathfix
from loops import training_loop
from models.multitask import BasicMTL
from dataiter import MultiTaskDataIter
from utils.misc import check_dumped_config
from utils.exceptions import ImproperDumpDir
from config import LOCATIONS as LOC, CONFIG, KNOWN_SPLITS
from eval import Evaluator, NERAcc, NERSpanRecognitionPR, PrunerPR, CorefBCubed, CorefMUC, CorefCeafe

Fixing paths from /home/priyansh/Dev/research/coref/mtl/src


In [2]:
! free -h

              total        used        free      shared  buff/cache   available
Mem:            15G        7,7G        2,2G        1,0G        5,4G        6,3G
Swap:          979M         14M        965M


In [3]:

def make_optimizer(model, optimizer_class: Callable, lr: float, freeze_encoder: bool):
    if freeze_encoder:
        return optimizer_class(
            [param for name, param in model.named_parameters() if not name.startswith("encoder")],
            lr=lr
        )
    else:
        return optimizer_class(model.parameters(), lr=lr)


def get_pretrained_dirs(nm: str):
    """Check if the given nm is stored locally. If so, load that. Else, pass it on as is."""
    plausible_parent_dir: Path = LOC.root / "models" / "huggingface" / nm

    if (
            (plausible_parent_dir / "config").exists()
            and (plausible_parent_dir / "tokenizer").exists()
            and (plausible_parent_dir / "encoder").exists()
    ):
        return (
            str(plausible_parent_dir / "config"),
            str(plausible_parent_dir / "tokenizer"),
            str(plausible_parent_dir / "encoder"),
        )
    else:
        return nm, nm, nm


def pick_loss_scale(options: dict, tasks: Iterable[str]):
    key = 'loss_scales_' + '_'.join(sorted(tasks))
    return options[key]


def get_saved_wandb_id(loc: Path):
    with (loc / 'config.json').open('r', encoding='utf8') as f:
        config = json.load(f)

    return config['wandbid']


def get_save_parent_dir(parentdir: Path, tasks: List[str], config: Union[transformers.BertConfig, dict]) -> Path:
    """
        Normally returns parentdir/'_'.join(sorted(tasks)).
        E.g. if tasks are ['coref', 'ner']:
                parentdir/coref_ner
            but if they are arranged like ['ner', coref'], the output would still be
                parentdir/coref_ner
            if tasks are ['ner', 'pruner', 'coref']:
                parentdir/coref_ner_pruner

        However, if we find that trim flag is active in config, or that the run is going to wandb-trials
            then the output is
                parentdir/trial/<tasks concatenated with _ in alphabetical order>
    """

    if config.trim or config.wandb_trial:
        return parentdir / 'trial' / '_'.join(sorted(tasks))
    else:
        return parentdir / '_'.join(sorted(tasks))


# Make MTL A

In [4]:
dataset: str = 'ontonotes'
epochs: int = 10
encoder: str = "bert-base-uncased"
tasks: List[str] = ('coref', 'ner', 'pruner')
device: str = "cpu"
trim: bool = True
train_encoder: bool = False,
ner_unweighted: bool = False
filter_candidates_pos = True
use_wandb = False
save=False

In [5]:

dir_config, dir_tokenizer, dir_encoder = get_pretrained_dirs(encoder)

tokenizer = transformers.BertTokenizer.from_pretrained(dir_tokenizer)
config = transformers.BertConfig(dir_config)
config.max_span_width = 5
config.coref_dropout = 0.3
config.metadata_feature_size = 20
config.unary_hdim = 1000
config.binary_hdim = 2000
config.top_span_ratio = 0.4
config.max_top_antecedents = 50
config.device = device
config.epochs = epochs
config.trim = trim
config.freeze_encoder = not train_encoder
config.ner_ignore_weights = ner_unweighted
config.filter_candidates_pos_threshold = CONFIG['filter_candidates_pos_threshold'] \
    if filter_candidates_pos else -1

# Assign loss scales based on task
loss_scales = pick_loss_scale(CONFIG, tasks)
config.loss_scales = loss_scales.tolist() if not type(loss_scales) is list else loss_scales

if 'ner' in tasks or 'pruner' in tasks:
    # Need to figure out the number of classes. Load a DL. Get the number. Delete the DL.
    temp_ds = MultiTaskDataIter(
        src=dataset,
        config=config,
        tasks=tasks,
        split="development",
        tokenizer=tokenizer,
    )
    if 'ner' in tasks:
        config.ner_n_classes = deepcopy(temp_ds.ner_tag_dict.__len__())
        config.ner_class_weights = temp_ds.estimate_class_weights('ner')
    else:
        config.ner_n_classes = 1
        config.ner_class_weights = [1.0, ]
    if 'pruner' in tasks:
        config.pruner_class_weights = temp_ds.estimate_class_weights('pruner')
    del temp_ds
else:
    config.ner_n_classes = 1
    config.ner_class_weights = [1.0, ]

# # Make the model
model = BasicMTL(dir_encoder, config=config)

# Load the data
train_ds = partial(
    MultiTaskDataIter,
    src=dataset,
    config=config,
    tasks=tasks,
    split=KNOWN_SPLITS[dataset].train,
    tokenizer=tokenizer,
)
dev_ds = partial(
    MultiTaskDataIter,
    src=dataset,
    config=config,
    tasks=tasks,
    split=KNOWN_SPLITS[dataset].dev,
    tokenizer=tokenizer,
)

# Make the optimizer
opt = make_optimizer(model=model, optimizer_class=torch.optim.SGD, lr=0.005, freeze_encoder=config.freeze_encoder)
opt = torch.optim.SGD(model.parameters(), lr=0.001)

# Make the evaluation suite (may compute multiple metrics corresponding to the tasks)
metrics = []
if 'ner' in tasks:
    metrics += [NERAcc(), NERSpanRecognitionPR()]
if 'pruner' in tasks:
    metrics += [PrunerPR()]
if 'coref' in tasks:
    metrics += [CorefBCubed(), CorefMUC(), CorefCeafe()]
train_eval = Evaluator(
    predict_fn=model.pred_with_labels,
    dataset_partial=train_ds,
    metrics=metrics,
    device=device
)
dev_eval = Evaluator(
    predict_fn=model.pred_with_labels,
    dataset_partial=dev_ds,
    metrics=metrics,
    device=device
)

print(config)
print("Training commences!")

Pulled 318 instances from ../data/parsed/ontonotes/development/MultiTaskDatasetDump_coref_ner_pruner.pkl.




BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "binary_hdim": 2000,
  "classifier_dropout": null,
  "coref_dropout": 0.3,
  "device": "cpu",
  "epochs": 10,
  "filter_candidates_pos_threshold": 2000,
  "freeze_encoder": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "loss_scales": [
    0.3923397028791302,
    0.2153205942417397,
    0.3923397028791302
  ],
  "max_position_embeddings": 512,
  "max_span_width": 5,
  "max_top_antecedents": 50,
  "metadata_feature_size": 20,
  "model_type": "bert",
  "ner_class_weights": [
    0.05330982413982514,
    24.68555023923445,
    30.717313646106216,
    51.42822966507177,
    20.887773279352228,
    43.23902111967818,
    101.32128829536528,
    1697.1315789473683,
    38.79157894736842,
    174.06477732793522,
    678.8526315789474,
    135.77052631578948,
    141.42763157894737,
    102.85645933014354,
    678.8526

In [6]:
savedir, save_config, save_objs = None, None, None


In [None]:
outputs = training_loop(
    model=model,
    epochs=epochs,
    trn_dl=train_ds,
    forward_fn=model.pred_with_labels,
    device=device,
    train_eval=train_eval,
    dev_eval=dev_eval,
    opt=opt,
    tasks=tasks,
    loss_scales=torch.tensor(loss_scales, dtype=torch.float, device=device),
    flag_wandb=use_wandb,
    flag_save=save,
    save_dir=savedir,
    save_config=save_config,
    epochs_last_run=config.epoch if hasattr(config, 'epoch') else 0
)

Pulled 2455 instances from ../data/parsed/ontonotes/train/MultiTaskDatasetDump_coref_ner_pruner.pkl.


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Following are the differences found in the configs.
Old:            ner_n_classes: None
New:            ner_n_classes: 19
Old:        ner_class_weights: None
New:        ner_class_weights: [0.05330982413982514, 24.68555023923445, 30.717313646106216, 51.42822966507177, 20.887773279352228, 43.23902111967818, 101.32128829536528, 1697.1315789473683, 38.79157894736842, 174.06477732793522, 678.8526315789474, 135.77052631578948, 141.42763157894737, 102.85645933014354, 678.8526315789474, 1697.1315789473683, 183.47368421052633, 424.2828947368421, 1357.7052631578947]
Old:     pruner_class_weights: None
New:     pruner_class_weights: [0.5103185781885514, 24.728144171779142]




HBox(children=(FloatProgress(value=0.0, max=318.0), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (1083 > 512). Running this sequence through the model will result in indexing errors
	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /opt/conda/conda-bld/pytorch_1607370124688/work/torch/csrc/utils/python_arg_parser.cpp:882.)
  if gold_labels.nonzero().shape[0] == 0:


TOO LONG:  reeeeeeeeeeeeeeaaaal
TOO LONG:  586.14_588.20_
TOO LONG:  586.14_588.20_a
TOO LONG:  586.14_588.20_a:
TOO LONG:  820.26_822.65_b
TOO LONG:  820.26_822.65_b:
TOO LONG:  965.04_967.89_
TOO LONG:  965.04_967.89_b
TOO LONG:  965.04_967.89_b:



HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))



Epoch:   1 | coref Loss: 3.91986 | coref Tr_b_cubed_p: 0.000 | coref Tr_b_cubed_r: 0.027 | coref Tr_b_cubed_f1: 0.001 | coref Tr_muc_p: 0.003 | coref Tr_muc_r: 0.019 | coref Tr_muc_f1: 0.005 | coref Tr_ceafe_p: 0.015 | coref Tr_ceafe_r: 0.007 | coref Tr_ceafe_f1: 0.010
		 | coref Vl_b_cubed_p: 0.000 | coref Vl_b_cubed_r: 0.027 | coref Vl_b_cubed_f1: 0.001 | coref Vl_muc_p: 0.003 | coref Vl_muc_r: 0.019 | coref Vl_muc_f1: 0.005 | coref Vl_ceafe_p: 0.015 | coref Vl_ceafe_r: 0.007 | coref Vl_ceafe_f1: 0.010
	 | ner Loss: 2.94371 | ner Tr_acc: 0.355 | ner Tr_acc_nonzero: 0.043 | ner Tr_spanrec_p: 0.660 | ner Tr_spanrec_r: 0.012
		 | ner Vl_acc: 0.355 | ner Vl_acc_nonzero: 0.043 | ner Vl_spanrec_p: 0.660 | ner Vl_spanrec_r: 0.012
	 | pruner Loss: 0.53346 | pruner Tr_p: 0.095 | pruner Tr_r: 0.018
		 | pruner Vl_p: 0.095 | pruner Vl_r: 0.018
Pulled 2455 instances from ../data/parsed/ontonotes/train/MultiTaskDatasetDump_coref_ner_pruner.pkl.


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Pulled 318 instances from ../data/parsed/ontonotes/development/MultiTaskDatasetDump_coref_ner_pruner.pkl.


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))



Epoch:   2 | coref Loss: 3.90847 | coref Tr_b_cubed_p: 0.000 | coref Tr_b_cubed_r: 0.024 | coref Tr_b_cubed_f1: 0.001 | coref Tr_muc_p: 0.003 | coref Tr_muc_r: 0.018 | coref Tr_muc_f1: 0.005 | coref Tr_ceafe_p: 0.017 | coref Tr_ceafe_r: 0.006 | coref Tr_ceafe_f1: 0.009
		 | coref Vl_b_cubed_p: 0.000 | coref Vl_b_cubed_r: 0.024 | coref Vl_b_cubed_f1: 0.001 | coref Vl_muc_p: 0.003 | coref Vl_muc_r: 0.018 | coref Vl_muc_f1: 0.005 | coref Vl_ceafe_p: 0.017 | coref Vl_ceafe_r: 0.006 | coref Vl_ceafe_f1: 0.009
	 | ner Loss: 2.94363 | ner Tr_acc: 0.391 | ner Tr_acc_nonzero: 0.038 | ner Tr_spanrec_p: 0.633 | ner Tr_spanrec_r: 0.013
		 | ner Vl_acc: 0.391 | ner Vl_acc_nonzero: 0.038 | ner Vl_spanrec_p: 0.633 | ner Vl_spanrec_r: 0.013
	 | pruner Loss: 0.38629 | pruner Tr_p: 0.084 | pruner Tr_r: 0.016
		 | pruner Vl_p: 0.084 | pruner Vl_r: 0.016
Pulled 2455 instances from ../data/parsed/ontonotes/train/MultiTaskDatasetDump_coref_ner_pruner.pkl.


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Pulled 318 instances from ../data/parsed/ontonotes/development/MultiTaskDatasetDump_coref_ner_pruner.pkl.


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))



Epoch:   3 | coref Loss: 3.89938 | coref Tr_b_cubed_p: 0.000 | coref Tr_b_cubed_r: 0.026 | coref Tr_b_cubed_f1: 0.001 | coref Tr_muc_p: 0.003 | coref Tr_muc_r: 0.016 | coref Tr_muc_f1: 0.004 | coref Tr_ceafe_p: 0.016 | coref Tr_ceafe_r: 0.007 | coref Tr_ceafe_f1: 0.010
		 | coref Vl_b_cubed_p: 0.000 | coref Vl_b_cubed_r: 0.026 | coref Vl_b_cubed_f1: 0.001 | coref Vl_muc_p: 0.003 | coref Vl_muc_r: 0.016 | coref Vl_muc_f1: 0.004 | coref Vl_ceafe_p: 0.016 | coref Vl_ceafe_r: 0.007 | coref Vl_ceafe_f1: 0.010
	 | ner Loss: 2.94308 | ner Tr_acc: 0.438 | ner Tr_acc_nonzero: 0.033 | ner Tr_spanrec_p: 0.619 | ner Tr_spanrec_r: 0.013
		 | ner Vl_acc: 0.438 | ner Vl_acc_nonzero: 0.033 | ner Vl_spanrec_p: 0.619 | ner Vl_spanrec_r: 0.013
	 | pruner Loss: 0.27350 | pruner Tr_p: 0.095 | pruner Tr_r: 0.017
		 | pruner Vl_p: 0.095 | pruner Vl_r: 0.017
Pulled 2455 instances from ../data/parsed/ontonotes/train/MultiTaskDatasetDump_coref_ner_pruner.pkl.


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Pulled 318 instances from ../data/parsed/ontonotes/development/MultiTaskDatasetDump_coref_ner_pruner.pkl.


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))



Epoch:   4 | coref Loss: 3.86282 | coref Tr_b_cubed_p: 0.001 | coref Tr_b_cubed_r: 0.023 | coref Tr_b_cubed_f1: 0.002 | coref Tr_muc_p: 0.002 | coref Tr_muc_r: 0.014 | coref Tr_muc_f1: 0.004 | coref Tr_ceafe_p: 0.016 | coref Tr_ceafe_r: 0.013 | coref Tr_ceafe_f1: 0.014
		 | coref Vl_b_cubed_p: 0.001 | coref Vl_b_cubed_r: 0.023 | coref Vl_b_cubed_f1: 0.002 | coref Vl_muc_p: 0.002 | coref Vl_muc_r: 0.014 | coref Vl_muc_f1: 0.004 | coref Vl_ceafe_p: 0.016 | coref Vl_ceafe_r: 0.013 | coref Vl_ceafe_f1: 0.014
	 | ner Loss: 2.94290 | ner Tr_acc: 0.465 | ner Tr_acc_nonzero: 0.027 | ner Tr_spanrec_p: 0.586 | ner Tr_spanrec_r: 0.013
		 | ner Vl_acc: 0.465 | ner Vl_acc_nonzero: 0.027 | ner Vl_spanrec_p: 0.586 | ner Vl_spanrec_r: 0.013
	 | pruner Loss: 0.19529 | pruner Tr_p: 0.096 | pruner Tr_r: 0.017
		 | pruner Vl_p: 0.096 | pruner Vl_r: 0.017
Pulled 2455 instances from ../data/parsed/ontonotes/train/MultiTaskDatasetDump_coref_ner_pruner.pkl.


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


Pulled 318 instances from ../data/parsed/ontonotes/development/MultiTaskDatasetDump_coref_ner_pruner.pkl.


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))



Epoch:   5 | coref Loss: 3.72722 | coref Tr_b_cubed_p: 0.002 | coref Tr_b_cubed_r: 0.013 | coref Tr_b_cubed_f1: 0.003 | coref Tr_muc_p: 0.002 | coref Tr_muc_r: 0.005 | coref Tr_muc_f1: 0.002 | coref Tr_ceafe_p: 0.014 | coref Tr_ceafe_r: 0.011 | coref Tr_ceafe_f1: 0.012
		 | coref Vl_b_cubed_p: 0.002 | coref Vl_b_cubed_r: 0.013 | coref Vl_b_cubed_f1: 0.003 | coref Vl_muc_p: 0.002 | coref Vl_muc_r: 0.005 | coref Vl_muc_f1: 0.002 | coref Vl_ceafe_p: 0.014 | coref Vl_ceafe_r: 0.011 | coref Vl_ceafe_f1: 0.012
	 | ner Loss: 2.94266 | ner Tr_acc: 0.486 | ner Tr_acc_nonzero: 0.026 | ner Tr_spanrec_p: 0.562 | ner Tr_spanrec_r: 0.013
		 | ner Vl_acc: 0.486 | ner Vl_acc_nonzero: 0.026 | ner Vl_spanrec_p: 0.562 | ner Vl_spanrec_r: 0.013
	 | pruner Loss: 0.15243 | pruner Tr_p: 0.103 | pruner Tr_r: 0.019
		 | pruner Vl_p: 0.103 | pruner Vl_r: 0.019
Pulled 2455 instances from ../data/parsed/ontonotes/train/MultiTaskDatasetDump_coref_ner_pruner.pkl.


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

In [None]:
onto_train_di = train_ds()

# Make MTL B

In [None]:
dataset: str = 'scierc'
epochs: int = 10
encoder: str = "bert-base-uncased"
tasks: List[str] = ('ner',)
device: str = "cpu"
trim: bool = False
train_encoder: bool = False
ner_unweighted: bool = False
filter_candidates_pos = True


dir_config, dir_tokenizer, dir_encoder = get_pretrained_dirs(encoder)

tokenizer = transformers.BertTokenizer.from_pretrained(dir_tokenizer)
config = transformers.BertConfig(dir_config)
config.max_span_width = 5
config.coref_dropout = 0.3
config.metadata_feature_size = 20
config.unary_hdim = 1000
config.binary_hdim = 2000
config.top_span_ratio = 0.4
config.max_top_antecedents = 50
config.device = device
config.epochs = epochs
config.trim = trim
config.freeze_encoder = not train_encoder
config.ner_ignore_weights = ner_unweighted
config.filter_candidates_pos_threshold = CONFIG['filter_candidates_pos_threshold'] \
    if filter_candidates_pos else -1


# if 'ner' in tasks or 'pruner' in tasks:
if False:
    # Need to figure out the number of classes. Load a DL. Get the number. Delete the DL.
    temp_ds = MultiTaskDataIter(
        src=dataset,
        config=config,
        tasks=tasks,
        split="dev",
        tokenizer=tokenizer,
    )
    if 'ner' in tasks:
        config.ner_n_classes = deepcopy(temp_ds.ner_tag_dict.__len__())
        config.ner_class_weights = temp_ds.estimate_class_weights('ner')
    else:
        config.ner_n_classes = 1
        config.ner_class_weights = [1.0, ]
    if 'pruner' in tasks:
        config.pruner_class_weights = temp_ds.estimate_class_weights('pruner')
    del temp_ds
else:
    config.ner_n_classes = 1
    config.ner_class_weights = [1.0, ]

# # Make the model
# model = BasicMTL(dir_encoder, config=config)

# Load the data
train_ds = partial(
    MultiTaskDataIter,
    src=dataset,
    config=config,
    tasks=tasks,
    split="train",
    tokenizer=tokenizer,
)

sci_train_ds = train_ds()

In [None]:
sci_train_ds[2] 

In [None]:
la = onto_train_di.__len__()
lb = sci_train_ds.__len__()

la, lb, la+lb

In [None]:
sampling_ratio = [0.5, 1]

In [None]:
llen = la + lb

In [None]:
pointers = [int(x* llen / float(sum(sampling_ratio))) for x in sampling_ratio]
pointers

In [None]:
pointers = [2, 6]

In [None]:
import numpy as np

In [None]:
source_indices = []
for i, dataset_specific_ratio in enumerate(pointers):
    source_indices += [i]*dataset_specific_ratio
    
source_indices

In [None]:
np.random.shuffle(source_indices)

source_indices

# Testing Eval

In [None]:
from eval import Evaluator, NERAcc, NERSpanRecognitionPR, PrunerPR, CorefBCubed, CorefMUC, CorefCeafe

In [None]:
eval_bench = Evaluator(
    predict_fn = model.pred_with_labels,
    dataset_partial = valid_ds,
    metrics = [NERAcc(), NERSpanRecognitionPR(), PrunerPR(), CorefBCubed(), CorefMUC(), CorefCeafe()],
    device = 'cpu'
)

In [None]:
eval_bench.run()

# Eval for Coref

In [None]:


def b_cubed(clusters, mention_to_gold):
    num, dem = 0, 0

    for c in clusters:
        if len(c) == 1:
            continue

        gold_counts = Counter()
        correct = 0
        for m in c:
            if m in mention_to_gold:
                gold_counts[tuple(mention_to_gold[m])] += 1
        for c2, count in gold_counts.items():
            if len(c2) != 1:
                correct += count * count

        num += correct / float(len(c))
        dem += len(c)

    return num, dem


def muc(clusters, mention_to_gold):
    tp, p = 0, 0
    for c in clusters:
        p += len(c) - 1
        tp += len(c)
        linked = set()
        for m in c:
            if m in mention_to_gold:
                linked.add(mention_to_gold[m])
            else:
                tp -= 1
        tp -= len(linked)
    return tp, p


def phi4(c1, c2):
    return 2 * len([m for m in c1 if m in c2]) / float(len(c1) + len(c2))


def ceafe(clusters, gold_clusters):
    clusters = [c for c in clusters if len(c) != 1]
    scores = np.zeros((len(gold_clusters), len(clusters)))
    for i in range(len(gold_clusters)):
        for j in range(len(clusters)):
            scores[i, j] = phi4(gold_clusters[i], clusters[j])
    matching = linear_assignment(-scores)
    similarity = sum(scores[matching[0], matching[1]])

    # similarity = sum(scores[matching[:, 0], matching[:, 1]])
    return similarity, len(clusters), similarity, len(gold_clusters)

In [None]:
for i, instance in enumerate(dl):
    outputs = model.pred_with_labels(**instance)
    break

In [None]:
! free -h

In [None]:
instance['coref'].keys(), outputs['coref'].keys(), outputs.keys(), outputs['coref']['eval'].keys()

In [None]:
print('=None, '.join(['clusters', 'gold_clusters', 'mention_to_predicted', 'mention_to_gold']))

In [None]:
ll =  outputs['coref']['eval']

In [None]:
ceafe(ll['clusters'], ll['gold_clusters'])

In [None]:
phi4(ll['clusters'], ll['gold_clusters'])

In [None]:
muc(ll['clusters'], ll['mention_to_gold'])

In [None]:
b_cubed(ll['clusters'], ll['mention_to_gold'])