In [19]:
import pickle
import numpy as np
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.metrics import Precision, Recall
from tensorflow.keras.utils import to_categorical
from tqdm.keras import TqdmCallback
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences

from tqdm.notebook import tqdm, trange

from nervaluate import Evaluator
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from utils import *

In [250]:
# ---------- Load train test split -----------
# train_pids, valid_pids, test_pids, unseen_pids = pickle.load(open("train_test_split.p", "rb"))
train_pids, valid_pids, test_pids, test_pids_cat = pickle.load(open("train_test_split_0331.p", "rb"))
train_idxs, valid_idxs, test_idxs, unseen_idxs = [], [], [], []

unseen_pids = test_pids_cat["unseen"] + valid_pids.tolist()
for k in test_pids_cat.keys():
    if k == "unseen":
        continue
    unseen_pids = [i for i in unseen_pids if i not in test_pids_cat[k]]
len(test_pids_cat["unseen"]), len(unseen_pids)

(8456, 836)

In [269]:
input_file = "../data/ml_datasetname_inputs_flv0.p"
X, y, X_pids = pickle.load(open(input_file, "rb"))

In [251]:
for i in trange(len(X_pids)):
    if X_pids[i] in train_pids:
        train_idxs.append(i)
    elif X_pids[i] in valid_pids:
        valid_idxs.append(i)
    elif X_pids[i] in test_pids:
        test_idxs.append(i)
    if X_pids[i] in unseen_pids:
        unseen_idxs.append(i)
        
tot = len(train_idxs) + len(valid_idxs) + len(test_idxs)
print(f"nSamples: train={len(train_idxs):,} ({len(train_idxs)*100/tot:.2f}%), valid={len(valid_idxs):,} ({len(valid_idxs)*100/tot:.2f}%)")
print(f"test={len(test_idxs):,} ({len(test_idxs)*100/tot:.2f}%), unseen = {len(unseen_idxs):,} ({len(unseen_idxs)*100/tot:.2f}%)")

  0%|          | 0/210185 [00:00<?, ?it/s]

nSamples: train=146,580 (69.74%), valid=21,217 (10.09%)
test=42,388 (20.17%), unseen = 7,608 (3.62%)


In [271]:
# ----------- spliting -------------
X_tr = np.array([X[i] for i in train_idxs])
X_val = np.array([X[i] for i in valid_idxs])
X_te = np.array([X[i] for i in test_idxs])
X_te_seen = np.array([X[i] for i in test_idxs if i not in unseen_idxs])
X_te_unseen = np.array([X[i] for i in test_idxs if i in unseen_idxs])

y_tr = np.array([y[i] for i in train_idxs])
y_val = np.array([y[i] for i in valid_idxs])
y_te = np.array([y[i] for i in test_idxs])
y_te_unseen = np.array([y[i] for i in test_idxs if i in unseen_idxs])
y_te_seen = np.array([y[i] for i in test_idxs if i not in unseen_idxs])

# SciBERT

In [167]:
import os
import math
import random
import csv
import sys
import pickle
sys.path.append(os.getcwd() + "/bert-sklearn/")
os.environ["CUDA_VISIBLE_DEVICES"]="2"

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score
import statistics as stats

from bert_sklearn import BertClassifier
from bert_sklearn import BertRegressor
from bert_sklearn import BertTokenClassifier
from bert_sklearn import load_model
from nervaluate import Evaluator
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [256]:
# ---------- Load inputs-----------
X, y, X_pids = pickle.load(open("./data/ml_datasetname_inputs_flv0.p", "rb"))

In [257]:
# ----------- spliting -------------
X_tr = [X[i] for i in train_idxs]
X_val = [X[i] for i in valid_idxs]
X_te = [X[i] for i in test_idxs]
X_te_seen = [X[i] for i in test_idxs if i not in unseen_idxs]
X_te_unseen = [X[i] for i in test_idxs if i in unseen_idxs]

y_tr = [y[i] for i in train_idxs]
y_val = [y[i] for i in valid_idxs]
y_te = [y[i] for i in test_idxs]
y_te_unseen = [y[i] for i in test_idxs if i in unseen_idxs]
y_te_seen = [y[i] for i in test_idxs if i not in unseen_idxs]

In [181]:
#savefile = '../data/scibert_Jo_split.bin'
savefile = './checkpoints/scibert.bin'
# # load model from disk
model = load_model(savefile)

Loading model from ../ckpts/scibert.bin...
Defaulting to linear classifier/regressor
Building sklearn token classifier...


In [184]:
test_pred = model.predict(X_te_seen)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te_seen]
print("-"*10, "Test Seen", "-"*10)
print(classification_report(test_labels, preds))

Predicting: 100%|███████████████████████████████████████████████████████████████████████████████████| 2355/2355 [05:09<00:00,  7.61it/s]


              precision    recall  f1-score   support

           _       0.92      0.93      0.92     70044

   micro avg       0.92      0.93      0.92     70044
   macro avg       0.92      0.93      0.92     70044
weighted avg       0.92      0.93      0.92     70044



In [185]:
test_pred = model.predict(X_val)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_val]
print("-"*10, "Valid", "-"*10)
print(classification_report(test_labels, preds))

test_pred = model.predict(X_te)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te]
print("-"*10, "Test", "-"*10)
print(classification_report(test_labels, preds))

Predicting: 100%|███████████████████████████████████████████████████████████████████████████████████| 1327/1327 [02:54<00:00,  7.61it/s]


---------- Valid ----------
              precision    recall  f1-score   support

           _       0.91      0.90      0.90     38680

   micro avg       0.91      0.90      0.90     38680
   macro avg       0.91      0.90      0.90     38680
weighted avg       0.91      0.90      0.90     38680



Predicting: 100%|███████████████████████████████████████████████████████████████████████████████████| 2650/2650 [05:48<00:00,  7.61it/s]


---------- Test ----------
              precision    recall  f1-score   support

           _       0.91      0.91      0.91     79878

   micro avg       0.91      0.91      0.91     79878
   macro avg       0.91      0.91      0.91     79878
weighted avg       0.91      0.91      0.91     79878



In [186]:
evaluator = Evaluator([list(i) for i in y_te], preds,
                      tags=[""], loader='list')
results, results_per_tag = evaluator.evaluate()
results

{'ent_type': {'correct': 73279,
  'incorrect': 0,
  'partial': 0,
  'missed': 6599,
  'spurious': 6286,
  'possible': 79878,
  'actual': 79565,
  'precision': 0.920995412555772,
  'recall': 0.9173865144345126,
  'f1': 0.9191874212100877},
 'partial': {'correct': 72484,
  'incorrect': 0,
  'partial': 795,
  'missed': 6599,
  'spurious': 6286,
  'possible': 79878,
  'actual': 79565,
  'precision': 0.915999497266386,
  'recall': 0.9124101755176645,
  'f1': 0.9142013133220022},
 'strict': {'correct': 72484,
  'incorrect': 795,
  'partial': 0,
  'missed': 6599,
  'spurious': 6286,
  'possible': 79878,
  'actual': 79565,
  'precision': 0.9110035819769999,
  'recall': 0.9074338366008162,
  'f1': 0.9092152054339168},
 'exact': {'correct': 72484,
  'incorrect': 795,
  'partial': 0,
  'missed': 6599,
  'spurious': 6286,
  'possible': 79878,
  'actual': 79565,
  'precision': 0.9110035819769999,
  'recall': 0.9074338366008162,
  'f1': 0.9092152054339168}}

In [187]:
test_pred = model.predict(X_te_unseen)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te_unseen]
print("-"*10, "Unseen", "-"*10)
print(classification_report(test_labels, preds))

Predicting: 100%|█████████████████████████████████████████████████████████████████████████████████████| 295/295 [00:39<00:00,  7.38it/s]


---------- Unseen ----------
              precision    recall  f1-score   support

           _       0.86      0.75      0.80      9834

   micro avg       0.86      0.75      0.80      9834
   macro avg       0.86      0.75      0.80      9834
weighted avg       0.86      0.75      0.80      9834



In [189]:
sample_id = 102
print_results(X_te_unseen[sample_id], y_te_unseen[sample_id], preds[sample_id], print_all=False)

word           | gt| pr
-------------------------
are            | O | O 
with           | O | O 
dense          | O | O 
objects        | O | O 
in             | O | O 
TinyPerson     | B | B 
,              | O | O 
DETECTIONS     | O | O 
PER            | O | O 
IMG            | O | O 


#  BERT

In [258]:
savefile = './checkpoints/bert_base.bin'
# # load model from disk
model = load_model(savefile)

Loading model from ../ckpts/bert_base.bin...
Defaulting to linear classifier/regressor
Building sklearn token classifier...


In [259]:
print(model)

BertTokenClassifier(bert_config_json={'architectures': ['BertForMaskedLM'],
                                      'attention_probs_dropout_prob': 0.1,
                                      'hidden_act': 'gelu',
                                      'hidden_dropout_prob': 0.1,
                                      'hidden_size': 768,
                                      'initializer_range': 0.02,
                                      'intermediate_size': 3072,
                                      'layer_norm_eps': 1e-12,
                                      'max_position_embeddings': 512,
                                      'model_type': 'bert',
                                      'num_attention_heads': 12,
                                      'num_hidden_layers': 12,
                                      'pad_t...
                                            ('[unused21]', 21),
                                            ('[unused22]', 22),
                                      

In [261]:
test_pred = model.predict(X_val)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_val]
print("-"*10, "Valid", "-"*10)
print(classification_report(test_labels, preds))

test_pred = model.predict(X_te_seen)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te_seen]
print("-"*10, "Test Seen", "-"*10)
print(classification_report(test_labels, preds))

test_pred = model.predict(X_te)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te]
print("-"*10, "Test", "-"*10)
print(classification_report(test_labels, preds))

evaluator = Evaluator([list(i) for i in y_te], preds,
                      tags=[""], loader='list')
results, results_per_tag = evaluator.evaluate()
print(results)

test_pred = model.predict(X_te_unseen)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te_unseen]
print("-"*10, "Unseen", "-"*10)
print(classification_report(test_labels, preds))

Predicting: 100%|███████████████████████████████████████████████████████████████████████████████████| 1327/1327 [02:55<00:00,  7.55it/s]


---------- Valid ----------
              precision    recall  f1-score   support

           _       0.91      0.91      0.91     38680

   micro avg       0.91      0.91      0.91     38680
   macro avg       0.91      0.91      0.91     38680
weighted avg       0.91      0.91      0.91     38680



Predicting: 100%|███████████████████████████████████████████████████████████████████████████████████| 2355/2355 [05:10<00:00,  7.57it/s]


---------- Test Seen ----------
              precision    recall  f1-score   support

           _       0.92      0.93      0.92     70044

   micro avg       0.92      0.93      0.92     70044
   macro avg       0.92      0.93      0.92     70044
weighted avg       0.92      0.93      0.92     70044



Predicting: 100%|███████████████████████████████████████████████████████████████████████████████████| 2650/2650 [05:48<00:00,  7.60it/s]


---------- Test ----------
              precision    recall  f1-score   support

           _       0.91      0.91      0.91     79878

   micro avg       0.91      0.91      0.91     79878
   macro avg       0.91      0.91      0.91     79878
weighted avg       0.91      0.91      0.91     79878

{'ent_type': {'correct': 73591, 'incorrect': 0, 'partial': 0, 'missed': 6287, 'spurious': 6469, 'possible': 79878, 'actual': 80060, 'precision': 0.9191981014239321, 'recall': 0.9212924710183029, 'f1': 0.9202440945866525}, 'partial': {'correct': 72822, 'incorrect': 0, 'partial': 769, 'missed': 6287, 'spurious': 6469, 'possible': 79878, 'actual': 80060, 'precision': 0.9143954534099425, 'recall': 0.916478880292446, 'f1': 0.9154359814428091}, 'strict': {'correct': 72822, 'incorrect': 769, 'partial': 0, 'missed': 6287, 'spurious': 6469, 'possible': 79878, 'actual': 80060, 'precision': 0.909592805395953, 'recall': 0.9116652895665891, 'f1': 0.9106278682989658}, 'exact': {'correct': 72822, 'incorrec

Predicting: 100%|█████████████████████████████████████████████████████████████████████████████████████| 295/295 [00:40<00:00,  7.30it/s]


---------- Unseen ----------
              precision    recall  f1-score   support

           _       0.86      0.76      0.81      9834

   micro avg       0.86      0.76      0.81      9834
   macro avg       0.86      0.76      0.81      9834
weighted avg       0.86      0.76      0.81      9834



In [None]:
sample_id = 102
print_results(X_te_unseen[sample_id], y_te_unseen[sample_id], preds[sample_id], print_all=False)