In [1]:
import pickle
import numpy as np
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.metrics import Precision, Recall
from tensorflow.keras.utils import to_categorical
from tqdm.keras import TqdmCallback
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences

from tqdm.notebook import tqdm, trange

from nervaluate import Evaluator
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from utils import *

In [13]:
# ---------- Load train test split -----------
# train_pids, valid_pids, test_pids, unseen_pids = pickle.load(open("train_test_split.p", "rb"))
train_pids, valid_pids, test_pids, test_pids_cat = pickle.load(open("./data/train_test_split_0331.p", "rb"))
train_idxs, valid_idxs, test_idxs, unseen_idxs = [], [], [], []

unseen_pids = test_pids_cat["unseen"] + valid_pids.tolist()
for k in test_pids_cat.keys():
    if k == "unseen":
        continue
    unseen_pids = [i for i in unseen_pids if i not in test_pids_cat[k]]
len(test_pids_cat["unseen"]), len(unseen_pids)

(8456, 836)

# Load Data

In [14]:
input_file = "./data/ml_datasetname_inputs_flv0.p"
X, y, X_pids = pickle.load(open(input_file, "rb"))

In [15]:
for i in trange(len(X_pids)):
    if X_pids[i] in train_pids:
        train_idxs.append(i)
    elif X_pids[i] in valid_pids:
        valid_idxs.append(i)
    elif X_pids[i] in test_pids:
        test_idxs.append(i)
    if X_pids[i] in unseen_pids:
        unseen_idxs.append(i)
        
tot = len(train_idxs) + len(valid_idxs) + len(test_idxs)
print(f"nSamples: train={len(train_idxs):,} ({len(train_idxs)*100/tot:.2f}%), valid={len(valid_idxs):,} ({len(valid_idxs)*100/tot:.2f}%)")
print(f"test={len(test_idxs):,} ({len(test_idxs)*100/tot:.2f}%), unseen = {len(unseen_idxs):,} ({len(unseen_idxs)*100/tot:.2f}%)")

  0%|          | 0/210185 [00:00<?, ?it/s]

nSamples: train=146,580 (69.74%), valid=21,217 (10.09%)
test=42,388 (20.17%), unseen = 7,608 (3.62%)


# BiLSTM

In [5]:
# ---------- Preprocess inputs-----------
word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sent in X:
    for word in sent:
        if word not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
word_to_ix["ENDPAD"] = len(word_to_ix) # the corresponding padding
words = word_to_ix.keys()
ix_to_word = dict((v, k) for k, v in word_to_ix.items())

tag_to_ix = {
'O': 0,
'B': 1,
'I': 2,
}

X = [[word_to_ix[w] for w in s] for s in X]
y = [[to_categorical(tag_to_ix[w], num_classes=3) for w in s] for s in y]

max_len = len(X[0])
n_words = len(word_to_ix.keys())
n_tags = len(tag_to_ix.keys())

X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=word_to_ix["ENDPAD"])
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag_to_ix["O"])

In [16]:
# ----------- spliting -------------
X_tr = np.array([X[i] for i in train_idxs])
X_val = np.array([X[i] for i in valid_idxs])
X_te = np.array([X[i] for i in test_idxs])
X_te_seen = np.array([X[i] for i in test_idxs if i not in unseen_idxs])
X_te_unseen = np.array([X[i] for i in test_idxs if i in unseen_idxs])

y_tr = np.array([y[i] for i in train_idxs])
y_val = np.array([y[i] for i in valid_idxs])
y_te = np.array([y[i] for i in test_idxs])
y_te_unseen = np.array([y[i] for i in test_idxs if i in unseen_idxs])
y_te_seen = np.array([y[i] for i in test_idxs if i not in unseen_idxs])

  X_tr = np.array([X[i] for i in train_idxs])
  X_val = np.array([X[i] for i in valid_idxs])
  X_te = np.array([X[i] for i in test_idxs])
  X_te_seen = np.array([X[i] for i in test_idxs if i not in unseen_idxs])
  X_te_unseen = np.array([X[i] for i in test_idxs if i in unseen_idxs])
  y_tr = np.array([y[i] for i in train_idxs])
  y_val = np.array([y[i] for i in valid_idxs])
  y_te = np.array([y[i] for i in test_idxs])
  y_te_unseen = np.array([y[i] for i in test_idxs if i in unseen_idxs])
  y_te_seen = np.array([y[i] for i in test_idxs if i not in unseen_idxs])


# BiLSTM(without embedding)

In [7]:
input_ = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=300, input_length=max_len)(input_)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer
model = Model(input_, out)
model.load_weights('./checkpoints/BiLSTM_no_pretrain')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x147f23406580>

In [8]:
# test validation set
valid_pred = model.predict(X_val, verbose=1)
test_labels = pred2labels(y_val).tolist()
pred_labels = pred2labels(valid_pred).tolist()
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           _       0.92      0.77      0.84     38680

   micro avg       0.92      0.77      0.84     38680
   macro avg       0.92      0.77      0.84     38680
weighted avg       0.92      0.77      0.84     38680



In [9]:
# test seen set
seen_pred = model.predict(X_te_seen, verbose=1)
test_labels = pred2labels(y_te_seen).tolist()
pred_labels = pred2labels(seen_pred).tolist()
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           _       0.93      0.81      0.87     70044

   micro avg       0.93      0.81      0.87     70044
   macro avg       0.93      0.81      0.87     70044
weighted avg       0.93      0.81      0.87     70044



In [10]:
#test all
test_pred = model.predict(X_te, verbose=1)
test_labels = pred2labels(y_te).tolist()
pred_labels = pred2labels(test_pred).tolist()
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           _       0.92      0.78      0.85     79878

   micro avg       0.92      0.78      0.85     79878
   macro avg       0.92      0.78      0.85     79878
weighted avg       0.92      0.78      0.85     79878



In [11]:
# test zero-shot 
unseen_pred = model.predict(X_te_unseen, verbose=1)
test_labels = pred2labels(y_te_unseen).tolist()
pred_labels = pred2labels(unseen_pred).tolist()
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           _       0.88      0.58      0.70      9834

   micro avg       0.88      0.58      0.70      9834
   macro avg       0.88      0.58      0.70      9834
weighted avg       0.88      0.58      0.70      9834



## BiLSTM(Glove)

In [27]:
input_ = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=300, input_length=max_len)(input_)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer
model = Model(input_, out)
model.load_weights('./checkpoints/BiLSTM_Glove')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x154df6b2f6a0>

In [18]:
# test validation set
valid_pred = model.predict(X_val, verbose=1)
test_labels = pred2labels(y_val).tolist()
pred_labels = pred2labels(valid_pred).tolist()
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           _       0.90      0.84      0.87     38680

   micro avg       0.90      0.84      0.87     38680
   macro avg       0.90      0.84      0.87     38680
weighted avg       0.90      0.84      0.87     38680



In [19]:
# test seen set
seen_pred = model.predict(X_te_seen, verbose=1)
test_labels = pred2labels(y_te_seen).tolist()
pred_labels = pred2labels(seen_pred).tolist()
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           _       0.91      0.88      0.89     70044

   micro avg       0.91      0.88      0.89     70044
   macro avg       0.91      0.88      0.89     70044
weighted avg       0.91      0.88      0.89     70044



In [20]:
#test all
test_pred = model.predict(X_te, verbose=1)
test_labels = pred2labels(y_te).tolist()
pred_labels = pred2labels(test_pred).tolist()
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           _       0.90      0.85      0.87     79878

   micro avg       0.90      0.85      0.87     79878
   macro avg       0.90      0.85      0.87     79878
weighted avg       0.90      0.85      0.87     79878



In [28]:
# test zero-shot 
unseen_pred = model.predict(X_te_unseen, verbose=1)
test_labels = pred2labels(y_te_unseen).tolist()
pred_labels = pred2labels(unseen_pred).tolist()
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           _       0.85      0.64      0.73      9834

   micro avg       0.85      0.64      0.73      9834
   macro avg       0.85      0.64      0.73      9834
weighted avg       0.85      0.64      0.73      9834



## BiLSTM(word2vec)

In [21]:
input_ = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=300, input_length=max_len)(input_)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer
model = Model(input_, out)
model.load_weights('./checkpoints/BiLSTM_word2vec')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x154df70d3070>

In [22]:
# test validation set
valid_pred = model.predict(X_val, verbose=1)
test_labels = pred2labels(y_val).tolist()
pred_labels = pred2labels(valid_pred).tolist()
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           _       0.91      0.85      0.88     38680

   micro avg       0.91      0.85      0.88     38680
   macro avg       0.91      0.85      0.88     38680
weighted avg       0.91      0.85      0.88     38680



In [23]:
# test seen set
seen_pred = model.predict(X_te_seen, verbose=1)
test_labels = pred2labels(y_te_seen).tolist()
pred_labels = pred2labels(seen_pred).tolist()
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           _       0.91      0.89      0.90     70044

   micro avg       0.91      0.89      0.90     70044
   macro avg       0.91      0.89      0.90     70044
weighted avg       0.91      0.89      0.90     70044



In [24]:
#test all
test_pred = model.predict(X_te, verbose=1)
test_labels = pred2labels(y_te).tolist()
pred_labels = pred2labels(test_pred).tolist()
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           _       0.91      0.86      0.88     79878

   micro avg       0.91      0.86      0.88     79878
   macro avg       0.91      0.86      0.88     79878
weighted avg       0.91      0.86      0.88     79878



In [26]:
# test zero-shot 
unseen_pred = model.predict(X_te_unseen, verbose=1)
test_labels = pred2labels(y_te_unseen).tolist()
pred_labels = pred2labels(unseen_pred).tolist()
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

           _       0.89      0.65      0.75      9834

   micro avg       0.89      0.65      0.75      9834
   macro avg       0.89      0.65      0.75      9834
weighted avg       0.89      0.65      0.75      9834



# CRF

In [50]:
# load processed data
X = pickle.load(open("./data/CRF_X.p", "rb"))

In [55]:
# ----------- spliting -------------
X_tr = np.array([X[i] for i in train_idxs])
X_val = np.array([X[i] for i in valid_idxs])
X_te = np.array([X[i] for i in test_idxs])
X_te_seen = np.array([X[i] for i in test_idxs if i not in unseen_idxs])
X_te_unseen = np.array([X[i] for i in test_idxs if i in unseen_idxs])

y_tr = np.array([y[i] for i in train_idxs])
y_val = np.array([y[i] for i in valid_idxs])
y_te = np.array([y[i] for i in test_idxs])
y_te_unseen = np.array([y[i] for i in test_idxs if i in unseen_idxs])
y_te_seen = np.array([y[i] for i in test_idxs if i not in unseen_idxs])

  X_tr = np.array([X[i] for i in train_idxs])
  X_val = np.array([X[i] for i in valid_idxs])
  X_te = np.array([X[i] for i in test_idxs])
  X_te_seen = np.array([X[i] for i in test_idxs if i not in unseen_idxs])
  X_te_unseen = np.array([X[i] for i in test_idxs if i in unseen_idxs])
  y_tr = np.array([y[i] for i in train_idxs])
  y_val = np.array([y[i] for i in valid_idxs])
  y_te = np.array([y[i] for i in test_idxs])
  y_te_unseen = np.array([y[i] for i in test_idxs if i in unseen_idxs])
  y_te_seen = np.array([y[i] for i in test_idxs if i not in unseen_idxs])


In [56]:
from sklearn_crfsuite import CRF
crf = CRF(algorithm='lbfgs',
          c1=10,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)
filename = 'crf_Qi_v2.pkl'
crf = pickle.load(open(filename, 'rb'))

In [36]:
# load trained CRF model
# filename = 'crf_Qi_v2.pkl'
# crf = pickle.load(open(filename, 'rb'))

In [57]:
test_pred = crf.predict(X_te)
print(classification_report([list(i) for i in y_te], test_pred))

              precision    recall  f1-score   support

           _       0.90      0.83      0.86     79878

   micro avg       0.90      0.83      0.86     79878
   macro avg       0.90      0.83      0.86     79878
weighted avg       0.90      0.83      0.86     79878



In [58]:
# validation
test_pred = crf.predict(X_val)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_val]
print("-"*10, "Valid", "-"*10)
print(classification_report(test_labels, preds))

---------- Valid ----------
              precision    recall  f1-score   support

           _       0.90      0.81      0.85     38680

   micro avg       0.90      0.81      0.85     38680
   macro avg       0.90      0.81      0.85     38680
weighted avg       0.90      0.81      0.85     38680



In [59]:
# Test for unseen
test_pred = crf.predict(X_te_unseen)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te_unseen]
print("-"*10, "Unseen", "-"*10)
print(classification_report(test_labels, preds))

---------- Unseen ----------
              precision    recall  f1-score   support

           _       0.89      0.63      0.73      9834

   micro avg       0.89      0.63      0.73      9834
   macro avg       0.89      0.63      0.73      9834
weighted avg       0.89      0.63      0.73      9834



In [60]:
# Test for seen
test_pred = crf.predict(X_te_seen)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te_seen]
print("-"*10, "Seen", "-"*10)
print(classification_report(test_labels, preds))

---------- Seen ----------
              precision    recall  f1-score   support

           _       0.90      0.86      0.88     70044

   micro avg       0.90      0.86      0.88     70044
   macro avg       0.90      0.86      0.88     70044
weighted avg       0.90      0.86      0.88     70044



# BERT

In [12]:
from bert_sklearn import load_model
savefile = './checkpoints/bert_base.bin'
model = load_model(savefile)

2022-05-02 12:13:15 bert_sklearn.model.pytorch_pretrained.modeling INFO: Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


Loading model from ./checkpoints/bert_base.bin...


2022-05-02 12:13:18 bert_sklearn.model.pytorch_pretrained.modeling INFO: Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 28996
}



Defaulting to linear classifier/regressor
Building sklearn token classifier...


In [17]:
test_pred = model.predict(X_val)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_val]
print("-"*10, "Valid", "-"*10)
print(classification_report(test_labels, preds))

test_pred = model.predict(X_te_seen)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te_seen]
print("-"*10, "Test Seen", "-"*10)
print(classification_report(test_labels, preds))

test_pred = model.predict(X_te)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te]
print("-"*10, "Test", "-"*10)
print(classification_report(test_labels, preds))

evaluator = Evaluator([list(i) for i in y_te], preds,
                      tags=[""], loader='list')
results, results_per_tag = evaluator.evaluate()
print(results)

test_pred = model.predict(X_te_unseen)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te_unseen]
print("-"*10, "Unseen", "-"*10)
print(classification_report(test_labels, preds))

Predicting: 100%|██████████| 1327/1327 [01:23<00:00, 15.93it/s]


---------- Valid ----------
              precision    recall  f1-score   support

           _       0.91      0.93      0.92     38680

   micro avg       0.91      0.93      0.92     38680
   macro avg       0.91      0.93      0.92     38680
weighted avg       0.91      0.93      0.92     38680



Predicting: 100%|██████████| 2355/2355 [02:29<00:00, 15.76it/s]


---------- Test Seen ----------
              precision    recall  f1-score   support

           _       0.97      1.00      0.98     70044

   micro avg       0.97      1.00      0.98     70044
   macro avg       0.97      1.00      0.98     70044
weighted avg       0.97      1.00      0.98     70044



Predicting: 100%|██████████| 2650/2650 [02:46<00:00, 15.91it/s]


---------- Test ----------
              precision    recall  f1-score   support

           _       0.97      0.99      0.98     79878

   micro avg       0.97      0.99      0.98     79878
   macro avg       0.97      0.99      0.98     79878
weighted avg       0.97      0.99      0.98     79878

{'ent_type': {'correct': 79510, 'incorrect': 0, 'partial': 0, 'missed': 368, 'spurious': 1987, 'possible': 79878, 'actual': 81497, 'precision': 0.9756187344319422, 'recall': 0.9953929742857858, 'f1': 0.9854066615027111}, 'partial': {'correct': 79437, 'incorrect': 0, 'partial': 73, 'missed': 368, 'spurious': 1987, 'possible': 79878, 'actual': 81497, 'precision': 0.9751708651852216, 'recall': 0.9949360274418488, 'f1': 0.9849542989930288}, 'strict': {'correct': 79437, 'incorrect': 73, 'partial': 0, 'missed': 368, 'spurious': 1987, 'possible': 79878, 'actual': 81497, 'precision': 0.9747229959385008, 'recall': 0.9944790805979118, 'f1': 0.9845019364833463}, 'exact': {'correct': 79437, 'incorrect':

Predicting: 100%|██████████| 295/295 [00:19<00:00, 15.49it/s]


---------- Unseen ----------
              precision    recall  f1-score   support

           _       0.98      0.99      0.99      9834

   micro avg       0.98      0.99      0.99      9834
   macro avg       0.98      0.99      0.99      9834
weighted avg       0.98      0.99      0.99      9834

