# Dataset overview

In [3]:
import pandas as pd
import pickle
import numpy as np
import re
import json
import regex
from ast import literal_eval
from nltk import pos_tag
from nervaluate import Evaluator
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

import pickle
import numpy as np
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.metrics import Precision, Recall
from tensorflow.keras.utils import to_categorical
from tqdm.keras import TqdmCallback
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences

from tqdm.notebook import tqdm, trange

from nervaluate import Evaluator
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from utils import *

In [4]:
# ---------- Load train test split -----------
# train_pids, valid_pids, test_pids, unseen_pids = pickle.load(open("train_test_split.p", "rb"))
train_pids, valid_pids, test_pids, test_pids_cat = pickle.load(open("./data/train_test_split_0331.p", "rb"))
train_idxs, valid_idxs, test_idxs, unseen_idxs = [], [], [], []

unseen_pids = test_pids_cat["unseen"] + valid_pids.tolist()
for k in test_pids_cat.keys():
    if k == "unseen":
        continue
    unseen_pids = [i for i in unseen_pids if i not in test_pids_cat[k]]
len(test_pids_cat["unseen"]), len(unseen_pids)

(8456, 836)

In [5]:
test_pids_cat.keys()

dict_keys(['unseen', 'multiword_true', 'multiword_false', 'alphabetOnly_true', 'alphabetOnly_false', 'frequent_true', 'frequent_false'])

In [6]:
input_file = "./data/ml_datasetname_inputs_flv0.p"
X, y, X_pids = pickle.load(open(input_file, "rb"))

for i in range(len(X_pids)):
    if X_pids[i] in train_pids:
        train_idxs.append(i)
    elif X_pids[i] in valid_pids:
        valid_idxs.append(i)
    elif X_pids[i] in test_pids:
        test_idxs.append(i)
    if X_pids[i] in unseen_pids:
        unseen_idxs.append(i)
        
tot = len(train_idxs) + len(valid_idxs) + len(test_idxs)
print(f"nSamples: train={len(train_idxs):,} ({len(train_idxs)*100/tot:.2f}%), valid={len(valid_idxs):,} ({len(valid_idxs)*100/tot:.2f}%)")
print(f"test={len(test_idxs):,} ({len(test_idxs)*100/tot:.2f}%), unseen = {len(unseen_idxs):,} ({len(unseen_idxs)*100/tot:.2f}%)")

nSamples: train=146,580 (69.74%), valid=21,217 (10.09%)
test=42,388 (20.17%), unseen = 7,608 (3.62%)


In [7]:
# ----------- spliting -------------
X_tr = [X[i] for i in train_idxs]
X_val = [X[i] for i in valid_idxs]
X_te = [X[i] for i in test_idxs]
X_te_seen = [X[i] for i in test_idxs if i not in unseen_idxs]
X_te_unseen = [X[i] for i in test_idxs if i in unseen_idxs]

y_tr = [y[i] for i in train_idxs]
y_val = [y[i] for i in valid_idxs]
y_te = [y[i] for i in test_idxs]
y_te_unseen = [y[i] for i in test_idxs if i in unseen_idxs]
y_te_seen = [y[i] for i in test_idxs if i not in unseen_idxs]

# Model 1: CRF

In [5]:
# CRF feature
def word2features(words, poss, i):
    word = words[i]
    postag = poss[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = str(words[i-1])
        postag1 = str(poss[i-1][1])
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(words)-1:
        word1 = str(words[i+1])
        postag1 = str(poss[i+1][1])
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    poss = pos_tag(sent)
    return [word2features(sent, poss, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [22]:
# NLTK to process the sentence
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tuo96248/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [23]:
# Processing the sequence data to fit the CRF
X = [sent2features(s) for s in tqdm(X)]
pickle.dump(X, open("./data/CRF_X.p", "wb"))

  0%|          | 0/210185 [00:00<?, ?it/s]

# Run CRF

In [58]:
# import pickle
# import numpy as np
# from keras.models import Model, Input
# from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
# from keras.metrics import Precision, Recall
# from tensorflow.keras.utils import to_categorical
# from tqdm.keras import TqdmCallback
# import tensorflow as tf
# from keras.preprocessing.sequence import pad_sequences

# from tqdm.notebook import tqdm, trange

# from nervaluate import Evaluator
# from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
# from utils import *

In [3]:
# ---------- Load train test split -----------
# train_pids, valid_pids, test_pids, unseen_pids = pickle.load(open("train_test_split.p", "rb"))
train_pids, valid_pids, test_pids, test_pids_cat = pickle.load(open("./data/train_test_split_0331.p", "rb"))
train_idxs, valid_idxs, test_idxs, unseen_idxs = [], [], [], []

unseen_pids = test_pids_cat["unseen"] + valid_pids.tolist()
for k in test_pids_cat.keys():
    if k == "unseen":
        continue
    unseen_pids = [i for i in unseen_pids if i not in test_pids_cat[k]]
len(test_pids_cat["unseen"]), len(unseen_pids)

(8456, 836)

In [60]:
test_pids_cat.keys()

dict_keys(['unseen', 'multiword_true', 'multiword_false', 'alphabetOnly_true', 'alphabetOnly_false', 'frequent_true', 'frequent_false'])

In [4]:
input_file = "./data/ml_datasetname_inputs_flv0.p"
X, y, X_pids = pickle.load(open(input_file, "rb"))

for i in trange(len(X_pids)):
    if X_pids[i] in train_pids:
        train_idxs.append(i)
    elif X_pids[i] in valid_pids:
        valid_idxs.append(i)
    elif X_pids[i] in test_pids:
        test_idxs.append(i)
    if X_pids[i] in unseen_pids:
        unseen_idxs.append(i)
        
tot = len(train_idxs) + len(valid_idxs) + len(test_idxs)
print(f"nSamples: train={len(train_idxs):,} ({len(train_idxs)*100/tot:.2f}%), valid={len(valid_idxs):,} ({len(valid_idxs)*100/tot:.2f}%)")
print(f"test={len(test_idxs):,} ({len(test_idxs)*100/tot:.2f}%), unseen = {len(unseen_idxs):,} ({len(unseen_idxs)*100/tot:.2f}%)")

  0%|          | 0/210185 [00:00<?, ?it/s]

nSamples: train=146,580 (69.74%), valid=21,217 (10.09%)
test=42,388 (20.17%), unseen = 7,608 (3.62%)


In [5]:
# load processed data
X = pickle.load(open("./data/CRF_X.p", "rb"))

In [6]:
# ----------- spliting -------------
X_tr = np.array([X[i] for i in train_idxs])
X_val = np.array([X[i] for i in valid_idxs])
X_te = np.array([X[i] for i in test_idxs])
X_te_seen = np.array([X[i] for i in test_idxs if i not in unseen_idxs])
X_te_unseen = np.array([X[i] for i in test_idxs if i in unseen_idxs])

y_tr = np.array([y[i] for i in train_idxs])
y_val = np.array([y[i] for i in valid_idxs])
y_te = np.array([y[i] for i in test_idxs])
y_te_unseen = np.array([y[i] for i in test_idxs if i in unseen_idxs])
y_te_seen = np.array([y[i] for i in test_idxs if i not in unseen_idxs])

  X_tr = np.array([X[i] for i in train_idxs])
  X_val = np.array([X[i] for i in valid_idxs])
  X_te = np.array([X[i] for i in test_idxs])
  X_te_seen = np.array([X[i] for i in test_idxs if i not in unseen_idxs])
  X_te_unseen = np.array([X[i] for i in test_idxs if i in unseen_idxs])
  y_tr = np.array([y[i] for i in train_idxs])
  y_val = np.array([y[i] for i in valid_idxs])
  y_te = np.array([y[i] for i in test_idxs])
  y_te_unseen = np.array([y[i] for i in test_idxs if i in unseen_idxs])
  y_te_seen = np.array([y[i] for i in test_idxs if i not in unseen_idxs])


In [11]:
#
len(X_tr)

146580

In [14]:
size_list = list(range(10000,51000,5000))
for i in size_list:
    print(i)

10000
15000
20000
25000
30000
35000
40000
45000
50000


In [8]:
from sklearn_crfsuite import CRF
crf = CRF(algorithm='lbfgs',
          c1=10,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [10]:
#train with mini batch
X_tr_batch = X_tr[0:100]
y_tr_batch = y_tr[0:100]

crf.fit(X_tr_batch, y_tr_batch)



CRF(algorithm='lbfgs', all_possible_transitions=False, c1=10, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [15]:

training_size

'Training size:100'

In [17]:
# Test for unseen
test_pred = crf.predict(X_te_unseen)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te_unseen]

In [22]:
report = classification_report(test_labels, preds)
training_size = "Training size:"+(str(len(X_tr_batch)))

In [29]:
with open('crf_train.txt', "a") as file:
    file.write('\n')
    file.write(training_size)
    file.write('\n')
    file.write(report)
    file.write('\n')
file.close()

In [70]:
# MODEL
# %%time
from sklearn_crfsuite import CRF
crf = CRF(algorithm='lbfgs',
          c1=10,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)
crf.fit(X_tr, y_tr)

with open('crf_Qi_v2.pkl','wb') as f:
    pickle.dump(crf,f)

In [71]:
test_pred = crf.predict(X_te)
print(classification_report([list(i) for i in y_te], test_pred))

              precision    recall  f1-score   support

           _       0.90      0.83      0.86     79878

   micro avg       0.90      0.83      0.86     79878
   macro avg       0.90      0.83      0.86     79878
weighted avg       0.90      0.83      0.86     79878



In [72]:
# validation
test_pred = crf.predict(X_val)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_val]
print("-"*10, "Valid", "-"*10)
print(classification_report(test_labels, preds))

---------- Valid ----------
              precision    recall  f1-score   support

           _       0.90      0.81      0.85     38680

   micro avg       0.90      0.81      0.85     38680
   macro avg       0.90      0.81      0.85     38680
weighted avg       0.90      0.81      0.85     38680



In [73]:
# Test for unseen
test_pred = crf.predict(X_te_unseen)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te_unseen]
print("-"*10, "Unseen", "-"*10)
print(classification_report(test_labels, preds))

---------- Unseen ----------
              precision    recall  f1-score   support

           _       0.89      0.63      0.73      9834

   micro avg       0.89      0.63      0.73      9834
   macro avg       0.89      0.63      0.73      9834
weighted avg       0.89      0.63      0.73      9834



In [74]:
# Test for seen
test_pred = crf.predict(X_te_seen)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te_seen]
print("-"*10, "Seen", "-"*10)
print(classification_report(test_labels, preds))

---------- Seen ----------
              precision    recall  f1-score   support

           _       0.90      0.86      0.88     70044

   micro avg       0.90      0.86      0.88     70044
   macro avg       0.90      0.86      0.88     70044
weighted avg       0.90      0.86      0.88     70044



# Model 2: BERT and Sci-BERT

We use the Sklearn implement of BERT. Please see the github repository: https://github.com/charles9n/bert-sklearn

In [2]:
import os
import math
import random
import csv
import sys
import pickle
sys.path.append(os.getcwd() + "/bert-sklearn/")
os.environ["CUDA_VISIBLE_DEVICES"]="4"
#os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score
import statistics as stats

from bert_sklearn import BertClassifier
from bert_sklearn import BertRegressor
from bert_sklearn import BertTokenClassifier
from bert_sklearn import load_model
from nervaluate import Evaluator
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

from tqdm import trange
# ---------- Load inputs-----------
X, y, X_pids = pickle.load(open("./data/ml_datasetname_inputs_flv0.p", "rb"))

#-----Dataset Split------
train_pids, valid_pids, test_pids, test_pids_cat = pickle.load(open("./data/train_test_split_0331.p", "rb"))
# train_pids, valid_pids, test_pids, unseen_pids = pickle.load(open("./data/train_test_split.p", "rb"))
train_idxs, valid_idxs, test_idxs = [], [], []

print("storing sample indexs for different split")
for i in trange(len(X_pids)):
    if X_pids[i] in train_pids:
        train_idxs.append(i)
    elif X_pids[i] in valid_pids:
        valid_idxs.append(i)
    elif X_pids[i] in test_pids:
        test_idxs.append(i)
        
X_tr = np.array([X[i] for i in train_idxs])
X_val = np.array([X[i] for i in valid_idxs])
X_te = np.array([X[i] for i in test_idxs])

y_tr = np.array([y[i] for i in train_idxs])
y_val = np.array([y[i] for i in valid_idxs])
y_te = np.array([y[i] for i in test_idxs])

print(f"nSamples: train={len(X_tr):,}, valid={len(X_val):,}, test={len(X_te):,}")



#run the Sci-BERT
#%%time
label_list = ['B', 'I', 'O']
# define model

# Choose between BERT or SciBERT

model = BertTokenClassifier(bert_model='bert-base-cased',
# model = BertTokenClassifier(bert_model='scibert-scivocab-cased',
                            max_seq_length=178,
                            epochs=3,
                            gradient_accumulation_steps=4,
                            learning_rate=5e-5,
                            train_batch_size=16,
                            eval_batch_size=16,
                            validation_fraction=0., 
                            label_list=label_list,                         
                            ignore_label=['O'])


print(model)

# X_tr = X_tr[:2000]
# y_tr = X_tr[:2000]

# finetune model
model.fit(np.array(X_tr), np.array(y_tr))


#----save the model---
savefile = './checkpoints/bert.bin'
model.save(savefile)

# # score model
f1_test = model.score(X_te, y_te, 'macro')
print("Test f1: %0.02f"%(f1_test))


#----save the model---


# make predictions
y_preds = model.predict(np.array(X_te))


#----print test restul
evaluator = Evaluator(y_te, y_preds, tags= [''], loader='list')
results, results_per_tag = evaluator.evaluate()
print(results)

#----save the model---
# savefile = './checkpoints/bert.bin'
# model.save(savefile)

2022-04-30 23:24:26 bert_sklearn.model.pytorch_pretrained.modeling INFO: Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .
  0%|          | 451/210185 [00:00<01:41, 2067.85it/s]

storing sample indexs for different split


100%|██████████| 210185/210185 [05:16<00:00, 663.84it/s]
  X_tr = np.array([X[i] for i in train_idxs])
  X_val = np.array([X[i] for i in valid_idxs])
  X_te = np.array([X[i] for i in test_idxs])
  y_tr = np.array([y[i] for i in train_idxs])
  y_val = np.array([y[i] for i in valid_idxs])
  y_te = np.array([y[i] for i in test_idxs])


nSamples: train=146,580, valid=21,217, test=42,388
Building sklearn token classifier...
BertTokenClassifier(bert_model='bert-base-cased', eval_batch_size=16,
                    gradient_accumulation_steps=4, ignore_label=['O'],
                    label_list=['B', 'I', 'O'], learning_rate=5e-05,
                    max_seq_length=178, train_batch_size=16,
                    validation_fraction=0.0)
Loading bert-base-cased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 42388, validation data size: 0


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1055.)
  next_m.mul_(beta1).add_(1 - beta1, grad)
Training  : 100%|██████████| 10597/10597 [13:08<00:00, 13.45it/s, loss=0.00986]
Training  : 100%|██████████| 10597/10597 [13:08<00:00, 13.43it/s, loss=0.00234]
Training  : 100%|██████████| 10597/10597 [13:10<00:00, 13.40it/s, loss=0.0014] 
Predicting: 100%|██████████| 2650/2650 [02:50<00:00, 15.52it/s]


Test f1: 98.58


FileNotFoundError: [Errno 2] No such file or directory: '../checkpoints/bert_base.bin'

In [3]:
savefile = './checkpoints/bert_base.bin'
model.save(savefile)

In [8]:
from bert_sklearn import load_model
savefile = './checkpoints/bert_base.bin'
model = load_model(savefile)

2022-05-01 20:59:33 bert_sklearn.model.pytorch_pretrained.modeling INFO: Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


Loading model from ./checkpoints/bert_base.bin...


2022-05-01 20:59:37 bert_sklearn.model.pytorch_pretrained.modeling INFO: Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 28996
}



Defaulting to linear classifier/regressor
Building sklearn token classifier...


In [9]:
test_pred = model.predict(X_val)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_val]
print("-"*10, "Valid", "-"*10)
print(classification_report(test_labels, preds))

test_pred = model.predict(X_te_seen)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te_seen]
print("-"*10, "Test Seen", "-"*10)
print(classification_report(test_labels, preds))

test_pred = model.predict(X_te)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te]
print("-"*10, "Test", "-"*10)
print(classification_report(test_labels, preds))

evaluator = Evaluator([list(i) for i in y_te], preds,
                      tags=[""], loader='list')
results, results_per_tag = evaluator.evaluate()
print(results)

test_pred = model.predict(X_te_unseen)
preds = [[j if j is not None else 'O' for j in i] for i in test_pred]
test_labels = [np.array(i).astype('<U1').tolist() for i in y_te_unseen]
print("-"*10, "Unseen", "-"*10)
print(classification_report(test_labels, preds))

  return np.array(X)
Predicting: 100%|██████████| 1327/1327 [01:22<00:00, 16.02it/s]


---------- Valid ----------
              precision    recall  f1-score   support

           _       0.91      0.93      0.92     38680

   micro avg       0.91      0.93      0.92     38680
   macro avg       0.91      0.93      0.92     38680
weighted avg       0.91      0.93      0.92     38680



Predicting: 100%|██████████| 2355/2355 [02:27<00:00, 15.96it/s]


---------- Test Seen ----------
              precision    recall  f1-score   support

           _       0.97      1.00      0.98     70044

   micro avg       0.97      1.00      0.98     70044
   macro avg       0.97      1.00      0.98     70044
weighted avg       0.97      1.00      0.98     70044



Predicting: 100%|██████████| 2650/2650 [02:45<00:00, 15.97it/s]


---------- Test ----------
              precision    recall  f1-score   support

           _       0.97      0.99      0.98     79878

   micro avg       0.97      0.99      0.98     79878
   macro avg       0.97      0.99      0.98     79878
weighted avg       0.97      0.99      0.98     79878

{'ent_type': {'correct': 79510, 'incorrect': 0, 'partial': 0, 'missed': 368, 'spurious': 1987, 'possible': 79878, 'actual': 81497, 'precision': 0.9756187344319422, 'recall': 0.9953929742857858, 'f1': 0.9854066615027111}, 'partial': {'correct': 79437, 'incorrect': 0, 'partial': 73, 'missed': 368, 'spurious': 1987, 'possible': 79878, 'actual': 81497, 'precision': 0.9751708651852216, 'recall': 0.9949360274418488, 'f1': 0.9849542989930288}, 'strict': {'correct': 79437, 'incorrect': 73, 'partial': 0, 'missed': 368, 'spurious': 1987, 'possible': 79878, 'actual': 81497, 'precision': 0.9747229959385008, 'recall': 0.9944790805979118, 'f1': 0.9845019364833463}, 'exact': {'correct': 79437, 'incorrect':

Predicting: 100%|██████████| 295/295 [00:18<00:00, 15.68it/s]


---------- Unseen ----------
              precision    recall  f1-score   support

           _       0.98      0.99      0.99      9834

   micro avg       0.98      0.99      0.99      9834
   macro avg       0.98      0.99      0.99      9834
weighted avg       0.98      0.99      0.99      9834



In [58]:
import torch

num_of_gpus = torch.cuda.device_count()
print(num_of_gpus)

1


In [60]:
vailable_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]

In [62]:
torch.cuda.current_device()

0

In [57]:
os.environ["CUDA_VISIBLE_DEVICES"]="100"
if use_cuda:
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)

__CUDNN VERSION: 7605
__Number CUDA Devices: 1
__CUDA Device Name: Tesla V100-SXM2-16GB
__CUDA Device Total Memory [GB]: 16.945709056


In [49]:
import torch
os.environ['CUDA_LAUNCH_BLOCKING'] = "4"
foo = torch.tensor([1,2,3])
foo = foo.to('cuda')

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [39]:
torch.__version__

'1.11.0+cu102'

In [30]:
!pip install torch --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu116

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/nightly/cu116


In [36]:
model = BertTokenClassifier(bert_model='bert-base-cased',
# model = BertTokenClassifier(bert_model='scibert-scivocab-cased',
                            max_seq_length=178,
                            epochs=3,
                            gradient_accumulation_steps=4,
                            learning_rate=5e-5,
                            train_batch_size=4,
                            eval_batch_size=4,
                            validation_fraction=0., 
                            label_list=label_list,                         
                            ignore_label=['O'])

model.fit(np.array(X_te), np.array(y_te))

Building sklearn token classifier...
Loading bert-base-cased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 42388, validation data size: 0


RuntimeError: CUDA out of memory. Tried to allocate 86.00 MiB (GPU 0; 15.78 GiB total capacity; 0 bytes already allocated; 7.94 MiB free; 0 bytes reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [12]:
# X_tr = X_tr[:2000]
# y_tr = y_tr[:2000]

# finetune model
model.fit(np.array(X_te), np.array(y_te))

# # score model
f1_test = model.score(X_te, y_te, 'macro')
print("Test f1: %0.02f"%(f1_test))

# make predictions
y_preds = model.predict(np.array(X_te))

#----save the model---
#savefile = 'scibert_Jo_split.bin'
#model.save(savefile)

#----print test restul
evaluator = Evaluator(y_te, y_preds, tags= [''], loader='list')
results, results_per_tag = evaluator.evaluate()
print(results)

#----save the model---
savefile = '../checkpoints/bert_base.bin'
model.save(savefile)

100%|██████████| 213450/213450 [00:00<00:00, 13431463.34B/s]


Loading bert-base-cased model...


100%|██████████| 435779157/435779157 [00:07<00:00, 59623963.88B/s]
100%|██████████| 433/433 [00:00<00:00, 342150.27B/s]


Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 42388, validation data size: 0


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [11]:
y_tr

array([list([',', '200', 'and', 'NOISE', '=', '0', '%', ',', '50', '%', '.', 'As', 'in', 'Hall', 'et', 'al', '.', '(', '2009', ')', 'we', 'used', 'a', 'subset', 'of', 'the', 'Reuters-21578', 'text', 'categorization', 'test', 'collection', '(', 'Lewis', ',', '1997', ';', 'Sebastiani', ',', '2002', ')', 'to', 'demonstrate', 'the', 'usefulness', 'of', 'EQC', 'and', 'its', 'improved', 'performance', 'over', 'MC', 'and', 'QC', '.', 'The', 'improved', 'performance', 'may', 'be', 'expected', 'since', 'this', 'data', 'set', 'is', 'high-dimensional', ',', 'sparse', 'and', 'the', 'variables', 'are', 'highly', 'skewed']),
       list(['any', 'external', 'feature', 'selections', ',', 'followed', 'by', 'the', 'QC', 'and', 'the', 'MC', '.', 'It', 'was', 'found', 'that', 'most', 'of', 'the', 'quantile-difference', 'transformed', 'variables', 'were', 'constants', ',', 'which', 'can', 'be', 'removed', '.', 'This', 'sparsity', 'may', 'explain', 'the', 'improved', 'performance', 'of', 'the', 'EQC', 'fami

In [33]:
#Jo's code

import os
import math
import random
import csv
import sys
import pickle
sys.path.append(os.getcwd() + "/bert-sklearn/")
os.environ["CUDA_VISIBLE_DEVICES"]="2"

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score
import statistics as stats

from bert_sklearn import BertClassifier
from bert_sklearn import BertRegressor
from bert_sklearn import BertTokenClassifier
from bert_sklearn import load_model
##from utils import *

input_file = "../data/ml_datasetname_inputs_flv0.p"
X, y, word_to_ix= pickle.load(open(input_file, "rb"))
#ix_to_word = dict((v, k) for k, v in word_to_ix.items())
#X = [[ix_to_word[i] for i in sample] for sample in X]
#y = pred2labels(y)

#max_len = len(X[0])

# train_idxs = train_idxs[:100]

#X_tr = np.array([X[i] for i in train_idxs])
# X_val = np.array([X[i] for i in valid_idxs])
# X_te = np.array([X[i] for i in test_idxs])

#y_tr = np.array([y[i] for i in train_idxs])
# y_val = np.array([y[i] for i in valid_idxs])
# y_te = np.array([y[i] for i in test_idxs])

In [34]:
# split a dataset into train and test sets
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [37]:
label_list = ['B', 'I', 'O']

In [38]:
%%time
label_list = ['B', 'I', 'O']
# define model

# Choose between BERT or SciBERT

model = BertTokenClassifier(bert_model='scibert-scivocab-cased',
# model = BertTokenClassifier(bert_model='scibert-scivocab-cased',
                            max_seq_length=178,
                            epochs=3,
                            gradient_accumulation_steps=4,
                            learning_rate=5e-5,
                            train_batch_size=16,
                            eval_batch_size=16,
                            validation_fraction=0., 
                            label_list=label_list,                           
                            ignore_label=['O'])


print(model)

# finetune model
model.fit(np.array(X_train), np.array(y_train))

# # score model
f1_test = model.score(X_test, y_test, 'macro')
print("Test f1: %0.02f"%(f1_test))

# make predictions
y_preds = model.predict(np.array(X_test))

Building sklearn token classifier...
BertTokenClassifier(bert_model='scibert-scivocab-cased', eval_batch_size=16,
                    gradient_accumulation_steps=4, ignore_label=['O'],
                    label_list=['B', 'I', 'O'], learning_rate=5e-05,
                    max_seq_length=178, train_batch_size=16,
                    validation_fraction=0.0)


100%|██████████| 410521600/410521600 [01:21<00:00, 5038584.76B/s] 


Loading scibert-scivocab-cased model...


100%|██████████| 410521600/410521600 [01:00<00:00, 6736470.18B/s]


Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 168148, validation data size: 0


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1055.)
  next_m.mul_(beta1).add_(1 - beta1, grad)
Training  : 100%|██████████| 42037/42037 [52:35<00:00, 13.32it/s, loss=0.00597]
Training  : 100%|██████████| 42037/42037 [53:51<00:00, 13.01it/s, loss=0.00244] 
Training  : 100%|██████████| 42037/42037 [53:39<00:00, 13.06it/s, loss=0.0016]  
  return np.array(X)
Predicting: 100%|██████████| 2628/2628 [02:49<00:00, 15.54it/s]


Test f1: 95.44


Predicting: 100%|██████████| 2628/2628 [02:49<00:00, 15.55it/s]

CPU times: user 2h 39min 38s, sys: 7min 22s, total: 2h 47min
Wall time: 2h 48min 53s





In [39]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.7 MB/s  eta 0:00:01
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16170 sha256=3ec49ffe81a311419a8dbbffd69ec6b42836068bd9264d32a58bc4c9c6764ec1
  Stored in directory: /home/tuo96248/.cache/pip/wheels/ad/5c/ba/05fa33fa5855777b7d686e843ec07452f22a66a138e290e732
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [45]:
savefile = '../checkpoints/scibert_seqlen75.bin'
model.save(savefile)

In [42]:
!pip install nervaluate

Collecting nervaluate
  Downloading nervaluate-0.1.8-py3-none-any.whl (24 kB)
Installing collected packages: nervaluate
Successfully installed nervaluate-0.1.8


In [43]:
from nervaluate import Evaluator
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [46]:
print(classification_report(y_test, y_preds))

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [47]:
evaluator = Evaluator(y_test, y_preds, tags= [''], loader='list')

In [48]:
results, results_per_tag = evaluator.evaluate()

In [49]:
results

{'ent_type': {'correct': 76619,
  'incorrect': 0,
  'partial': 0,
  'missed': 1281,
  'spurious': 4136,
  'possible': 77900,
  'actual': 80755,
  'precision': 0.9487833570676738,
  'recall': 0.9835558408215661,
  'f1': 0.9658567331631527},
 'partial': {'correct': 76458,
  'incorrect': 0,
  'partial': 161,
  'missed': 1281,
  'spurious': 4136,
  'possible': 77900,
  'actual': 80755,
  'precision': 0.9477865147668875,
  'recall': 0.9825224646983312,
  'f1': 0.9648419526645866},
 'strict': {'correct': 76458,
  'incorrect': 161,
  'partial': 0,
  'missed': 1281,
  'spurious': 4136,
  'possible': 77900,
  'actual': 80755,
  'precision': 0.9467896724661011,
  'recall': 0.9814890885750963,
  'f1': 0.9638271721660207},
 'exact': {'correct': 76458,
  'incorrect': 161,
  'partial': 0,
  'missed': 1281,
  'spurious': 4136,
  'possible': 77900,
  'actual': 80755,
  'precision': 0.9467896724661011,
  'recall': 0.9814890885750963,
  'f1': 0.9638271721660207}}

# Test the Jo's split

In [8]:
# !pip install tensorflow

In [2]:
import pickle
import numpy as np
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.metrics import Precision, Recall
from tensorflow.keras.utils import to_categorical
from tqdm.keras import TqdmCallback
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences

In [40]:
# ---------- Load inputs-----------
#input_file = "../data/ml_datasetname_inputs_flv0.p"
X, y, X_pids = pickle.load(open("./data/ml_datasetname_inputs_flv0.p", "rb"))

#-----Split------
train_pids, valid_pids, test_pids, unseen_pids = pickle.load(open("./data/train_test_split.p", "rb"))
train_idxs, valid_idxs, test_idxs = [], [], []

for i in range(len(X_pids)):
    if X_pids[i] in train_pids:
        train_idxs.append(i)
    elif X_pids[i] in valid_pids:
        valid_idxs.append(i)
    elif X_pids[i] in test_pids:
        test_idxs.append(i)
        
X_tr = np.array([X[i] for i in train_idxs])
X_val = np.array([X[i] for i in valid_idxs])
X_te = np.array([X[i] for i in test_idxs])

y_tr = np.array([y[i] for i in train_idxs])
y_val = np.array([y[i] for i in valid_idxs])
y_te = np.array([y[i] for i in test_idxs])

print(f"nSamples: train={len(X_tr):,}, valid={len(X_val):,}, test={len(X_te):,}")

  X_tr = np.array([X[i] for i in train_idxs])
  X_val = np.array([X[i] for i in valid_idxs])
  X_te = np.array([X[i] for i in test_idxs])
  y_tr = np.array([y[i] for i in train_idxs])


nSamples: train=157,770, valid=17,807, test=34,608


  y_val = np.array([y[i] for i in valid_idxs])
  y_te = np.array([y[i] for i in test_idxs])


In [4]:
import os
import math
import random
import csv
import sys
import pickle
sys.path.append(os.getcwd() + "/bert-sklearn/")
os.environ["CUDA_VISIBLE_DEVICES"]="2"

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score
import statistics as stats

from bert_sklearn import BertClassifier
from bert_sklearn import BertRegressor
from bert_sklearn import BertTokenClassifier
from bert_sklearn import load_model

In [6]:
#load trained model to test
savefile='scibert_Jo_split.bin'
model = load_model(savefile)

Loading model from scibert_Jo_split.bin...
Defaulting to linear classifier/regressor
Building sklearn token classifier...


In [27]:
y_preds = model.predict(X_te)

Predicting: 100%|██████████| 2163/2163 [02:19<00:00, 15.53it/s]


In [60]:
test = y_preds

In [63]:
new = np.array(y_preds,dtype='<U1')

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (34608,) + inhomogeneous part.

In [28]:
from nervaluate import Evaluator
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [51]:
from sklearn.metrics import classification_report

In [53]:
y_val[0]

array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], dtype='<U1')

In [42]:
y_kk = y_te.astype(string)

NameError: name 'string' is not defined

In [37]:
type(y_te)

numpy.ndarray

In [36]:
k[0]

array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], dtype='<U1')

In [23]:
test.astype('int')

AttributeError: 'list' object has no attribute 'astype'

In [45]:
y_te.dtype

dtype('O')

In [52]:
print(classification_report(y_te, y_preds))

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.

In [47]:
y_te.tolist()

[array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
        'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
        'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
        'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
        'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
        'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], dtype='<U1'),
 array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
        'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
        'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
        'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
        'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O',
        'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], dtype='<U1'),
 array(['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
        'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [16]:
%%time
label_list = ['B', 'I', 'O']
# define model

# Choose between BERT or SciBERT

model = BertTokenClassifier(bert_model='scibert-scivocab-cased',
# model = BertTokenClassifier(bert_model='scibert-scivocab-cased',
                            max_seq_length=178,
                            epochs=3,
                            gradient_accumulation_steps=4,
                            learning_rate=5e-5,
                            train_batch_size=16,
                            eval_batch_size=16,
                            validation_fraction=0., 
                            label_list=label_list,                           
                            ignore_label=['O'])


print(model)

# finetune model
model.fit(np.array(X_tr), np.array(y_tr))

# # score model
f1_test = model.score(X_te, y_te, 'macro')
print("Test f1: %0.02f"%(f1_test))

# make predictions
y_preds = model.predict(np.array(X_te))

Building sklearn token classifier...
BertTokenClassifier(bert_model='scibert-scivocab-cased', eval_batch_size=16,
                    gradient_accumulation_steps=4, ignore_label=['O'],
                    label_list=['B', 'I', 'O'], learning_rate=5e-05,
                    max_seq_length=178, train_batch_size=16,
                    validation_fraction=0.0)
Loading scibert-scivocab-cased model...
Defaulting to linear classifier/regressor
Loading Pytorch checkpoint
train data size: 157770, validation data size: 0


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1055.)
  next_m.mul_(beta1).add_(1 - beta1, grad)
Training  :  56%|█████▌    | 22036/39443 [52:01<41:06,  7.06it/s, loss=0.00792]  


KeyboardInterrupt: 

In [17]:
savefile = '../checkpoints/scibert_Jo_split.bin'
model.save(savefile)

FileNotFoundError: [Errno 2] No such file or directory: '../checkpoints/scibert_Jo_split.bin'

# Test the BiLSTM

In [None]:
import pickle
import numpy as np
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.metrics import Precision, Recall
from tensorflow.keras.utils import to_categorical
from tqdm.keras import TqdmCallback
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant
from tqdm import tqdm, trange

import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"
# import tensorflow as tf
# print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# ---------- Load inputs-----------
#input_file = "../data/ml_datasetname_inputs_flv0.p"
input_file = "./data/ml_datasetname_inputs_flv0.p"
X, y, X_pids = pickle.load(open(input_file, "rb"))

word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sent in X:
    for word in sent:
        if word not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
word_to_ix["ENDPAD"] = len(word_to_ix) # the corresponding padding
words = word_to_ix.keys()
ix_to_word = dict((v, k) for k, v in word_to_ix.items())

tag_to_ix = {
'O': 0,
'B': 1,
'I': 2,
}

X = [[word_to_ix[w] for w in s] for s in X]
y = [[to_categorical(tag_to_ix[w], num_classes=3) for w in s] for s in y]

max_len = len(X[0])
n_words = len(word_to_ix.keys())
n_tags = len(tag_to_ix.keys())

X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=word_to_ix["ENDPAD"])
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag_to_ix["O"])

# ---------- Load train test split -----------
# train_pids, valid_pids, test_pids, unseen_pids = pickle.load(open("train_test_split.p", "rb"))
train_pids, valid_pids, test_pids, test_pids_cat = pickle.load(open("train_test_split_0331.p", "rb"))
train_idxs, valid_idxs, test_idxs = [], [], []

for i in trange(len(X_pids)):
    if X_pids[i] in train_pids:
        train_idxs.append(i)
    elif X_pids[i] in valid_pids:
        valid_idxs.append(i)
    elif X_pids[i] in test_pids:
        test_idxs.append(i)
        
X_tr = np.array([X[i] for i in train_idxs])
X_val = np.array([X[i] for i in valid_idxs])
X_te = np.array([X[i] for i in test_idxs])

y_tr = np.array([y[i] for i in train_idxs])
y_val = np.array([y[i] for i in valid_idxs])
y_te = np.array([y[i] for i in test_idxs])

print(f"nSamples: train={len(X_tr):,}, valid={len(X_val):,}, test={len(X_te):,}")

# ---------- pretrained embedding ----------
#https://keras.io/examples/nlp/pretrained_word_embeddings/
import gensim.downloader
embedding_vector =  gensim.downloader.load('word2vec-google-news-300')

#embedding_dim = 100 # without pretrain, it was 50
embedding_dim = len(embedding_vector["he"])
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((n_words, embedding_dim))
for word, i in word_to_ix.items():
    try:
        embedding_matrix[i] = embedding_vector[word]
        hits += 1
    except:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

# ---------- Model -----------
input_ = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=embedding_dim, 
                  embeddings_initializer=Constant(embedding_matrix), 
                  trainable=True, #since there are many missing words in the pretrained
                  input_length=max_len)(input_)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer
model = Model(input_, out)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath='./checkpoints/BiLSTM_word2vec',
                                                 save_weights_only=True,
                                                 verbose=1)

model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy", Precision(), Recall()])

history = model.fit(X_tr, y_tr, batch_size=32, epochs=5, 
                     verbose=0, callbacks=[TqdmCallback(verbose=2),cp_callback],
                     validation_data=(X_val, y_val)) 



100%|██████████| 210185/210185 [07:04<00:00, 495.18it/s]
2022-04-18 20:52:29 gensim.downloader INFO: Creating /home/tuo96248/gensim-data


nSamples: train=146,580, valid=21,217, test=42,388


2022-04-18 20:55:07 gensim.downloader INFO: word2vec-google-news-300 downloaded
2022-04-18 20:55:13 gensim.models.keyedvectors INFO: loading projection weights from /home/tuo96248/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz
2022-04-18 20:56:23 gensim.utils INFO: KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from /home/tuo96248/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2022-04-18T20:56:23.061313', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 19:58:26) \n[GCC 7.3.0]', 'platform': 'Linux-4.15.0-166-generic-x86_64-with-glibc2.10', 'event': 'load_word2vec_format'}


Converted 47772 words (125683 misses)


0epoch [00:00, ?epoch/s]

  0%|          | 0.00/4.58k [00:00<?, ?batch/s]


Epoch 1: saving model to ./checkpoints/BiLSTM_word2vec


  0%|          | 0.00/4.58k [00:00<?, ?batch/s]

In [20]:
import pickle
import numpy as np
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras.metrics import Precision, Recall
from tensorflow.keras.utils import to_categorical
from tqdm.keras import TqdmCallback
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant
from tqdm import tqdm, trange

# import tensorflow as tf
# print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# ---------- Load inputs-----------
input_file = "./data/ml_datasetname_inputs_flv0.p"
X, y, X_pids = pickle.load(open(input_file, "rb"))

word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sent in X:
    for word in sent:
        if word not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
word_to_ix["ENDPAD"] = len(word_to_ix) # the corresponding padding
words = word_to_ix.keys()
ix_to_word = dict((v, k) for k, v in word_to_ix.items())

tag_to_ix = {
'O': 0,
'B': 1,
'I': 2,
}

X = [[word_to_ix[w] for w in s] for s in X]
y = [[to_categorical(tag_to_ix[w], num_classes=3) for w in s] for s in y]

max_len = len(X[0])
n_words = len(word_to_ix.keys())
n_tags = len(tag_to_ix.keys())

X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=word_to_ix["ENDPAD"])
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag_to_ix["O"])

# ---------- Load train test split -----------
# train_pids, valid_pids, test_pids, unseen_pids = pickle.load(open("train_test_split.p", "rb"))
train_pids, valid_pids, test_pids, test_pids_cat = pickle.load(open("train_test_split_0331.p", "rb"))
train_idxs, valid_idxs, test_idxs = [], [], []

for i in trange(len(X_pids)):
    if X_pids[i] in train_pids:
        train_idxs.append(i)
    elif X_pids[i] in valid_pids:
        valid_idxs.append(i)
    elif X_pids[i] in test_pids:
        test_idxs.append(i)
        
X_tr = np.array([X[i] for i in train_idxs])
X_val = np.array([X[i] for i in valid_idxs])
X_te = np.array([X[i] for i in test_idxs])

y_tr = np.array([y[i] for i in train_idxs])
y_val = np.array([y[i] for i in valid_idxs])
y_te = np.array([y[i] for i in test_idxs])

print(f"nSamples: train={len(X_tr):,}, valid={len(X_val):,}, test={len(X_te):,}")
embedding_dim = 300 # v1 without pretrain, it was 50

# ---------- pretrained embedding ----------
#https://keras.io/examples/nlp/pretrained_word_embeddings/
PRETRAINED = "" # "", "GLOVE_V1", "word2vec", "glove_300"
if PRETRAINED != "":
    hits, misses = 0, 0

    if PRETRAINED == "GLOVE_V1":
        path_to_glove_file = "glove.6B.100d.txt"
        embeddings_index = {}
        with open(path_to_glove_file) as f:
            for line in f:
                word, coefs = line.split(maxsplit=1)
                coefs = np.fromstring(coefs, "f", sep=" ")
                embeddings_index[word] = coefs
        print("In pretrained golve, found %s word vectors." % len(embeddings_index))
        embedding_dim = 100 

    ## WORD2VEC 
    else:
        import gensim.downloader

        if PRETRAINED == "word2vec":
            embeddings_index =  gensim.downloader.load('word2vec-google-news-300')
            
        if PRETRAINED == "glove_300":
            embeddings_index =  gensim.downloader.load('glove-wiki-gigaword-300')
        embedding_dim = len(embeddings_index["he"])

    # Prepare embedding matrix
    embedding_matrix = np.zeros((n_words, embedding_dim))
    for word, i in word_to_ix.items():
        if PRETRAINED == "GLOVE_V1":
            embedding_vector = embeddings_index.get(word.lower())
            if embedding_vector is not None:
                # Words not found in embedding index will be all-zeros.
                # This includes the representation for "padding" and "OOV"
                embedding_matrix[i] = embedding_vector
                hits += 1
            else:
                misses += 1
        else:
            try:
                embedding_matrix[i] = embedding_vector[word]
                hits += 1
            except:
                misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))

# ---------- Model -----------
input_ = Input(shape=(max_len,))
if PRETRAINED != "":
    model = Embedding(input_dim=n_words, output_dim=embedding_dim, 
                      embeddings_initializer=Constant(embedding_matrix), 
                      trainable=True, #since there are many missing words in the pretrained
                      input_length=max_len)(input_)
else:
    model = Embedding(input_dim=n_words, output_dim=embedding_dim, 
                      input_length=max_len)(input_)
model = Dropout(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer
model = Model(input_, out)

# Create a callback that saves the model's weights
filepath = './checkpoints/BiLSTM300'
if PRETRAINED != "":
    filepath += "_" + PRETRAINED
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=filepath,
                                                 save_weights_only=True,
                                                 verbose=1)

model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy", Precision(), Recall()])

history = model.fit(X_tr, y_tr, batch_size=32, epochs=5, 
                     verbose=0, callbacks=[TqdmCallback(verbose=2),cp_callback],
                     validation_data=(X_val, y_val)) 



Collecting gensim
  Downloading gensim-4.1.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: gensim
Successfully installed gensim-4.1.2
