# SETUP

In [None]:
!pip install datasets --quiet
!pip install transformers
!pip install datasets
!pip install huggingface_hub
!pip install seqeval 
!pip install evaluate --quiet
!pip install tf2crf
!pip install tensorflow_addons

!pip install keras-bert
!pip install bert-for-tf2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import tensorflow as tf
import keras
import tensorflow_datasets as tfds
import datasets
import json
import nltk

from os import listdir
from os.path import isfile, join

from tqdm.notebook import tqdm

from datasets import load_dataset, load_metric
from datasets import Dataset, DatasetDict


from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification
from transformers import TFBertModel, TFAutoModelForTokenClassification, TFAutoModel

from tensorflow.keras import Model, Input
#from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Embedding, Input, Dense, Lambda, LSTM, Dropout, Bidirectional, Layer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow_addons as tfa
#from tensorflow_addons.layers import CRF
#from tensorflow_addons.text.crf_wrapper import CRFModelWrapper
from tf2crf import CRF
from tf2crf import ModelWithCRFLoss

from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from statistics import mean


drive.mount('/content/drive/', force_remount= True)
DATA_PATH = './drive/MyDrive/PubMed_Abstracts/standoff'
parsed_data_path = './drive/MyDrive/PubMed_Abstracts/parsed.jsonl'

Mounted at /content/drive/


In [None]:
# @title Parse Data for Transfer Learning (The original data was in .txt and tagged at each letter position, we parse it into the same JSON format of NCBI, GM and CDR dataset, and tagged for each word)

nltk.download('punkt')
DATA_PATH = './drive/MyDrive/PubMed_Abstracts/standoff'
parsed_data_path = './drive/MyDrive/PubMed_Abstracts/parsed.jsonl'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
np.random.seed(42)

In [None]:
# convert label-start-end into 3 columns for pandas dataframe
# b/c they are space delimited instead of tab delimited
def parse_label(x):
  x = x.strip().split(' ')
  label = ''
  start = -1
  end = -1
  if len(x) == 3 and x[1].isnumeric() and x[2].isnumeric():
    label = x[0].strip()
    start = int(x[1])
    end = int(x[2])
  return pd.Series({ 
    'label': label,
    'start': start,
    'end': end
  })

# load annotation files
def parse_annotation(path):
  annotation = pd.read_csv(path, sep='\t', names=[
      'key', 'label_start_end', 'word'
  ])
  tmp = annotation['label_start_end'].apply(parse_label)
  annotation = pd.concat([annotation, tmp], axis=1)
  annotation = annotation[['label', 'start', 'end', 'word']]
  # unify for GENE labels
  annotation['label'] = annotation['label']\
    .apply(lambda x : 'GENE' if x.startswith('Gene') else x)
  # keep only 'Chemical', 'Disease', 'GENE' labels 
  annotation = annotation[annotation['label'].isin([
      'Chemical', 'Disease', 'GENE',
  ])].reset_index(drop=True)
  return annotation

# make sure . , ? ! ; are independent tokens
# by padding them with additional space on the left
def tokenize(x):
  # return word_tokenize(x)
  for c in ['. ', ', ', '? ', '! ', '; ']:
    x = x.replace(c, f' {c}')
  for c in ['-']:
    x = x.replace(c, f' {c} ')
  return [ s.strip() for s in re.sub(' +', ' ', x).split(' ') ]


In [None]:
abstracts = sorted([f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f)) and f.endswith('.txt')])
# the label dictionary used for the original 3 datasets
label2index = {
  'O': 0,
  'B-Disease': 1,
  'I-Disease': 2,
  'B-GENE': 3,
  'I-GENE': 4,
  'B-Chemical': 5,
  'I-Chemical': 6
}

data = []
for a in tqdm(abstracts):
  
#   a = abstracts[0]
  # paired abstract and annotation files share the same name
  abstract_path = f'{DATA_PATH}/{a}'
  annotation_path = f'{DATA_PATH}/{a[:-4] + ".ann"}'
  # check if annotation file exists
  # if not os.path.exists(annotation_path):
  #   print(f'{a} misses annotation file')
  #   continue
  with open(abstract_path, 'r', encoding='utf8') as f:
    abstract = f.read()
  # sort annotations by the starting position (at letter level) in descending order
  # such that we will replace tokens with tags from right to left
  # this ensures that the positions of the tags won't change by replacing
  annotation = parse_annotation(annotation_path).sort_values(by=['start', 'end'], ascending=False)
  # making sure the abstract is longer than the annotation
  max_end = annotation['end'].max()
  # if len(abstract) < max_end:
  #   print(f'ERROR: {a}: abstract is shorter than the annotation')
  #   continue

  # start replacing
  # initialize labels with abstract
  masked_abstract_with_tags = abstract
  # loop through the data frame
  start_ends = []
  for i, r in annotation.iterrows():
    start = r['start']
    end = r['end']
    
    ignore = False
    for s, e in start_ends:
      if start <= s <= end or start <= e <= end:
        ignore = True
        break
    if ignore: 
      continue
      
    extracted = abstract[start:end]
    # make sure that the extracted words are the same as in the annotation files
    if extracted != r['word']:
      print(f'ERROR: {a}: expected: {r["word"]} not equal to extracted: {extracted}')
      continue
    # generate tags
    # could extract multiple words, replace all the words with the same label 
    tags = [ f'<{r["label"]}>' ] * len(tokenize(extracted))
    masked_abstract_with_tags = masked_abstract_with_tags[:start] + ' '.join(tags) + masked_abstract_with_tags[end:]
    start_ends.append((start, end))
  
  tags = []
  last_token = ''
  for token in tokenize(masked_abstract_with_tags):
    if token in ['<Chemical>', '<Disease>', '<GENE>']:
      token = token[1:-1]
      if last_token == token:
        tags.append(label2index[f'I-{token}'])
      else:
        tags.append(label2index[f'B-{token}'])
    else:
      tags.append(0)
    last_token = token
    
  # remove the abstract ID
  tokens = [ t.strip() for t in tokenize(abstract) ][1:]
  tags = tags[1:]

  # separate abstract into sentences 
  partial_tokens = []
  partial_tags = []
  id = 0
  for token, tag in zip(tokens, tags):
    partial_tokens.append(token)
    partial_tags.append(tag)
    # separate by period
    if token == '.':
      data.append({
        'id': id,  
        'tokens': partial_tokens,
        'ner_tags': partial_tags
      })
      id += 1
      partial_tokens = []
      partial_tags = []
    # in case the last sentence in the abstract does not end with period
    # still capture the last sentence   
  if len(partial_tokens) > 0:
    data.append({
      'id': id,  
      'tokens': partial_tokens,
      'ner_tags': partial_tags
    })
  
#   if 'However, the present study indicated' in abstract:
#   if 'For example, at the other heparin-binding' in abstract:
#     break

  0%|          | 0/661 [00:00<?, ?it/s]

In [None]:
annotation.shape

(68, 4)

In [None]:
len(abstracts)

661

In [None]:
with open(parsed_data_path, 'w') as f:
    f.writelines([ json.dumps(d) for d in data ])

In [None]:
abstract_dataset = load_dataset("json", data_files=parsed_data_path, split='train')



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-209129b57c6e4736/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-209129b57c6e4736/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


In [None]:
# Split 70% train/ 15% test/ 15% validation

abstract_train_test = abstract_dataset.train_test_split(shuffle = True, seed = 200, test_size=0.3)
abstract_test_val = abstract_train_test['test'].train_test_split(shuffle = True, seed = 200, test_size=0.50)
abstract_train_test_val = DatasetDict({
    'train': abstract_train_test['train'],
    'test': abstract_test_val['test'],
    'validation': abstract_test_val['train']})

In [None]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML


def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(
        dataset
    ), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(
                lambda x: [typ.feature.names[i] for i in x]
            )
    display(HTML(df.to_html()))

In [None]:
abstract_dataset = abstract_train_test_val

In [None]:
show_random_elements(abstract_dataset["train"])

Unnamed: 0,id,tokens,ner_tags
0,0,"[4, Discussion, Cardiovascular, diseases, are, closely, linked, to, hypertension, .]","[0, 0, 1, 2, 0, 0, 0, 0, 0, 0]"
1,2,"[Furthermore, ,, the, numbers, of, CD14⁺CD163⁺CD206⁺, M2, monocyte, not, only, increased, in, the, three, subgroups, of, IMN, but, also, shared, the, same, changes, in, trend, with, disease, progression, .]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,3,"[These, mice, display, a, complex, phenotype, ,, with, increased, activation, of, the, thiazide, -, sensitive, Na⁺, -, Cl−, cotransporter, (NCC), ,, and, polyuria, due, to, a, loss, of, aquaporin, -, 2, (AQP2), .]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 4, 4, 0, 0]"
3,5,"[Adiponectin, can, up, -, regulate, the, PPARγ, expression, through, by, regulating, the, insulin, content, and, insulin, secretion, ,, and, decreased, levels, of, adiponectin, in, the, circulation, of, obese, individuals, may, be, directly, associated, with, the, β, -, cell, dysfunction, in, T2DM.[, ³⁷, ]]","[3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,0,"[In, contrast, to, prior, studies, ,, the, present, work, has, focused, on, the, epigenetic, regulatory, role, played, by, vitamin, C, in, DNA, demethylation, ,, and, how, it, may, pertain, to, cancer, treatment, .]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
5,10,"[In, 2012, ,, Paris, et, al, .]","[0, 0, 0, 0, 0, 0, 0]"
6,8,"[Similarly, ,, Yu, et, al, observed, a, gradual, increase, in, the, Fn, load, during, the, transition, from, adenoma, to, carcinoma, ¹⁸, ⁾, .]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0]"
7,14,"[For, instance, ,, both, TNF, -, α, and, IL, -, 1β, can, induce, miR, -, 146a, expression, 50, ,, while, IL, -, 17, can, suppress, miR, -, 23b, expression, in, RA, 40, .]","[0, 0, 0, 0, 3, 4, 4, 0, 3, 4, 4, 0, 0, 3, 4, 4, 0, 0, 0, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
8,0,"[Kim, [, 18, ], examined, whether, the, rate, of, PD, -, 1/PD, -, L1, positivity, differed, in, patients, with, different, types, of, malignant, tumors, .]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0]"
9,7,"[The, developmental, transcription, factor, IRF6, may, be, associated, with, cell, proliferation, ,, cancer, stem, cell, properties, and, chemotherapeutic, sensitivity, in, nasopharyngeal, carcinoma, (, 41, ), .]","[0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0]"


# Functions and Tokenizer

In [None]:
def get_padded_arr(inputs, max_length=180, pad_type='post', trunc_type='post'):
  """
    Returns a padded array from an input array, 
    padded to the max_length using pad_type and trunc_type.
    Uses pad_sequences() from tensorflow.keras.preprocessing.sequence.

    Parameters
    ----------
    inputs: nested list of ints
      input sequence of tokenized input_ids
    max_length: int
      maximum length to which to pad sequence
    pad_type: string
      padding type; ex: post, pre, etc.
    trunc_type: string
      truncation type; ex: post, pre, etc.

    Returns
    -------
    nested numpy array of ints
      numpy array of each numpy array of input_ids padded to max_length

    Example
    --------
    >>> get_padded_arr([[465, 1093, 669], [110, 549]], 5, 'post', 'post')
    array([[ 465, 1093,  669,    0,    0],
          [ 110,  549,    0,    0,    0]], dtype=int32)
    """
  
  padded_arr = pad_sequences(inputs, padding=pad_type, truncating=trunc_type, maxlen=max_length)
  input_arr = np.array([np.array(x) for x in padded_arr])
  return input_arr

In [None]:
def get_argmax_predictions(y_pred):
  """
    Returns an array of label predictions in the form of numpy arrays,
    from a vector of probability distributions output by softmax layer.

    Parameters
    ----------
    y_pred: nested list of floats
      vector of probability distributions created by softmax function

    Returns
    -------
    array of numpy array of ints
      array of predicted labels (numpy array) per example

    Example
    --------
    >>> get_argmax_predictions([[0.9, 0.05, 0.05], [0.04, 0.9, 0.06], 
                                [0.03, 0.07, 0.9], [0.9, 0.03, 0.07]])
    [0, 1, 2, 0]
    """

  predicted_labels = []
  for idx, input in enumerate(y_pred):
    input_preds = []
    for token in input:
      input_preds.append(np.argmax(token))
    predicted_labels.append(np.array(input_preds))

 # predicted_labels = np.array(predicted_labels)

  return predicted_labels

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
# @title Map Labels To Tokenized Data (The Original Labels Were At Word-Level)
def bert_process(data, label_names, label2index):
  inputs = bert_tokenizer(data['tokens'], is_split_into_words=True, add_special_tokens=False)
  tags = []
  ids = []
  for i, labels in enumerate(data['ner_tags']):
    # len(word_id) = len(tokens)
    # word_id indicates which word the token belongs to
    word_ids = inputs.word_ids(i)
    label_ids = []
    for wid in word_ids:
      # set label to -100 for CLS and SEP tokens
      if wid is None: 
        label_ids.append(-100)
      else:
        # original label at word-level
        orig_label_index = labels[wid]  
        orig_label_name = label_names[orig_label_index]
        # grab the new unified label indexes
        new_label_index = label2index[orig_label_name]
        label_ids.append(new_label_index)
    tags.append(label_ids)
    # overwrite ids to ensure we have all the features defined above
    # cdr data doesn't have ids
    ids.append('0')
  inputs['id'] = ids
  # the new labels for model training
  inputs['labels'] = tags
  return inputs

biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
# @title Map Labels To Tokenized Data (The Original Labels Were At Word-Level)
def biobert_process(data, label_names, label2index):
  inputs = biobert_tokenizer(data['tokens'], is_split_into_words=True, add_special_tokens=False)
  tags = []
  ids = []
  for i, labels in enumerate(data['ner_tags']):
    # len(word_id) = len(tokens)
    # word_id indicates which word the token belongs to
    word_ids = inputs.word_ids(i)
    label_ids = []
    for wid in word_ids:
      # set label to -100 for CLS and SEP tokens
      if wid is None: 
        label_ids.append(-100)
      else:
        # original label at word-level
        orig_label_index = labels[wid]  
        orig_label_name = label_names[orig_label_index]
        # grab the new unified label indexes
        new_label_index = label2index[orig_label_name]
        label_ids.append(new_label_index)
    tags.append(label_ids)
    # overwrite ids to ensure we have all the features defined above
    # cdr data doesn't have ids
    ids.append('0')
  inputs['id'] = ids
  # the new labels for model training
  inputs['labels'] = tags
  return inputs

In [None]:
#Classification Report
"""
    Prints the classification report with micro, macro, and weighted
    precision, recall, and f1_score for label classes found in the set.

    Parameters
    ----------
    test_labels: nested list of integers
      nested array of true labels sourced from dataset

    test_attention_mask: nested list of integers
      nested array of attention mask values created by tokenizer

    preds: nested list of integers
      nested array of predicted labels generated from model.predict()

    label_names: list of strings
      names of labels at their respective indices

    Returns
    -------
    Prints classification report, we use weighted average in our paper
    to represent the results, but we have micro and macro precision, recall,
    and f1 scores.
    """

from tqdm.notebook import tqdm
from seqeval.metrics import classification_report

def print_classification_report(test_labels, test_attention_mask, preds, label_names):
  flatten_labels = []
  flatten_preds = []
  for i in tqdm(range(len(test_labels))):
    j = sum(test_attention_mask[i]).numpy() - 1

    flatten_labels.extend(test_labels[i][1:j])
    flatten_preds.extend(preds[i][1:j])
  flatten_labels = list(map(lambda x: label_names[x], flatten_labels))
  flatten_preds = list(map(lambda x: label_names[x], flatten_preds))

  for s in classification_report([flatten_labels], [flatten_preds], digits=4).split('\n'):
    print(s)

# Datasets for Model Testing

In [None]:
## NCBI Dataset ## 
## Biomed Inform. 2014 February ; 47: 1–10. doi:10.1016/j.jbi.2013.12.006. 
## The NCBI Disease corpus consists of 793 PubMed abstracts, 
## which are separated into training (593), development (100) and test (100) subsets
## It includes 6,892 disease mentions, which are mapped to 790 unique disease concepts

ncbi_dataset = load_dataset("ncbi_disease")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
## GENE MENTION DATASET ##
## Genome Biology 2008, 9(Suppl 2):S2 

## BioCreative II Gene Mention Recognition (BC2GM) Dataset contains data 
## where participants are asked to identify a gene mention in a sentence 
## by giving its start and end characters. The training set consists of a 
## set of sentences, and for each sentence a set of gene mentions (GENE annotations).
## Includes  15,000 annotated training sentences.
## 

gm_dataset = load_dataset("bc2gm_corpus")



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
## CHEMICAL DISEASE RELATIONS DATASET ##
## Database, Volume 2016, 2016, baw068, https://doi.org/10.1093/database/baw068

## The BioCreative V CDR task corpus is manually annotated for chemicals,
## diseases and chemical-induced disease (CID) relations. 
## It consists of 1500 PubMed articles with 4409 annotated chemicals, 
## 5818 diseases and 3116 chemical-disease interactions.

cdr_dataset = load_dataset("ghadeermobasher/BC5CDR-Chemical-Disease")



  0%|          | 0/3 [00:00<?, ?it/s]

# Tokenization

In [None]:
# @title Create Unified Label Mapping Across Datasets
# rename "tags" to "ner_tags" for consistency
#if 'tags' in cdr_dataset['train'].features.keys():
# cdr_dataset = cdr_dataset.rename_column("tags", "ner_tags")

# get labels from each dataset

ncbi_label_names = ncbi_dataset["train"].features["ner_tags"].feature.names
gm_label_names = ['O', 'B-GENE', 'I-GENE'] #gm_dataset["train"].features["ner_tags"].feature.names    
cdr_label_names = ['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical']

# create an unified label-to-index mapping across datasets
# keys: unique labels, values: indexes
label2index = {}
id = 0
for n in ncbi_label_names + gm_label_names + cdr_label_names:
  if n not in label2index:
    label2index[n] = id
    id += 1

label_names = list(label2index.keys())
print(label2index)
print(label_names)

from datasets import Features, Value, Sequence, ClassLabel
# define wanted output columns from the mapping process
# this ensures that the tokenized datasets have the same features for concatenation 
features = Features({
  'id': Value(dtype='string', id=None),
  'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
  'ner_tags': Sequence(feature=ClassLabel(names=label_names, id=None), length=-1, id=None),
  'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
  'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
  'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
  'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)
})



{'O': 0, 'B-Disease': 1, 'I-Disease': 2, 'B-GENE': 3, 'I-GENE': 4, 'B-Chemical': 5, 'I-Chemical': 6}
['O', 'B-Disease', 'I-Disease', 'B-GENE', 'I-GENE', 'B-Chemical', 'I-Chemical']


In [None]:
#create tokenized datasets for training

bert_tokenized_datasets = { 
    name: data.map(
      bert_process, 
      features=features,
      batched=True,
      fn_kwargs={
        'label_names': label_names,
        'label2index': label2index,
      }
  ) for name, data, label_names in [
      ('abstract', abstract_dataset, label_names),
      ('ncbi', ncbi_dataset, label_names),
      #('gm', gm_dataset, label_names),
      #('cdr', cdr_dataset, label_names)
  ] 
}

biobert_tokenized_datasets = { 
    name: data.map(
      biobert_process, 
      features=features,
      batched=True,
      fn_kwargs={
        'label_names': label_names,
        'label2index': label2index,
      }
  ) for name, data, label_names in [
      ('abstract', abstract_dataset, label_names),
      ('ncbi', ncbi_dataset, label_names),
      #('gm', gm_dataset, label_names),
      #('cdr', cdr_dataset, label_names)
  ] 
}

  0%|          | 0/5 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]



  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]



# Pre-Process Data

In [None]:
print(max([len(x) for x in biobert_tokenized_datasets['abstract']['train']['input_ids']]))
print(max([len(x) for x in biobert_tokenized_datasets['abstract']['validation']['input_ids']]))
print(max([len(x) for x in biobert_tokenized_datasets['abstract']['test']['input_ids']]))

808
660
807


In [None]:
biobert_tokenized_datasets['abstract']['train']['input_ids'][0]

[2393, 1204, 118, 3087, 131, 8362, 1742, 14545, 1174, 2884]

In [None]:
biobert_tokenized_datasets['abstract']['train']['ner_tags'][0]

[0, 0, 0, 0, 0]

In [None]:
MAX_LENGTH = 808
NUM_LABELS = len(label2index) #7
#print(NUM_LABELS)

In [None]:
abstract_train_input_ids = tf.convert_to_tensor(get_padded_arr(biobert_tokenized_datasets['abstract']['train']['input_ids'], MAX_LENGTH))
abstract_train_attention_mask = tf.convert_to_tensor(get_padded_arr(biobert_tokenized_datasets['abstract']['train']['attention_mask'], MAX_LENGTH))

abstract_test_input_ids = tf.convert_to_tensor(get_padded_arr(biobert_tokenized_datasets['abstract']['test']['input_ids'], MAX_LENGTH))
abstract_test_attention_mask = tf.convert_to_tensor(get_padded_arr(biobert_tokenized_datasets['abstract']['test']['attention_mask'], MAX_LENGTH))

abstract_val_input_ids = tf.convert_to_tensor(get_padded_arr(biobert_tokenized_datasets['abstract']['validation']['input_ids'], MAX_LENGTH))
abstract_val_attention_mask = tf.convert_to_tensor(get_padded_arr(biobert_tokenized_datasets['abstract']['validation']['attention_mask'], MAX_LENGTH))

abstract_train_labels = get_padded_arr(biobert_tokenized_datasets['abstract']['train']['labels'], MAX_LENGTH)
abstract_test_labels = get_padded_arr(biobert_tokenized_datasets['abstract']['test']['labels'], MAX_LENGTH)
abstract_val_labels = get_padded_arr(biobert_tokenized_datasets['abstract']['validation']['labels'], MAX_LENGTH)

In [None]:
biobert_ncbi_train_input_ids = tf.convert_to_tensor(get_padded_arr(biobert_tokenized_datasets['ncbi']['train']['input_ids'], MAX_LENGTH))
biobert_ncbi_train_attention_mask = tf.convert_to_tensor(get_padded_arr(biobert_tokenized_datasets['ncbi']['train']['attention_mask'], MAX_LENGTH))

biobert_ncbi_test_input_ids = tf.convert_to_tensor(get_padded_arr(biobert_tokenized_datasets['ncbi']['test']['input_ids'], MAX_LENGTH))
biobert_ncbi_test_attention_mask = tf.convert_to_tensor(get_padded_arr(biobert_tokenized_datasets['ncbi']['test']['attention_mask'], MAX_LENGTH))

biobert_ncbi_val_input_ids = tf.convert_to_tensor(get_padded_arr(biobert_tokenized_datasets['ncbi']['validation']['input_ids'], MAX_LENGTH))
biobert_ncbi_val_attention_mask = tf.convert_to_tensor(get_padded_arr(biobert_tokenized_datasets['ncbi']['validation']['attention_mask'], MAX_LENGTH))

biobert_ncbi_train_labels = get_padded_arr(biobert_tokenized_datasets['ncbi']['train']['labels'], MAX_LENGTH)
biobert_ncbi_test_labels = get_padded_arr(biobert_tokenized_datasets['ncbi']['test']['labels'], MAX_LENGTH)
biobert_ncbi_val_labels = get_padded_arr(biobert_tokenized_datasets['ncbi']['validation']['labels'], MAX_LENGTH)


# BioBERT

In [None]:
# VANILLA BIOBERT MODEL

def create_biobert_model(num_labels, max_length=180, dropout=0.3) -> tf.keras.Model:
    """
    Create a plain BioBERT model that takes in input_ids, attention_mask
    and will output a softmax probability distribution over a set of num_labels
    number of possible labels.

    Parameters
    ----------
    num_labels: integer
      the number of classes to compute the probabilities for
    max_length: int
      maximum length of inputs
    dropout: float
      dropout rate
    
    Returns
    -------
    a custom BioBERT model built on the pretrained biobert-base-cased-v1.2 model
    for multi-class classification
    
    """
    biobert_model = TFAutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=num_labels, from_pt=True)

    input_ids = Input(shape=(max_length,), dtype=tf.int64, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask')

    biobert_inputs = {'input_ids': input_ids,
                   'attention_mask': attention_mask}      

    biobert_output = biobert_model(biobert_inputs)

    last_hidden_state = biobert_output[0]

    embedding = tf.keras.layers.Dropout(dropout)(last_hidden_state)

    output = tf.keras.layers.Dense(num_labels, activation='softmax')(embedding)

    model = tf.keras.models.Model(inputs = [input_ids, attention_mask], outputs = output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate= 5e-5),
                       loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                        metrics=['accuracy'])

    return model

In [None]:
ncbi_biobert_model = create_biobert_model(NUM_LABELS, MAX_LENGTH)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint

In [None]:
# Train with NCBI Disease train/ validation dataset

history = ncbi_biobert_model.fit([biobert_ncbi_train_input_ids, biobert_ncbi_train_attention_mask], 
                                          biobert_ncbi_train_labels, batch_size=8, epochs=5, 
                                          validation_data=([biobert_ncbi_val_input_ids, biobert_ncbi_val_attention_mask], biobert_ncbi_val_labels))


Epoch 1/5


  return dispatch_target(*args, **kwargs)


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
#loss, accuracy = ncbi_biobert_model.evaluate([biobert_ncbi_test_input_ids, biobert_ncbi_test_attention_mask], biobert_ncbi_test_labels, verbose=0)

ncbi_biobert_predictions_ncbi_only = ncbi_biobert_model.predict([biobert_ncbi_test_input_ids, biobert_ncbi_test_attention_mask])



In [None]:
ncbi_biobert_ncbi_only_preds_list = get_argmax_predictions(ncbi_biobert_predictions_ncbi_only)

In [None]:
print_classification_report(biobert_ncbi_test_labels, biobert_ncbi_test_attention_mask, ncbi_biobert_ncbi_only_preds_list, label_names)

  0%|          | 0/941 [00:00<?, ?it/s]

              precision    recall  f1-score   support

     Disease     0.8076    0.9002    0.8514      2294

   micro avg     0.8076    0.9002    0.8514      2294
   macro avg     0.8076    0.9002    0.8514      2294
weighted avg     0.8076    0.9002    0.8514      2294



In [None]:
# Train with full train/ validation dataset

history = ncbi_biobert_model.fit([abstract_train_input_ids, abstract_train_attention_mask], 
                                          abstract_train_labels, batch_size=8, epochs=5, 
                                          validation_data=([abstract_val_input_ids, abstract_val_attention_mask], abstract_val_labels))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
#loss, accuracy = ncbi_biobert_model.evaluate([biobert_ncbi_test_input_ids, biobert_ncbi_test_attention_mask], biobert_ncbi_test_labels, verbose=0)

ncbi_biobert_abstract_predictions_2 = ncbi_biobert_model.predict([abstract_test_input_ids, abstract_test_attention_mask])



In [None]:
ncbi_biobert_abstract_preds_list_2 = get_argmax_predictions(ncbi_biobert_abstract_predictions_2)

In [None]:
print_classification_report(biobert_ncbi_test_labels, biobert_ncbi_test_attention_mask, ncbi_biobert_abstract_preds_list_2, label_names)

  0%|          | 0/1062 [00:00<?, ?it/s]

              precision    recall  f1-score   support

    Chemical     0.6942    0.7825    0.7357       731
     Disease     0.8379    0.8921    0.8642      2318
        GENE     0.8488    0.8657    0.8572      3768

   micro avg     0.8272    0.8658    0.8460      6817
   macro avg     0.7936    0.8468    0.8190      6817
weighted avg     0.8285    0.8658    0.8465      6817

