In [1]:
!pip install transformers==3.1.0 seqeval[gpu]



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from pathlib import Path

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification


DATASET_DIR = Path("/content/drive/MyDrive/Datasets/litcoin-nlp-challenge/")
ABSTRACTS_TEST_CSV = DATASET_DIR / "abstracts_test.csv"
ABSTRACTS_TRAIN_CSV = DATASET_DIR / "abstracts_train.csv"
ENTITIES_TRAIN_CSV = DATASET_DIR / "entities_train.csv"
RELATIONS_TRAIN_CSV = DATASET_DIR / "relations_train.csv"
SUBMISSION_EXAMPLE_CSV = DATASET_DIR / "submission_example.csv"


In [4]:
abstracts_train_df = pd.read_csv(ABSTRACTS_TRAIN_CSV, sep="\t")
abstracts_train_df.head()

Unnamed: 0,abstract_id,title,abstract
0,1353340,Late-onset metachromatic leukodystrophy: molec...,We report on a new allele at the arylsulfatase...
1,1671881,Two distinct mutations at a single BamHI site ...,Classical phenylketonuria is an autosomal rece...
2,1848636,Debrisoquine phenotype and the pharmacokinetic...,The metabolism of the cardioselective beta-blo...
3,2422478,Midline B3 serotonin nerves in rat medulla are...,Previous experiments in this laboratory have s...
4,2491010,Molecular and phenotypic analysis of patients ...,Eighty unrelated individuals with Duchenne mus...


In [5]:
abstracts_test_df = pd.read_csv(ABSTRACTS_TEST_CSV, sep="\t")
abstracts_test_df.head()

Unnamed: 0,abstract_id,title,abstract
0,1711760,Delayed institution of hypertension during foc...,The effect of induced hypertension instituted ...
1,6086495,Localisation of the Becker muscular dystrophy ...,A linkage study in 30 Becker muscular dystroph...
2,7018927,Pituitary response to luteinizing hormone-rele...,The effects of a 6-hour infusion with haloperi...
3,7811247,X-linked adrenoleukodystrophy (ALD): a novel m...,Fragments of the adrenoleukodystrophy (ALD) cD...
4,8944024,Detection of heterozygous mutations in BRCA1 u...,The ability to scan a large gene rapidly and a...


In [6]:
entities_train_df = pd.read_csv(ENTITIES_TRAIN_CSV, sep="\t")
entities_train_df.head()

Unnamed: 0,id,abstract_id,offset_start,offset_finish,type,mention,entity_ids
0,0,1353340,11,39,DiseaseOrPhenotypicFeature,metachromatic leukodystrophy,D007966
1,1,1353340,111,126,GeneOrGeneProduct,arylsulfatase A,410
2,2,1353340,128,132,GeneOrGeneProduct,ARSA,410
3,3,1353340,159,187,DiseaseOrPhenotypicFeature,metachromatic leukodystrophy,D007966
4,4,1353340,189,192,DiseaseOrPhenotypicFeature,MLD,D007966


In [7]:
train_df = pd.merge(abstracts_train_df, entities_train_df, on="abstract_id")
train_df["full_text"] = train_df["title"] + " " + train_df["abstract"]
train_df.head()

Unnamed: 0,abstract_id,title,abstract,id,offset_start,offset_finish,type,mention,entity_ids,full_text
0,1353340,Late-onset metachromatic leukodystrophy: molec...,We report on a new allele at the arylsulfatase...,0,11,39,DiseaseOrPhenotypicFeature,metachromatic leukodystrophy,D007966,Late-onset metachromatic leukodystrophy: molec...
1,1353340,Late-onset metachromatic leukodystrophy: molec...,We report on a new allele at the arylsulfatase...,1,111,126,GeneOrGeneProduct,arylsulfatase A,410,Late-onset metachromatic leukodystrophy: molec...
2,1353340,Late-onset metachromatic leukodystrophy: molec...,We report on a new allele at the arylsulfatase...,2,128,132,GeneOrGeneProduct,ARSA,410,Late-onset metachromatic leukodystrophy: molec...
3,1353340,Late-onset metachromatic leukodystrophy: molec...,We report on a new allele at the arylsulfatase...,3,159,187,DiseaseOrPhenotypicFeature,metachromatic leukodystrophy,D007966,Late-onset metachromatic leukodystrophy: molec...
4,1353340,Late-onset metachromatic leukodystrophy: molec...,We report on a new allele at the arylsulfatase...,4,189,192,DiseaseOrPhenotypicFeature,MLD,D007966,Late-onset metachromatic leukodystrophy: molec...


In [8]:
from string import punctuation


def iter_word_offsets(input_string: str):
    start = 0
    for word in input_string.split():
        c = word[0] #first character
        start = input_string.find(c, start)
        end = start + len(word) - 1
        yield start, end, word
        start = end + 2


def transform_data(df: pd.DataFrame):
    train_data = []
    
    for abstract_id, frame in df.groupby(["abstract_id"]):
        text = list(frame["full_text"])[0]
        tag_sequence = []
        offset_iterator = zip(frame["type"], frame["offset_start"], frame["offset_finish"])
        current_offset = 0

        offset_iterator = iter_word_offsets(text)
        continue_previous_tag = False
        n_continuations = 0
        previous_tag = ""
        for offset_start, offset_finish, word in offset_iterator:
            if n_continuations > 0:
                tag_sequence.append(f"I-{previous_tag}")
                n_continuations -= 1
                continue
            else:
                continue_previous_tag = False

            if not continue_previous_tag:
                actual_offset_start = offset_start
                for char in word:
                    if char in punctuation:
                        actual_offset_start += 1
                inner_entities = frame[frame["offset_start"] == actual_offset_start]
                if not inner_entities.empty:
                    entity_text = list(inner_entities["mention"])[0]
                    entity_type = list(inner_entities["type"])[0]
                    entity_tokens = entity_text.split()
                    tag_sequence.append(f"B-{entity_type}")

                    if len(entity_tokens) > 1:
                        previous_tag = entity_type
                        continue_previous_tag = True
                        n_continuations = len(entity_tokens) - 1

                else:
                    tag_sequence.append("O")
        train_data.append({"text": text, "tags": (",").join(tag_sequence)})
    
    bert_df = pd.DataFrame(train_data)
    return bert_df 

bert_train_df = transform_data(train_df)
bert_train_df.head()

Unnamed: 0,text,tags
0,Late-onset metachromatic leukodystrophy: molec...,"O,B-DiseaseOrPhenotypicFeature,I-DiseaseOrPhen..."
1,Two distinct mutations at a single BamHI site ...,"O,O,O,O,O,O,O,O,O,O,B-DiseaseOrPhenotypicFeatu..."
2,Debrisoquine phenotype and the pharmacokinetic...,"B-ChemicalEntity,O,O,O,O,O,O,O,O,O,B-ChemicalE..."
3,Midline B3 serotonin nerves in rat medulla are...,"O,O,B-ChemicalEntity,O,O,B-OrganismTaxon,O,O,O..."
4,Molecular and phenotypic analysis of patients ...,"O,O,O,O,O,B-OrganismTaxon,O,O,O,O,O,O,O,O,B-Ge..."


In [9]:
original_tags = list(train_df["type"].unique())
bert_b_tags = [f"B-{tag}" for tag in original_tags]
bert_i_tags = [f"I-{tag}" for tag in original_tags]
bert_tags = ["O"] + bert_b_tags + bert_i_tags

labels_to_ids = {k: v for v, k in enumerate(bert_tags)}
ids_to_labels = {v: k for v, k in enumerate(bert_tags)}
labels_to_ids

{'B-CellLine': 6,
 'B-ChemicalEntity': 5,
 'B-DiseaseOrPhenotypicFeature': 1,
 'B-GeneOrGeneProduct': 2,
 'B-OrganismTaxon': 4,
 'B-SequenceVariant': 3,
 'I-CellLine': 12,
 'I-ChemicalEntity': 11,
 'I-DiseaseOrPhenotypicFeature': 7,
 'I-GeneOrGeneProduct': 8,
 'I-OrganismTaxon': 10,
 'I-SequenceVariant': 9,
 'O': 0}

In [10]:
MAX_LEN = 256
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [11]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        sentence = self.data.text[index].strip().split()
        word_labels = self.data.tags[index].split(",")

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_pretokenized=True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)

        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels] 
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            try:
                encoded_labels[idx] = labels[i]
                i += 1
            except IndexError as e:
                print(i)
                print(encoded_labels)
                print(labels)
                raise

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [12]:
train_size = 0.8
train_dataset = bert_train_df.sample(frac=train_size,random_state=200)
test_dataset = bert_train_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(bert_train_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (400, 2)
TRAIN Dataset: (320, 2)
TEST Dataset: (80, 2)


In [13]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [14]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cpu


In [15]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [16]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(2.6290, grad_fn=<NllLossBackward0>)

In [17]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 256, 13])

In [18]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [19]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [20]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}/{EPOCHS}")
    train(epoch)

Training epoch: 1/3
Training loss per 100 training steps: 2.6832621097564697
Training loss epoch: 0.8284413486719131
Training accuracy epoch: 0.8215701690915521
Training epoch: 2/3
Training loss per 100 training steps: 0.5662605166435242
Training loss epoch: 0.4603139337152243
Training accuracy epoch: 0.8680545748191685
Training epoch: 3/3
Training loss per 100 training steps: 0.31060683727264404
Training loss epoch: 0.3138920137658715
Training accuracy epoch: 0.9073296404247022


In [21]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [22]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.2074960619211197
Validation Loss: 0.2735874764621258
Validation Accuracy: 0.9151736460754683


In [23]:
from seqeval.metrics import classification_report

print(classification_report(labels, predictions))

TypeError: ignored

In [25]:
abstracts_test_df["full_text"] = abstracts_test_df["title"] + " " + abstracts_test_df["abstract"]
abstracts_test_df.head()

Unnamed: 0,abstract_id,title,abstract,full_text
0,1711760,Delayed institution of hypertension during foc...,The effect of induced hypertension instituted ...,Delayed institution of hypertension during foc...
1,6086495,Localisation of the Becker muscular dystrophy ...,A linkage study in 30 Becker muscular dystroph...,Localisation of the Becker muscular dystrophy ...
2,7018927,Pituitary response to luteinizing hormone-rele...,The effects of a 6-hour infusion with haloperi...,Pituitary response to luteinizing hormone-rele...
3,7811247,X-linked adrenoleukodystrophy (ALD): a novel m...,Fragments of the adrenoleukodystrophy (ALD) cD...,X-linked adrenoleukodystrophy (ALD): a novel m...
4,8944024,Detection of heterozygous mutations in BRCA1 u...,The ability to scan a large gene rapidly and a...,Detection of heterozygous mutations in BRCA1 u...


In [33]:
def predict(text):
    full_prediction = []
    split_words = list(iter_word_offsets(text))
    offsets = [(start, end) for start, end, _ in split_words]
    words = [word for _, _, word in split_words]

    inputs = tokenizer(
        words,
        is_pretokenized=True, 
        return_offsets_mapping=True, 
        padding='max_length', 
        truncation=True, 
        max_length=MAX_LEN,
        return_tensors="pt"
    )

    # move to gpu
    ids = inputs["input_ids"].to(device)
    mask = inputs["attention_mask"].to(device)
    # forward pass
    outputs = model(ids, attention_mask=mask)
    logits = outputs[0]

    active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]

    zipped_args = zip(
        ids.squeeze().tolist(),
        tokens,
        token_predictions,
        inputs["offset_mapping"].squeeze().tolist(),
    )

    predicted_results = {}
    for token_to_word_idx, (t_idx, t, t_pred, t_offsets) in enumerate(zipped_args):
        if not t_offsets[0] == t_offsets[1] == 0:
            if t_pred != "O":
                word_id = inputs.token_to_word(token_to_word_idx)
                # print(f"Found {t} ({word_id})")
                predicted_results[word_id] = t_pred[2:]


    # print(predicted_results)
    # print(list(zip(offsets, words)))
    for k, v in predicted_results.items():
        try:
            yield {"offset_start": offsets[k][0], "offset_finish": offsets[k][1], "type": v}
        except:
            pass

submission_preds = []
for idx, row in abstracts_test_df.iterrows():
    for pred_row in predict(row["full_text"]):
        pred_row["abstract_id"] = row["abstract_id"]
        submission_preds.append(pred_row)

preds_df = pd.DataFrame(submission_preds)
preds_df
# predict(abstracts_test_df.iloc[0]["full_text"])
# len(abstracts_test_df.iloc[0]["full_text"].split())

Unnamed: 0,offset_start,offset_finish,type,abstract_id
0,23,34,DiseaseOrPhenotypicFeature,1711760
1,43,47,DiseaseOrPhenotypicFeature,1711760
2,49,56,DiseaseOrPhenotypicFeature,1711760
3,58,66,DiseaseOrPhenotypicFeature,1711760
4,84,89,DiseaseOrPhenotypicFeature,1711760
...,...,...,...,...
1540,443,450,DiseaseOrPhenotypicFeature,30442153
1541,452,453,DiseaseOrPhenotypicFeature,30442153
1542,595,600,OrganismTaxon,30442153
1543,632,637,OrganismTaxon,30442153


In [34]:
preds_df = preds_df[["abstract_id", "offset_start", "offset_finish", "type"]]
preds_df.to_csv("preds.csv", sep="\t", index_label="id")

In [35]:
preds_loaded_df = pd.read_csv("preds.csv", sep="\t")
preds_loaded_df

Unnamed: 0,id,abstract_id,offset_start,offset_finish,type
0,0,1711760,23,34,DiseaseOrPhenotypicFeature
1,1,1711760,43,47,DiseaseOrPhenotypicFeature
2,2,1711760,49,56,DiseaseOrPhenotypicFeature
3,3,1711760,58,66,DiseaseOrPhenotypicFeature
4,4,1711760,84,89,DiseaseOrPhenotypicFeature
...,...,...,...,...,...
1540,1540,30442153,443,450,DiseaseOrPhenotypicFeature
1541,1541,30442153,452,453,DiseaseOrPhenotypicFeature
1542,1542,30442153,595,600,OrganismTaxon
1543,1543,30442153,632,637,OrganismTaxon
