In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import torch
import nltk
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import *
from transformers import RobertaTokenizer, RobertaForTokenClassification, AlbertTokenizer
from transformers import AutoTokenizer, AutoModel
import transformers
from transformers import get_linear_schedule_with_warmup
from transformers import BertForTokenClassification, AdamW, AlbertForTokenClassification, DistilBertForTokenClassification, DistilBertTokenizer
from seqeval.metrics import f1_score, accuracy_score
import os

In [2]:
data = pd.read_csv("./data/labeled_dfs_all.csv")

In [3]:
data.columns = ['paper_id',"word", "tag", "sentence_id"]
data['word'] = data['word'].astype(str)
data['POS'] = data['word'].apply(lambda x: nltk.pos_tag([x])[0][1])
data = data[data['tag'].isin(['O', 'B-software', 'B-version', 'I-version','I-software'])]

In [8]:
data.shape

(361995, 5)

In [9]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [11]:
getter = SentenceGetter(data)

In [13]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]

['Behavioural',
 'analysis',
 'was',
 'performed',
 'to',
 'assess',
 'the',
 'development',
 'of',
 'tactile',
 'allodynia',
 'in',
 'mice.']

In [16]:
labels = [[s[2] for s in sentence] for sentence in getter.sentences]

In [17]:
tag_values = list(set(data["tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [19]:
tag2idx

{'I-version': 0,
 'B-version': 1,
 'O': 2,
 'I-software': 3,
 'B-software': 4,
 'PAD': 5}

In [20]:
tag2name={tag2idx[key] : key for key in tag2idx.keys()}

### Finetune SciBERT

In [25]:
MAX_LEN = 215
bs = 32

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [28]:
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_cased', do_lower_case=False)
model = BertForTokenClassification.from_pretrained('allenai/scibert_scivocab_cased', num_labels=len(tag2idx))

In [56]:
def tokenize_label_sentence(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] * n_subwords)        
    return tokenized_sentence, labels

In [57]:
tokenized_label_text = [
    tokenize_label_sentence(sent, labs)
    for sent, labs in zip(sentences, labels)
]

In [58]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_label_text]
labels = [token_label_pair[1] for token_label_pair in tokenized_label_text]

In [59]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [60]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [61]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [62]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [63]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [64]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [68]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [69]:
epochs = 5
max_grad_norm = 1.0

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

### Training

In [72]:
loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        outputs = model(b_input_ids, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        loss = outputs[0]
        loss.backward()
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        optimizer.step()
        scheduler.step()
        
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))
    loss_values.append(avg_train_loss)

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            outputs = model(b_input_ids,
                            attention_mask=b_input_mask, 
                            labels=b_labels)
            
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Average train loss: 0.04543473568646216


Epoch:  20%|██        | 1/5 [5:45:22<23:01:28, 20722.25s/it]

Validation loss: 0.017523886758664793
Validation Accuracy: 0.9947196677543756
Validation F1-Score: 0.8985167837626853

Average train loss: 0.01287587856365908


Epoch:  40%|████      | 2/5 [11:27:24<17:13:06, 20662.27s/it]

Validation loss: 0.01638697526650503
Validation Accuracy: 0.9954909522396915
Validation F1-Score: 0.912975912975913

Average train loss: 0.005892529607007444
Validation loss: 0.018631886810974472
Validation Accuracy: 0.9959655888460397


Epoch:  60%|██████    | 3/5 [17:18:10<11:32:34, 20777.28s/it]

Validation F1-Score: 0.9214618973561431

Average train loss: 0.0032527646533686227


Epoch:  80%|████████  | 4/5 [23:24:13<5:52:13, 21133.19s/it] 

Validation loss: 0.018779239163797517
Validation Accuracy: 0.9958469296944527
Validation F1-Score: 0.9185642609031263

Average train loss: 0.00199385326450725
Validation loss: 0.019842711591098287
Validation Accuracy: 0.9959062592702462


Epoch: 100%|██████████| 5/5 [29:09:09<00:00, 20989.95s/it]  

Validation F1-Score: 0.9221232080588919






### Save the model

In [74]:
model_out_address = './models/scibert_software_sent'

if not os.path.exists(model_out_address):
    os.makedirs(model_out_address) 

In [76]:
model_to_save = model.module if hasattr(model, 'module') else model 

In [79]:
output_model_file = os.path.join(model_out_address, "pytorch_model.bin")
output_config_file = os.path.join(model_out_address, "config.json")

In [82]:
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(model_out_address)

('/Users/iwilliams/Desktop/Models/bert_out_model/scibert_software_sent/vocab.txt',)

### Performace metrics

In [84]:
model.eval();

In [86]:
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
y_true = []
y_pred = []

print("***** Running evaluation *****")
print("  Num examples ={}".format(len(val_inputs)))
print("  Batch size = {}".format(bs))
for step, batch in enumerate(valid_dataloader):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, label_ids = batch
    with torch.no_grad():
        outputs = model(input_ids,
                        attention_mask=input_mask,)
        logits = outputs[0]
    logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2)
    logits = logits.detach().cpu().numpy()
    label_ids = label_ids.to('cpu').numpy()
    input_mask = input_mask.to('cpu').numpy()
    for i,mask in enumerate(input_mask):
        temp_1 = []
        temp_2 = []
        
        for j, m in enumerate(mask):
            if m:
                if tag2name[label_ids[i][j]] != "X" and tag2name[label_ids[i][j]] != "[CLS]" and tag2name[label_ids[i][j]] != "[SEP]" : 
                    temp_1.append(tag2name[label_ids[i][j]])
                    temp_2.append(tag2name[logits[i][j]])
            else:
                break
        y_true.append(temp_1)
        y_pred.append(temp_2)

report = classification_report(y_true, y_pred,digits=4)

***** Running evaluation *****
  Num examples =1754
  Batch size = 32
f1 socre: 0.922123
Accuracy score: 0.995906


In [87]:
print("F1 socre: %f"%(f1_score(y_true, y_pred)))
print(report)

f1 socre: 0.922123
Accuracy score: 0.995906
           precision    recall  f1-score   support

 software     0.9014    0.9343    0.9176       959
  version     0.9216    0.9515    0.9363       309

micro avg     0.9063    0.9385    0.9221      1268
macro avg     0.9063    0.9385    0.9221      1268

