In [1]:
import pandas as pd
import numpy as np
import random
import json
import csv
import itertools
import torch
from transformers import BertTokenizer, BertForSequenceClassification



In [2]:
data_fname = './datasets/valid_new.csv'
test_df = pd.read_csv(data_fname)

In [3]:
test_df

Unnamed: 0.1,Unnamed: 0,sentences,entity_1_mention,entity_2_mention,type
0,0,the pharmacokinetics of tolbutamide in all vin...,tolbutamide,vinegar-baked radix bupleuri,NEGATIVE
1,1,"wever, treatment of radix bupleuri decreased t...",radix bupleuri,tolbutamide,POSITIVE
2,2,culated. results: the auc and t1/2 of midazola...,midazolam,radix bupleuri,POSITIVE
3,3,culated. results: the auc and t1/2 of midazola...,midazolam,vinegar-baked radix bupleuri,POSITIVE
4,4,culated. results: the auc and t1/2 of midazola...,dextromethorphan,radix bupleuri,POSITIVE
5,5,culated. results: the auc and t1/2 of midazola...,dextromethorphan,vinegar-baked radix bupleuri,POSITIVE
6,6,culated. results: the auc and t1/2 of midazola...,chlorzoxazone,radix bupleuri,POSITIVE
7,7,culated. results: the auc and t1/2 of midazola...,chlorzoxazone,vinegar-baked radix bupleuri,POSITIVE
8,8,the reason of different therapeutic effects of...,radix bupleuri,cyp2c9,POSITIVE
9,9,the reason of different therapeutic effects of...,vinegar-baked radix bupleuri,cyp2c19,POSITIVE


In [17]:
label_map_dic = {
    "POSITIVE": 0,
    "SPECULATIVE": 1,
    "NEGATIVE": 2,
}

embd_map_dic = {val: i for i, val in label_map_dic.items()}

In [18]:
embd_map_dic

{0: 'POSITIVE', 1: 'SPECULATIVE', 2: 'NEGATIVE'}

In [19]:
model_path = './output_biobert/models/bert_model_epoch5/'

In [20]:
# Instantiate the Bert tokenizer
tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=False)
model = BertForSequenceClassification.from_pretrained(model_path,num_labels=len(label_map_dic))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# send model to device
model.to(device);

In [21]:
def classify_relation(row):
    text = row['sentences'] + str('[SEP]') + row['entity_1_mention'] + str('[SEP]') + row['entity_2_mention']
    input_ids = torch.tensor(tokenizer.encode(text, 
                                              add_special_tokens=True,
                                              max_length=128)).unsqueeze(0)  # Batch size 1
    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1

    input_ids= input_ids.to(device) 
    labels= labels.to(device)
    with torch.no_grad():
        outputs = model(input_ids, labels=labels)
    logits = outputs[1]
    result = np.argmax(logits.cpu().numpy(),axis=1)[0]
    
    label = embd_map_dic[result]

    return label

In [22]:
test_df['predicted_type']= test_df.apply(lambda x: classify_relation(x), axis=1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [23]:
test_df

Unnamed: 0.1,Unnamed: 0,sentences,entity_1_mention,entity_2_mention,type,predicted_type
0,0,the pharmacokinetics of tolbutamide in all vin...,tolbutamide,vinegar-baked radix bupleuri,NEGATIVE,POSITIVE
1,1,"wever, treatment of radix bupleuri decreased t...",radix bupleuri,tolbutamide,POSITIVE,POSITIVE
2,2,culated. results: the auc and t1/2 of midazola...,midazolam,radix bupleuri,POSITIVE,SPECULATIVE
3,3,culated. results: the auc and t1/2 of midazola...,midazolam,vinegar-baked radix bupleuri,POSITIVE,SPECULATIVE
4,4,culated. results: the auc and t1/2 of midazola...,dextromethorphan,radix bupleuri,POSITIVE,SPECULATIVE
5,5,culated. results: the auc and t1/2 of midazola...,dextromethorphan,vinegar-baked radix bupleuri,POSITIVE,SPECULATIVE
6,6,culated. results: the auc and t1/2 of midazola...,chlorzoxazone,radix bupleuri,POSITIVE,SPECULATIVE
7,7,culated. results: the auc and t1/2 of midazola...,chlorzoxazone,vinegar-baked radix bupleuri,POSITIVE,SPECULATIVE
8,8,the reason of different therapeutic effects of...,radix bupleuri,cyp2c9,POSITIVE,POSITIVE
9,9,the reason of different therapeutic effects of...,vinegar-baked radix bupleuri,cyp2c19,POSITIVE,POSITIVE


In [28]:
true_vals = {
    label: test_df[(test_df['predicted_type'] == test_df['type']) & (test_df['type'] == label)].shape[0] for label in label_map_dic.keys()
}
true_vals

{'POSITIVE': 21, 'SPECULATIVE': 7, 'NEGATIVE': 2}

In [31]:
false_vals = {
    label: test_df[(test_df['predicted_type'] != test_df['type']) & (test_df['type'] == label)].shape[0] for label in label_map_dic.keys()
}
false_vals

{'POSITIVE': 7, 'SPECULATIVE': 2, 'NEGATIVE': 2}

In [33]:
# Calculate the accuracy

accuracy = {
    label: (true_vals[label] / (true_vals[label] + false_vals[label])) * 100 for label in label_map_dic.keys()
}

accuracy

{'POSITIVE': 75.0, 'SPECULATIVE': 77.77777777777779, 'NEGATIVE': 50.0}

In [37]:
total_accuracy = sum(true_vals.values()) / (sum(true_vals.values()) + sum(false_vals.values())) * 100

In [38]:
total_accuracy

73.17073170731707