In [1]:
import pandas as pd
import numpy as np
import random
import json
import csv
import itertools
import torch
from transformers import BertTokenizer, BertForSequenceClassification



In [2]:
data_fname = './datasets/valid_new.csv'
test_df = pd.read_csv(data_fname)

In [3]:
test_df

Unnamed: 0.1,Unnamed: 0,sentences,entity_1_mention,entity_1_label,entity_2_mention,entity_2_label,type
0,0,the pharmacokinetics of tolbutamide in all vin...,tolbutamide,CHEMICAL,vinegar-baked radix bupleuri,HERB,NEGATIVE
1,1,the pharmacokinetics of tolbutamide in all vin...,radix bupleuri,HERB,tolbutamide,CHEMICAL,POSITIVE
2,2,the plasma concentrations of the six probes we...,midazolam,CHEMICAL,radix bupleuri,HERB,POSITIVE
3,3,the plasma concentrations of the six probes we...,midazolam,CHEMICAL,vinegar-baked radix bupleuri,HERB,POSITIVE
4,4,the plasma concentrations of the six probes we...,dextromethorphan,CHEMICAL,radix bupleuri,HERB,POSITIVE
5,5,the plasma concentrations of the six probes we...,dextromethorphan,CHEMICAL,vinegar-baked radix bupleuri,HERB,POSITIVE
6,6,the plasma concentrations of the six probes we...,chlorzoxazone,CHEMICAL,radix bupleuri,HERB,POSITIVE
7,7,the plasma concentrations of the six probes we...,chlorzoxazone,CHEMICAL,vinegar-baked radix bupleuri,HERB,POSITIVE
8,8,the reason of different therapeutic effects of...,radix bupleuri,HERB,cyp2c9,CHEMICAL,POSITIVE
9,9,the reason of different therapeutic effects of...,vinegar-baked radix bupleuri,HERB,cyp2c19,CHEMICAL,POSITIVE


In [4]:
label_map_dic = {
    "POSITIVE": 0,
    "SPECULATIVE": 1,
    "NEGATIVE": 2,
}

embd_map_dic = {val: i for i, val in label_map_dic.items()}

In [5]:
embd_map_dic

{0: 'POSITIVE', 1: 'SPECULATIVE', 2: 'NEGATIVE'}

In [6]:
model_path = './output_biobert/models/bert_model_epoch5/'

In [7]:
# Instantiate the Bert tokenizer
tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=False)
model = BertForSequenceClassification.from_pretrained(model_path,num_labels=len(label_map_dic))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# send model to device
model.to(device);

In [8]:
def classify_relation(row):
    text = row['sentences'] + str('[SEP]') + row['entity_1_mention'] + str('[SEP]') + row['entity_2_mention']
    input_ids = torch.tensor(tokenizer.encode(text, 
                                              add_special_tokens=True,
                                              max_length=128)).unsqueeze(0)  # Batch size 1
    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1

    input_ids= input_ids.to(device) 
    labels= labels.to(device)
    with torch.no_grad():
        outputs = model(input_ids, labels=labels)
    logits = outputs[1]
    result = np.argmax(logits.cpu().numpy(),axis=1)[0]
    
    label = embd_map_dic[result]

    return label

In [9]:
test_df['predicted_type']= test_df.apply(lambda x: classify_relation(x), axis=1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
test_df

Unnamed: 0.1,Unnamed: 0,sentences,entity_1_mention,entity_1_label,entity_2_mention,entity_2_label,type,predicted_type
0,0,the pharmacokinetics of tolbutamide in all vin...,tolbutamide,CHEMICAL,vinegar-baked radix bupleuri,HERB,NEGATIVE,POSITIVE
1,1,the pharmacokinetics of tolbutamide in all vin...,radix bupleuri,HERB,tolbutamide,CHEMICAL,POSITIVE,POSITIVE
2,2,the plasma concentrations of the six probes we...,midazolam,CHEMICAL,radix bupleuri,HERB,POSITIVE,SPECULATIVE
3,3,the plasma concentrations of the six probes we...,midazolam,CHEMICAL,vinegar-baked radix bupleuri,HERB,POSITIVE,SPECULATIVE
4,4,the plasma concentrations of the six probes we...,dextromethorphan,CHEMICAL,radix bupleuri,HERB,POSITIVE,SPECULATIVE
5,5,the plasma concentrations of the six probes we...,dextromethorphan,CHEMICAL,vinegar-baked radix bupleuri,HERB,POSITIVE,SPECULATIVE
6,6,the plasma concentrations of the six probes we...,chlorzoxazone,CHEMICAL,radix bupleuri,HERB,POSITIVE,SPECULATIVE
7,7,the plasma concentrations of the six probes we...,chlorzoxazone,CHEMICAL,vinegar-baked radix bupleuri,HERB,POSITIVE,SPECULATIVE
8,8,the reason of different therapeutic effects of...,radix bupleuri,HERB,cyp2c9,CHEMICAL,POSITIVE,POSITIVE
9,9,the reason of different therapeutic effects of...,vinegar-baked radix bupleuri,HERB,cyp2c19,CHEMICAL,POSITIVE,POSITIVE


In [11]:
true_vals = {
    label: test_df[(test_df['predicted_type'] == test_df['type']) & (test_df['type'] == label)].shape[0] for label in label_map_dic.keys()
}
true_vals

{'POSITIVE': 21, 'SPECULATIVE': 7, 'NEGATIVE': 2}

In [12]:
false_vals = {
    label: test_df[(test_df['predicted_type'] != test_df['type']) & (test_df['type'] == label)].shape[0] for label in label_map_dic.keys()
}
false_vals

{'POSITIVE': 7, 'SPECULATIVE': 2, 'NEGATIVE': 2}

In [13]:
# Calculate the accuracy

accuracy = {
    label: (true_vals[label] / (true_vals[label] + false_vals[label])) * 100 for label in label_map_dic.keys()
}

accuracy

{'POSITIVE': 75.0, 'SPECULATIVE': 77.77777777777779, 'NEGATIVE': 50.0}

In [14]:
total_accuracy = sum(true_vals.values()) / (sum(true_vals.values()) + sum(false_vals.values())) * 100

In [15]:
total_accuracy

73.17073170731707