In [1]:
# import onto
import torch
import pandas as pd 
import torch
from torch.utils.data import Dataset,DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import os 
import numpy as np 
import torch.nn as nn
from itertools import combinations
from sklearn.preprocessing import LabelEncoder


In [15]:
def evaluate(net, criterion, dataloader,max_iter=1000):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0
    
    with torch.no_grad():
        for tokens, labels in dataloader:

            for key,_ in tokens.items():
                tokens[key] =tokens[key].to('cuda').squeeze()
            labels=labels.to('cuda')
            
            logits = net(tokens)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1
            if count>max_iter:
                break

    return mean_acc / count, mean_loss / count

In [3]:
## read in Alvaro's Dataset

pos_samples = pd.read_csv('HPO-Terms-Dataset/HPO-Terms-Dataset/terms-poss.csv')
neg_samples = pd.read_csv('HPO-Terms-Dataset/HPO-Terms-Dataset/terms-negs.csv')


In [4]:
dataset = pd.concat([pos_samples,neg_samples],axis=0,sort=False).sample(frac=1)

In [5]:
dataset['target'] = dataset['score']>0

In [6]:
dataset = dataset.rename({'txt1':'names_left','txt2':'names_right'},axis=1)

In [7]:
# creat train and test data sets:

dataset = dataset.sample(frac=1).reset_index(drop=True)

n_train_samples = int(0.8*len(dataset))
train_data = dataset.iloc[0: n_train_samples]
valid_data = dataset.iloc[n_train_samples:]

In [8]:
## create data loader for training/classification 

class hpoDataset(Dataset):

    def __init__(self,dataset,max_length=20):
        self.df = dataset.reset_index(drop=True)
        self.tokenizer =  AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.maxlen = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence1 = self.df.iloc[index]['names_left']
        sentence2 = self.df.iloc[index]['names_right']

        #Preprocessing the text to be suitable for BERT
        tokens =self.tokenizer.encode_plus(sentence1, sentence2 ,truncation =True,return_tensors='pt',padding='max_length', max_length = self.maxlen) #Tokenize the sentence
        
        label = self.df.iloc[index]['target']
        
        return tokens,label


In [9]:
# load gsc+ dataset into the trainlaoder 
train_set = hpoDataset(train_data)
train_loader = DataLoader(train_set, batch_size = 30, num_workers = 0,shuffle=True)

# load gsc+ dataset into the trainlaoder 
valid_set = hpoDataset(valid_data)
valid_loader = DataLoader(valid_set, batch_size = 30, num_workers = 0,shuffle=True)

In [10]:
class HpoSimilarity(nn.Module):

    def __init__(self, freeze_bert = False):
        super(HpoSimilarity, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
        self.bert_layer.to('cuda')
        #Freeze bert layers
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False
        
        #Classification layer
        self.cls_layer = nn.Linear(768, 1).to('cuda')

    def forward(self, tokens):

        #Feeding the input to BERT model to obtain contextualized representations
        cont_reps, _ = self.bert_layer(**tokens)

        #Obtaining the representation of [CLS] head
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep.to('cuda'))

        return logits


In [11]:
net = HpoSimilarity()

import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [12]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

In [17]:
def train(net, criterion, opti, train_loader, val_loader):
    best_acc = 0
    for ep in range(1):

        for it, (tokens, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
    #             tokens = tokens.to('cuda')
            for key,_ in tokens.items():
                tokens[key] =tokens[key].to('cuda').squeeze()
            labels=labels.to('cuda')
            #Obtaining the logits from the model
            logits = net(tokens)

            #Computing loss
            loss = criterion(logits.squeeze(-1).to('cuda'), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()

            if (it + 1) % 1000 == 0:
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss : {} Accuracy : {}".format(it+1, ep+1, loss.item(), acc))
            
            if (it + 1) % 5000 == 0:
                val_acc, val_loss = evaluate(net, criterion, val_loader,1000)
                print("{} iteration completed! Validation Accuracy : {}, Validation Loss : {}".format(it, val_acc, val_loss))
                if val_acc>best_acc:
                    best_acc = val_acc
                    torch.save(net.state_dict(), 'd:/tmp/fine_tune_sentence_pair_AM{}.dat'.format(it))
                else:
                    break;
                

In [18]:
train(net, criterion, opti, train_loader, valid_loader)

Iteration 1000 of epoch 1 complete. Loss : 0.17828981578350067 Accuracy : 0.9000000357627869
Iteration 2000 of epoch 1 complete. Loss : 0.0674898624420166 Accuracy : 0.9666666984558105
Iteration 3000 of epoch 1 complete. Loss : 0.12143027037382126 Accuracy : 0.9666666984558105
Iteration 4000 of epoch 1 complete. Loss : 0.25577622652053833 Accuracy : 0.9000000357627869
Iteration 5000 of epoch 1 complete. Loss : 0.03989880159497261 Accuracy : 0.9666666984558105
4999 iteration completed! Validation Accuracy : 0.9760932922363281, Validation Loss : 0.06357457403272211
Iteration 6000 of epoch 1 complete. Loss : 0.25289568305015564 Accuracy : 0.9000000357627869
Iteration 7000 of epoch 1 complete. Loss : 0.033161234110593796 Accuracy : 1.0
Iteration 8000 of epoch 1 complete. Loss : 0.1032642126083374 Accuracy : 0.9666666984558105
Iteration 9000 of epoch 1 complete. Loss : 0.10855235904455185 Accuracy : 0.9666666984558105
Iteration 10000 of epoch 1 complete. Loss : 0.0322815477848053 Accuracy :

In [None]:
val_acc, val_loss=evaluate(net, criterion, valid_loader,1000000)


In [20]:
print("Validation Accuracy : {}, Validation Loss : {}".format(val_acc, val_loss))


Validation Accuracy : 0.9912647008895874, Validation Loss : 0.024122397497957235
