In [1]:
from transformers import BertTokenizer, BertModel, AlbertTokenizerFast, AlbertModel
import torch
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_bert_tokenizer_and_model(tokenizer_path, model_path):
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
    model = BertModel.from_pretrained(model_path)
    return tokenizer, model

def get_albert_tokenizer_and_model(tokenizer_path, model_path):
    tokenizer = AlbertTokenizerFast.from_pretrained(tokenizer_path)
    model = AlbertModel.from_pretrained(model_path)
    return tokenizer, model

In [3]:
def get_embedding(tokenizer, model, text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

    with torch.no_grad():  # Turn off gradient tracking for faster inference
        outputs = model(**inputs)

    # The embeddings are in `outputs.last_hidden_state`
    # `outputs.last_hidden_state` is a tensor of shape [batch_size, sequence_length, hidden_size]
    embeddings = outputs.last_hidden_state
    
    # If you want to get the embedding for the [CLS] token (which is often used as a sentence-level representation):
    return embeddings#[:, 0, :]  # [CLS] token is the first token


In [4]:
math_albert_path = "AnReu/albert-for-arqmath-3"
desc_bert_path = "bert-base-uncased"

In [5]:
mtokenizer, mmodel = get_albert_tokenizer_and_model(math_albert_path, math_albert_path)
dtokenizer, dmodel = get_bert_tokenizer_and_model(desc_bert_path, desc_bert_path)

Some weights of the model checkpoint at AnReu/albert-for-arqmath-3 were not used when initializing AlbertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
reactions = ["[Blood-Lymph].CTLA4_mabB <-> Lymph_Node.CTLA4_mab", "[Blood-Lymph].CTLA4_mabB <-> Peripheral.CTLA4_mabP_leaky"]

In [7]:
emb1 = get_embedding(dtokenizer, dmodel, reactions[0])[:,0,:]
emb2 = get_embedding(dtokenizer, dmodel, reactions[1])[:,0,:]

In [8]:
F.cosine_similarity(emb1,emb2)

tensor([0.9652])