In [1]:
from transformers import AutoTokenizer, AutoModel 
import torch
import numpy as np
#from sklearn.metrics.pairwise import cosine_similarity
#from sentence_transformers import util

# sbert_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-cls-token")
# sbert_model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-cls-token")

sbert_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")
sbert_model = AutoModel.from_pretrained("sentence-transformers/bert-base-nli-mean-tokens")

In [2]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [33]:
### TASHA:
def sbert_phrase_sim(phrase1, phrase2, encoded_input, sbert_tokenizer, sbert_model):
    """
    Parameters:
    phrase1: str phrase from the first amr
    phrase2: str phrase from the second amr
    encoded_input: transformers.tokenization_utils_base.BatchEncoding; both sentences([sent1, sent2]) encoded by a tokenizer. 
    Returns:
    cosine_score: int; cosine similarity between two phrases based on the sentence embeddings
    """
    #encode both phrases to compare with the sentences
    encoded_phrase1 = sbert_tokenizer(phrase1) 
    encoded_phrase2 = sbert_tokenizer(phrase2)
    #first phrase
    first_sent = encoded_input["input_ids"][0] #get the ids from the sentence
    first_phrase = encoded_phrase1["input_ids"][1:-1] #get the ids for the wanted words
    #find the index in the sentence for words in the phrase
    sent_index = [torch.eq(x, first_sent) for x in torch.tensor(first_phrase)]
   
    if len(sent_index) == 1: 
        condition1 = sent_index[0] #eg. condition = [False, False, False, True, True, False]
    elif len(sent_index) == 2:
        condition1 = sent_index[0] + sent_index[1]
    elif len(sent_index) == 3:
        condition1 = sent_index[0] + sent_index[1] + sent_index[2]
    
    attention = encoded_input["attention_mask"][0] #e.g [1,1,1,1,1] 
    print(condition1)
    new_attention1 = attention.where(condition1, torch.tensor(0)) #only the wanted words eg.[0,0,1,1,0]
    
    
    #do the same for the second phrase
    second_sent = encoded_ids["input_ids"][1]
    second_phrase  = encoded_phrase2["input_ids"][1:-1]
    sent2_index = [torch.eq(x, second_sent) for x in torch.tensor(second_phrase)]
    
    if len(sent2_index) == 1: 
        condition2 = sent2_index[0]
    elif len(sent2_index) == 2:
        condition2 = sent2_index[0] + sent2_index[1]
    elif len(sent2_index) == 3:
        condition2 = sent2_index[0] + sent2_index[1] + sent2_index[2]    

    attention2 = encoded_ids["attention_mask"][1]
    new_attention2 = attention.where(condition2, torch.tensor(0))
    
    #get the new attention for the mean pooling
    new_attention = torch.stack([new_attention1, new_attention2])
    print(new_attention)
    with torch.no_grad():
        model_output = sbert_model(**encoded_input)
        
    sentence_embeddings = mean_pooling(model_output,new_attention)
    #sentence_embeddings2 = mean_pooling(model_output,encoded_ids["attention_mask"]) test sim for the full sents
    
    cos = torch.nn.CosineSimilarity(0)
    
    cosine = cos(sentence_embeddings[0], sentence_embeddings[1])
    cosine_score = float(cosine.squeeze().detach().numpy())
   
    
    return cosine_score


In [3]:
### DENIS:

def x_in_y(query, base):
    try:
        l = len(query)
    except TypeError:
        l = 1
        query = type(base)((query,))

    for i in range(len(base)):
        if base[i:i+l] == query:
            return list(range(i, i+l))
    return False

def get_new_attention_mask(phrase1, phrase2, encoded_input, sbert_tokenizer):
    """
    Parameters:
    phrase1: str phrase from the first amr
    phrase2: str phrase from the second amr
    encoded_input: transformers.tokenization_utils_base.BatchEncoding; both sentences([sent1, sent2]) encoded by a tokenizer. 
    Returns:
    cosine_score: int; cosine similarity between two phrases based on the sentence embeddings
    """
    # encode both phrases to compare with the sentences
    encoded_phrase1 = sbert_tokenizer(phrase1) 
    encoded_phrase2 = sbert_tokenizer(phrase2)
    # first phrase
    sents = encoded_input['input_ids']
    sent1 = encoded_input['input_ids'][0] # get the ids from the sentence
    sent2 = encoded_input['input_ids'][1]
    phrase1 = encoded_phrase1['input_ids'][1:-1] # get the ids for the wanted words
    phrase2 = encoded_phrase2['input_ids'][1:-1]
    # find the index in the sentence for words in the phrase    
    indices_matched1 = x_in_y(phrase1, sent1.tolist())
    indices_matched2 = x_in_y(phrase2, sent2.tolist())
    
    if not (indices_matched1 and indices_matched2):
        return 0
    
    new_attention = torch.zeros_like(sents, dtype=torch.bool)
    new_attention[0, torch.tensor(indices_matched1)] = True
    new_attention[1, torch.tensor(indices_matched2)] = True
    return new_attention
    

def sbert_sim(model_output, phrase1, phrase2):
    sentence_embeddings = mean_pooling(model_output, 
                                       get_new_attention_mask(phrase1, phrase2, encoded_ids, sbert_tokenizer))
    
    return cos(sentence_embeddings[0], sentence_embeddings[1]).item()
    

In [4]:
# sentences = ["Chinese lunar rover lands on moon","China lands robot rover on moon"]
# phrase1 = "Chinese lunar rover"
# phrase2 = "robot"

# sentences = ["a young cat sprints", "a kitten runs"]
# phrase1 = "young cat"
# phrase2 = "kitten"

cos = torch.nn.CosineSimilarity(0)
sentences = ["we eat french fries", "we eat chips"]
phrase1 = "french fries"
phrase2 = "chips"

encoded_ids = sbert_tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
        
print(sbert_sim(sbert_model(**encoded_ids), phrase1, phrase2))

0.6100732684135437


In [13]:
#Sentences we want sentence embeddings for
sentences = ['dog walks', 'cat', 'kitten runs']

#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
print(sentence_embeddings.shape)
print(encoded_input['attention_mask'])

torch.Size([3, 768])
tensor([[1, 1, 1, 1],
        [1, 1, 1, 0],
        [1, 1, 1, 1]])


In [19]:
encoded_input

{'input_ids': tensor([[  101,  3899,  7365,   102],
        [  101,  4937,   102,     0],
        [  101, 18401,  3216,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0],
        [1, 1, 1, 1]])}

In [21]:
torch.stack((sentence_embeddings[0, :5], 
            sentence_embeddings[0, :5]))

tensor([[ 0.6995,  0.2457,  1.4855, -0.4822, -0.4247],
        [ 0.6995,  0.2457,  1.4855, -0.4822, -0.4247]])

In [6]:
sbert_model(**encoded_ids)[0].size()

torch.Size([2, 6, 768])