In [1]:
from transformers import BertTokenizer, AutoModel
import torch
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

In [4]:
data = pd.read_csv('testing_data.csv')

In [5]:
def tokenize(texts):
    tokens = {'input_ids': [], 'attention_mask': []}
    
    for text in texts:
        new_tokens = tokenizer.encode_plus(text, max_length=128,
                                           truncation=True, padding='max_length',
                                           return_tensors='pt')
        tokens['input_ids'].append(new_tokens['input_ids'][0])
        tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    
    tokens['input_ids'] = torch.stack(tokens['input_ids'])
    tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
    
    return tokens 

In [6]:
def mean_pool(tokens, embeddings):
    attention_mask = tokens['attention_mask']
    mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    masked_embeddings = embeddings * mask

    summed = torch.sum(masked_embeddings, 1)
    summed_mask = torch.clamp(mask.sum(1), min=1e-9)

    mean_pooled = summed / summed_mask
    
    return mean_pooled

In [7]:
def test(data):
    sim_arr = []
    
    text1 = data.text1
    text2 = data.text2 
    
    for i in range(len(data)):
        tokens = tokenize([text1[i], text2[i]])
        
        outputs = model(**tokens)
        embeddings = outputs.last_hidden_state
        
        mean_pooled = mean_pool(tokens, embeddings)
        mean_pooled = mean_pooled.detach().numpy()
        
        cos_sim = cosine_similarity([mean_pooled[0]], [mean_pooled[1]])
        
        sim_arr.append(cos_sim)
    
    return sim_arr

In [8]:
similarity_value = test(data)

In [9]:
output_ans = []

for i in similarity_value:
    value = i.flatten()[0]
    print(value)
    if value >= 0.91:
        output_ans.append(1)
    else:
        output_ans.append(0)

0.9260255
0.7161503
0.9502964
0.90253365
0.9898264
0.8316078
0.7744622
0.83685017
0.24968006
0.69243103
0.716396
0.8748588
0.9388281
0.95143384


In [10]:
print('roc_auc_score: ', roc_auc_score(data.ans, output_ans))

roc_auc_score:  0.7142857142857144
