In [1]:
import pandas as pd

df = pd.read_csv("data/eric_records.csv")

In [12]:
df[["description"]].to_csv("eric_for_mlm.csv")

In [104]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, DistilBertModel

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [110]:
document = "I need papers about irrespective of language and cultural background."
document2 = """This paper presents arguments in favor of and against formalistic and realistic approaches in teaching/learning English in an English-speaking country. Within the formalism approach, the purpose of language teaching is to inculcate an intuitive grasp of the system of the target language. The ability to manipulate language rules is of prime consideration, with the ability to use language appropriately of secondary concern. Realism refers to an approach where the ability to use language in natural situations is the prime concern. A command of language forms is of interest only as it contributes to the ability to operate effectively in real-life language situations. It is suggested that language programs combine formalism (since linguistic forms must be learned through practice) and  realism (since language is an activity related to human interaction and the exchange of information). (Author/JK)."""
# Tokenize the input text
inputs1 = tokenizer(document, return_tensors="pt",truncation=True)
inputs2 = tokenizer(document2, return_tensors="pt", truncation=True)

In [111]:
import torch
model.eval()

# Get the token embeddings
with torch.no_grad():
    outputs1 = model(**inputs1)
    outputs2 = model(**inputs2)
    # Get the embeddings from the last hidden layer
    token_embeddings1 = outputs1.last_hidden_state
    token_embeddings2 = outputs2.last_hidden_state

In [112]:
attention_mask1 = inputs1['attention_mask']
attention_mask2 = inputs2['attention_mask']
attention_mask_expanded1 = attention_mask1.unsqueeze(-1).expand(token_embeddings1.size()).float()
attention_mask_expanded2 = attention_mask2.unsqueeze(-1).expand(token_embeddings2.size()).float()
sum_embeddings1 = torch.sum(token_embeddings1 * attention_mask_expanded1, 1)
sum_embeddings2 = torch.sum(token_embeddings2 * attention_mask_expanded2, 1)
sum_mask1 = torch.clamp(attention_mask_expanded1.sum(1), min=1e-9)
sum_mask2 = torch.clamp(attention_mask_expanded2.sum(1), min=1e-9)
average_embeddings1 = sum_embeddings1 / sum_mask1
average_embeddings2 = sum_embeddings2 / sum_mask2

In [113]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

cos_sim = cosine_similarity(average_embeddings1, average_embeddings2)

In [114]:
cos_sim

array([[0.77956605]], dtype=float32)