In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import torch.nn.functional as F

In [None]:
bert="sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
scibert='allenai/scibert_scivocab_uncased'
specter='allenai/specter'
sbert='Muennighoff/SBERT-base-nli-v2'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(sbert)
tokens=tokenizer("Hello World", padding=True, truncation=True, return_tensors='pt',max_length=512)
print(tokens)

In [None]:
tokenizer.vocab_size

In [None]:
tokens['input_ids'].shape

In [None]:
for i in range(tokens['input_ids'].shape[1]):
    print(i,tokens['input_ids'][0,i],tokenizer.decode(tokens['input_ids'][0,i]))

In [None]:
#Encode text
def encode(texts,modelname,strat='pool'):
    #Mean Pooling - Take average of all tokens
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    tokenizer = AutoTokenizer.from_pretrained(modelname)
    model = AutoModel.from_pretrained(modelname)
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, 
                              return_tensors='pt',max_length=512)
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)
    # Perform pooling
    if strat=='pool':
        embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    elif strat=='cls':
        embeddings = model_output.last_hidden_state[:, 0, :]
    # Normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    return embeddings.squeeze(0)

In [None]:
def cosine_similarity(vector1, vector2, use_torch=True):
    if use_torch:
        dot_product = torch.dot(vector1, vector2)
        norm_vector1 = torch.norm(vector1)
        norm_vector2 = torch.norm(vector2)
        similarity = dot_product / (norm_vector1 * norm_vector2)
    else:
        dot_product = np.dot(vector1, vector2)
        norm_vector1 = np.linalg.norm(vector1)
        norm_vector2 = np.linalg.norm(vector2)
        similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity

In [None]:
senL=['Hello World','Great day today','Hi there and greetings']

In [None]:
encode(senL[0],specter,strat='cls')

In [None]:
USE_GEMINI=False

In [None]:
genL=senL
strat='pool'
bert_embeddingsL=[]
sbert_embeddingsL=[]
scibert_embeddingsL=[]
specter_embeddingsL=[]
if USE_GEMINI:gemini_embeddingsL=[]
for s in genL:
    bert_embeddingsL.append(encode(s,bert,strat))
    sbert_embeddingsL.append(encode(s,sbert,strat))
    scibert_embeddingsL.append(encode(s,scibert,strat))
    specter_embeddingsL.append(encode(s,specter,strat))
    if USE_GEMINI:gemini_embeddingsL.append(encode_gemini(s))

In [None]:
bert_distances=np.zeros((3,3))
sbert_distances=np.zeros((3,3))
scibert_distances=np.zeros((3,3))
specter_distances=np.zeros((3,3))
if USE_GEMINI:gemini_distances=np.zeros((3,3))
for i in range(3):
    for j in range(3):
        if i<j:break
        bert_distances[i,j]=cosine_similarity(bert_embeddingsL[i],bert_embeddingsL[j])
        sbert_distances[i,j]=cosine_similarity(sbert_embeddingsL[i],sbert_embeddingsL[j])
        scibert_distances[i,j]=cosine_similarity(scibert_embeddingsL[i],scibert_embeddingsL[j])
        specter_distances[i,j]=cosine_similarity(specter_embeddingsL[i],specter_embeddingsL[j])
        if USE_GEMINI:gemini_distances[i,j]=cosine_similarity(gemini_embeddingsL[i],gemini_embeddingsL[j])

In [None]:
bert_distances

In [None]:
sbert_distances

In [None]:
scibert_distances

In [None]:
specter_distances

In [None]:
if USE_GEMINI:
    print(gemini_distances)

In [None]:
aiL=['Deep Learning','Artificial Intelligence','Programming Languages']

In [None]:
genL=aiL
strat='pool'
bert_embeddingsL=[]
sbert_embeddingsL=[]
scibert_embeddingsL=[]
specter_embeddingsL=[]
if USE_GEMINI:gemini_embeddingsL=[]
for s in genL:
    bert_embeddingsL.append(encode(s,bert,strat))
    sbert_embeddingsL.append(encode(s,sbert,strat))
    scibert_embeddingsL.append(encode(s,scibert,strat))
    specter_embeddingsL.append(encode(s,specter,strat))
    if USE_GEMINI:gemini_embeddingsL.append(encode_gemini(s))

In [None]:
bert_distances=np.zeros((3,3))
sbert_distances=np.zeros((3,3))
scibert_distances=np.zeros((3,3))
specter_distances=np.zeros((3,3))
if USE_GEMINI:gemini_distances=np.zeros((3,3))
for i in range(3):
    for j in range(3):
        if i<j:break
        bert_distances[i,j]=cosine_similarity(bert_embeddingsL[i],bert_embeddingsL[j])
        sbert_distances[i,j]=cosine_similarity(sbert_embeddingsL[i],sbert_embeddingsL[j])
        scibert_distances[i,j]=cosine_similarity(scibert_embeddingsL[i],scibert_embeddingsL[j])
        specter_distances[i,j]=cosine_similarity(specter_embeddingsL[i],specter_embeddingsL[j])
        if USE_GEMINI:gemini_distances[i,j]=cosine_similarity(gemini_embeddingsL[i],gemini_embeddingsL[j])

In [None]:
bert_distances

In [None]:
scibert_distances

In [None]:
specter_distances

In [None]:
if USE_GEMINI:
    print(gemini_distances)

In [None]:
import anvil.server
import import_ipynb
from mykeys import GEMENIKEY,SERVERKEY
anvil.server.connect(SERVERKEY)

### Gemini Encoding

In [None]:
def encode_gemini(text):
    return torch.tensor(anvil.server.call('encode_gemini',text,GEMENIKEY))

In [None]:
encode_gemini('Hello World')

In [None]:
# import google.generativeai as genai
# from mykeys import GEMENIKEY
# genai.configure(api_key=GEMENIKEY)
# def encode_gemini(text):
#     result = genai.embed_content(
#     model="models/embedding-001",
#     content=text,
#     task_type="retrieval_document",
#     title="Embedding of single string")
#     return result['embedding']