# imports

In [10]:
import torch
from transformers import AutoModel, AutoTokenizer
from scipy.spatial.distance import cosine
import pandas as pd
import gensim
import nltk
import sys
import gensim.utils as gensimUtils

# Get our models - The package will take care of downloading the models automatically
# For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-125M-weightedmean-nli-bitfit")
model = AutoModel.from_pretrained("Muennighoff/SGPT-125M-weightedmean-nli-bitfit")
# Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
model.eval()

# Tokenize input 

queries = [
    "I'm searching for a planet not too far from Earth.",
]

docs = [
    "Neptune is the eighth and farthest-known Solar planet from the Sun. In the Solar System, it is the fourth-largest planet by diameter, the third-most-massive planet, and the densest giant planet. It is 17 times the mass of Earth, slightly more massive than its near-twin Uranus.",
    "TRAPPIST-1d, also designated as 2MASS J23062928-0502285 d, is a small exoplanet (about 30% the mass of the earth), which orbits on the inner edge of the habitable zone of the ultracool dwarf star TRAPPIST-1 approximately 40 light-years (12.1 parsecs, or nearly 3.7336×1014 km) away from Earth in the constellation of Aquarius.",
    "A harsh desert world orbiting twin suns in the galaxy’s Outer Rim, Tatooine is a lawless place ruled by Hutt gangsters. Many settlers scratch out a living on moisture farms, while spaceport cities such as Mos Eisley and Mos Espa serve as home base for smugglers, criminals, and other rogues.",
]

texts = [
    "deep learning",
    "artificial intelligence",
    "deep diving",
    "artificial snow",
]

# Asymmetric Semantic search Bi-Encoder

In [14]:
SPECB_QUE_BOS = tokenizer.encode("[", add_special_tokens=False)[0]
SPECB_QUE_EOS = tokenizer.encode("]", add_special_tokens=False)[0]

SPECB_DOC_BOS = tokenizer.encode("{", add_special_tokens=False)[0]
SPECB_DOC_EOS = tokenizer.encode("}", add_special_tokens=False)[0]


def tokenize_with_specb(texts, is_query):
    # Tokenize without padding
    batch_tokens = tokenizer(texts, padding=False, truncation=True)   
    # Add special brackets & pay attention to them
    for seq, att in zip(batch_tokens["input_ids"], batch_tokens["attention_mask"]):
        if is_query:
            seq.insert(0, SPECB_QUE_BOS)
            seq.append(SPECB_QUE_EOS)
        else:
            seq.insert(0, SPECB_DOC_BOS)
            seq.append(SPECB_DOC_EOS)
        att.insert(0, 1)
        att.append(1)
    # Add padding
    batch_tokens = tokenizer.pad(batch_tokens, padding=True, return_tensors="pt")
    return batch_tokens

def get_weightedmean_embedding(batch_tokens, model):
    # Get the embeddings
    with torch.no_grad():
        # Get hidden state of shape [bs, seq_len, hid_dim]
        last_hidden_state = model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state

    # Get weights of shape [bs, seq_len, hid_dim]
    weights = (
        torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
        .unsqueeze(0)
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float().to(last_hidden_state.device)
    )

    # Get attn mask of shape [bs, seq_len, hid_dim]
    input_mask_expanded = (
        batch_tokens["attention_mask"]
        .unsqueeze(-1)
        .expand(last_hidden_state.size())
        .float()
    )

    # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
    sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
    sum_mask = torch.sum(input_mask_expanded * weights, dim=1)

    embeddings = sum_embeddings / sum_mask

    return embeddings


SAAS_embeddings = get_weightedmean_embedding(tokenize_with_specb(SAlist, is_query=True), model)
GAAS_embeddings = get_weightedmean_embedding(tokenize_with_specb(GAlist, is_query=False), model)

# Calculate cosine similarities
# Cosine similarities are in [-1, 1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(SAAS_embeddings[0], GAAS_embeddings[0])
cosine_sim_0_2 = 1 - cosine(SAAS_embeddings[1], GAAS_embeddings[0])
cosine_sim_0_3 = 1 - cosine(SAAS_embeddings[2], GAAS_embeddings[0])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (SAlist[0], GAlist[0][:20] + "...", cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (SAlist[1], GAlist[0][:20] + "...", cosine_sim_0_2))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (SAlist[2], GAlist[0][:20] + "...", cosine_sim_0_3))

Cosine similarity between "Rheostat is a resistor which can be adjusted to maintain the flow and when resistance " and " It would stay the s..." is: 0.266
Cosine similarity between "the voltmeter would get a lower reading depending on how high the rheostat resistance is which might raise or lower the amount of A (amps) the circuit gives." and " It would stay the s..." is: 0.188
Cosine similarity between "The reading would be lower." and " It would stay the s..." is: 0.321


#  response dataset

In [2]:
sys.argv = [""]

ET=pd.read_csv('ETrespclean3.csv', encoding='latin1') #get data from student responses set
df = pd.DataFrame(ET)    #set ET as dataframe
dfIdeal = pd.DataFrame({'Gsentences': ET.GA}) #define ideal answers for tokenization (good answers/Gans).
dfIdeal['tokenized_sents'] = dfIdeal.apply(lambda row: nltk.word_tokenize(row['Gsentences']), axis=1)
dfStudent = pd.DataFrame({'Ssentences': ET.SA}) #define student answers for tokenization.
dfStudent['tokenized_sents'] = dfStudent.apply(lambda row: nltk.word_tokenize(row['Ssentences']), axis=1)
Gans = dfIdeal['tokenized_sents'] ##renaming the 2 tokenized sent sets for ease.
Sans = dfStudent['tokenized_sents']
GansDict = (dfIdeal['tokenized_sents'].to_dict)  #dictionary for tokenized sents
SansDict = (dfStudent['tokenized_sents'].to_dict)


tokenizedGans = [(gensimUtils.simple_preprocess(i, deacc=True, min_len=1, max_len=14)) for i in ET.GA] #tokenize Gans and Sans for use in w2v, w2vB, and D2V models matching(LSA allows for unkown terms in tokenized strings. these other models do not.)
tokenizedSans = [(gensimUtils.simple_preprocess(i, deacc=True, min_len=1, max_len=14)) for i in ET.SA]

In [4]:
torch.__version__

'1.12.0'

In [3]:
GAlist = []
for i in ET.GA[0:100]:
    GAlist.append(i)
    
SAlist = []
for i in ET.SA[0:100]:
    SAlist.append(i)

# sentence embeddings, attention masks, weighting and mean pooling for ideal answers (GA)

In [4]:
batch_tokensGA = tokenizer(GAlist, padding=True, truncation=True, return_tensors="pt")

# Get the embeddings
with torch.no_grad():
    # Get hidden state of shape [bs, seq_len, hid_dim]
    last_hidden_stateGA = model(**batch_tokensGA, output_hidden_states=True, return_dict=True).last_hidden_state

# Get weights of shape [bs, seq_len, hid_dim]
weightsGA = (
    torch.arange(start=1, end=last_hidden_stateGA.shape[1] + 1)
    .unsqueeze(0)
    .unsqueeze(-1)
    .expand(last_hidden_stateGA.size())
    .float().to(last_hidden_stateGA.device)
)

# Get attn mask of shape [bs, seq_len, hid_dim]
input_mask_expandedGA = (
    batch_tokensGA["attention_mask"]
    .unsqueeze(-1)
    .expand(last_hidden_stateGA.size())
    .float()
)

# encodings for ideal responses

In [5]:
# Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
sum_embeddingsGA2 = torch.sum(last_hidden_stateGA * input_mask_expandedGA * weightsGA, dim=1)
sum_maskGA = torch.sum(input_mask_expandedGA * weightsGA, dim=1)

embeddingsGA = sum_embeddingsGA2 / sum_maskGA
print(embeddingsGA)

tensor([[ 0.5642,  1.0927, -0.2866,  ..., -0.1318,  1.1017,  1.3367],
        [ 0.5642,  1.0927, -0.2866,  ..., -0.1318,  1.1017,  1.3367],
        [ 0.5642,  1.0927, -0.2866,  ..., -0.1318,  1.1017,  1.3367],
        ...,
        [-0.7614, -0.5094,  0.6846,  ...,  1.2404, -0.1497, -1.2174],
        [-0.7614, -0.5094,  0.6846,  ...,  1.2404, -0.1497, -1.2174],
        [-0.7614, -0.5094,  0.6846,  ...,  1.2404, -0.1497, -1.2174]])


# sentence embeddings, attention masks, weighting and mean pooling for user responses (SA)

In [6]:
batch_tokensSA = tokenizer(SAlist, padding=True, truncation=True, return_tensors="pt")

# Get the embeddings
with torch.no_grad():
    # Get hidden state of shape [bs, seq_len, hid_dim]
    last_hidden_stateSA = model(**batch_tokensSA, output_hidden_states=True, return_dict=True).last_hidden_state

# Get weights of shape [bs, seq_len, hid_dim]
weightsSA = (
    torch.arange(start=1, end=last_hidden_stateSA.shape[1] + 1)
    .unsqueeze(0)
    .unsqueeze(-1)
    .expand(last_hidden_stateSA.size())
    .float().to(last_hidden_stateSA.device)
)

# Get attn mask of shape [bs, seq_len, hid_dim]
input_mask_expandedSA = (
    batch_tokensSA["attention_mask"]
    .unsqueeze(-1)
    .expand(last_hidden_stateSA.size())
    .float()
)

# encodings for user responses

In [7]:
# Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
sum_embeddingsSA2 = torch.sum(last_hidden_stateSA * input_mask_expandedSA * weightsSA, dim=1)
sum_maskSA = torch.sum(input_mask_expandedSA * weightsSA, dim=1)

embeddingsSA = sum_embeddingsSA2 / sum_maskSA

In [9]:
# Calculate cosine similarities
# Cosine similarities are in [-1, 1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(embeddingsSA[0], embeddingsGA[0])
cosine_sim_0_2 = 1 - cosine(embeddingsSA[1], embeddingsGA[1])
cosine_sim_0_3 = 1 - cosine(embeddingsSA[2], embeddingsGA[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f", (cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f", (cosine_sim_0_2))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" ,(cosine_sim_0_3))

Cosine similarity between "%s" and "%s" is: %.3f 0.2322816550731659
Cosine similarity between "%s" and "%s" is: %.3f 0.1253511607646942
Cosine similarity between "%s" and "%s" is: %.3f 0.26908594369888306


In [43]:
coslist = []
i = 0
for i in embeddingsSA:
    cossim = 1 - cosine(embeddingsSA[0], embeddingsGA[0])
    i += 1
    coslist.append(cossim)
    

In [45]:
def merge(embeddingsGA, embeddingsSA): 
      
    merged_list = [(embeddingsGA[i], embeddingsSA[i]) for i in range(0, len(embeddingsGA))] 
    return merged_list 

embeddingstuple = merge(embeddingsGA, embeddingsSA)           ##Very important. Used for Judges either/or as well as LSA/RegEx combination thresholds.
Combothresh = []
for value in embeddingstuple:
    if value[0] or value[1] == 1:
        Combothresh.append(1)
    else:
        Combothresh.append(0)    

In [54]:
print(1 - cosine(embeddingstuple[0], embeddingstuple[0]))


<class 'AttributeError'>: 'torch.dtype' object has no attribute 'type'

In [28]:
print(coslist)

[0.2322816550731659, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611530304, 0.18029339611