In [None]:
'''
 * Copyright 2023 QuickAns
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 '''

In [None]:
import pandas as pd
df = pd.read_csv("/kaggle/input/quickans-answers/jaccard_scores.csv")
df.head()

In [None]:
quickans_null_map = df["QuickAns answer"].isnull()
chatgpt_null_map = df["GPT 3.5 answer"].isnull()
for index, row in df.iterrows():
    if quickans_null_map[index] or chatgpt_null_map[index]:
        df.drop(index, inplace=True)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def get_similarity_mpnet(sentence1, sentence2):
    encoded_input = tokenizer([sentence1, sentence2], padding=True, truncation=True, return_tensors='pt')

    # token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
#     print(sentence_embeddings[0].shape)
    cosine_sim = F.cosine_similarity(sentence_embeddings[0].unsqueeze(0), sentence_embeddings[1].unsqueeze(0), dim = 1)
    return float(cosine_sim[0])
print(get_similarity_mpnet("hello world", "hello world"))

In [None]:
for index, row in df.iterrows():
    print(index)
    reference_text = row["Question"] + row["Ground Truth Answer"]
    sim_qa = get_similarity_mpnet(reference_text, row["QuickAns answer"])
    sim_chatgpt = get_similarity_mpnet(reference_text, row["GPT 3.5 answer"])
    print(index)
    df.loc[index, 'Sentence Sim QA'] = sim_qa
    df.loc[index, 'Sentence Sim ChatGPT'] = sim_chatgpt

In [None]:
df

In [None]:
import numpy as np
from scipy.stats import ttest_ind

def calculate_scores(score_name, set1_scores, set2_scores):
    if len(set1_scores) != len(set2_scores):
        print("Error: different number of scores in the two sets")
        exit()

    t_stat, p_value = ttest_ind(set1_scores, set2_scores)

    # Print the results
    print("Stats for {} Scores:".format(score_name))
    print("T-test statistic: ", t_stat)
    print("P-value: ", p_value)

In [None]:
# Sentence-Similarity Scores
set1_scores = np.array(df["Sentence Sim QA"])
set2_scores = np.array(df["Sentence Sim ChatGPT"])
calculate_scores("Sentence Similarity", set1_scores, set2_scores)

In [None]:
df.to_csv("mpnet_scores.csv")