In [1]:
import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import tensorflow_hub as hub
path_indoor_distance = "Datasets/34B-IndoorDistance.csv"
path_indoor_nodistance = "Datasets/34B-IndoorNoDistance.csv"

nlp = spacy.load("en_core_web_lg")

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)

2024-09-29 21:59:43.404946: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [2]:
def load_data(file_path):
    t = pd.read_csv(file_path)
    return t

In [3]:
def similarity_spacy(t):
    r = pd.DataFrame(columns=['Name', 'Similarity_1', 'Similarity_2', 'Similarity_3', 'Average'])
    r["Name"] = t["Name"]
    r['Similarity_1'] = t.apply(lambda row: nlp(row['Answer_1']).similarity(nlp(row['Expected_answer'])), axis=1)
    r['Similarity_2'] = t.apply(lambda row: nlp(row['Answer_2']).similarity(nlp(row['Expected_answer'])), axis=1)
    r['Similarity_3'] = t.apply(lambda row: nlp(row['Answer_3']).similarity(nlp(row['Expected_answer'])), axis=1)
    r['Average'] = r[['Similarity_1','Similarity_2','Similarity_3']].mean(axis=1)
    
    return r

In [4]:
def similarity_bows(t):
    r = pd.DataFrame(columns=['Name', 'Similarity_1', 'Similarity_2', 'Similarity_3', "Average"])
    for index, row in t.iterrows():
        sentences = [row['Answer_1'], row['Answer_2'], row['Answer_3']]
        expected_answer = row['Expected_answer']
        
        vectorizer = CountVectorizer()
        sentence_vectors = vectorizer.fit_transform(sentences)
        query_vector = vectorizer.transform([expected_answer])
    
        similarities = cosine_similarity(query_vector, sentence_vectors)
        r.loc[index] = [row["Name"], similarities[0][0], similarities[0][1], similarities[0][2], my_mean(similarities)]
    return r
  
def my_mean(s):
    return (s[0][0] + s[0][1] + s[0][2])/3 

In [5]:
def similarity_use(t):
    r = pd.DataFrame(columns=['Name', 'Similarity_1', 'Similarity_2', 'Similarity_3', "Average"])
    for index, row in t.iterrows():
        sentences = [row['Answer_1'], row['Answer_2'], row['Answer_3']]
        query = row['Expected_answer']
        
        sentence_embeddings = model(sentences)
        query_embedding = model([query])
    
        similarities = cosine_similarity(query_embedding, sentence_embeddings)
        r.loc[index] = [row["Name"], similarities[0][0], similarities[0][1], similarities[0][2], my_mean(similarities)]
    return r

In [6]:
def measure_nearest_neighbors(t, k=3):
  results = []
  for index, row in t.iterrows():
    sentences = [row['Answer_1'], row['Answer_2'], row['Answer_3']]
    expected_answer = row['Expected_answer']
    answers_embeddings = [nlp(sentence).vector for sentence in sentences]
    answers_embeddings = np.array(answers_embeddings)
    query_embedding = nlp(expected_answer).vector.reshape(1, -1)  # Reshape for compatibility with sklearn
    
    nn_model = NearestNeighbors(n_neighbors=k, metric='cosine')
    nn_model.fit(answers_embeddings)
    distances, indices = nn_model.kneighbors(query_embedding)
    nearest_neighbors = []
    for i, j in enumerate(indices[0]):
      nearest_neighbors.append({"text": sentences[j], "distance": distances[0][i]})

    results.append({"expected_answer": expected_answer, "nearest_neighbors": nearest_neighbors})

  return results

In [7]:
def avg_table(t1, t2, t3):
    results = pd.DataFrame(columns=['Name', 'Avg_Spacy', 'Avg_BagofWords', 'Avg_USE'])
    results['Name'] = t1['Name']
    results['Avg_Spacy'] = t1['Average']
    results['Avg_BagofWords'] = t2['Average']
    results['Avg_USE'] = t3['Average']
    return results

### With distance measures

In [8]:
d1 = load_data(path_indoor_distance)

In [9]:
results_spacy_d1 = similarity_spacy(d1)
result_nn_d1 = measure_nearest_neighbors(d1, k=3)

results_spacy_d1

Unnamed: 0,Name,Similarity_1,Similarity_2,Similarity_3,Average
0,IYD_0,0.914416,0.900761,0.894069,0.903082
1,IYD_1,0.865183,0.832897,0.865183,0.854421
2,IYD_2,0.780235,0.7609,0.761194,0.767443
3,IYD_3,0.840144,0.797367,0.7972,0.81157
4,IYD_4,0.676684,0.631053,0.631053,0.646263
5,IYD_5,0.923828,0.923122,0.927145,0.924698
6,IYD_P0,0.847347,0.866381,0.864306,0.859345
7,IYD_P1,0.867676,0.91571,0.945028,0.909471
8,IYD_P2,0.855707,0.871383,0.860741,0.86261


In [10]:
results_bow_d1 = similarity_bows(d1)
results_bow_d1

Unnamed: 0,Name,Similarity_1,Similarity_2,Similarity_3,Average
0,IYD_0,0.745356,0.638442,0.68313,0.688976
1,IYD_1,0.721688,0.547723,0.721688,0.663699
2,IYD_2,0.63901,0.447214,0.440959,0.509061
3,IYD_3,0.471405,0.455733,0.433013,0.453383
4,IYD_4,0.478091,0.596285,0.596285,0.556887
5,IYD_5,0.696873,0.67082,0.680545,0.682746
6,IYD_P0,0.447214,0.626099,0.596285,0.556532
7,IYD_P1,0.545545,0.666667,0.799305,0.670506
8,IYD_P2,0.631614,0.654654,0.720577,0.668948


In [11]:
results_use_d1 = similarity_use(d1)
results_use_d1

2024-09-29 21:59:44.537987: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string
	 [[{{node inputs}}]]


Unnamed: 0,Name,Similarity_1,Similarity_2,Similarity_3,Average
0,IYD_0,0.832208,0.647445,0.51862,0.666091
1,IYD_1,0.517346,0.343775,0.517346,0.459489
2,IYD_2,0.449334,0.276377,0.273994,0.333235
3,IYD_3,0.227435,0.142912,0.130478,0.166942
4,IYD_4,0.431811,0.515158,0.515158,0.487376
5,IYD_5,0.628319,0.718385,0.717151,0.687951
6,IYD_P0,0.390465,0.345548,0.290956,0.342323
7,IYD_P1,0.401153,0.444841,0.431075,0.42569
8,IYD_P2,0.258282,0.296439,0.258534,0.271085


In [12]:
results_d1 = avg_table(results_spacy_d1, results_bow_d1, results_use_d1)

### Without distance

In [13]:
d2 = load_data(path_indoor_nodistance)

In [14]:
results_spacy_d2 = similarity_spacy(d2)
results_spacy_d2 

Unnamed: 0,Name,Similarity_1,Similarity_2,Similarity_3,Average
0,IND_0,0.890926,0.926421,0.951255,0.922867
1,IND_1,0.885296,0.885296,0.885296,0.885296
2,IND_2,0.829627,0.786157,0.786157,0.800647
3,IND_3,0.919669,0.920899,0.923163,0.921243
4,IND_4,0.85418,0.867223,0.85384,0.858414
5,IND_5,0.698333,0.620809,0.635317,0.651487
6,IND_6,0.840529,0.869521,0.869296,0.859782
7,IND_7,0.892384,0.933643,0.888126,0.904718
8,IND_8,0.664143,0.641362,0.581787,0.629097
9,IND_9,0.856637,0.844569,0.813488,0.838231


In [15]:
results_bow_d2 = similarity_bows(d2)
results_bow_d2

Unnamed: 0,Name,Similarity_1,Similarity_2,Similarity_3,Average
0,IND_0,0.376889,0.464207,0.703526,0.514874
1,IND_1,0.80904,0.80904,0.80904,0.80904
2,IND_2,0.408248,0.182574,0.182574,0.257799
3,IND_3,0.74162,0.638285,0.650444,0.676783
4,IND_4,0.590879,0.734968,0.738549,0.688132
5,IND_5,0.534522,0.0,0.0,0.178174
6,IND_6,0.646762,0.568535,0.656532,0.623943
7,IND_7,0.657376,0.710047,0.65938,0.675601
8,IND_8,0.453743,0.566947,0.303046,0.441245
9,IND_9,0.410997,0.5,0.306186,0.405728


In [16]:
result_use_d2= similarity_use(d2)
result_use_d2

Unnamed: 0,Name,Similarity_1,Similarity_2,Similarity_3,Average
0,IND_0,0.248844,0.330942,0.531964,0.370583
1,IND_1,0.807655,0.807655,0.807655,0.807655
2,IND_2,0.256058,0.150949,0.150949,0.185985
3,IND_3,0.286582,0.323812,0.365089,0.325161
4,IND_4,0.473404,0.363138,0.522413,0.452985
5,IND_5,0.411324,0.266929,0.296434,0.324896
6,IND_6,0.336866,0.384397,0.394776,0.372013
7,IND_7,0.690576,0.772437,0.619606,0.694206
8,IND_8,0.447991,0.682023,0.394048,0.508021
9,IND_9,0.442361,0.397127,0.377499,0.405662


In [18]:
results_d2 = avg_table(results_spacy_d2, results_bow_d2, result_use_d2)
results_d2

Unnamed: 0,Name,Avg_Spacy,Avg_BagofWords,Avg_USE
0,IND_0,0.922867,0.514874,0.370583
1,IND_1,0.885296,0.80904,0.807655
2,IND_2,0.800647,0.257799,0.185985
3,IND_3,0.921243,0.676783,0.325161
4,IND_4,0.858414,0.688132,0.452985
5,IND_5,0.651487,0.178174,0.324896
6,IND_6,0.859782,0.623943,0.372013
7,IND_7,0.904718,0.675601,0.694206
8,IND_8,0.629097,0.441245,0.508021
9,IND_9,0.838231,0.405728,0.405662
