In [1]:
import spacy
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import tensorflow_hub as hub

path_outdoor_distance = "Datasets/34B-OutdoorDistance.csv"
path_outdoor_nodistance = "Datasets/34B-OutdoorNoDistance.csv"

nlp = spacy.load("en_core_web_lg")

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)

2024-09-29 21:59:38.364410: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [2]:
def load_data(file_path):
    t = pd.read_csv(file_path)
    return t

In [3]:
def similarity_spacy(t):
    r = pd.DataFrame(columns=['Name', 'Similarity_1', 'Similarity_2', 'Similarity_3', 'Average'])
    r["Name"] = t["Name"]
    r['Similarity_1'] = t.apply(lambda row: nlp(row['Answer_1']).similarity(nlp(row['Expected_answer'])), axis=1)
    r['Similarity_2'] = t.apply(lambda row: nlp(row['Answer_2']).similarity(nlp(row['Expected_answer'])), axis=1)
    r['Similarity_3'] = t.apply(lambda row: nlp(row['Answer_3']).similarity(nlp(row['Expected_answer'])), axis=1)
    r['Average'] = r[['Similarity_1','Similarity_2','Similarity_3']].mean(axis=1)
    
    return r

In [4]:
def similarity_bows(t):
    r = pd.DataFrame(columns=['Name', 'Similarity_1', 'Similarity_2', 'Similarity_3', "Average"])
    for index, row in t.iterrows():
        sentences = [row['Answer_1'], row['Answer_2'], row['Answer_3']]
        expected_answer = row['Expected_answer']
        
        vectorizer = CountVectorizer()
        sentence_vectors = vectorizer.fit_transform(sentences)
        query_vector = vectorizer.transform([expected_answer])
    
        similarities = cosine_similarity(query_vector, sentence_vectors)
        r.loc[index] = [row["Name"], similarities[0][0], similarities[0][1], similarities[0][2], my_mean(similarities)]
    return r
  
def my_mean(s):
    return (s[0][0] + s[0][1] + s[0][2])/3 

In [5]:
def similarity_use(t):
    r = pd.DataFrame(columns=['Name', 'Similarity_1', 'Similarity_2', 'Similarity_3', "Average"])
    for index, row in t.iterrows():
        sentences = [row['Answer_1'], row['Answer_2'], row['Answer_3']]
        query = row['Expected_answer']
        
        sentence_embeddings = model(sentences)
        query_embedding = model([query])
    
        similarities = cosine_similarity(query_embedding, sentence_embeddings)
        r.loc[index] = [row["Name"], similarities[0][0], similarities[0][1], similarities[0][2], my_mean(similarities)]
    return r

In [6]:
def measure_nearest_neighbors(t, k=3):
  results = []
  for index, row in t.iterrows():
    sentences = [row['Answer_1'], row['Answer_2'], row['Answer_3']]
    expected_answer = row['Expected_answer']

    answers_embeddings = [nlp(sentence).vector for sentence in sentences]
    answers_embeddings = np.array(answers_embeddings)
    
    nn_model = NearestNeighbors(n_neighbors=k, metric='cosine')
    nn_model.fit(answers_embeddings)
    
    query_embedding = nlp(expected_answer).vector.reshape(1, -1)  # Reshape for compatibility with sklearn
    distances, indices = nn_model.kneighbors(query_embedding)
    
    nearest_neighbors = []
    for i, j in enumerate(indices[0]):
      nearest_neighbors.append({"text": sentences[j], "distance": distances[0][i]})

    results.append({"expected_answer": expected_answer, "nearest_neighbors": nearest_neighbors})

  return results

In [7]:
def avg_table(t1, t2, t3):
    results = pd.DataFrame(columns=['Name', 'Avg_Spacy', 'Avg_BagofWords', 'Avg_USE'])
    results['Name'] = t1['Name']
    results['Avg_Spacy'] = t1['Average']
    results['Avg_BagofWords'] = t2['Average']
    results['Avg_USE'] = t3['Average']
    return results

### Without distance

In [8]:
d1 = load_data(path_outdoor_nodistance)
d2 = load_data(path_outdoor_distance)

In [9]:
results_spacy_d1 = similarity_spacy(d1)
results_spacy_d1

Unnamed: 0,Name,Similarity_1,Similarity_2,Similarity_3,Average
0,OND_0,0.944191,0.924147,0.924147,0.930828
1,OND_1,0.842313,0.879678,0.879678,0.867223
2,OND_2,0.857047,0.918308,0.907306,0.89422
3,OND_3,0.885182,0.865397,0.872507,0.874362
4,OND_4,0.914774,0.903561,0.931131,0.916489
5,OND_5,0.862355,0.871935,0.892875,0.875722
6,OND_6,0.937252,0.873307,0.815544,0.875368
7,OND_7,0.879813,0.900937,0.915529,0.898759
8,OND_8,0.830025,0.857743,0.830025,0.839265
9,OND_P0,0.876868,0.898244,0.813877,0.862996


In [10]:
results_bows_d1 = similarity_bows(d1)
results_bows_d1

Unnamed: 0,Name,Similarity_1,Similarity_2,Similarity_3,Average
0,OND_0,0.797017,0.719874,0.719874,0.745588
1,OND_1,0.516398,0.235702,0.235702,0.329267
2,OND_2,0.451848,0.571449,0.657267,0.560188
3,OND_3,0.237915,0.210819,0.147844,0.198859
4,OND_4,0.534522,0.356348,0.46291,0.45126
5,OND_5,0.593442,0.502625,0.63901,0.578359
6,OND_6,0.816497,0.57735,0.408248,0.600698
7,OND_7,0.40452,0.47194,0.552052,0.476171
8,OND_8,0.720082,0.608581,0.720082,0.682915
9,OND_P0,0.668153,0.5547,0.588348,0.603734


In [11]:
results_use_d1 = similarity_use(d1)
results_use_d1

2024-09-29 21:59:39.730559: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string
	 [[{{node inputs}}]]


Unnamed: 0,Name,Similarity_1,Similarity_2,Similarity_3,Average
0,OND_0,0.709798,0.75423,0.75423,0.739419
1,OND_1,0.278739,0.178155,0.178155,0.211683
2,OND_2,0.264729,0.266867,0.255198,0.262265
3,OND_3,0.223591,0.175391,0.175423,0.191468
4,OND_4,0.754342,0.27517,0.356111,0.461874
5,OND_5,0.572919,0.309167,0.720468,0.534185
6,OND_6,0.674294,0.376682,0.196033,0.41567
7,OND_7,0.289239,0.316569,0.313782,0.30653
8,OND_8,0.638865,0.416002,0.638865,0.564577
9,OND_P0,0.565857,0.276646,0.425211,0.422571


In [12]:
results_d1 = avg_table(results_spacy_d1, results_bows_d1, results_use_d1)
results_d1

Unnamed: 0,Name,Avg_Spacy,Avg_BagofWords,Avg_USE
0,OND_0,0.930828,0.745588,0.739419
1,OND_1,0.867223,0.329267,0.211683
2,OND_2,0.89422,0.560188,0.262265
3,OND_3,0.874362,0.198859,0.191468
4,OND_4,0.916489,0.45126,0.461874
5,OND_5,0.875722,0.578359,0.534185
6,OND_6,0.875368,0.600698,0.41567
7,OND_7,0.898759,0.476171,0.30653
8,OND_8,0.839265,0.682915,0.564577
9,OND_P0,0.862996,0.603734,0.422571


### With distance

In [13]:
results_spacy_d2 = similarity_spacy(d2)
results_spacy_d2

Unnamed: 0,Name,Similarity_1,Similarity_2,Similarity_3,Average
0,OYP_P0,0.798659,0.846425,0.845775,0.830286
1,OYP_P1,0.818225,0.846501,0.818796,0.827841
2,OYP_P2,0.942317,0.9505,0.930817,0.941212
3,OYP_P3,0.910671,0.892425,0.910671,0.904589
4,OYP_P4,0.807594,0.901593,0.871756,0.860315
5,OYP_P5,0.824872,0.832845,0.839278,0.832331
6,OYP_P6,0.739066,0.754408,0.778938,0.757471
7,OYP_P7,0.913274,0.839062,0.913274,0.888536
8,OYP_P8,0.838072,0.873886,0.893424,0.868461
9,OYP_P9,0.911268,0.842585,0.905473,0.886442


In [14]:
results_bow_d2 = similarity_bows(d2)
results_bow_d2

Unnamed: 0,Name,Similarity_1,Similarity_2,Similarity_3,Average
0,OYP_P0,0.402911,0.374241,0.330791,0.369314
1,OYP_P1,0.39736,0.471405,0.351763,0.406842
2,OYP_P2,0.823529,0.800853,0.761243,0.795209
3,OYP_P3,0.534522,0.322749,0.534522,0.463931
4,OYP_P4,0.338062,0.433861,0.447214,0.406379
5,OYP_P5,0.273861,0.436436,0.361158,0.357152
6,OYP_P6,0.267261,0.288675,0.392232,0.316056
7,OYP_P7,0.755929,0.534522,0.755929,0.682127
8,OYP_P8,0.527046,0.496139,0.533333,0.51884
9,OYP_P9,0.705907,0.57735,0.709795,0.664351


In [15]:
results_use_d2 = similarity_use(d2)
results_use_d2

Unnamed: 0,Name,Similarity_1,Similarity_2,Similarity_3,Average
0,OYP_P0,0.364791,0.35162,0.443353,0.386588
1,OYP_P1,0.434188,0.402291,0.394317,0.410265
2,OYP_P2,0.582257,0.580761,0.490684,0.551234
3,OYP_P3,0.347259,0.186317,0.347259,0.293612
4,OYP_P4,0.313325,0.391111,0.55016,0.418199
5,OYP_P5,0.267504,0.31507,0.348559,0.310378
6,OYP_P6,0.534463,0.494818,0.555798,0.528359
7,OYP_P7,0.555793,0.410495,0.555793,0.50736
8,OYP_P8,0.454529,0.327129,0.498091,0.426583
9,OYP_P9,0.269755,0.205213,0.259812,0.244927


In [16]:
results_d2 = avg_table(results_spacy_d2, results_bow_d2, results_use_d2)
results_d2

Unnamed: 0,Name,Avg_Spacy,Avg_BagofWords,Avg_USE
0,OYP_P0,0.830286,0.369314,0.386588
1,OYP_P1,0.827841,0.406842,0.410265
2,OYP_P2,0.941212,0.795209,0.551234
3,OYP_P3,0.904589,0.463931,0.293612
4,OYP_P4,0.860315,0.406379,0.418199
5,OYP_P5,0.832331,0.357152,0.310378
6,OYP_P6,0.757471,0.316056,0.528359
7,OYP_P7,0.888536,0.682127,0.50736
8,OYP_P8,0.868461,0.51884,0.426583
9,OYP_P9,0.886442,0.664351,0.244927
