This notebook explores how to check text similarity among two list of texts.

In [1]:
import math
# Load eb kg dataframe
import pandas as pd
eb_kg_df = pd.read_json("../eb_kg_hq_with_embeddings_dataframe", orient="index")

Using sklearn library

In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('all-mpnet-base-v2')
def calculating_similarity_sklearn(query_embedding, text_embeddings, top_n=20, offset=0):
    similarities=cosine_similarity( [query_embedding], text_embeddings)
    similarities_sorted = similarities.argsort()
    result = []
    start = -1 - offset
    end = start - top_n
    for index in similarities_sorted[0][start:end:-1]:
        result.append({
            "index": index,
            "score": similarities[0][index]
        })
    return result

In [4]:
text_embeddings = eb_kg_df["embedding"].values.tolist()

In [5]:
query_embedding = text_embeddings[0]

In [7]:
query_embedding

[0.0667808577,
 -0.1191483587,
 0.0258066095,
 -0.0154051194,
 0.0177952293,
 0.005992443300000001,
 0.0086399661,
 0.0618461259,
 -0.0033398916,
 0.005896432300000001,
 0.014627636400000001,
 0.0438207388,
 0.0130106201,
 -0.08083358410000001,
 0.0270166956,
 -0.0680485591,
 -0.0445473716,
 0.0502884388,
 0.029745051600000003,
 0.002854231,
 -0.0256614294,
 0.038261733900000004,
 -0.010777960500000001,
 0.0328972042,
 -0.0182807855,
 -0.0265123285,
 0.0293891933,
 0.028690232000000003,
 -0.063238658,
 -0.0045688478,
 -0.0137779284,
 -0.0777471587,
 -0.0070252493000000004,
 -0.0108974073,
 1.9242e-06,
 -0.0074232947,
 0.0314941928,
 0.0012543703,
 0.0154032484,
 -0.0287385751,
 0.014461278000000001,
 -0.0274059866,
 0.0502606928,
 -0.005175781,
 0.007666277700000001,
 -7.57975e-05,
 -0.0391709097,
 -0.0277074073,
 -0.0332484059,
 0.0609931946,
 -0.0010288119,
 -0.0453827381,
 -0.0085687572,
 -0.0302264057,
 0.022930976000000002,
 -0.0363037474,
 0.0101896971,
 0.0089511834,
 -0.0259510

In [26]:
similarities = calculating_similarity_sklearn(query_embedding, text_embeddings, offset=1)

In [27]:
for similarity in similarities:
    term_name = eb_kg_df.loc[similarity["index"], "term_name"]
    print(f"term name: {term_name}, score: {similarity['score']}")

term name: JALEMUS, score: 0.9174686934632075
term name: JALEMUS, score: 0.8772943130567594
term name: JALEMUS, score: 0.8475532906894628
term name: JALEMUS, score: 0.8148748575742857
term name: MONODY, score: 0.7130271120678224
term name: EPICEDIUM, score: 0.6476358814635169
term name: ELEGI, score: 0.6218768102182376
term name: MONODY, score: 0.6181812197105039
term name: MONODY, score: 0.6161303561791671
term name: EPICEDIUM, score: 0.6117035352533822
term name: EPICEDIUM, score: 0.6080285260537095
term name: EPICEDIUM, score: 0.6080284931314845
term name: MONODY, score: 0.6065482571235987
term name: MONODY, score: 0.588152880069543
term name: GENETHLIA, score: 0.5872355263579042
term name: GENETHLIA, score: 0.5872355176553301
term name: ELEGIAC, score: 0.5835353365703346
term name: ELEGIAC, score: 0.5835353365703346
term name: MOURNING, score: 0.5776497152776886
term name: MIMESIS, score: 0.5497770164934734


In [10]:
import math
def manual_cos_similarity(vector1, vector2):
    doc_product = sum( [vector1[i]*vector2[i] for i in range(len(vector1))] )
    len_vector1 = math.sqrt(sum([vector1[i]*vector1[i] for i in range(len(vector1))]))
    len_vector2 = math.sqrt(sum([vector2[i]*vector2[i] for i in range(len(vector2))]))
    return doc_product / (len_vector1*len_vector2)

In [11]:
manual_cos_result = manual_cos_similarity(query_embedding, text_embeddings[1])

In [12]:
manual_cos_result

0.2287541277754158

using KNN search from Elasticsearch

In [2]:
from elasticsearch import Elasticsearch
client = Elasticsearch(
  "https://83a1253d6aac48278867d36eed60b642.us-central1.gcp.cloud.es.io:443",
  api_key="cmtBajU0MEJiRUoteDA3bmtubEE6bHpVYzFlSWNUSXFWcG8tbHFnOUFxQQ=="
)

eb_index = "eb_with_embeddings"



In [11]:
def es_knn_search(query_embedding, top_n=20, offset=0):
    body = {
        "size": top_n,
        "from": offset,
        "_source": ["name"],
        "query": {
            "script_score": {
              "query" : {
                "match_all": {}
              },
              "script": {
                "source": "cosineSimilarity(params.queryVector, 'embedding') + 1.0",
                "params": {
                  "queryVector": query_embedding
                }
              }
            }
        }
    }
    response = client.search(index=eb_index, body=body)
    return response

In [37]:
response = es_knn_search(query_embedding, offset=1)

In [38]:
response

ObjectApiResponse({'took': 370, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 10000, 'relation': 'gte'}, 'max_score': 2.0, 'hits': [{'_index': 'eb_with_embeddings', '_id': 'https://w3id.org/hto/ArticleTermRecord/9910796273804340_192693199_1475160240_0', '_score': 1.9174685, '_source': {'name': 'JALEMUS'}}, {'_index': 'eb_with_embeddings', '_id': 'https://w3id.org/hto/ArticleTermRecord/997902543804341_149982181_1475160240_0', '_score': 1.8772943, '_source': {'name': 'JALEMUS'}}, {'_index': 'eb_with_embeddings', '_id': 'https://w3id.org/hto/ArticleTermRecord/9910796233804340_191679019_1475160240_0', '_score': 1.8475533, '_source': {'name': 'JALEMUS'}}, {'_index': 'eb_with_embeddings', '_id': 'https://w3id.org/hto/ArticleTermRecord/9910796253804340_192984257_1475160240_0', '_score': 1.8148748, '_source': {'name': 'JALEMUS'}}, {'_index': 'eb_with_embeddings', '_id': 'https://w3id.org/hto/ArticleTermRecord/99227765380434

Compare the result from knn search with cosine similarity from sklearn

In [42]:
import math
for index in range(len(similarities)):
    sklearn_similarity = similarities[index]
    sklearn_term_name = eb_kg_df.loc[sklearn_similarity["index"], "term_name"]
    sklearn_score = sklearn_similarity['score']
    sklearn_term_uri = eb_kg_df.loc[sklearn_similarity["index"], "term_uri"]

    es_knn_term_name = response["hits"]["hits"][index]["_source"]["name"]
    es_knn_term_uri = response["hits"]["hits"][index]["_id"]
    es_knn_score = response["hits"]["hits"][index]["_score"] - 1
    if sklearn_term_uri != es_knn_term_uri or sklearn_term_name != es_knn_term_name or not math.isclose(sklearn_score, es_knn_score, rel_tol=0.000001):
        print(f"result does not match! sklearn result: {sklearn_term_name}, {sklearn_score}, {sklearn_term_uri} | knn search result: {es_knn_term_name}, {es_knn_score}, {es_knn_term_uri}")


result does not match! sklearn result: ELEGIAC, 0.5835353365703346 | knn search result: ELEGIAC, 0.5835353999999999
result does not match! sklearn result: ELEGIAC, 0.5835353365703346 | knn search result: ELEGIAC, 0.5835353999999999
