In [None]:
%%sh
pip install opensearch-py==2.4.2
pip install boto3==1.34.29
pip install sentence-transformers==2.2.2

In [None]:
import pandas as pd

# import pre selected list of known queries already classified
#df_top = df_queries[(df_queries['CTR'] > 0.35) & (df_queries['Buscas'] > 1000)]
#print(df_top.head(5))

file_path = 'queries.csv'
columns = ['Termo','Categoria']
df_queries = pd.read_csv(file_path, usecols=columns)
df_top = df_queries.dropna()
print(df_top.head(5))

In [None]:
from opensearchpy import OpenSearch

CLUSTER_URL = 'http://localhost:9200'

def get_client(cluster_url = CLUSTER_URL):

    client = OpenSearch(
        hosts=[cluster_url],
        verify_certs=False
    )
    return client

client = get_client()


In [None]:
from sentence_transformers import SentenceTransformer

model_name = "all-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

EMBEDDING_DIM = model.encode(["Sample sentence"])[0].shape[0]

In [None]:
print(EMBEDDING_DIM)

In [None]:
index_name = "queries"

index_body = {
  "settings": {
    "index": {
      "knn": True,
      "knn.algo_param.ef_search": 100
    }
  },
  "mappings": {
    "properties": {
        "embedding": {
          "type": "knn_vector", #we are going to put 
          "dimension": EMBEDDING_DIM,
          "method": {
            "name": "hnsw",
            "space_type": "cosinesimil",
            "engine": "nmslib",
            "parameters": {
              "ef_construction": 128,
              "m": 24
            }
         }
     }
}
}}

In [None]:
response = client.indices.create(index=index_name, body=index_body)
print(response)

In [None]:
import pandas as pd

for index, row in df_top.iterrows():
    print(f"Id: {index}, Query: {row['Termo']}, Categoria: {row['Categoria']}")
    original_term = row['Termo']
    ctg = row['Categoria']
    id = index

    # Sentence transformer model takes list of documents as input and returns list of embeddings.
    embedding = model.encode([original_term])[0]
    my_doc = {"id": id, "term": original_term, "ctg": ctg, "embedding": embedding}
    res = client.index(
        index=index_name,
        body=my_doc,
        id = str(index),
        refresh = True
    )


In [None]:
""" Example query text """
user_query = "zflip5 samsung usado"

""" Embedding the query by using the same model """
query_embedding = model.encode((user_query))


query_body = {
    "query": {"knn": {"embedding": {"vector": query_embedding, "k": 5}}},
    "_source": False,
    "fields": ["id", "term", "ctg"]
}

results = client.search(
    body=query_body,
    index=index_name
)

for i, result in enumerate(results["hits"]["hits"]):
    query = result['fields']['term'][0]
    ctg = result['fields']['ctg'][0]
    score = result['_score']
    print(f"{i+1}. Query: {query}, ctg: {ctg}, score: {score}")