In [1]:
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

import psycopg2
from psycopg2 import sql
import pandas as pd



  from tqdm.autonotebook import tqdm, trange


In [2]:
def get_db_connection():
    return psycopg2.connect(
        dbname="banksa",
        user="root",
        password="root",
        host="localhost",
        port="5432"
    )

In [3]:
def get_knowledge_base_not_indexed():
    cur = None
    conn = None
    try:
        conn = get_db_connection()
        cur = conn.cursor()

        query = sql.SQL("select * from v_knowledge_base where indexed = false")
        
        cur.execute(query)

        column_names = [desc[0] for desc in cur.description]
        results = cur.fetchall()

        df = pd.DataFrame(results, columns=column_names)

        return df
    except Exception as e:
        print(f"An error ocurred: {str(e)}")
        return pd.DataFrame()
    finally:
        if cur:
            cur.close()
        if conn:
            conn.close()

In [5]:
df_knowledge_base = get_knowledge_base_not_indexed()

In [6]:
df_knowledge_base = df_knowledge_base[['id', 'description', 'category']]
df_knowledge_base

Unnamed: 0,id,description,category
0,54,SMASH BURGUER,RESTAURANT
1,57,AMAZON,ONLINE SHOPPING
2,60,LA SIRENA VENEZUELA MC,SUPERMARKET
3,62,COFFEE SHOP ONE PLACE,COFFEE SHOP
4,66,Amazon.com*RV0SD8QA2,ONLINE SHOPPING
5,59,ALTICE DEBITO DIRECTO,SERVICE PAYMENT
6,47,PLAZA CENTRAL CIN MEG,CINEMA
7,65,JAGI CAPS MEGACENTRO,SUPERMARKET
8,80,TACOS DEL SOL PIANTINI,RESTAURANT
9,67,PLAZA LAMA 27 DE FEB,SUPERMARKET


In [7]:
model_name = 'all-MiniLM-L6-v2'
embedding_model = SentenceTransformer(model_name)



In [31]:
cats = df_knowledge_base.to_dict('records')

In [32]:
cats

[{'id': 54, 'description': 'SMASH BURGUER', 'category': 'RESTAURANT'},
 {'id': 57, 'description': 'AMAZON', 'category': 'ONLINE SHOPPING'},
 {'id': 60,
  'description': 'LA SIRENA VENEZUELA MC',
  'category': 'SUPERMARKET'},
 {'id': 62, 'description': 'COFFEE SHOP ONE PLACE', 'category': 'COFFEE SHOP'},
 {'id': 66,
  'description': 'Amazon.com*RV0SD8QA2',
  'category': 'ONLINE SHOPPING'},
 {'id': 59,
  'description': 'ALTICE DEBITO DIRECTO',
  'category': 'SERVICE PAYMENT'},
 {'id': 47, 'description': 'PLAZA CENTRAL CIN  MEG', 'category': 'CINEMA'},
 {'id': 65, 'description': 'JAGI CAPS MEGACENTRO', 'category': 'SUPERMARKET'},
 {'id': 80, 'description': 'TACOS DEL SOL PIANTINI', 'category': 'RESTAURANT'},
 {'id': 67, 'description': 'PLAZA LAMA 27 DE FEB', 'category': 'SUPERMARKET'},
 {'id': 48, 'description': 'PETRONAN LOS HERMANOS', 'category': 'GAS'},
 {'id': 69, 'description': 'PAGO ONLINE', 'category': 'PAYMENT'},
 {'id': 85, 'description': 'NETFLIX.COM', 'category': 'SUBSCRIPTION'

In [33]:
embeddings = []

for cat in tqdm(cats):
    description = cat['description']
    category = cat['category']
    cat_text = f'{description} {category}'
    embedding = embedding_model.encode(cat_text)
    embeddings.append(embedding)

  0%|          | 0/42 [00:00<?, ?it/s]

In [34]:
import numpy as np

X = np.array(embeddings)
X.shape

(42, 384)

In [35]:
to_predict = 'LA SIRENA'
encoding = embedding_model.encode(to_predict)
encoding[0]

np.float32(0.06174844)

In [36]:
v = encoding
scores = X.dot(v)
scores.max()

np.float32(0.66853565)

In [37]:
len(v)

384

In [38]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

In [39]:
search_engine = VectorSearchEngine(documents=cats, embeddings=X)
search_engine.search(v, num_results=2)

[{'id': 60,
  'description': 'LA SIRENA VENEZUELA MC',
  'category': 'SUPERMARKET'},
 {'id': 84,
  'description': 'LA SIRENA VENEZUELA MC',
  'category': 'SUPERMARKET'}]

In [8]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://127.0.0.1:9200', request_timeout=500) 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "id": {"type": "keyword"},
            "description": {"type": "text"},
            "category": {"type": "text"},
            "description_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "categories"

In [9]:
es_client.indices.delete(index=index_name, ignore_unavailable=True)

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x725076f32f00>: Failed to establish a new connection: [Errno 111] Connection refused))

In [10]:
if not es_client.indices.exists(index=index_name):
    print('creating index')
    es_client.indices.create(index=index_name, body=index_settings)
    print('index created')
    print(es_client.indices.get_settings(index=index_name))

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x725076a486b0>: Failed to establish a new connection: [Errno 111] Connection refused))

In [46]:
ground_truth = cats

In [47]:
ground_truth[0]

{'id': 54, 'description': 'SMASH BURGUER', 'category': 'RESTAURANT'}

In [48]:
# create embeddings
for cat in tqdm(ground_truth):
    description = cat['description']
    cat['description_vector'] = embedding_model.encode(description)

  0%|          | 0/42 [00:00<?, ?it/s]

In [53]:
ground_truth[0]

{'id': 54,
 'description': 'SMASH BURGUER',
 'category': 'RESTAURANT',
 'description_vector': array([ 1.34226568e-02,  7.57203437e-04, -2.51709353e-02,  5.45753399e-03,
        -8.82995054e-02,  1.19551226e-01,  5.65344505e-02,  5.12149511e-03,
         3.80637147e-03, -1.28104994e-02, -3.93640175e-02, -9.94731709e-02,
        -3.17534688e-03,  1.94332679e-03,  9.28765722e-03,  6.98426664e-02,
        -2.09480844e-04,  3.35071459e-02, -2.14301180e-02, -9.20342281e-03,
        -5.92158958e-02,  9.19533428e-03,  3.98660973e-02,  7.58114755e-02,
        -2.34276466e-02, -1.92669481e-02,  2.61279033e-03,  3.45828459e-02,
        -3.10578290e-02, -1.00138925e-01,  4.13720869e-02, -4.74197045e-02,
         5.84138557e-02, -5.81344124e-03, -4.35380712e-02, -3.87630500e-02,
         1.65780149e-02,  1.79668758e-02, -9.83983185e-03,  4.69177403e-02,
        -5.45436330e-02, -3.25366370e-02,  3.12766968e-03,  7.36454828e-03,
         3.35040912e-02, -4.58420403e-02, -2.65942067e-02,  6.64915293e

In [50]:
len(ground_truth[0]['description_vector'])

384

In [61]:
for cat in tqdm(ground_truth):
    es_client.index(index=index_name, document=cat)

  0%|          | 0/42 [00:00<?, ?it/s]

In [66]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 2,
        "num_candidates": 10000,
    }

    search_query = {
        "knn": knn,
        "_source": ["id", "description", "category"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    print(es_results['hits']['hits'])
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [67]:
elastic_search_knn('description_vector', v)

[{'_index': 'categories', '_id': 'XW9D1JEBfxhp_4C2fpH9', '_score': 0.91291916, '_source': {'description': 'LA SIRENA VENEZUELA MC', 'id': 60, 'category': 'SUPERMARKET'}}, {'_index': 'categories', '_id': 'fm9D1JEBfxhp_4C2gpG3', '_score': 0.91291916, '_source': {'description': 'LA SIRENA VENEZUELA MC', 'id': 84, 'category': 'SUPERMARKET'}}]


[{'description': 'LA SIRENA VENEZUELA MC',
  'id': 60,
  'category': 'SUPERMARKET'},
 {'description': 'LA SIRENA VENEZUELA MC',
  'id': 84,
  'category': 'SUPERMARKET'}]