In [None]:
from typing import Any, List, Dict
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from db_connections.pg_sql import session, Ticket
import numpy as np


In [None]:
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'
EMBEDDING_DIM = 768
N_CLUSTERS = 2

In [30]:
sample_tickets = [
    {"ticket_id": "1", "text": "Cannot connect to the company VPN, it keeps saying authentication failed.", "category": "Network"},
    {"ticket_id": "2", "text": "My Microsoft Outlook is not opening, it crashes on startup.", "category": "Software"},
    {"ticket_id": "3", "text": "The new laptop I received won't boot up, it shows a black screen.", "category": "Hardware"},
    {"ticket_id": "4", "text": "I'm unable to access the shared drive, it says permission denied.", "category": "Network"},
    {"ticket_id": "5", "text": "Excel is freezing frequently when I work with large spreadsheets.", "category": "Software"},
    {"ticket_id": "6", "text": "The printer on the 3rd floor is not working, documents are stuck in queue.", "category": "Hardware"},
    {"ticket_id": "7", "text": "How do I reset my domain password?", "category": "Account"},
]

In [41]:
sample_tickets = [
    {"ticket_id": "1", "text": "Cannot connect to the company VPN, it keeps saying authentication failed.", "category": "Network"},
    {"ticket_id": "2", "text": "My Microsoft Outlook is not opening, it crashes on startup.", "category": "Software"},
]

In [None]:
def process_and_store_to_pgvector(tickets: List[Dict[str, Any]], model: SentenceTransformer, kmeans: KMeans):
    texts = [ticket['text'] for ticket in tickets]
    embeddings = model.encode(texts, show_progress_bar=True)
    print(f"embedding shPE: {embeddings.shape}")
    cluster_ids = kmeans.fit_predict(embeddings)

    for i, ticket in enumerate(tickets):
        embedding = np.array(embeddings[i], dtype=np.float32)
        new_ticket = Ticket(
            ticket_id=ticket['ticket_id'],
            text=ticket['text'],
            embedding=embedding
        )
        
        session.add(new_ticket)

    session.commit()
    print(f"Inserted {len(tickets)} tickets into PostgreSQL with pgvector.")

In [32]:

def query_similar_from_pgvector(query: str, model: SentenceTransformer, top_k: int = 3):
    query_vector = np.array(model.encode([query])[0], dtype=np.float32)

    sql = f"""
    SELECT id, ticket_id, text,
           embedding <#> cube(:query_embedding) as distance
    FROM tickets
    ORDER BY embedding <#> cube(:query_embedding)
    LIMIT :top_k;
    """

    result = session.execute(
        sql,
        {"query_embedding": query_vector.tolist(), "top_k": top_k}
    )

    print("Search Results:")
    for row in result:
        print(f"ID: {row.id}, Ticket ID: {row.ticket_id}, Text: {row.text}, Distance: {row.distance:.4f}")


In [48]:
sbert_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
kmeans = KMeans(n_clusters=1, random_state=42, n_init='auto')
process_and_store_to_pgvector(sample_tickets, sbert_model, kmeans)

Batches: 100%|██████████| 1/1 [00:00<00:00, 40.87it/s]

embedding shPE: (2, 384)





PendingRollbackError: This Session's transaction has been rolled back due to a previous exception during flush. To begin a new transaction with this Session, first issue Session.rollback(). Original exception was: (builtins.ValueError) expected 768 dimensions, not 384
[SQL: INSERT INTO tickets (ticket_id, text, embedding) SELECT p0::VARCHAR, p1::VARCHAR, p2::VECTOR(768) FROM (VALUES (%(ticket_id)s, %(text)s, %(embedding)s)) AS imp_sen(p0, p1, p2, sen_counter) ORDER BY sen_counter RETURNING tickets.id, tickets.id AS id__1]
[parameters: [{'embedding': array([-6.63655475e-02,  3.42290588e-02,  1.24685112e-02, -2.55581066e-02,
       -4.58992943e-02,  2.31425129e-02,  3.60830836e-02, -3. ... (7039 characters truncated) ... 45e-02,  4.56833746e-03],
      dtype=float32), 'text': 'Cannot connect to the company VPN, it keeps saying authentication failed.', 'ticket_id': '1'}, {'embedding': array([ 3.21427062e-02, -4.40553539e-02,  2.69629080e-02,  1.07551329e-02,
        1.67914126e-02, -3.76561992e-02, -3.93719375e-02,  4. ... (7025 characters truncated) ... -02,  2.12958250e-02, -3.29278223e-02],
      dtype=float32), 'text': 'My Microsoft Outlook is not opening, it crashes on startup.', 'ticket_id': '2'}, {'embedding': array([-1.92928594e-02, -2.24530920e-02,  3.24990181e-03, -3.92881781e-02,
        6.01377450e-02, -2.12627649e-03, -6.90801889e-02, -2. ... (7031 characters truncated) ... 5.04368730e-03, -7.34494580e-03],
      dtype=float32), 'text': "The new laptop I received won't boot up, it shows a black screen.", 'ticket_id': '3'}, {'embedding': array([ 1.85625274e-02,  5.95447794e-02, -8.71186319e-04, -1.81658249e-02,
       -2.79050320e-02,  5.88835329e-02, -7.10095093e-02, -2. ... (7031 characters truncated) ... 8.74158293e-02,  7.05249161e-02],
      dtype=float32), 'text': "I'm unable to access the shared drive, it says permission denied.", 'ticket_id': '4'}, {'embedding': array([ 9.29281395e-03,  1.54157476e-02, -5.10715730e-02, -1.08511522e-02,
        1.29002603e-02, -4.73905951e-02, -1.28359303e-01,  3. ... (7031 characters truncated) ... 6.66601732e-02,  3.39323357e-02],
      dtype=float32), 'text': 'Excel is freezing frequently when I work with large spreadsheets.', 'ticket_id': '5'}, {'embedding': array([-4.92093898e-02, -2.78321560e-02, -8.86119455e-02,  2.39728019e-02,
       -8.28019343e-04, -4.95178290e-02, -7.22196698e-02, -5. ... (7040 characters truncated) ... 6e-02,  3.13547179e-02],
      dtype=float32), 'text': 'The printer on the 3rd floor is not working, documents are stuck in queue.', 'ticket_id': '6'}, {'embedding': array([ 6.32430008e-03, -6.00010566e-02, -4.80055101e-02, -8.23722258e-02,
       -4.09136489e-02,  8.35997111e-04, -4.75735664e-02, -4. ... (7000 characters truncated) ... 1319359e-02,  5.71280122e-02, -3.80178951e-02, -3.89298834e-02],
      dtype=float32), 'text': 'How do I reset my domain password?', 'ticket_id': '7'}]] (Background on this error at: https://sqlalche.me/e/20/7s2a)

In [42]:
texts = [ticket['text'] for ticket in sample_tickets]
embeddings = sbert_model.encode(texts, show_progress_bar=True)
cluster_ids = kmeans.fit_predict(embeddings)

Batches: 100%|██████████| 1/1 [00:00<00:00, 48.27it/s]


In [46]:
embeddings.shape

(2, 384)