In [22]:

import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
import faiss
import numpy as np


In [23]:
device = torch.device("mps") if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")


Using device: mps


In [24]:
# Load E5 model and tokenizer
model_name = "intfloat/e5-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)


In [25]:
# Function to generate embeddings
def get_embedding(text):
    """Generate dense vector for input text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Move back to CPU for FAISS


In [26]:
# Sample IT logs
# documents = [
#     "passage: Server crashed due to high memory usage.",
#     "passage: Network outage detected in data center.",
#     "passage: Disk failure reported on node 7.",
#     "passage: Application error after software update."
# ]
queries = ["I love dogs"]
documents = ["I love BGE", "I love animals is a joke"]

# Convert documents to embeddings
doc_embeddings = np.array([get_embedding(doc) for doc in documents])

# ðŸ”¥ Fix: Ensure correct shape (num_docs, embedding_dim)
doc_embeddings = np.asarray(doc_embeddings, dtype="float32").reshape(len(documents), -1)

# Build FAISS index
index = faiss.IndexFlatL2(doc_embeddings.shape[1])  # L2 distance
index.add(doc_embeddings)

# Search function with similarity scores
def search(query, top_k=2):
    query_embedding = np.array([get_embedding(f"query: {query}")]).astype("float32")
    query_embedding = query_embedding.reshape(1, -1)  # Ensure correct shape

    distances, indices = index.search(query_embedding, top_k)

    # Convert distances to similarity scores
    similarity_scores = 1 / (1 + distances)

    results = [(documents[i], similarity_scores[0][rank]) for rank, i in enumerate(indices[0])]
    return results

# Test retrieval
query = "I love dogs"
results = search(query)

# Print results with similarity scores
for doc, score in results:
    print(f"Document: {doc} | Similarity Score: {score:.4f}")


Document: I love BGE | Similarity Score: 0.0140
Document: I love animals is a joke | Similarity Score: 0.0090


In [27]:


def get_embedding(text):
    """Generate embeddings using mean pooling instead of CLS token."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Mean pooling (better for long passages)
    attention_mask = inputs['attention_mask']
    token_embeddings = outputs.last_hidden_state  # Shape: (batch, seq_len, hidden_dim)
    mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.shape)
    
    sum_embeddings = torch.sum(token_embeddings * mask_expanded, dim=1)
    sum_mask = mask_expanded.sum(dim=1)
    embedding = sum_embeddings / sum_mask  # Mean pooling
    return embedding.cpu().numpy()

def chunk_text(text, max_tokens=512, overlap=50):
    """Splits a long passage into overlapping chunks for better embeddings."""
    tokens = tokenizer.tokenize(text)
    chunks = []
    
    for i in range(0, len(tokens), max_tokens - overlap):
        chunk = tokens[i : i + max_tokens]
        chunks.append(tokenizer.convert_tokens_to_string(chunk))
    
    return chunks
import faiss

# Sample IT logs (Long Texts)
documents = [
    "passage: The server crashed because of a memory leak. The application logs showed increased RAM usage over time...",
    "passage: A major network outage was reported in the data center due to a misconfigured router...",
    "passage: Disk failure was detected on node 7, causing application errors in the database...",
    "passage: After the latest software update, several applications failed due to dependency issues..."
]

# Process and embed document chunks
all_chunks = []
chunk_doc_map = {}  # Map chunk index to original document

for i, doc in enumerate(documents):
    chunks = chunk_text(doc)
    for chunk in chunks:
        all_chunks.append(chunk)
        chunk_doc_map[len(all_chunks) - 1] = documents[i]  # Store original doc index

# Convert all chunks to embeddings
doc_embeddings = np.array([get_embedding(chunk) for chunk in all_chunks], dtype="float32")
doc_embeddings = doc_embeddings.reshape(len(all_chunks), -1)  # Ensure 2D shape

# Build FAISS index
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

# Search function with similarity scores
def search(query, top_k=3):
    query_embedding = np.array([get_embedding(f"query: {query}")]).astype("float32").reshape(1, -1)
    distances, indices = index.search(query_embedding, top_k)
    
    # Convert L2 distance to similarity scores
    similarity_scores = 1 / (1 + distances)
    
    results = [(chunk_doc_map[i], similarity_scores[0][rank]) for rank, i in enumerate(indices[0])]
    return results

# Test retrieval
query = "memory issue caused server crash"
results = search(query)

# Print results
for doc, score in results:
    print(f"Document: {doc} | Similarity Score: {score:.4f}")


Document: passage: The server crashed because of a memory leak. The application logs showed increased RAM usage over time... | Similarity Score: 0.0202
Document: passage: A major network outage was reported in the data center due to a misconfigured router... | Similarity Score: 0.0141
Document: passage: Disk failure was detected on node 7, causing application errors in the database... | Similarity Score: 0.0139
