In [None]:
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from pathlib import Path
import pandas as pd

# Path to your test step CSVs
input_dir = Path("failures_ds_csv")  # Replace with your actual CSV folder name

# Step 1: Load all .csv files recursively and convert to Documents
documents = []

for csv_path in input_dir.rglob("*.csv"):
    df = pd.read_csv(csv_path)

    # Option 1: concatenate relevant columns like "Step", "Description", "Command"
    if "Step" in df.columns and "Description" in df.columns:
        steps = [
            f"Step {row['Step']}: {row['Description']}"
            for _, row in df.iterrows()
            if pd.notna(row['Description'])
        ]
    else:
        # fallback: join all rows
        steps = [" ".join(str(cell) for cell in row) for _, row in df.iterrows()]

    content = "\n".join(steps).strip()

    if content:
        documents.append(Document(
            page_content=content,
            metadata={"source": str(csv_path.relative_to(input_dir))}
        ))

print(f"✅ Loaded {len(documents)} CSV-based test documents.")

# Step 2: Load the embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Step 3: Create Chroma vectorstore (skip chunking)
db_path = "chroma_test_step_vectors"
vectorstore = Chroma.from_documents(documents=documents, embedding=embedding_model, persist_directory=db_path)
vectorstore.persist()

print(f"✅ Vectorstore created with {vectorstore._collection.count()} test cases at {db_path}")

Demonstrate results in 2D curve

In [None]:
# Step 1: Load the Chroma DB
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from sklearn.manifold import TSNE
import plotly.express as px
import numpy as np

persist_path = "chroma_test_step_vectors"
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = Chroma(persist_directory=persist_path, embedding_function=embedding_model)

# ✅ Get embeddings explicitly
result = vectorstore.get(include=['embeddings', 'metadatas', 'documents'])  # Include documents ✅
all_docs = result['documents']
all_metas = result['metadatas']
all_embeddings = result['embeddings']

# ✅ Convert to numpy array and verify shape
X = np.array(all_embeddings)
print("Shape of X:", X.shape)

# ✅ Adjust perplexity to be < number of samples
X_2d = TSNE(n_components=2, perplexity=min(30, X.shape[0] - 1), random_state=42).fit_transform(X)

# Prepare Plotly data
from pathlib import Path
def extract_test_id(path_str):
    return Path(path_str).stem

sources = [extract_test_id(meta['source']) for meta in all_metas]

texts = [doc[:200] for doc in all_docs]
df_data = {
    "x": X_2d[:, 0],
    "y": X_2d[:, 1],
    "source": sources,
    "preview": texts,
}

# Plot
fig = px.scatter(df_data, x="x", y="y", color="source", hover_data=["preview"])
fig.update_layout(title="2D Visualization of Chroma Embeddings", width=1000, height=700)
fig.show()