In [44]:
import pandas as pd
import random
from sentence_transformers import SentenceTransformer
import chromadb

# ----------------------------
# Step 1: Generate synthetic dataset
# ----------------------------
n_rows = 200
data = {
    "InvoiceNo": [f"INV{1000+i}" for i in range(n_rows)],
    "StockCode": [f"STK{random.randint(100, 999)}" for _ in range(n_rows)],
    "Description": [random.choice(["T-shirt", "Shoes", "Laptop", "Phone", "Book", "Pen", "Bag"]) for _ in range(n_rows)],
    "Quantity": [random.randint(1, 10) for _ in range(n_rows)],
    "InvoiceDate": pd.date_range(start="2023-01-01", periods=n_rows, freq="D").strftime("%Y-%m-%d").tolist(),
    "UnitPrice": [round(random.uniform(5, 500), 2) for _ in range(n_rows)],
    "CustomerID": [random.randint(10000, 20000) for _ in range(n_rows)],
    "Country": [random.choice(["India", "USA", "UK", "Germany", "Canada"]) for _ in range(n_rows)]
}

df = pd.DataFrame(data)
df.to_csv("synthetic_retail_data.csv", index=False)
print("✅ synthetic_retail_data.csv generated successfully!")

# ----------------------------
# Step 2: Convert rows into chunks
# ----------------------------
chunks = []
for idx, row in df.iterrows():
    chunk_text = f"InvoiceNo: {row['InvoiceNo']}, " \
                 f"StockCode: {row['StockCode']}, " \
                 f"Description: {row['Description']}, " \
                 f"Quantity: {row['Quantity']}, " \
                 f"InvoiceDate: {row['InvoiceDate']}, " \
                 f"UnitPrice: {row['UnitPrice']}, " \
                 f"CustomerID: {row['CustomerID']}, " \
                 f"Country: {row['Country']}"
    chunks.append(chunk_text)

print("Sample chunks:", chunks[:3])

# ----------------------------
# Step 3: Create embeddings
# ----------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks)

print(f"Embeddings shape: {len(embeddings)} x {len(embeddings[0])}")


# ----------------------------
# Step 4: Store in ChromaDB with metadata
client = chromadb.PersistentClient(path="chromadb_store")
collection = client.get_or_create_collection("retail_chunks")

# Optional: clear old data only if collection already has entries
if collection.count() > 0:
    all_ids = collection.get()["ids"]
    if all_ids:  # avoid crash if empty
        collection.delete(ids=all_ids)

# Insert data with metadata
for i, (text, emb) in enumerate(zip(chunks, embeddings)):
    row = df.iloc[i]
    collection.add(
        ids=[str(i)],
        documents=[text],
        embeddings=[emb],
        metadatas=[{
            "InvoiceNo": row["InvoiceNo"],
            "StockCode": row["StockCode"],
            "Description": row["Description"],
            "Quantity": int(row["Quantity"]),
            "InvoiceDate": row["InvoiceDate"],
            "UnitPrice": float(row["UnitPrice"]),
            "CustomerID": int(row["CustomerID"]),
            "Country": row["Country"]
        }]
    )

print("✅ Data stored in ChromaDB successfully!")
print("Stored embeddings:", collection.count())
# -----------------------------
# Step 6: Evaluate Retrieval with Recall@k
# -----------------------------

def recall_at_k(collection, model, query, ground_truth_ids, k=5):
    query_emb = model.encode([query])
    results = collection.query(query_embeddings=query_emb, n_results=k)
    retrieved_ids = results["ids"][0]

    relevant_retrieved = set(retrieved_ids) & set(ground_truth_ids)
    recall = len(relevant_retrieved) / len(ground_truth_ids) if ground_truth_ids else 0.0
    return recall, retrieved_ids

# Example evaluation queries
queries = ["Laptop", "Shoes", "Book"]
for q in queries:
    ground_truth_ids = [str(i) for i, desc in enumerate(df["Description"]) if q in desc]
    recall, retrieved = recall_at_k(collection, model, query=q, ground_truth_ids=ground_truth_ids, k=5)
    print(f"Query: {q}")
    print(f"  Recall@5 = {recall:.2f}")
    print(f"  Retrieved IDs: {retrieved}")
    print(f"  Ground Truth IDs (first 5): {ground_truth_ids[:5]}")
    print("-" * 40)



✅ synthetic_retail_data.csv generated successfully!
Sample chunks: ['InvoiceNo: INV1000, StockCode: STK398, Description: Shoes, Quantity: 10, InvoiceDate: 2023-01-01, UnitPrice: 183.98, CustomerID: 12392, Country: Canada', 'InvoiceNo: INV1001, StockCode: STK327, Description: Pen, Quantity: 2, InvoiceDate: 2023-01-02, UnitPrice: 455.05, CustomerID: 12773, Country: India', 'InvoiceNo: INV1002, StockCode: STK518, Description: T-shirt, Quantity: 5, InvoiceDate: 2023-01-03, UnitPrice: 193.05, CustomerID: 16777, Country: India']
Embeddings shape: 200 x 384
✅ Data stored in ChromaDB successfully!
Stored embeddings: 200
Query: Laptop
  Recall@5 = 0.14
  Retrieved IDs: ['188', '69', '119', '83', '33']
  Ground Truth IDs (first 5): ['18', '23', '24', '27', '30']
----------------------------------------
Query: Shoes
  Recall@5 = 0.18
  Retrieved IDs: ['115', '170', '129', '70', '8']
  Ground Truth IDs (first 5): ['0', '4', '8', '9', '13']
----------------------------------------
Query: Book
  Rec

In [42]:
# Example query
results = collection.query(
    query_texts=["cheap laptop"],  # user query
    n_results=5                    # top 5 results
)

print("🔎 Query Results:")
for doc, dist in zip(results["documents"][0], results["distances"][0]):
    print(f"Match: {doc}\nDistance: {dist:.4f}\n")


🔎 Query Results:
Match: InvoiceNo: INV1134, StockCode: STK237, Description: Laptop, Quantity: 8, InvoiceDate: 2023-05-15, UnitPrice: 159.87, CustomerID: 18685, Country: India
Distance: 1.3075

Match: InvoiceNo: INV1037, StockCode: STK301, Description: Laptop, Quantity: 6, InvoiceDate: 2023-02-07, UnitPrice: 322.4, CustomerID: 11978, Country: India
Distance: 1.3258

Match: InvoiceNo: INV1022, StockCode: STK348, Description: Laptop, Quantity: 6, InvoiceDate: 2023-01-23, UnitPrice: 492.34, CustomerID: 18199, Country: India
Distance: 1.3273

Match: InvoiceNo: INV1016, StockCode: STK738, Description: Laptop, Quantity: 5, InvoiceDate: 2023-01-17, UnitPrice: 206.05, CustomerID: 19803, Country: Germany
Distance: 1.3369

Match: InvoiceNo: INV1025, StockCode: STK605, Description: Laptop, Quantity: 7, InvoiceDate: 2023-01-26, UnitPrice: 89.76, CustomerID: 10812, Country: India
Distance: 1.3382



In [43]:
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,INV1000,STK939,Shoes,4,2023-01-01,177.13,15314,Germany
1,INV1001,STK307,Pen,10,2023-01-02,67.86,11443,Germany
2,INV1002,STK280,Pen,7,2023-01-03,427.54,12185,Germany
3,INV1003,STK354,Book,4,2023-01-04,489.48,14723,Germany
4,INV1004,STK928,Book,8,2023-01-05,179.14,15712,India
...,...,...,...,...,...,...,...,...
195,INV1195,STK776,Bag,5,2023-07-15,390.82,13737,Germany
196,INV1196,STK787,Shoes,1,2023-07-16,298.67,11518,USA
197,INV1197,STK826,Pen,3,2023-07-17,212.68,12003,Canada
198,INV1198,STK777,Shoes,2,2023-07-18,409.72,10639,India
