In [None]:
import pandas as pd
from pathlib import Path
import os
import dotenv
from haystack import Pipeline, Document
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.mongodb_atlas import MongoDBAtlasDocumentStore
from haystack.utils import Secret
from haystack.components.preprocessors import RecursiveDocumentSplitter
from haystack.document_stores.types import DuplicatePolicy
from getpass import getpass

# Section 1: Data Loading and Preprocessing
dotenv.load_dotenv()

DATA_PATH = '../data/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv'
df = pd.read_csv(DATA_PATH)

QUESTION_CANDIDATES = ['instruction', 'intent', 'question', 'Question', 'text', 'query', 'prompt', 'utterance']
ANSWER_CANDIDATES = ['response', 'answer', 'Answer', 'Response', 'reply']

def pick_col(cols, candidate):
    for c in candidate:
        if c in cols:
            return c
    return None

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).replace("\\r", " ").replace("\\n", " ").replace("\\t", " ").replace("\\xa0", " ").strip()
    return " ".join(text.split())

docs_raw = []
if df is not None:
    q_col = pick_col(df.columns, QUESTION_CANDIDATES) or "instruction"
    a_col = pick_col(df.columns, ANSWER_CANDIDATES) or "response"
    print("q_col:", q_col, "| a_col:", a_col)

    use_cols = [c for c in ["instruction", "response", "intent", "category", "flags"] if c in df.columns]
    df_use = df[use_cols].copy()

    for c in ["instruction", "response", "intent", "category"]:
        if c in df_use.columns:
            df_use[c] = df_use[c].apply(clean_text)

    if "instruction" in df_use.columns and "intent" in df_use.columns:
        empty_instr = df_use["instruction"] == ""
        df_use.loc[empty_instr, "instruction"] = df_use.loc[empty_instr, "intent"]

    mask_valid = (df_use.get("instruction", "") != "") & (df_use.get("response", "") != "")
    df_use = df_use[mask_valid].drop_duplicates(subset=["instruction", "response"]).reset_index(drop=True)
    print("Rows kept:", len(df_use))

    for i, row in df_use.iterrows():
        q = row.get("instruction", "")
        a = row.get("response", "")
        text = f"Q: {q}\\nA: {a}"
        meta = {
            "row_id": int(i),
            "source": DATA_PATH.name if isinstance(DATA_PATH, Path) else str(DATA_PATH),
            "category": row.get("category", ""),
            "intent": row.get("intent", ""),
        }
        docs_raw.append({"content": text, "metadata": meta})

documents = [Document(content=doc["content"], meta=doc["metadata"]) for doc in docs_raw]



# Section 2: MongoDB Atlas Setup


os.environ["MONGO_CONNECTION_STRING"] = os.getenv("MONGO_CONNECTION_STRING2")

document_store = MongoDBAtlasDocumentStore(
    database_name="depato_store",
    collection_name="common_info",
    vector_search_index="vector_index",
    full_text_search_index="search_index",
)

# Section 3: Haystack Pipeline

pipeline = Pipeline()
pipeline.add_component("embedder", SentenceTransformersDocumentEmbedder())
pipeline.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE))
pipeline.connect("embedder", "writer")

pipeline.run({
    "embedder": {
        "documents": documents
    }
})

print(f"Successfully embedded and wrote {len(documents)} documents to MongoDB Atlas.")

q_col: instruction | a_col: response
Rows kept: 26872


Batches: 100%|██████████| 840/840 [59:54<00:00,  4.28s/it]  


Successfully embedded and wrote 26872 documents to MongoDB Atlas.
