In [1]:
import os
import shutil
from pathlib import Path

In [2]:
if os.path.exists('./chroma'):
    try:
        shutil.rmtree('./chroma')
        print("Existing Chroma directory removed successfully.")
    except PermissionError as e:
        print(f"PermissionError: Unable to remove './chroma' directory: {e}")
        print("Ensure no processes are using the './chroma' directory and try running as administrator.")
        raise
    except Exception as e:
        print(f"Error removing Chroma directory: {e}")
        raise

Existing Chroma directory removed successfully.


In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings
embedding_llm = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embedding_llm = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from langchain_community.document_loaders import TextLoader

In [5]:
from langchain_community.vectorstores import Chroma

In [6]:
from langchain_core.documents import Document
import json

In [7]:
base = Path.cwd()  # Current working directory
candidates = [
    base / "HDFC_Faq.txt",
    base.parent / "HDFC_Faq.txt",                  # One level up
    base / "HDFC_ChatBot-main" / "HDFC_Faq.txt",   # Nested variant
    Path("C:/Users/chara/Downloads/HDFC_ChatBot-main/HDFC_ChatBot-main/HDFC_Faq.txt")  # Absolute path
]

faq_file = None
for candidate in candidates:
    if candidate.exists():
        faq_file = candidate
        break

if faq_file is None:
    raise FileNotFoundError("HDFC_Faq.txt not found in any of the specified locations")

# Read and process the FAQ file
with open(faq_file, "r", encoding="utf-8") as f:
    raw_text = f.read().strip()

# Ensure valid JSON format
if not raw_text.startswith("["):
    raw_text = "[" + raw_text
if not raw_text.endswith("]"):
    raw_text = raw_text.rstrip(",") + "]"

# Parse JSON and create documents
faq_data = json.loads(raw_text)
documents = [
    Document(
        page_content=f"Q: {item['question']}\nA: {item['answer']}",
        metadata={"source": str(faq_file)}
    )
    for item in faq_data
]
print(f"Number of chunks created: {len(documents)}")

Number of chunks created: 2236


In [8]:
db = Chroma.from_documents(
    documents,
    embedding_llm,
    persist_directory="./chroma",
    collection_name="hdfc_faqs"
)

In [9]:
db.persist()

  db.persist()
