In [1]:
!python --version

Python 3.12.7


In [1]:
import os
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate


# --- Configuration ---
DATA_PATH = "firstaid.txt"
PERSIST_DIRECTORY = "./test1/chroma_db"
EMBEDDING_MODEL = "bge-m3:latest"
LLM_MODEL = "docbot:v1"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
RETRIEVER_K = 10  # Reduced K value significantly
CHAIN_TYPE = "stuff" # Consider starting with 'stuff' if context allows with smaller K

# --- Embedding and LLM Setup ---
embedding_model = OllamaEmbeddings(model=EMBEDDING_MODEL)
llm = Ollama(model=LLM_MODEL)

# --- Vector Database ---
if not os.path.exists(PERSIST_DIRECTORY):
    print(f"Creating new vector database at {PERSIST_DIRECTORY}")
    try:
        # 1. Load and prepare data
        loader = TextLoader(DATA_PATH)
        documents = loader.load()
        if not documents:
            print(f"Error: No documents loaded from {DATA_PATH}")
            exit()

        # 2. Split text into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP
        )
        chunks = text_splitter.split_documents(documents)
        if not chunks:
            print(f"Error: No chunks created from documents.")
            exit()

        # 3. Create and persist vector database
        print(f"Creating Chroma DB with {len(chunks)} chunks...")
        vector_db = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_model,
            persist_directory=PERSIST_DIRECTORY
        )
        print("Database created successfully.")
    except Exception as e:
        print(f"Error creating vector database: {e}")
        exit()
else:
    print(f"Loading existing vector database from {PERSIST_DIRECTORY}")
    try:
        vector_db = Chroma(
            persist_directory=PERSIST_DIRECTORY,
            embedding_function=embedding_model
        )
        print("Database loaded successfully.")
    except Exception as e:
        print(f"Error loading vector database: {e}")
        exit()

# --- QA Chain Setup ---
retriever = vector_db.as_retriever(search_kwargs={"k": RETRIEVER_K})

# --- Prompts (Adjust based on chosen CHAIN_TYPE) ---

# Example for 'stuff' chain type (uses a default prompt or you can specify one)
# qa_chain = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff", # Simpler chain type if context fits
#     retriever=retriever,
#     return_source_documents=True
# )

# Setup for 'map_reduce' (if you stick with it after reducing K)
map_prompt_template = """
Use the following context to answer the question. Answer concisely based only on the provided text.
Context: {context}
Question: {question}
Answer:"""
MAP_PROMPT = PromptTemplate(
    template=map_prompt_template,
    input_variables=["context", "question"]
)

reduce_prompt_template = """
Synthesize the following individual answers into a single, coherent, and comprehensive final answer based only on the provided documents. Avoid repetition.
Summaries: {summaries}
Final Answer:"""
REDUCE_PROMPT = PromptTemplate(
    template=reduce_prompt_template,
    input_variables=["summaries"]
)

# Check if selected chain type requires specific prompts
if CHAIN_TYPE == "map_reduce":
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="map_reduce",
        retriever=retriever,
        chain_type_kwargs={
            "question_prompt": MAP_PROMPT,
            "combine_prompt": REDUCE_PROMPT,
            # Langchain map_reduce uses 'summaries' by default for the combine_prompt input
            # Explicitly setting combine_document_variable_name might not always be needed
            # if your REDUCE_PROMPT variable matches the default ('summaries').
            # Let's keep it for clarity as you had it:
            "combine_document_variable_name": "summaries"
        },
        return_source_documents=True
    )
elif CHAIN_TYPE == "stuff":
     # Example using a custom prompt for 'stuff'
     stuff_template = """Use the following pieces of context from the document as reference to answer the question at the end as accurately as possible. Do not summarise content from the document, give the document content as is then, provide a summary.If you don't know the answer, just say that you don't know, don't try to make up an answer.

     {context}

     Question: {question}
     Helpful Answer:"""
     STUFF_PROMPT = PromptTemplate(template=stuff_template, input_variables=["context", "question"])
     qa_chain = RetrievalQA.from_chain_type(
         llm=llm,
         chain_type="stuff",
         retriever=retriever,
         chain_type_kwargs={"prompt": STUFF_PROMPT},
         return_source_documents=True
     )
# Add other chain types (refine, map_rerank) here if needed
else:
    print(f"Error: Unsupported chain type '{CHAIN_TYPE}'")
    exit()


# --- Query Loop ---
print(f"\nQA System Initialized (Using Chain Type: '{CHAIN_TYPE}', Retriever k={RETRIEVER_K}).")
while True:
    query = input("\nAsk a question (or type 'exit'): ")
    if query.lower() == 'exit':
        break
    if not query:
        continue

    try:
        # Use invoke for LangChain Expression Language (LCEL) compatibility
        result = qa_chain.invoke({"query": query}) # Pass query in a dictionary for standard LCEL format

        print("\nAnswer:", result.get("result", "No answer found."))

        source_docs = result.get("source_documents", [])
        if source_docs:
            # Get unique source filenames
            source_files = list(set(doc.metadata.get('source', 'Unknown Source') for doc in source_docs))
            print("Sources:", ", ".join(source_files))
        else:
            print("Sources: No source documents found.")

    except Exception as e:
        print(f"An error occurred during query processing: {e}")

  embedding_model = OllamaEmbeddings(model=EMBEDDING_MODEL)
  llm = Ollama(model=LLM_MODEL)


Creating new vector database at ./test1/chroma_db
Error creating vector database: Error loading firstaid.txt


NameError: name 'vector_db' is not defined

In [None]:
import os
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate


# --- Configuration ---
DATA_PATH = "C:/Users/htsocadmin/Desktop/text.txt"
PERSIST_DIRECTORY = "./test1/chroma_db"
EMBEDDING_MODEL = "bge-m3:latest"
LLM_MODEL = "docbot:v1"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
RETRIEVER_K = 10  # Reduced K value significantly
CHAIN_TYPE = "stuff" # Consider starting with 'stuff' if context allows with smaller K

# --- Embedding and LLM Setup ---
embedding_model = OllamaEmbeddings(model=EMBEDDING_MODEL)
llm = Ollama(model=LLM_MODEL)

# --- Vector Database ---
if not os.path.exists(PERSIST_DIRECTORY):
    print(f"Creating new vector database at {PERSIST_DIRECTORY}")
    try:
        # 1. Load and prepare data
        loader = TextLoader(DATA_PATH)
        documents = loader.load()
        if not documents:
            print(f"Error: No documents loaded from {DATA_PATH}")
            exit()

        # 2. Split text into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP
        )
        chunks = text_splitter.split_documents(documents)
        if not chunks:
            print(f"Error: No chunks created from documents.")
            exit()

        # 3. Create and persist vector database
        print(f"Creating Chroma DB with {len(chunks)} chunks...")
        vector_db = Chroma.from_documents(
            documents=chunks,
            embedding=embedding_model,
            persist_directory=PERSIST_DIRECTORY
        )
        print("Database created successfully.")
    except Exception as e:
        print(f"Error creating vector database: {e}")
        exit()
else:
    print(f"Loading existing vector database from {PERSIST_DIRECTORY}")
    try:
        vector_db = Chroma(
            persist_directory=PERSIST_DIRECTORY,
            embedding_function=embedding_model
        )
        print("Database loaded successfully.")
    except Exception as e:
        print(f"Error loading vector database: {e}")
        exit()

# --- QA Chain Setup ---
retriever = vector_db.as_retriever(search_kwargs={"k": RETRIEVER_K})

# --- Prompts (Adjust based on chosen CHAIN_TYPE) ---

# Example for 'stuff' chain type (uses a default prompt or you can specify one)
# qa_chain = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff", # Simpler chain type if context fits
#     retriever=retriever,
#     return_source_documents=True
# )

# Setup for 'map_reduce' (if you stick with it after reducing K)
map_prompt_template = """
Use the following context to answer the question. Answer concisely based only on the provided text.
Context: {context}
Question: {question}
Answer:"""
MAP_PROMPT = PromptTemplate(
    template=map_prompt_template,
    input_variables=["context", "question"]
)

reduce_prompt_template = """
Synthesize the following individual answers into a single, coherent, and comprehensive final answer based only on the provided documents. Avoid repetition.
Summaries: {summaries}
Final Answer:"""
REDUCE_PROMPT = PromptTemplate(
    template=reduce_prompt_template,
    input_variables=["summaries"]
)

# Check if selected chain type requires specific prompts
if CHAIN_TYPE == "map_reduce":
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="map_reduce",
        retriever=retriever,
        chain_type_kwargs={
            "question_prompt": MAP_PROMPT,
            "combine_prompt": REDUCE_PROMPT,
            # Langchain map_reduce uses 'summaries' by default for the combine_prompt input
            # Explicitly setting combine_document_variable_name might not always be needed
            # if your REDUCE_PROMPT variable matches the default ('summaries').
            # Let's keep it for clarity as you had it:
            "combine_document_variable_name": "summaries"
        },
        return_source_documents=True
    )
elif CHAIN_TYPE == "stuff":
     # Example using a custom prompt for 'stuff'
     stuff_template = """Use the following pieces of context from the document as reference to answer the question at the end as accurately as possible. Do not summarise content from the document, give the document content as is then, provide a summary.If you don't know the answer, just say that you don't know, don't try to make up an answer.

     {context}

     Question: {question}
     Helpful Answer:"""
     STUFF_PROMPT = PromptTemplate(template=stuff_template, input_variables=["context", "question"])
     qa_chain = RetrievalQA.from_chain_type(
         llm=llm,
         chain_type="stuff",
         retriever=retriever,
         chain_type_kwargs={"prompt": STUFF_PROMPT},
         return_source_documents=True
     )
# Add other chain types (refine, map_rerank) here if needed
else:
    print(f"Error: Unsupported chain type '{CHAIN_TYPE}'")
    exit()


# --- Query Loop ---
print(f"\nQA System Initialized (Using Chain Type: '{CHAIN_TYPE}', Retriever k={RETRIEVER_K}).")
while True:
    query = input("\nAsk a question (or type 'exit'): ")
    if query.lower() == 'exit':
        break
    if not query:
        continue

    try:
        # Use invoke for LangChain Expression Language (LCEL) compatibility
        result = qa_chain.invoke({"query": query}) # Pass query in a dictionary for standard LCEL format

        print("\nAnswer:", result.get("result", "No answer found."))

        source_docs = result.get("source_documents", [])
        if source_docs:
            # Get unique source filenames
            source_files = list(set(doc.metadata.get('source', 'Unknown Source') for doc in source_docs))
            print("Sources:", ", ".join(source_files))
        else:
            print("Sources: No source documents found.")

    except Exception as e:
        print(f"An error occurred during query processing: {e}")

  llm = Ollama(model=LLM_MODEL)
  vector_db = Chroma(


Loading existing vector database from ./test1/chroma_db
Database loaded successfully.

QA System Initialized (Using Chain Type: 'stuff', Retriever k=10).



Ask a question (or type 'exit'):  what is inside the document



Answer: <think>
Okay, the user is asking what's inside the document based on the provided context. Let me check the context given. The only excerpts provided are "We are HTSOC" repeated twice. There's no other information in the document excerpts. The user wants an answer using only the document content. Since the document only mentions "We are HTSOC" and nothing else, the answer should reflect that. I need to state that the document contains just those two lines. Also, the user mentioned not to summarize but to provide the document content as is. So I should present the excerpts exactly as they are and then summarize that the document only includes those statements. No other information is present, so the answer is straightforward.
</think>

The document excerpts provided contain only the following text:  
**"We are HTSOC"**  
**"We are HTSOC"**  

No additional content or context is included in the provided excerpts.
Sources: C:/Users/htsocadmin/Desktop/test.txt


In [1]:
!pip install -U :class:`~langchain-ollama

ERROR: Invalid requirement: ':class:`~langchain-ollama': Expected package name at the start of dependency specifier
    :class:`~langchain-ollama
    ^


In [1]:
!conda install python=3.11.2

Channels:
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... failed



LibMambaUnsatisfiableError: Encountered problems while solving:
  - nothing provides python >=3.4,<3.5.0a0 needed by conda-4.3.29-py34h479681e_0

Could not solve for environment specs
The following packages are incompatible
\u251c\u2500 anaconda_prompt is installable with the potential options
\u2502  \u251c\u2500 anaconda_prompt 1.1.0 would require
\u2502  \u2502  \u2514\u2500 menuinst >=2.1.1 , which can be installed;
\u2502  \u2514\u2500 anaconda_prompt 1.0.0 would require
\u2502     \u2514\u2500 menuinst >=2.1.0 , which can be installed;
\u251c\u2500 conda-token is installable with the potential options
\u2502  \u251c\u2500 conda-token [0.5.0|0.6.0] would require
\u2502  \u2502  \u2514\u2500 conda >=4.6,!=23.10.0,!=23.11.0  with the potential options
\u2502  \u2502     \u251c\u2500 conda [24.1.0|24.1.1|...|25.3.1] would require
\u2502  \u2502     \u2502  \u2514\u2500 python >=3.12,<3.13.0a0 , which can be installed;
\u2502  \u2502     \u251c\u2500 conda [22.11.0|22.11.1|...|4.14.0

In [3]:
!pip install langchain_community


Collecting langchain_community
  Downloading langchain_community-0.3.23-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain-core<1.0.0,>=0.3.56 (from langchain_community)
  Using cached langchain_core-0.3.56-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.24 (from langchain_community)
  Using cached langchain-0.3.24-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langsmith<0.4,>=0.1.125 (from langchain_community)
  Downloading langsmith-0.3.38-py3-none-any.whl.metadata (15 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Using cached httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Using cached marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=

In [1]:
!pip install langchain



In [1]:
!pip install unstructured
!pip install langchain_text_splitters



In [1]:
pip install chromadb

Collecting chromadb
  Using cached chromadb-1.0.7-cp39-abi3-win_amd64.whl.metadata (7.0 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Using cached chroma_hnswlib-0.7.6.tar.gz (32 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting fastapi==0.115.9 (from chromadb)
  Using cached fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Using cached uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Using cached posthog-4.0.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxr

In [1]:
!pip install ollama

Collecting ollama
  Using cached ollama-0.4.8-py3-none-any.whl.metadata (4.7 kB)
Using cached ollama-0.4.8-py3-none-any.whl (13 kB)
Installing collected packages: ollama
Successfully installed ollama-0.4.8


In [3]:
# Required installations:
# pip install langchain ollama chromadb PyPDFLoader

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import TextLoader

# 1. Load and prepare data
#loader = PyPDFLoader("cybersecurity_threats.pdf")  # Replace with your PDF path
loader = TextLoader("firstaid.txt")
documents = loader.load()

# 2. Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
chunks = text_splitter.split_documents(documents)

# 3. Create vector database with Ollama embeddings
embedding_model = OllamaEmbeddings(model="bge-m3:latest")  # [[8]]
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="./test1/chroma_db"  # Saves the vector store locally
)

RuntimeError: Error loading firstaid.txt

In [5]:
# Required installations:
# pip install langchain ollama chromadb PyPDFLoader

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import TextLoader

# 1. Load and prepare data
#loader = PyPDFLoader("cybersecurity_threats.pdf")  # Replace with your PDF path
loader = TextLoader("C:/Users/htsocadmin/Desktop/project folder/firstaid.txt")
documents = loader.load()

# 2. Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
chunks = text_splitter.split_documents(documents)

# 3. Create vector database with Ollama embeddings
embedding_model = OllamaEmbeddings(model="bge-m3:latest")  # [[8]]
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="./test1/chroma_db"  # Saves the vector store locally
)

RuntimeError: Error loading C:/Users/htsocadmin/Desktop/project folder/firstaid.txt

In [7]:
# Required installations:
# pip install langchain ollama chromadb PyPDFLoader

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import TextLoader

# 1. Load and prepare data
#loader = PyPDFLoader("cybersecurity_threats.pdf")  # Replace with your PDF path
loader = TextLoader("C:/Users/htsocadmin/Desktop/firstaid.txt")
documents = loader.load()

# 2. Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
chunks = text_splitter.split_documents(documents)

# 3. Create vector database with Ollama embeddings
embedding_model = OllamaEmbeddings(model="bge-m3:latest")  # [[8]]
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="./test1/chroma_db"  # Saves the vector store locally
)

RuntimeError: Error loading C:/Users/htsocadmin/Desktop/firstaid.txt

In [9]:
conda update conda

error: incomplete escape \U at position 28

In [1]:
# Required installations:
# pip install langchain ollama chromadb PyPDFLoader

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import TextLoader

# 1. Load and prepare data
#loader = PyPDFLoader("cybersecurity_threats.pdf")  # Replace with your PDF path
loader = TextLoader("test.txt")
documents = loader.load()

# 2. Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
chunks = text_splitter.split_documents(documents)

# 3. Create vector database with Ollama embeddings
embedding_model = OllamaEmbeddings(model="bge-m3:latest")  # [[8]]
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="./test1/chroma_db"  # Saves the vector store locally
)

RuntimeError: Error loading test.txt

In [3]:
# Required installations:
# pip install langchain ollama chromadb PyPDFLoader

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import TextLoader

# 1. Load and prepare data
#loader = PyPDFLoader("cybersecurity_threats.pdf")  # Replace with your PDF path
loader = TextLoader("C:/Users/htsocadmin/Desktop/test.txt")
documents = loader.load()

# 2. Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
chunks = text_splitter.split_documents(documents)

# 3. Create vector database with Ollama embeddings
embedding_model = OllamaEmbeddings(model="bge-m3:latest")  # [[8]]
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="./test1/chroma_db"  # Saves the vector store locally
)

  embedding_model = OllamaEmbeddings(model="bge-m3:latest")  # [[8]]


In [5]:
# Required installations:
# pip install langchain ollama chromadb PyPDFLoader

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import TextLoader

# 1. Load and prepare data
#loader = PyPDFLoader("cybersecurity_threats.pdf")  # Replace with your PDF path
loader = TextLoader("C:/Users/htsocadmin/Desktop/test.txt")
documents = loader.load()

# 2. Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)
chunks = text_splitter.split_documents(documents)

# 3. Create vector database with Ollama embeddings
embedding_model = OllamaEmbeddings(model="bge-m3:latest")  # [[8]]
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="./test1/chroma_db"  # Saves the vector store locally
)