In [None]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain.prompts import ChatPromptTemplate

CHROMA_PATH = "chroma"  # directory where your Chroma DB lives

# --------------------------------------------------------------------------- #
PROMPT_TEMPLATE = """
Answer the question using only the context provided between the lines.

{context}
---
Question: {question}

Answer:"""

query_text = "How does Alice meet the Mad Hatter?"

# Embeddings ------------------------------------------------------------ #
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Vector DB ------------------------------------------------------------- #
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

# Vector search --------------------------------------------------------- #
results = db.similarity_search_with_relevance_scores(query_text, k=3)

if not results:
    print("Unable to find sufficiently relevant results.")

In [None]:
# NEW imports ────────────────────────────────────────────────────────────────
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms import HuggingFacePipeline   # NOT ChatOllama

# ────────────────────────────────────────────────────────────────────────────
# 1. pick a model that fits your machine
#    • Tiny models (≤2-3 GB RAM):  TinyLlama/TinyLlama-1.1B-Chat-v1.0
#    • Mid-range (8-12 GB RAM):   microsoft/phi-2
#    • Bigger ( >12 GB RAM):      mistralai/Mistral-7B-Instruct-v0.2
HF_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# 2. load it with quantisation if you need to save RAM
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    HF_MODEL_NAME,
    device_map="auto",      # GPU if available, else CPU
    load_in_4bit=True       # comment out if you have plenty of VRAM/RAM
)

# 3. wrap it in a text-generation pipeline
gen_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1,
)

# 4. make it a LangChain LLM
llm = HuggingFacePipeline(pipeline=gen_pipe)

# ────────────────────────────────────────────────────────────────────────────
# Everything below is unchanged
response_text: str = llm.predict(prompt)

sources = [doc.metadata.get("source") for doc, _ in results]
print("─" * 80)
print(response_text.strip())
print("\nSources:")
for s in sources:
    print(" •", s)


In [5]:
context_text = "\n\n---\n\n".join(doc.page_content for doc, _ in results)

# Prompt ---------------------------------------------------------------- #
prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATE).format(
    context=context_text, question=query_text
)

# Local LLM via Ollama --------------------------------------------------- #
llm = ChatOllama(model="mistral:7b")

response_text: str = llm.predict(prompt)

# Nicely print ---------------------------------------------------------- #
sources = [doc.metadata.get("source") for doc, _ in results]
print("─" * 80)
print(response_text.strip())
print("\nSources:")
for s in sources:
    print(" •", s)



  response_text: str = llm.predict(prompt)


OllamaEndpointNotFoundError: Ollama call failed with status code 404. Maybe your model is not found and you should pull the model with `ollama pull mistral:7b`.