In [2]:
from dotenv import load_dotenv
import os
import weaviate
from weaviate.classes.init import Auth

load_dotenv()
WEAVIATE_URL = "https://konvsbviroeqczjr3hsitw.c0.asia-southeast1.gcp.weaviate.cloud"
WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(api_key=WEAVIATE_API_KEY),
)
print(client.is_ready())
client.close()

True


In [3]:
from IPython.display import display, Markdown

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

In [5]:
from langchain_community.document_loaders import PyPDFLoader

data = "C:/Users/csing/VSCode/Projects/RAG_LangChain_MistralAI_Weaviate/data"
documents = []

for pdf_file in os.listdir(data):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(data, pdf_file)
        loader = PyPDFLoader(pdf_path, extract_images=True)
        documents.extend(loader.load())

In [None]:
print(documents[0].page_content)

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
)
split_docs = text_splitter.split_documents(documents)

In [7]:
# Clean document metadata for Weaviate compatibility
import re

def clean_metadata_keys(metadata):
    cleaned = {}
    for key, value in metadata.items():
        clean_key = re.sub(r'[^A-Za-z0-9_]', '_', key)
        if clean_key and not re.match(r'^[A-Za-z_]', clean_key):
            clean_key = '_' + clean_key
        cleaned[clean_key[:230]] = value
    return cleaned

# Apply cleaning to all documents
for doc in split_docs:
    doc.metadata = clean_metadata_keys(doc.metadata)

print(f"Cleaned metadata for {len(split_docs)} documents")

Cleaned metadata for 382 documents


In [8]:
client.connect()

In [9]:
from langchain_weaviate.vectorstores import WeaviateVectorStore
vector_db = WeaviateVectorStore.from_documents(
    split_docs,
    embeddings,
    client=client,
)

print(f"Successfully created vector store with {len(split_docs)} documents")

Successfully created vector store with 382 documents


In [None]:
results = vector_db.similarity_search("What is the main topic of the document?", k=3)

for i, doc in enumerate(results):
    print(f"Result {i+1}: {doc.page_content}")
    print("-" * 50)

In [10]:
from langchain.prompts import ChatPromptTemplate

template = """
You are an expert in question answering tasks.
Use the following piece of retrieved context to answer the question.
If the context does not provide enough information, say "I don't know".
Use the context to answer the question as accurately as possible.
Use ten sentences maximum to answer the question.

Question: {question}
Context: {context}
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)

In [None]:
from langchain_huggingface import HuggingFacePipeline

model_name = "mistralai/Mistral-7B-Instruct-v0.3"
print(f"Loading model {model_name}...")

llm = HuggingFacePipeline.from_model_id(
    model_id=model_name,
    task="text-generation",
    model_kwargs={"temperature": 0.1, "max_length": 512},
)

print("Model loaded successfully.")

In [1]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [None]:
rag_chain = (
    {"context": vector_db.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
rag_chain.invoke({"question": "What is the main topic of the document?"})