In [54]:
import pymupdf
from dotenv import load_dotenv

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableParallel
from langchain.prompts import ChatPromptTemplate

from transformers import AutoTokenizer
from huggingface_hub import notebook_login

load_dotenv()

True

In [55]:
doc = pymupdf.open("../data/DCEE Actions Master List_090920_final.pdf") # open a document
text_list = []
for page in doc: # iterate the document pages
	text = page.get_text() # get plain text encoded as UTF-8
	text_list.append(text)

# Concatenate all the text from the pages
text = " ".join(text_list)

In [49]:
# setup tokenizer and text splitter
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,
    chunk_overlap=25,
    is_separator_regex=False,
).from_huggingface_tokenizer(tokenizer)

# create documents
docs = text_splitter.create_documents([text])

In [50]:
# create embeddings and convert to retriever
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
retriever = vectorstore.as_retriever()



In [51]:
# create prompt template and chat model
template = """
Answer the question based only on the following context. Please reference the context in your answer.
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOllama(model="llama3.2:1b")

In [52]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: x["context"]))
    | prompt
    | model
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

result = rag_chain_with_source.invoke("What are some ways I can reduce my data center energy usage?")

In [53]:
with open("output.txt", "w") as f:
	f.write(result["answer"])