In [28]:
import pymupdf
import os
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from transformers import AutoTokenizer
from huggingface_hub import notebook_login
load_dotenv()

True

In [13]:
doc = pymupdf.open("../data/DCEE Actions Master List_090920_final.pdf") # open a document
text_list = []
for page in doc: # iterate the document pages
	text = page.get_text() # get plain text encoded as UTF-8
	text_list.append(text)

In [14]:
# Concatenate all the text from the pages
text = " ".join(text_list)

In [22]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=250,
    chunk_overlap=25,
    length_function=len,
    is_separator_regex=False,
).from_huggingface_tokenizer(tokenizer)

In [23]:
docs = text_splitter.create_documents([text])
docs[:5]

[Document(metadata={}, page_content='Data Center Master List of Energy \nEfficiency Measures \nVersion 2.0 \n \n \nSeptember 2020 \n \n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n(This page intentionally left blank) \n \n DOCUMENT TITLE \niii \nDisclaimer  \nThis document was prepared as an account of work sponsored by the United States Government. While this \ndocument is believed to contain correct information, neither the United States Government nor any agency \nthereof, nor The Regents of the University of California, nor any of their employees, makes any warranty, \nexpress or implied, or assumes any legal responsibility for the accuracy, completeness, or usefulness of any \ninformation, apparatus, product, or process disclosed, or represents that its use would not infringe privately \nowned rights. Reference herein to any specific commercial product, process, or service by its trade name, \ntrademark, manufacturer, or otherwise, does not necessarily constitute or imply its end

In [24]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [25]:
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)

In [26]:
retriever = vectorstore.as_retriever()

In [38]:
from langchain.prompts import ChatPromptTemplate
# Prompt template
template = """
Answer the question based only on the following context. Please reference the context in your answer.
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOllama(model="llama3.2:1b")

In [39]:
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: x["context"]))
    | prompt
    | model
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

result = rag_chain_with_source.invoke("What are some ways I can reduce my data center energy usage?")

In [40]:
with open("output.txt", "w") as f:
	f.write(result["answer"])