In [75]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_openai.chat_models import ChatOpenAI
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.globals import set_debug

# Variables

In [76]:
load_dotenv(override=True)

True

In [77]:
api_key = os.getenv("OPENAI_API_KEY")
files = [os.getenv("SOURCE_PDF")]
pages = []
store = os.getenv("SOURCE_STORE")

# Files

In [78]:
for file in files:
    loader = PyPDFLoader(file)
    pages.extend(loader.load())

# Recursive TextSplit

In [79]:
recur_split = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", " ", ""]
)

In [80]:
documents = recur_split.split_documents(pages)

# Embbiding

In [82]:
embeddings_model = OpenAIEmbeddings()

# Vector Store

In [83]:
vectordb = Chroma.from_documents(
    documents=documents,
    embedding=embeddings_model,
    persist_directory=store
)

# LLM

In [84]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=api_key, temperature=0)

# Chain

In [85]:
chat_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectordb.as_retriever(search_type='mmr'),
)

# Question

In [86]:
question = "O que é Valorizar ideias?"

# Answer

In [87]:
chat_chain.invoke({"query": question})

{'query': 'O que é Valorizar ideias?',
 'result': 'Valorizar ideias é fazer o máximo com as ideias e as oportunidades, avaliando o valor que elas possuem em termos sociais, culturais e econômicos. Isso envolve reconhecer o potencial de criação de valor de uma ideia e identificar formas adequadas de extrair o máximo dela.'}

# Prompt