In [None]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_openai.chat_models import ChatOpenAI
from langchain.chains.retrieval_qa.base import RetrievalQA


In [None]:
load_dotenv(override=True)
os.environ.get("OPENAI_API_KEY")

In [None]:
files = ["D:\Visual Code\RAG\pdf\entrecomp.pdf"]
pages = []

In [None]:
for file in files:
    loader = PyPDFLoader(file)
    pages.extend(loader.load())

In [None]:
recur_split = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", " ", ""]
)

In [None]:
documents = recur_split.split_documents(pages)

In [None]:
for i, doc in enumerate(documents):
    doc.metadata['source'] = doc.metadata['source'].replace('pdf/', '')
    doc.metadata['doc_id'] = i

In [None]:
directory = '../vector/chroma_retrival_bd'

In [None]:
embeddings_model = OpenAIEmbeddings()

In [None]:
vectordb = Chroma.from_documents(
    documents=documents,
    embedding=embeddings_model,
    persist_directory=directory
)

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [None]:
chat_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectordb.as_retriever(search_type='mmr'),
)

In [None]:
question = "O que é Pensamento ético e sustentável?"

In [None]:
chat_chain.invoke({"query": question})

# Engenharia de Prompt