In [None]:
import os
from dotenv import load_dotenv

from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.chroma import Chroma
from langchain_openai.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.schema import AttributeInfo

In [None]:
load_dotenv(override=True)

os.environ.get("OPENAI_API_KEY")

In [None]:
files = ["D:\Visual Code\RAG\pdf\entrecomp.pdf"]
pages = []

for file in files:
    loader = PyPDFLoader(file)
    pages.extend(loader.load())

recur_split = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", " ", ""]
)

In [None]:
documents = recur_split.split_documents(pages)

In [None]:
for i, doc in enumerate(documents):
    doc.metadata['source'] = doc.metadata['source'].replace('pdf/', '')
    doc.metadata['doc_id'] = i

In [None]:
documents[2].metadata

In [None]:
embeddings_model = OpenAIEmbeddings()

In [None]:
directory = '../vector/chroma_retrival_bd'

vectordb = Chroma.from_documents(
    documents=documents,
    embedding=embeddings_model,
    persist_directory=directory
)

In [None]:
question = "O que é Pensamento ético e sustentável?"

# Busca Semantica

In [None]:
docs = vectordb.similarity_search(question, k=3)

for doc in docs:
    print(doc.page_content)
    print(f"========{doc.metadata}\n")

# Max Margina Relevance

In [None]:
docs = vectordb.max_marginal_relevance_search(question, k=3, fetch_k=10)

for doc in docs:
    print(doc.page_content)
    print(f"========{doc.metadata}\n")

# Filtro

In [None]:
docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"source": "entrecomp.pdf"}
)

for doc in docs:
    print(doc.page_content)
    print(f"========{doc.metadata}\n")

In [None]:
docs = vectordb.similarity_search(
    question, 
    k=3,
    filter={'$and':
            [{'source': {'$in': ['entrecomp.pdf']}},
            {'page': {'$in': [3, 4, 5, 6]}}],
            }
)

for doc in docs:
    print(doc.page_content)
    print(f'==========={doc.metadata}\n\n')

# LLM Aided Retrival

In [None]:
metadata_info = [
    AttributeInfo(
        name='source',
        description='Nome do PDF de onde o texto original foi retirado. ENTRECOMP',
        type='string'
    ),
    AttributeInfo(
        name='page',
        description='A página do PDF de onde o texto foi extraído. Número da página.',
        type='integer'
    ),
]

In [None]:
document_description = 'ENTRECOMP'

llm = OpenAI()

retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_description,
    metadata_info,
    verbose=True
)

In [None]:
docs = retriever.get_relevant_documents(question)

for doc in docs:
    print(doc.page_content)
    print(f'==========={doc.metadata}\n\n')