In [None]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import OnlinePDFLoader

In [None]:
import os

print("Loading data...")
pdf_folder_path = "biblioteca/"
print(os.listdir(pdf_folder_path))

# Load multiple files
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]

print(loaders)

all_documents = []

for loader in loaders:
    print("Loading raw document..." + loader.file_path)
    raw_documents = loader.load()

    print("Splitting text...")
    text_splitter = RecursiveCharacterTextSplitter(
        #separator="\n\n",
        chunk_size=800,
        chunk_overlap=100,
        length_function=len,
    )
    documents = text_splitter.split_documents(raw_documents)
    all_documents.extend(documents)

In [None]:
local_path = 'biblioteca/cartilha_lgpd_2021.pdf'

if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
else:
    print('Upload um arquivo')

In [None]:
type(data)#[0].page_content

### Vector Embeddings

In [None]:
!ollama list

In [None]:
#!pip install --q chromadb
#!pip install --q langchain-text-splitters

In [None]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, 
                                               chunk_overlap=100)

chunks = text_splitter.split_documents(data)

In [None]:
model_name = 'cnmoro/mistral_7b_portuguese:q2_K'

vector_db = Chroma.from_documents(
    documents = all_documents, #chunks, 
    embedding = OllamaEmbeddings(model=model_name, show_progress=True),
    collection_name='local-rag'
)

### Retrieval

In [None]:
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever

In [None]:
local_model = model_name
llm = ChatOllama(model=local_model)

In [None]:
QUERY_PROMPT = PromptTemplate(
    input_variables=['question'],
    template = """
    Você é um modelo de linguagem de IA. 
    Sua tarefa é gerar cinco diferentes versões 
    de uma questão dada pelo usuário para recuperar documentos relevantes de um vector database.
    Ao gerar múltiplas perspectivas da questão do usuário, seu objetivo é ajudar o usuário a 
    suerar algumas limitações da busca baseada em similaridade. Forneça essas perguntas alternativas sparadas por linhas.
    Pergunta originaç: {question}
    """
)

In [None]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(),
    llm, 
    prompt=QUERY_PROMPT
)

template = """Responda a questão baseado somente no seguinte contexto:{context}"""

In [None]:
prompt = ChatPromptTemplate.from_template(template)

In [None]:
chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    |prompt
    |llm
    |StrOutputParser()
)

In [None]:
chain.invoke(input(""))

In [None]:
chain.invoke(input(""))

In [None]:
chain.invoke(input(""))

In [None]:
chain.invoke(input(""))

In [None]:
chain.invoke(input(""))

In [None]:
chain.invoke(input(""))

In [None]:
chain.invoke(input(""))

In [None]:
chain.invoke(input(""))

In [None]:
chain.invoke(input(""))