In [22]:
import boto3
import os
from langchain.document_loaders import S3FileLoader,PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
import shutil
import re
import chromadb
from langchain.chains import RetrievalQA

from langchain.llms.bedrock import Bedrock


from langchain.embeddings import BedrockEmbeddings 


model_kwargs = { 
    "max_tokens_to_sample": 1024, 
    "temperature": 1, 
    "top_p": 0.9, 
    "stop_sequences": ["Human:"]
}

default_model_id = "anthropic.claude-instant-v1"
bedrock_base_kwargs = dict(model_id=default_model_id, model_kwargs= model_kwargs)

llm = Bedrock(**bedrock_base_kwargs)


bedrock_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v1")
s3client = boto3.client('s3')
s3_bucket = "ac-genai-streamlitbucket2fe9c216-4t8poszaf1to"
s3_path = "upload_files"
filename = "Texto-aprobado-Consejo-Constitucional_06.10.23.pdf"
filename2 = "Propuesta-Nueva-Constitucion.pdf"



text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", "\. ", " ", ""],
    chunk_size=3000,
    chunk_overlap=300,
    length_function=len
)


def load_and_split_pdf(file_path):
    loader = PyPDFLoader(file_path)
    docs = loader.load_and_split(text_splitter)
    return docs


def produce_docs(s3_bucket, s3_keys):
    all_docs = []
    for s3_key in s3_keys:
        print (f"loading: {s3_key}", end="")
        loader = S3FileLoader(s3_bucket, s3_key)
        pages = loader.load()
        print(len(pages[0].page_content), "chars ", end="")
        docs = text_splitter.split_documents(pages)
        print (len(docs), "chunks")

        all_docs += docs
    return all_docs

def create_vectordb(name, docs):
    
    cleaned = re.sub(r'[^a-z0-9_]', '_', name.lower()) 
     
    persist_directory = f"./chroma/{cleaned}"
    if os.path.exists(persist_directory): shutil.rmtree(persist_directory)

    vectordb = Chroma.from_documents(
        documents=docs,
        embedding=bedrock_embeddings,
        persist_directory= f"./chroma/{cleaned}"
    )
    print(vectordb._collection.count())


    return vectordb

In [2]:
pdf_docs = load_and_split_pdf("./Propuesta-Nueva-Constitucion.pdf")

In [3]:
len(pdf_docs)

182

In [6]:
%time
vctor_store = Chroma(embedding_function=bedrock_embeddings)
vctor_store.add_documents( documents=pdf_docs)
vctor_store._collection.count()

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.72 µs


364

In [4]:
n = 18
chunks = [pdf_docs[i:i+n] for i in range(0, len(pdf_docs), n)]
len (pdf_docs), len(chunks), len(chunks[0]), len(chunks[-1])

(182, 11, 18, 2)

In [5]:
%time
vctor_store = Chroma(embedding_function=bedrock_embeddings)
for chunk in chunks:
    vctor_store.add_documents( documents=chunk )

vctor_store._collection.count()

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 10 µs


182

In [10]:
res = await vctor_store.aadd_documents( documents=pdf_docs )

In [12]:
vctor_store._collection.count()

182

In [16]:
async def add_docs(vectordb, docs):
    await vectordb.aadd_documents( documents=docs )

add_docs(vctor_store, pdf_docs)

<coroutine object add_docs at 0x1224d3ac0>

In [42]:
vctor_store._collection.count()

0

In [2]:
docs = produce_docs(s3_bucket, [ f"{s3_path}/{filename}",  f"{s3_path}/{filename2}"])
len (docs)

loading: upload_files/Texto-aprobado-Consejo-Constitucional_06.10.23.pdf

  from .autonotebook import tqdm as notebook_tqdm


321913 chars 119 chunks
loading: upload_files/Propuesta-Nueva-Constitucion.pdf328036 chars 122 chunks


241

In [3]:
vdb = create_vectordb("constitución", docs)

241


In [6]:
vdb._collection.count()

241

In [9]:
persistent_client = chromadb.PersistentClient()


In [13]:

def create_vectordb(persistent_client, name):
    persist_directory = f"./chroma/{name}"
    if os.path.exists(persist_directory): shutil.rmtree(persist_directory)
    vectordb = Chroma(client=persistent_client, 
                      embedding_function=bedrock_embeddings,collection_name=name)
    print(vectordb._collection.count())
    return vectordb


vdb = create_vectordb(persistent_client, "prueba")

0


In [16]:
collection = persistent_client.get_collection(name= "prueba")

In [23]:
retriever= vdb.as_retriever(search_type = "mmr",  search_kwargs={"k": 5})
qa_chain = RetrievalQA.from_chain_type( llm, verbose=True, retriever=retriever)

In [25]:
qa_chain

RetrievalQA(verbose=True, combine_documents_chain=StuffDocumentsChain(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['context', 'question'], template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"), llm=Bedrock(client=<botocore.client.BedrockRuntime object at 0x12b7b5490>, model_id='anthropic.claude-instant-v1', model_kwargs={'max_tokens_to_sample': 1024, 'temperature': 1, 'top_p': 0.9, 'stop_sequences': ['Human:']})), document_variable_name='context'), retriever=VectorStoreRetriever(tags=['Chroma', 'BedrockEmbeddings'], vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x126cf1090>, search_type='mmr', search_kwargs={'k': 5}))

In [24]:
qa_chain.run({"query": "como funciona el congreso?"})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


' No sé la respuesta a esta pregunta.'

In [46]:
filename2 = "Propuesta-Nueva-Constitucion.pdf"

destination_key2 = f"{s3_path}/{filename2}"
print (destination_key2)
            
loader2 = S3FileLoader(s3_bucket, destination_key2)

upload_files/Propuesta-Nueva-Constitucion.pdf


In [47]:
pages2 = loader2.load()

In [48]:
len(pages2[0].page_content), len(pages2)

(328036, 1)

In [49]:
docs2 = text_splitter.split_documents(pages2)
len(docs2)

122

In [50]:
vectordb.add_documents(docs2)

['aaabc6f6-821a-11ee-b28b-88e9fe797c27',
 'aaabc8f4-821a-11ee-b28b-88e9fe797c27',
 'aaabc99e-821a-11ee-b28b-88e9fe797c27',
 'aaabc9f8-821a-11ee-b28b-88e9fe797c27',
 'aaabca48-821a-11ee-b28b-88e9fe797c27',
 'aaabca98-821a-11ee-b28b-88e9fe797c27',
 'aaabcaf2-821a-11ee-b28b-88e9fe797c27',
 'aaabcb4c-821a-11ee-b28b-88e9fe797c27',
 'aaabcb9c-821a-11ee-b28b-88e9fe797c27',
 'aaabcbec-821a-11ee-b28b-88e9fe797c27',
 'aaabcc32-821a-11ee-b28b-88e9fe797c27',
 'aaabcc78-821a-11ee-b28b-88e9fe797c27',
 'aaabccb4-821a-11ee-b28b-88e9fe797c27',
 'aaabccfa-821a-11ee-b28b-88e9fe797c27',
 'aaabcd40-821a-11ee-b28b-88e9fe797c27',
 'aaabcd86-821a-11ee-b28b-88e9fe797c27',
 'aaabcdcc-821a-11ee-b28b-88e9fe797c27',
 'aaabce12-821a-11ee-b28b-88e9fe797c27',
 'aaabce4e-821a-11ee-b28b-88e9fe797c27',
 'aaabce8a-821a-11ee-b28b-88e9fe797c27',
 'aaabcec6-821a-11ee-b28b-88e9fe797c27',
 'aaabcf0c-821a-11ee-b28b-88e9fe797c27',
 'aaabcf48-821a-11ee-b28b-88e9fe797c27',
 'aaabd038-821a-11ee-b28b-88e9fe797c27',
 'aaabd088-821a-

In [52]:
119+122

241

In [53]:
docs_mmr = vectordb.max_marginal_relevance_search("Como funciona el congreso?",k=8)

In [55]:
for d in docs_mmr:
    print(d.metadata)

{'source': 's3://ac-genai-streamlitbucket2fe9c216-4t8poszaf1to/upload_files/Texto-aprobado-Consejo-Constitucional_06.10.23.pdf'}
{'source': 's3://ac-genai-streamlitbucket2fe9c216-4t8poszaf1to/upload_files/Texto-aprobado-Consejo-Constitucional_06.10.23.pdf'}
{'source': 's3://ac-genai-streamlitbucket2fe9c216-4t8poszaf1to/upload_files/Propuesta-Nueva-Constitucion.pdf'}
{'source': 's3://ac-genai-streamlitbucket2fe9c216-4t8poszaf1to/upload_files/Propuesta-Nueva-Constitucion.pdf'}
{'source': 's3://ac-genai-streamlitbucket2fe9c216-4t8poszaf1to/upload_files/Propuesta-Nueva-Constitucion.pdf'}
{'source': 's3://ac-genai-streamlitbucket2fe9c216-4t8poszaf1to/upload_files/Propuesta-Nueva-Constitucion.pdf'}
{'source': 's3://ac-genai-streamlitbucket2fe9c216-4t8poszaf1to/upload_files/Texto-aprobado-Consejo-Constitucional_06.10.23.pdf'}
{'source': 's3://ac-genai-streamlitbucket2fe9c216-4t8poszaf1to/upload_files/Propuesta-Nueva-Constitucion.pdf'}
