#### Crear una BD vectorial con ChromaDB
Propósito del ejemplo  <br>
Mostrar cómo se crea un espacio vectorial empleando ChromaDB y cómo se opera con él. Adicionalmente se revisa cómo cargar varios documentos.



In [None]:
# Carga de multiples documentos
from langchain.document_loaders import PyPDFLoader

# Carga PDFs
loaders = [
    PyPDFLoader("./directorio/Estructuras.pdf"),
    PyPDFLoader("./directorio/Modulo2.pdf"),
    PyPDFLoader("./directorio/Modulo3.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [None]:
# Particion en chunks 
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)
# Se particiona el documento
splits = text_splitter.split_documents(docs)
# Los 3 documentos dieron 163 chunks
print(len(splits))

Hasta aqui se llegó con la partición.
#### Incrustación y almacenamiento

In [None]:
# Instalar la BD Chroma
#!pip install chromadb

In [None]:
# Importar ChromaDB
from langchain.vectorstores import Chroma
# Importar la clase de incrustación
from langchain_huggingface import HuggingFaceEmbeddings

# Incrustar los tokens
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# Inicializar el espacio vectorial (vector store)
vectordb = Chroma.from_documents(
    documents=splits,  # se le pasa los chunks guardados en splits
    embedding=embedding, # se asigna el modelo de incrustacion 
)
# Recuperar el numero de documentos almacenados.
print(len(vectordb.get()['documents']))

##### Agregar otro documento

In [4]:
loader = PyPDFLoader("./directorio/Constitucion1853.pdf")
docs = loader.load()
# Debe usar el mismo particionador usado previamente
splits = text_splitter.split_documents(docs)
# Recuperar la BD 
bd_recuperada = Chroma.from_documents(splits, embedding, )
len(bd_recuperada.get()['documents'])

213

#### FAISS (Facebook AI Similarity Search)
Propósito del ejemplo <br>
Mostrar cómo se crea un espacio vectorial empleando FAISS y cómo se opera con él. 


In [None]:
! pip install langchain-chroma faiss-cpu --quiet 

In [5]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings

loader = TextLoader("./directorio/US_Constitution.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = FAISS.from_documents(documents=docs, embedding=embeddings)
print(db.index.ntotal)

Created a chunk of size 1035, which is longer than the specified 1000
Created a chunk of size 1144, which is longer than the specified 1000
Created a chunk of size 1576, which is longer than the specified 1000
Created a chunk of size 2353, which is longer than the specified 1000
Created a chunk of size 1670, which is longer than the specified 1000
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


53


In [None]:
# Guardar la BD
persist_faiss_directory = "directorio/faiss_index"
db.save_local(persist_faiss_directory)
# Recuperar la BD
nueva_db = FAISS.load_local(persist_faiss_directory, embeddings,)