In [None]:
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.documents import Document
from langchain_chroma import Chroma

import uuid
from langchain_community.document_loaders import PyPDFLoader


In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=510,
    chunk_overlap=50,
    length_function=len,
    separators=['\n', '.', '\n\n']
)

In [None]:
embeddings_generator = OllamaEmbeddings(
    model='mxbai-embed-large:latest'
)

In [None]:
chroma_db_path = './database/chromadb'

vector_store = Chroma(
    persist_directory=chroma_db_path,
    embedding_function=embeddings_generator,
    collection_name='pokemon'
)

## Documentos TXT

In [None]:
txt_path = './database/docs/megaevolucion.txt'
txt_text = ''

with open(txt_path, 'r') as file:
    txt_text = file.read()

print(len(txt_text))

In [None]:
text_chunks = text_splitter.split_text(txt_text)
print(len(text_chunks))

In [None]:
for chunk in text_chunks:
    document = Document(
        id=str(uuid.uuid4()),
        page_content=chunk,
        metadata={
            'name': 'megaevolucion',
            'source': 'https://www.wikidex.net/wiki/Megaevoluci%C3%B3n',
            'generation': 6,
            'image_src': 'https://images.wikidexcdn.net/mwuploads/wikidex/thumb/2/20/latest/20130811194207/Mega_Mewtwo_vs_Mega_Lucario.png/300px-Mega_Mewtwo_vs_Mega_Lucario.png'
        }
    )

    vector_store.add_documents([document])

In [None]:
vector_store.similarity_search('Como se activa la megaevolucion?', 5)

## Documento PDF

In [None]:
pdf_document = PyPDFLoader('./database/docs/curiosidades.pdf')
pdf_pages = pdf_document.load()

In [None]:
pdf_chunks = text_splitter.split_documents(pdf_pages)
print(len(pdf_chunks))

In [None]:
for chunk in pdf_chunks:
    chunk.id = str(uuid.uuid4())
    chunk.metadata = {
        "name": "curiosidades"
    }

In [None]:
vector_store.add_documents(pdf_chunks)