# Final

## Versión Gratis

In [1]:
import chromadb
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader


embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2') #modelo de embeddings gratis
persistent_client = chromadb.PersistentClient(path='./vectordb_gratis')

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [2]:
def add_files_to_vectordb(filepath):
    loader = PyPDFLoader(filepath)
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) #usamos menos tokens que en el anterior debido a que este modelo es inferior
    splits = text_splitter.split_documents(docs)
    vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory='./vectordb_gratis')

In [3]:
add_files_to_vectordb('./data/los7HabitosGenteAltamenteEfectiva.pdf')

In [6]:
add_files_to_vectordb('./data/100M-offers-alex-hormozi-espaol.pdf')

## Ver si se agregaron

In [4]:
def get_unique_sources_list(chroma_settings):
    # Obtén los datos de la colección
    collection_data = chroma_settings.get_collection('langchain').get(include=['embeddings', 'documents', 'metadatas'])
    
    # Extrae los metadatos
    metadatas = collection_data['metadatas']
    
    # Obtén los valores únicos de 'source'
    sources = set()
    for metadata in metadatas:
        source = metadata.get('source', None)
        if source:
            sources.add(source)
    
    # Obtener solo el nombre de archivo de cada ruta
    file_names = list(set(source.split('/')[-1] for source in sources))
    
    return file_names

In [19]:
get_unique_sources_list(persistent_client)

['100M-offers-alex-hormozi-espaol.pdf',
 'los7HabitosGenteAltamenteEfectiva.pdf']

## Version paga

In [8]:
import os

os.environ["OPENAI_API_KEY"] = '' #API Key de OPENAI (version paga)

In [16]:
import chromadb
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter


persistent_client = chromadb.PersistentClient(path='./vectordb')

In [10]:
def add_files_to_vectordb(filepath):
    loader = PyPDFLoader(filepath)
    docs = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)
    vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings(), persist_directory='./vectordb')

In [16]:
add_files_to_vectordb('./data/los7HabitosGenteAltamenteEfectiva.pdf')

## Ver si se agregaron

In [14]:
def get_unique_sources_list(chroma_settings):
    # Obtén los datos de la colección
    collection_data = chroma_settings.get_collection('langchain').get(include=['embeddings', 'documents', 'metadatas'])
    
    # Extrae los metadatos
    metadatas = collection_data['metadatas']
    
    # Obtén los valores únicos de 'source'
    sources = set()
    for metadata in metadatas:
        source = metadata.get('source', None)
        if source:
            sources.add(source)
    
    # Obtener solo el nombre de archivo de cada ruta
    file_names = list(set(source.split('/')[-1] for source in sources))
    
    return file_names

In [17]:
get_unique_sources_list(persistent_client)

['100M-offers-alex-hormozi-espaol.pdf',
 'los7HabitosGenteAltamenteEfectiva.pdf']