In [32]:
# import the necessary libraries and set up the environment
import dotenv
import os
from qdrant_client import QdrantClient
from langchain_qdrant import QdrantVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Qdrant

dotenv.load_dotenv()
local_data_directory = "./TO_EMBED" 

In [21]:
# initialize the client and print the collections | set openai embeddings

#
qdrant_client = QdrantClient(
    url=os.getenv("QDRANT_URL"), 
    api_key=os.getenv("QDRANT_API_KEY"),
)

print(qdrant_client.get_collections())

embeddings = OpenAIEmbeddings()


collections=[CollectionDescription(name='Contratos_Embeds'), CollectionDescription(name='LOCAL_TEST_OPENAI_EMBEDS'), CollectionDescription(name='Contratos_OPENAI_EMBEDS'), CollectionDescription(name='NEW_TEST_COLLECTION')]


In [22]:
# check if the collection exists and create it if it doesn't

def check_and_create_collection():
    collection_name = os.getenv("QDRANT_COLLECTION_NAME_2")
    collections = qdrant_client.get_collections()
    
    if not any(collection.name == collection_name for collection in collections.collections):
        print(f"La colección '{collection_name}' no existe. Creándola...")
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config={
                "size": 1536,  # Tamaño de los embeddings de OpenAI
                "distance": "Cosine"
            }
        )
        print(f"Colección '{collection_name}' creada exitosamente.")
    else:
        print(f"La colección '{collection_name}' ya existe.")

# Llamar a la función para verificar y crear la colección si es necesario
check_and_create_collection()


La colección 'LOCAL_TEST_OPENAI_EMBEDS' ya existe.


In [29]:
# Initialize the vector store
vectorstore = Qdrant(
    client=qdrant_client,
    collection_name=os.getenv("QDRANT_COLLECTION_NAME_2"),
    embeddings=embeddings
)
print(os.getenv("QDRANT_COLLECTION_NAME_2"))


LOCAL_TEST_OPENAI_EMBEDS


In [36]:
import os
from langchain.document_loaders import DirectoryLoader, PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document

def load_and_chunk_documents(local_data_directory):
    # Definir los cargadores para diferentes tipos de archivos
    loaders = {
        ".pdf": (PyPDFLoader, {}),
        ".docx": (Docx2txtLoader, {}),
        ".doc": (Docx2txtLoader, {}),
        ".txt": (TextLoader, {"encoding": "utf8"}),
    }
    
    # Cargar documentos
    documents = []
    for root, _, files in os.walk(local_data_directory):
        for file in files:
            file_path = os.path.join(root, file)
            file_extension = os.path.splitext(file)[1].lower()
            if file_extension in loaders:
                loader_class, loader_args = loaders[file_extension]
                loader = loader_class(file_path, **loader_args)
                documents.extend(loader.load())
    
    # Crear el divisor de texto
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    
    chunked_documents = []
    
    for doc in documents:
        chunks = text_splitter.split_text(doc.page_content)
        for i, chunk in enumerate(chunks):
            # Crear un ID único
            doc_path = os.path.relpath(doc.metadata['source'], local_data_directory)
            page_num = doc.metadata.get('page', 1)  # Asume página 1 si no está especificado
            chunk_id = f"{doc_path}_p{page_num}_c{i+1}"
            
            # Crear un nuevo Document con el ID en los metadatos
            chunked_doc = Document(
                page_content=chunk,
                metadata={
                    **doc.metadata,
                    'chunk_id': chunk_id,
                    'chunk_number': i+1
                }
            )
            chunked_documents.append(chunked_doc)
    
    return chunked_documents

# Uso de la función
chunked_docs = load_and_chunk_documents(local_data_directory)

# Añadir los documentos chunkeados a Qdrant
#vectorstore.add_documents(chunked_docs)



In [38]:
vectorstore.add_documents(chunked_docs)


['bfae06a802e243bb955dd3fa210c8b23',
 '995a2d6d4aca4bc8bd96bf5aca881a1f',
 '4c9243cd0e0d46f2872eb8f90623b5c8',
 '85a66f728d874e2a8c5ed5c5a21c54e0',
 'a37e580a4e1e4400b80c54c7cd23e85d',
 '08a10fec24a142dda5a9f9a87adb44fe',
 'a176ebb41b8c41ad8a0f6c9a3f839e71',
 'd820ca3faee34d90a547892c98aaf470',
 '0cc1d8bc5e25422badfe8bbcd787c6b5',
 '795406d9c4ef41c58cf7d7c7f5c9d513',
 '8c202d6444394ad7840849f210af0203',
 '68db8c0a8c6d41e683cab9b9cdea76e1',
 'cbf3dabde0aa4c439cefa4e879013c78',
 'b931f6ada6da4506b301f7de2eb54c51',
 '582e098211e34d8897483276a2b55bb7',
 '873f447b5d7743eba8ed54c673327000',
 '99c1682ddd354da8aa534cff570e7a1b',
 '5c4a291b6b944d568f274ea9405fb3d2',
 '73932fb8217745b4af40bb849d096685',
 'f798d303ca1a4d11882e4b8d6202fe47',
 '9915824abb2e436baf02a9cd4a6d1d58',
 'd023da30b0e04e2fbf59ddb150d219b1',
 '62e71fdf4bea46bba9f34ec2421acc2f',
 '56c7cfbbdcd1437ba5f51b3e9eaf428d',
 'db97714bc5074af3bf9a1aeea10ec6d6',
 '67a011c96bf844e6aa66bfb65835a27f',
 '04176fbebb21449f9d2f700229c0d0dc',
 

In [37]:

print(chunked_docs)



[Document(metadata={'source': './TO_EMBED\\monopoly.pdf', 'page': 0, 'chunk_id': 'monopoly.pdf_p0_c1', 'chunk_number': 1}, page_content='MONOPOLY \nProperty Trading Game from Parker Brothers" \nAGES 8+ \n2 to 8 Players \nContents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance \nand Community Chest cards, Title Deed cards, play money and a Banker\'s tray. \nNow there\'s a faster way to play MONOPOLY. Choose to play by \nthe classic rules for buying, renting and selling properties or use the \nSpeed Die to get into the action faster. If you\'ve never played the classic \nMONOPOLY game, refer to the Classic Rules beginning on the next page. \nIf you already know how to play and want to use the Speed Die, just \nread the section below for the additional Speed Die rules. \nSPEED DIE RULES \nLearnins how to Play with the S~eed Die IS as \n/ \nfast as playing with i\'t. \n1. When starting the game, hand out an extra $1,000 to each player \n(two $5005 should work). The game moves fas