In [3]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from pinecone import Pinecone

# ✅ Load .env variables (API key, index name, etc.)
load_dotenv()

# Pinecone config
pinecone_api_key = os.getenv("PINECONE_API_KEY")
index_name = os.getenv("PINECONE_INDEX_V2")
if not pinecone_api_key or not index_name:
    raise ValueError("⚠️ Missing Pinecone API key or index name. Check your .env file.")

# ✅ Initialize Pinecone client and index
pinecone_client = Pinecone(api_key=pinecone_api_key)

# Check if the index exists
if index_name not in [idx["name"] for idx in pinecone_client.list_indexes()]:
    raise ValueError(f"❌ Index '{index_name}' does not exist in Pinecone.")

index = pinecone_client.Index(index_name)

# ✅ Load the PDF
pdf_path = r"C:\Users\Claudio\tfm_call_optimizer\data\documentos_empresa_rag\Soluciones_Generales.pdf"
loader = PyPDFLoader(pdf_path)
pages = loader.load()

# ✅ Split PDF content into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=350,
    chunk_overlap=75,
    separators=["\n\n", "\n", ".", ",", " "],
    length_function=len
)
chunks = splitter.split_documents(pages)

# ✅ Initialize embeddings model
embedding_model = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

# ✅ Process and store each chunk
pdf_id = "soluciones_generales"
filename = os.path.basename(pdf_path)

for i, chunk in enumerate(chunks):
    content = chunk.page_content
    embedding = embedding_model.embed_query(content)

    vector_id = f"{pdf_id}_chunk_{i+1:03}"  # ejemplo: soluciones_generales_chunk_001

    metadata = {
        "pdf_id": pdf_id,
        "chunk_id": f"{i+1:03}",
        "filename": filename,
        "source": "pdf",
        "text": content
    }

    index.upsert([(vector_id, embedding, metadata)])

print(f"✅ {len(chunks)} PDF chunks embedded and uploaded to Pinecone!")


  embedding_model = HuggingFaceBgeEmbeddings(


✅ 337 PDF chunks embedded and uploaded to Pinecone!


In [4]:
# ✅ Specify the ID of the vector you want to retrieve
vector_id = "soluciones_generales_chunk_001"  # Replace with the actual vector ID

# ✅ Retrieve the vector
retrieved_vector = index.fetch(ids=[vector_id])

# ✅ Print the result
print("🔹 Retrieved Vector:")
print(retrieved_vector)

🔹 Retrieved Vector:
{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'soluciones_generales_chunk_001': {'id': 'soluciones_generales_chunk_001',
                                                'metadata': {'chunk_id': '001',
                                                             'filename': 'Soluciones_Generales.pdf',
                                                             'pdf_id': 'soluciones_generales',
                                                             'source': 'pdf',
                                                             'text': '1  \n'
                                                                     ' \n'
                                                                     'Comandos '
                                                                     'basicos '
                                                                     'asterisk \n'
                                                                     '   \n'
                  