# Transcripciones a embeddings y almacenamiento en BBDD vectorial

In [1]:
import os
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone import Pinecone  # ✅ Correct import for Pinecone
from langchain.embeddings import HuggingFaceBgeEmbeddings
from dotenv import load_dotenv

# ✅ Load environment variables
load_dotenv()

# ✅ Initialize Pinecone Client
pinecone_client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# ✅ Ensure the correct Pinecone index name is set
index_name = os.getenv("PINECONE_INDEX_V3")  # ✅ Updated to use the correct index variable
if not index_name:
    raise ValueError("❌ PINECONE_INDEX_V3 is not set. Check your .env file.")

# ✅ Ensure the index exists before using it
existing_indexes = [idx["name"] for idx in pinecone_client.list_indexes()]
if index_name not in existing_indexes:
    raise ValueError(f"❌ Index '{index_name}' does not exist in Pinecone. Please create it first.")

# ✅ Initialize Pinecone Index
index = pinecone_client.Index(index_name)

# ✅ Initialize the Hugging Face Embeddings model
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}, 
    encode_kwargs={'normalize_embeddings': True}
)

# ✅ Define text splitter for chunking
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ".", ",", ";", " "], 
    chunk_size=350,  # ✅ Adjusted for better segmentation of varied call lengths
    chunk_overlap=75,  # ✅ Slightly reduced overlap to avoid too much redundancy
    length_function=len,
    add_start_index=True
)

# 📂 Path to full call transcriptions
ruta_docs = r"C:\Users\Claudio\tfm_call_optimizer\transcripciones_prueba"
txt_files = [f for f in os.listdir(ruta_docs) if f.endswith(".txt")]

if not txt_files:
    print("⚠ No .txt files found in the directory! Check the path.")

for filename in txt_files:
    file_path = os.path.join(ruta_docs, filename)
    loader = TextLoader(file_path, encoding="utf-8")
    docs = loader.load()

    # ✅ Combine entire call transcription into a single document
    full_transcription = " ".join([doc.page_content for doc in docs])

    # ✅ Split the full call into chunks for embedding
    chunks = text_splitter.split_text(full_transcription)

    # ✅ Extract call ID from filename
    call_id = filename.replace(".txt", "")  # E.g., "call_004"

    for i, chunk in enumerate(chunks):
        embedding = huggingface_embeddings.embed_query(chunk)
        vector_id = f"{call_id}_chunk_{i+1:03}"  # ✅ Unique ID for each call chunk

        # ✅ Store call transcription chunks with metadata
        index.upsert([
            (
                vector_id,
                embedding,
                {
                    "call_id": call_id,  # ✅ Matches full call ID
                    "chunk_id": f"{i+1:03}",  # ✅ Sequential chunk numbering
                    "filename": filename,  # ✅ Correct filename stored
                    "text": chunk  # ✅ Store actual transcription text
                }
            )
        ])

    print(f"✅ Processed and stored {len(chunks)} chunks for {filename}")

print("🚀 All call transcriptions stored in Pinecone!")


  from tqdm.autonotebook import tqdm
  huggingface_embeddings = HuggingFaceBgeEmbeddings(


✅ Processed and stored 14 chunks for call_0001.txt
✅ Processed and stored 6 chunks for call_0003.txt
✅ Processed and stored 7 chunks for call_0004.txt
✅ Processed and stored 6 chunks for call_0006.txt
✅ Processed and stored 7 chunks for call_0007.txt
✅ Processed and stored 10 chunks for call_0008.txt
🚀 All call transcriptions stored in Pinecone!
