In [9]:
import os
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone import Pinecone  # ✅ Correct import for Pinecone
from langchain.embeddings import HuggingFaceBgeEmbeddings
from dotenv import load_dotenv

# ✅ Load environment variables
load_dotenv()

# ✅ Initialize Pinecone Client
pinecone_client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# ✅ Ensure index name is set
index_name = os.getenv("PINECONE_INDEX")
if not index_name:
    raise ValueError("❌ PINECONE_INDEX is not set. Check your .env file.")

# ✅ Ensure the index exists before using it
existing_indexes = [idx["name"] for idx in pinecone_client.list_indexes()]
if index_name not in existing_indexes:
    raise ValueError(f"❌ Index '{index_name}' does not exist in Pinecone. Please create it first.")

# ✅ Initialize Pinecone Index
index = pinecone_client.Index(index_name)

# ✅ Initialize the Hugging Face Embeddings model
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}, 
    encode_kwargs={'normalize_embeddings': True}
)

# ✅ Define text splitter for chunking
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ".", ",", ";", " "], 
    chunk_size=200,  # ✅ Ensure chunking is optimized
    chunk_overlap=50,
    length_function=len,
    add_start_index=True
)

# 📂 Path to split transcriptions
ruta_docs = r"C:\Users\Claudio\tfm_call_optimizer\data\prueba_split_transcriptions"
txt_files = [f for f in os.listdir(ruta_docs) if f.endswith(".txt")]

if not txt_files:
    print("⚠ No .txt files found in the directory! Check the path.")

for filename in txt_files:
    file_path = os.path.join(ruta_docs, filename)
    loader = TextLoader(file_path, encoding="utf-8")
    docs = loader.load()
    
    chunks = text_splitter.split_documents(docs)

    # ✅ Ensure correct format extraction
    parts = filename.replace(".txt", "").split("_")
    if len(parts) == 4:  # Expecting format: call_004_003_agent.txt
        call_id, chunk_id, speaker = parts[0] + "_" + parts[1], parts[2], parts[3]
    else:
        print(f"⚠ Unexpected filename format: {filename} - Skipping this file.")
        continue  # Skip problematic filenames

    for i, chunk in enumerate(chunks):
        embedding = huggingface_embeddings.embed_query(chunk.page_content)
        vector_id = f"{call_id}_{chunk_id:03}_{speaker.lower()}_{i+1:03}"  # ✅ Ensures proper chunk numbering

        # ✅ Store transcription text in metadata correctly
        index.upsert([
            (
                vector_id,
                embedding,
                {
                    "call_id": call_id,  # ✅ Matches filename
                    "chunk_id": chunk_id,  # ✅ Correct chunk ID
                    "speaker": speaker.lower(),  # ✅ "agent" or "client"
                    "filename": filename,  # ✅ Correct filename stored
                    "text": chunk.page_content  # ✅ Store actual transcription text
                }
            )
        ])

    print(f"✅ Processed and stored {len(chunks)} chunks from {filename}")

print("🚀 All transcriptions stored in Pinecone!")


✅ Processed and stored 1 chunks from call_001_001_agent.txt
✅ Processed and stored 1 chunks from call_001_002_client.txt
✅ Processed and stored 1 chunks from call_001_003_agent.txt
✅ Processed and stored 1 chunks from call_001_004_client.txt
✅ Processed and stored 1 chunks from call_001_005_agent.txt
✅ Processed and stored 1 chunks from call_002_001_agent.txt
✅ Processed and stored 1 chunks from call_002_002_client.txt
✅ Processed and stored 1 chunks from call_002_003_agent.txt
✅ Processed and stored 1 chunks from call_002_004_client.txt
✅ Processed and stored 1 chunks from call_003_001_agent.txt
✅ Processed and stored 1 chunks from call_003_002_client.txt
✅ Processed and stored 1 chunks from call_003_003_agent.txt
✅ Processed and stored 1 chunks from call_003_004_client.txt
✅ Processed and stored 1 chunks from call_003_005_agent.txt
✅ Processed and stored 1 chunks from call_004_001_agent.txt
✅ Processed and stored 1 chunks from call_004_002_client.txt
✅ Processed and stored 1 chunks f

In [10]:
# ✅ Describe the index correctly
index_description = pinecone_client.describe_index("tfm-call-optimizer")

# ✅ Print the description
print(index_description)

{'deletion_protection': 'disabled',
 'dimension': 384,
 'host': 'tfm-call-optimizer-299x29w.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'tfm-call-optimizer',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}


# Comprobaciones

In [12]:
# ✅ Specify the ID of the vector you want to retrieve
vector_id = "call_004_003_agent_001"  # Replace with the actual vector ID

# ✅ Retrieve the vector
retrieved_vector = index.fetch(ids=[vector_id])

# ✅ Print the result
print("🔹 Retrieved Vector:")
print(retrieved_vector)

🔹 Retrieved Vector:
{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'call_004_003_agent_001': {'id': 'call_004_003_agent_001',
                                        'metadata': {'call_id': 'call_004',
                                                     'chunk_id': '003',
                                                     'filename': 'call_004_003_agent.txt',
                                                     'speaker': 'agent',
                                                     'text': 'Agent: Let me '
                                                             'confirm... Yes, '
                                                             'it was received. '
                                                             'The remaining '
                                                             'balance is $50.'},
                                        'values': [-0.0547002,
                                                   0.122288421,
                        

In [13]:
# ✅ Define a test query (simulating an auditor's question)
# query_text = "What payment options are available?"
# query_text = "how many months of extension are the agents giving"
query_text = "what are the remaining balances?"
# query_text = "what is the remaining balance in the 'call_004'?" # esto se puede afinar, ya que si pongo top_k=1 me trae call_001 debería traer la call_004

# ✅ Convert query to an embedding
query_embedding = huggingface_embeddings.embed_query(query_text)

In [14]:
# ✅ Perform similarity search in Pinecone
top_k = 3  # Retrieve top 3 most similar chunks
search_results = index.query(
    vector=query_embedding,
    top_k=top_k,  # Number of results to return
    include_metadata=True
)

# ✅ Print the retrieved results
for match in search_results["matches"]:
    print(f"🔹 Chunk ID: {match['id']}")
    print(f"📂 Stored Metadata: {match['metadata']}")  # ✅ This will show all metadata fields
    print(f"🔢 Score: {match['score']}\n")


🔹 Chunk ID: call_004_003_agent_001
📂 Stored Metadata: {'call_id': 'call_004', 'chunk_id': '003', 'filename': 'call_004_003_agent.txt', 'speaker': 'agent', 'text': 'Agent: Let me confirm... Yes, it was received. The remaining balance is $50.'}
🔢 Score: 0.476971477

🔹 Chunk ID: call_004_001_agent_001
📂 Stored Metadata: {'call_id': 'call_004', 'chunk_id': '001', 'filename': 'call_004_001_agent.txt', 'speaker': 'agent', 'text': 'Agent: Good morning, this is Alex from Debt Relief Services. I’m contacting you about a $200 balance.'}
🔢 Score: 0.392818928

🔹 Chunk ID: call_003_001_agent_001
📂 Stored Metadata: {'call_id': 'call_003', 'chunk_id': '001', 'filename': 'call_003_001_agent.txt', 'speaker': 'agent', 'text': 'Agent: Hi, this is Lisa from Finance Assist. I see a balance of $750 on your account.'}
🔢 Score: 0.381266594

