In [2]:
import os
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone import Pinecone  # ✅ Correct import for Pinecone
from langchain.embeddings import HuggingFaceBgeEmbeddings
from dotenv import load_dotenv

# ✅ Load environment variables
load_dotenv()

# ✅ Initialize Pinecone Client
pinecone_client = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# ✅ Ensure the correct Pinecone index name is set
index_name = os.getenv("PINECONE_INDEX_V2")  # ✅ Updated to use the correct index variable
if not index_name:
    raise ValueError("❌ PINECONE_INDEX_V2 is not set. Check your .env file.")

# ✅ Ensure the index exists before using it
existing_indexes = [idx["name"] for idx in pinecone_client.list_indexes()]
if index_name not in existing_indexes:
    raise ValueError(f"❌ Index '{index_name}' does not exist in Pinecone. Please create it first.")

# ✅ Initialize Pinecone Index
index = pinecone_client.Index(index_name)

# ✅ Initialize the Hugging Face Embeddings model
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}, 
    encode_kwargs={'normalize_embeddings': True}
)

# ✅ Define text splitter for chunking
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ".", ",", ";", " "], 
    chunk_size=350,  # ✅ Adjusted for better segmentation of varied call lengths
    chunk_overlap=75,  # ✅ Slightly reduced overlap to avoid too much redundancy
    length_function=len,
    add_start_index=True
)

# 📂 Path to full call transcriptions
ruta_docs = r"C:\Users\Claudio\tfm_call_optimizer\data\prueba_transcriptions_raw2"
txt_files = [f for f in os.listdir(ruta_docs) if f.endswith(".txt")]

if not txt_files:
    print("⚠ No .txt files found in the directory! Check the path.")

for filename in txt_files:
    file_path = os.path.join(ruta_docs, filename)
    loader = TextLoader(file_path, encoding="utf-8")
    docs = loader.load()

    # ✅ Combine entire call transcription into a single document
    full_transcription = " ".join([doc.page_content for doc in docs])

    # ✅ Split the full call into chunks for embedding
    chunks = text_splitter.split_text(full_transcription)

    # ✅ Extract call ID from filename
    call_id = filename.replace(".txt", "")  # E.g., "call_004"

    for i, chunk in enumerate(chunks):
        embedding = huggingface_embeddings.embed_query(chunk)
        vector_id = f"{call_id}_chunk_{i+1:03}"  # ✅ Unique ID for each call chunk

        # ✅ Store call transcription chunks with metadata
        index.upsert([
            (
                vector_id,
                embedding,
                {
                    "call_id": call_id,  # ✅ Matches full call ID
                    "chunk_id": f"{i+1:03}",  # ✅ Sequential chunk numbering
                    "filename": filename,  # ✅ Correct filename stored
                    "text": chunk  # ✅ Store actual transcription text
                }
            )
        ])

    print(f"✅ Processed and stored {len(chunks)} chunks for {filename}")

print("🚀 All call transcriptions stored in Pinecone!")


✅ Processed and stored 4 chunks for call_001.txt
✅ Processed and stored 3 chunks for call_002.txt
✅ Processed and stored 3 chunks for call_003.txt
✅ Processed and stored 2 chunks for call_004.txt
✅ Processed and stored 2 chunks for call_005.txt
✅ Processed and stored 1 chunks for call_006.txt
✅ Processed and stored 1 chunks for call_007.txt
✅ Processed and stored 1 chunks for call_008.txt
✅ Processed and stored 1 chunks for call_009.txt
✅ Processed and stored 1 chunks for call_010.txt
🚀 All call transcriptions stored in Pinecone!


# Comprobaciones

In [3]:
# ✅ Specify the ID of the vector you want to retrieve
vector_id = "call_003_chunk_002"  # Replace with the actual vector ID

# ✅ Retrieve the vector
retrieved_vector = index.fetch(ids=[vector_id])

# ✅ Print the result
print("🔹 Retrieved Vector:")
print(retrieved_vector)

🔹 Retrieved Vector:
{'namespace': '',
 'usage': {'read_units': 1},
 'vectors': {'call_003_chunk_002': {'id': 'call_003_chunk_002',
                                    'metadata': {'call_id': 'call_003',
                                                 'chunk_id': '002',
                                                 'filename': 'call_003.txt',
                                                 'text': 'Client: That’s '
                                                         'better. When do I '
                                                         'have to start?\n'
                                                         'Agent: The first '
                                                         'payment would be due '
                                                         'in two weeks.\n'
                                                         'Client: I might need '
                                                         'until next month.\n'
                              

In [11]:
# ✅ Define a test query (simulating an auditor's question)
# query_text = "What payment options are available?"
# query_text = "how many months of extension are the agents giving"
# query_text = "what are the remaining balances?"
# query_text = "what is the remaining balance in the 'call_004'?"   #funciona bien, ya que si pongo top_k=1 me trae la call_004

# ✅ Convert query to an embedding
query_embedding = huggingface_embeddings.embed_query(query_text)

In [12]:
# ✅ Perform similarity search in Pinecone
top_k = 3  # Retrieve top 3 most similar chunks
search_results = index.query(
    vector=query_embedding,
    top_k=top_k,  # Number of results to return
    include_metadata=True
)

# ✅ Print the retrieved results
for match in search_results["matches"]:
    print(f"🔹 Chunk ID: {match['id']}")
    print(f"📂 Stored Metadata: {match['metadata']}")  # ✅ This will show all metadata fields
    print(f"🔢 Score: {match['score']}\n")


🔹 Chunk ID: call_010_chunk_001
📂 Stored Metadata: {'call_id': 'call_010', 'chunk_id': '001', 'filename': 'call_010.txt', 'text': 'Agent: Good afternoon, this is Amy from Financial Assistance. Your account is past due at $1,500.\nClient: I lost my job. I can’t pay that.\nAgent: We offer hardship plans with reduced payments.\nClient: What’s the lowest option?\nAgent: $50 per month with no interest for six months.\nClient: That helps a lot.\nAgent: I’ll set that up now.'}
🔢 Score: 0.440576017

🔹 Chunk ID: call_001_chunk_002
📂 Stored Metadata: {'call_id': 'call_001', 'chunk_id': '002', 'filename': 'call_001.txt', 'text': 'Agent: I understand. We have different payment plans that could help. Would you be able to make a partial payment today?\nClient: I can probably do $50 today.\nAgent: That’s great. We can spread the remaining balance over five months at $90 per month, or we can extend it to eight months at $60 per month.'}
🔢 Score: 0.429198295

🔹 Chunk ID: call_006_chunk_001
📂 Stored Meta