# Transcripciones a embeddings y almacenamiento en BBDD vectorial

In [None]:
import mysql.connector
import openai
import pinecone
import os

# 🔹 Load API Keys
openai.api_key = os.getenv("OPENAI_API_KEY")
pinecone.init(api_key=os.getenv("PINECONE_API_KEY"), environment="us-west1-gcp")  # Replace with your region

# 🔹 Connect to MySQL Database
conn = mysql.connector.connect(
    host="your-mysql-server",
    user="your-username",
    password="your-password",
    database="your-database"
)
cursor = conn.cursor()

# 🔹 Fetch Transcriptions from MySQL
cursor.execute("SELECT call_id, transcription FROM transcriptions WHERE status IN ('Success', 'Failed')")
transcriptions = cursor.fetchall()  # Returns a list of (call_id, transcription) tuples

# 🔹 Connect to Pinecone
index_name = "transcriptions-index"
index = pinecone.Index(index_name)

# 🔹 Process Each Transcription
for call_id, transcription_json in transcriptions:
    # Convert JSON string to list (assuming structured JSON)
    import json
    transcription_data = json.loads(transcription_json)

    for i, entry in enumerate(transcription_data):  # Loop through each sentence
        speaker = entry["speaker"]  # "Agent" or "Customer"
        text = entry["text"]

        # Generate embedding
        response = openai.Embedding.create(input=text, model="text-embedding-ada-002")
        embedding = response["data"][0]["embedding"]

        # Create unique ID for each sentence
        vector_id = f"{call_id}_{i+1}"  # Example: "grab_1_1"

        # Store in Pinecone with metadata
        index.upsert([(vector_id, embedding, {"call_id": call_id, "speaker": speaker, "text": text})])

        print(f"✅ Stored: {vector_id} | {speaker} | {text}")

# 🔹 Close MySQL Connection
cursor.close()
conn.close()

print("🚀 All transcriptions (Agent/Customer) stored in Pinecone!")
