In [1]:
# (Multi-File): Erzeuge Chatverläufe (STT/TTS) für alle JSONs im Ordner,
# gruppiert nach conversation.id. 
# und gib einen kombinierten DataFrame zurück.

import json
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
import pandas as pd

# === Einstellungen anpassen ===
INPUT_DIR = Path("data/jaeger_exports/voice_ai_latency_v2_pilot_1/20251208_114421/json_traces")          # Ordner mit deinen JSON-Dateien
GLOB_PATTERN = "*.json"                # Muster für JSON-Dateien
LOCAL_TZ = "Europe/Berlin"             # Ausgabe in Berliner Zeit
SAVE_PER_CONVERSATION = False           # Pro Conversation-ID CSV speichern?
OUTPUT_DIR = INPUT_DIR / "exports"     # Zielordner für CSV-Exports
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)


# === Helpers ===
def tags_to_dict(tags: List[Dict[str, Any]]) -> Dict[str, Any]:
    return {t.get("key"): t.get("value") for t in (tags or [])}

def to_berlin_datetime(us_epoch: int) -> pd.Timestamp:
    ts = pd.to_datetime(us_epoch, unit="us", utc=True)
    return ts.tz_convert(LOCAL_TZ)

def sanitize_filename(name: str, maxlen: int = 120) -> str:
    name = re.sub(r"[^\w\-.]+", "_", str(name))
    return (name[:maxlen]).strip("._") or "conversation"

def load_spans_from_json(path: Path) -> List[Dict[str, Any]]:
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    if isinstance(data, dict):
        spans = data.get("spans", [])
    elif isinstance(data, list):
        spans = data
    else:
        spans = []
    for sp in spans:
        sp["_source_file"] = path.name
    return spans

def get_conversation_id(spans: List[Dict[str, Any]]) -> Optional[str]:
    """
    Extrahiert conversation.id NUR aus dem 'conversation'-Span.
    Erwartet tags:
      { "key": "conversation.id", "value": "<ID>" }
    """
    for sp in spans:
        if sp.get("operationName") == "conversation":
            t = tags_to_dict(sp.get("tags", []))
            cid = t.get("conversation.id")
            if cid:
                return str(cid)
    return None

def extract_rows_for_spans(spans: List[Dict[str, Any]], conversation_id: str) -> List[Dict[str, Any]]:
    rows: List[Dict[str, Any]] = []
    for span in spans:
        op = span.get("operationName")
        start_us = span.get("startTime")
        if start_us is None:
            continue

        if op not in ("stt", "tts"):
            continue

        tags = tags_to_dict(span.get("tags", []))

        if op == "stt":
            # nur finale STT (falls Flag vorhanden)
            is_final = tags.get("is_final")
            if is_final is not None and str(is_final).lower() not in ("true", "1", "yes"):
                continue
            text = (tags.get("transcript") or "").strip()
            role = "user"
        else:  # tts
            text = (tags.get("text") or "").strip()
            role = "assistant"

        if not text:
            continue

        rows.append({
            "conversation_id": conversation_id,
            "time": to_berlin_datetime(start_us),
            "role": role,
            "operation": op,
            "text": text,
            "spanID": span.get("spanID"),
            "duration_ms": round((span.get("duration") or 0) / 1000.0, 3),
            "_source_file": span.get("_source_file"),
        })
    return rows

# === Verarbeitung über alle Dateien ===
all_rows: List[Dict[str, Any]] = []
files = sorted(INPUT_DIR.glob(GLOB_PATTERN))
if not files:
    print(f"Keine Dateien gefunden in: {INPUT_DIR} mit Pattern: {GLOB_PATTERN}")

skipped = []
for fp in files:
    spans = load_spans_from_json(fp)
    conv_id = get_conversation_id(spans)
    if not conv_id:
        # -> Entweder überspringen oder Fehler werfen:
        # raise ValueError(f"Keine conversation.id im 'conversation'-Span gefunden: {fp}")
        skipped.append(fp.name)
        continue
    all_rows.extend(extract_rows_for_spans(spans, conv_id))

df_all = pd.DataFrame(all_rows)

if df_all.empty:
    print("Keine STT/TTS-Nachrichten mit gültiger conversation.id gefunden.")
else:
    # sortieren & msg# je conversation_id
    df_all = df_all.sort_values(["conversation_id", "time"]).reset_index(drop=True)
    df_all["msg#"] = df_all.groupby("conversation_id").cumcount() + 1

    # Spaltenreihenfolge
    cols = ["conversation_id", "msg#", "time", "role", "operation", "text", "spanID", "duration_ms", "_source_file"]
    df_all = df_all[[c for c in cols if c in df_all.columns]]

    # Optional: je Conversation-ID exportieren
    if SAVE_PER_CONVERSATION:
        for cid, df_conv in df_all.groupby("conversation_id", sort=False):
            out = OUTPUT_DIR / f"chat_{sanitize_filename(cid)}.csv"
            df_conv.to_csv(out, index=False)

# Info über übersprungene Dateien
if skipped:
    print("Übersprungen (keine conversation.id im 'conversation'-Span gefunden):")
    for s in skipped:
        print(" -", s)

# Ausgabe: kombinierter DataFrame mit allen Gesprächen
df_all



Übersprungen (keine conversation.id im 'conversation'-Span gefunden):
 - 015f47038b37f91e93b8e3165e9a3fe2.json
 - 04f539bf496de15bbd19360d2b3b7a3e.json
 - 05189b6b961aa8b19c63433e01da300e.json
 - 0540bfdfc5e9d3848960f0f3914c2ebb.json
 - 06dc62605445b8f8f5060a4e1ca2621f.json
 - 09b25fa2c1587201bd12293dabcda5ba.json
 - 0b6d1942dbfb6a20c0d6e83a1b10a0b4.json
 - 0b7152005940d3ad54232f954920ec74.json
 - 0c092bb8565191434a2157913ef311c2.json
 - 0e4d7442eb4bc80affba25dd4fddc0b7.json
 - 0eb24dd79c30bc8e1a7ff1bec1c3be31.json
 - 11980418caa48fa5b1931c33e6d68373.json
 - 12a701147f5e5e4bda137e78e75c19ea.json
 - 13cd7d64d420fe1e4b7fe4e887fba801.json
 - 147ae91638dd51c458cc5966e3760c84.json
 - 1e21f14e460747f860301b7612256c8a.json
 - 1ef1b4b86684c93e0d810cac10c4135a.json
 - 21373a47905db23d2b97d420ad81036c.json
 - 21f4180dd28612782db28fbaddebddb6.json
 - 24152ac7008f31e38944ff4cd86a06ad.json
 - 24a132f92d9a959b481d0eac461ddc5b.json
 - 2570310ec09296889c5ae4bfbfd71b18.json
 - 25f4f052978b6025cfd944dd3

Unnamed: 0,conversation_id,msg#,time,role,operation,text,spanID,duration_ms,_source_file
0,202,1,2025-11-11 18:36:46.776257+01:00,assistant,tts,Guten Tag.,be8b6e8b0de22140,468.564,e9bc9f3f27ad7db63ba9e47d49f89bba.json
1,202,2,2025-11-11 18:36:47.245230+01:00,assistant,tts,Mein Name ist Kerstin – ich bin die digitale A...,5b74ba9f02005d6a,137.182,e9bc9f3f27ad7db63ba9e47d49f89bba.json
2,202,3,2025-11-11 18:36:47.382822+01:00,assistant,tts,"Ich helfe Ihnen jetzt dabei, einen Termin zu v...",00af3ed031d880a2,162.270,e9bc9f3f27ad7db63ba9e47d49f89bba.json
3,202,4,2025-11-11 18:36:47.545454+01:00,assistant,tts,"Vielen Dank, dass Sie an unserer wissenschaftl...",a69ee2f0e3c76ef9,1908.874,e9bc9f3f27ad7db63ba9e47d49f89bba.json
4,202,5,2025-11-11 18:36:49.454820+01:00,assistant,tts,Bitte geben Sie für dieses Gespräch nicht Ihre...,27b699a38e2a198a,117.855,e9bc9f3f27ad7db63ba9e47d49f89bba.json
...,...,...,...,...,...,...,...,...,...
13026,800,22,2025-11-16 18:28:01.359519+01:00,user,stt,Sann af.,b41cc58ec5b09638,0.082,a6343fd2aeda7628e0f7a498540cd106.json
13027,800,23,2025-11-16 18:28:02.390854+01:00,assistant,tts,Bitte nennen Sie mir – gern mit einem Fantasie...,b1d56f620c446cf6,1039.355,a6343fd2aeda7628e0f7a498540cd106.json
13028,800,24,2025-11-16 18:28:11.300034+01:00,user,stt,Max Mustaman.,e2f060c30aebddde,0.081,a6343fd2aeda7628e0f7a498540cd106.json
13029,800,25,2025-11-16 18:28:12.499214+01:00,assistant,tts,"Vielen Dank, Max Mustaman.",8ba76e96d752a75c,510.015,a6343fd2aeda7628e0f7a498540cd106.json


In [4]:
df_all[df_all["conversation_id"] == "390"]


Unnamed: 0,conversation_id,msg#,time,role,operation,text,spanID,duration_ms,_source_file
4401,390,1,2025-10-30 18:44:51.207429+01:00,assistant,tts,Guten Tag.,244d2bcaec4a4653,235.768,3b753c3d68a8a4c6988a766a0f6147ac.json
4402,390,2,2025-10-30 18:44:51.443736+01:00,assistant,tts,Mein Name ist Kerstin – ich bin die digitale A...,24a5f2c5e73bcf12,103.635,3b753c3d68a8a4c6988a766a0f6147ac.json
4403,390,3,2025-10-30 18:44:51.547817+01:00,assistant,tts,"Ich helfe Ihnen jetzt dabei, einen Termin zu v...",5ee3545626e04500,194.799,3b753c3d68a8a4c6988a766a0f6147ac.json
4404,390,4,2025-10-30 18:44:51.742902+01:00,assistant,tts,"Vielen Dank, dass Sie an unserer wissenschaftl...",ac635075ef655dc0,123.653,3b753c3d68a8a4c6988a766a0f6147ac.json
4405,390,5,2025-10-30 18:44:51.866944+01:00,assistant,tts,Bitte geben Sie für dieses Gespräch nicht Ihre...,5292a57a8d1695b9,177.604,3b753c3d68a8a4c6988a766a0f6147ac.json
4406,390,6,2025-10-30 18:44:52.044848+01:00,assistant,tts,Bitte nutzen Sie erfundene Daten wie zum Beisp...,36473803d49c995b,113.98,3b753c3d68a8a4c6988a766a0f6147ac.json
4407,390,7,2025-10-30 18:44:52.159277+01:00,assistant,tts,Am Ende des Gesprächs nenne ich Ihnen eine dre...,d8a0f1ea7e87ad8d,206.771,3b753c3d68a8a4c6988a766a0f6147ac.json
4408,390,8,2025-10-30 18:44:52.366429+01:00,assistant,tts,Notieren Sie diese bitte und tragen Sie sie da...,046c24f2fddc7bd7,137.73,3b753c3d68a8a4c6988a766a0f6147ac.json
4409,390,9,2025-10-30 18:44:52.504630+01:00,assistant,tts,Damit wir Sie bestmöglich einplanen können: Da...,b589f7d8aaf9db66,181.931,3b753c3d68a8a4c6988a766a0f6147ac.json
4410,390,10,2025-10-30 18:45:33.793629+01:00,user,stt,Ja.,e86f256b02bdb14e,0.056,3b753c3d68a8a4c6988a766a0f6147ac.json


In [3]:
import os; print(os.getcwd())

d:\Desktop\data_merge\mycode
