In [None]:
# Ziel: Phrase-Level m-WER, bei dem jede WER-Operation als _med_ zählt, 
# wenn eine der enthaltenen Wörter medizinisch ist (Cutoff ≥ 0.80).

import re
import json
import pandas as pd
from pymongo import MongoClient
from jiwer import process_words, collect_error_counts
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# -------------------------------
# 1. Medizinisches Vokabular vorbereiten
# -------------------------------
cleaned = pd.read_csv("lexikon_cleaned_ger_synonyms.csv")
noun_set = set(w for ph in cleaned["only_nouns"].dropna() for w in ph.split())
adj_set  = set(w for ph in cleaned["adjectives"].dropna()  for w in ph.split())

atc   = pd.read_csv("lexikon_ATC-Bedeutung_final_noarticles.csv")
ling  = pd.read_csv("lexikon_deDE15LinguisticVariant_final_noarticles.csv")

def tokenize_list(phrases):
    toks = set()
    for p in phrases:
        for w in re.sub(r"[^\w\säöüß]", " ", str(p).lower()).split():
            if w.isalpha(): toks.add(w)
    return toks

atc_set  = tokenize_list(atc["ATC-Bedeutung_cleaned"].dropna())
comp_set = tokenize_list(ling["COMPONENT_cleaned"].dropna())

med_vocab = sorted(noun_set.union(adj_set, atc_set, comp_set))
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4))
X_vocab = vectorizer.fit_transform(med_vocab)

# Batch-unique tokens from previous JSON (to build lookup)
wer_data = json.load(open("wer_token_sources.json", encoding="utf-8"))
unique_tokens = set()
for e in wer_data:
    unique_tokens.update(w for ph in e.get("subs_ref_tokens", []) for w in ph.split())
    unique_tokens.update(w for ph in e.get("del_ref_tokens", [])   for w in ph.split())
    unique_tokens.update(w for ph in e.get("ins_hyp_tokens", [])   for w in ph.split())

tokens_list = sorted(unique_tokens)
X_tok = vectorizer.transform(tokens_list)
sims_matrix = cosine_similarity(X_tok, X_vocab)
token_to_score = {
    tok: float(sims_matrix[i].max())
    for i, tok in enumerate(tokens_list)
}

# -------------------------------
# 2. Phrase-Level compute function
# -------------------------------
def compute_wer_mwer_phrase(ref, hyp, cutoff=0.80):
    out       = process_words(ref, hyp)
    subs_d, ins_d, del_d = collect_error_counts(out)
    
    # Standard counts
    wer_score = out.wer
    S = sum(subs_d.values())
    D = sum(del_d.values())
    I = sum(ins_d.values())

    # Phrase-level medical counts
    S_med = sum(
        cnt for (ref_ph, _), cnt in subs_d.items()
        if any(token_to_score.get(w,0.0) >= cutoff for w in ref_ph.split())
    )
    D_med = sum(
        cnt for ref_ph, cnt in del_d.items()
        if any(token_to_score.get(w,0.0) >= cutoff for w in ref_ph.split())
    )
    I_med = sum(
        cnt for hyp_ph, cnt in ins_d.items()
        if any(token_to_score.get(w,0.0) >= cutoff for w in hyp_ph.split())
    )

    # m-WER denominator: number of medical words in ref
    ref_tokens = ref.split()
    total_med_ref = sum(1 for w in ref_tokens if token_to_score.get(w,0.0) >= cutoff)
    mwer = (S_med + D_med + I_med) / total_med_ref if total_med_ref else 0.0

    return wer_score, S, D, I, S_med, D_med, I_med, mwer

# -------------------------------
# 3. Über DB iterieren und CSV export
# -------------------------------
client = MongoClient("mongodb://localhost:27018/")
col = client["transcriptions"]["transcripts_denis"]

cursor = col.find(
    {"excludeGeneral": 0},
    {"convoID":1, "ambientVariant":1, "processedVolume":1,
     "technology":1, "model":1,
     "src_wer_denis":1, "text_wer_denis":1}
)

rows = []
for doc in cursor:
    ref = doc.get("src_wer_denis","")
    hyp = doc.get("text_wer_denis","")
    wer, S, D, I, S_med, D_med, I_med, mwer = compute_wer_mwer_phrase(ref, hyp)
    rows.append({
        "convoID":        doc.get("convoID"),
        "ambientVariant": doc.get("ambientVariant"),
        "processedVolume":doc.get("processedVolume"),
        "technology":     doc.get("technology"),
        "model":          doc.get("model"),
        "wer":            wer,
        "S":              S,
        "D":              D,
        "I":              I,
        "S_med":          S_med,
        "D_med":          D_med,
        "I_med":          I_med,
        "mwer":           mwer
    })

df = pd.DataFrame(rows)
df.to_csv("transcripts_wer_mwer_phrase.csv", index=False)
print(f"{len(df)} Transcripts verarbeitet. Ergebnis in 'transcripts_wer_mwer_phrase.csv'.")
