In [1]:
# Ziel: Für eine wählbare Anzahl von Transkripten (1 oder "all") Standard-WER und m-WER (S/D/I_med_diagnosis) berechnen.

import re
from pymongo import MongoClient
import pandas as pd
from jiwer import process_words, collect_error_counts

# -- Einstellungen --
NUM_TRANSCRIPTS = 1   # Setze auf eine Zahl oder "all" für alle Datensätze

# MongoDB-Verbindung
client = MongoClient("mongodb://localhost:27018/")
col = client["transcriptions"]["transcripts_denis"]

# Medizinische Wortlisten laden
cleaned = pd.read_csv("cleaned_ger_synonyms.csv")  # aus vorherigem Notebook
# Sets für Nomen und Adjektive
noun_set = set(w for phrase in cleaned["only_nouns"] for w in str(phrase).split() if w)
adj_set  = set(w for phrase in cleaned["adjectives"] for w in str(phrase).split() if w)

# Funktion zur einfachen Normalisierung
def normalize_simple(text: str) -> str:
    t = text.lower()
    t = re.sub(r"[^\w\s]", " ", t)
    return re.sub(r"\s+", " ", t).strip()

# Query vorbereiten
cursor = col.find({}, {"_id":1, "src_wer_denis":1, "text":1})
if NUM_TRANSCRIPTS != "all":
    cursor = cursor.limit(int(NUM_TRANSCRIPTS))

# Ergebnisse sammeln
results = []
for doc in cursor:
    ref = doc.get("src_wer_denis", "")
    hyp = normalize_simple(doc.get("text", ""))  # Hypothese nachsimple normalization
    
    # WER-Alignment
    out = process_words(ref, hyp)
    subs, ins, dels = collect_error_counts(out)

    # Standard-WER-Stats
    S, D, I = sum(subs.values()), sum(dels.values()), sum(ins.values())
    N = len(ref.split())
    wer_score = out.wer

    # Medizinische Fehler klassifizieren
    S_med = sum(cnt for (r,h), cnt in subs.items() if r in noun_set or r in adj_set)
    D_med = sum(cnt for r, cnt in dels.items()        if r in noun_set or r in adj_set)
    I_med = sum(cnt for h, cnt in ins.items()         if h in noun_set or h in adj_set)

    # m-WER
    total_med_ref = sum(1 for w in ref.split() if w in noun_set or w in adj_set)
    m_wer = (S_med + D_med + I_med) / total_med_ref if total_med_ref else None

    results.append({
        "id": str(doc["_id"]),
        "WER": wer_score,
        "S": S, "D": D, "I": I,
        "S_med_diagnosis": S_med,
        "D_med_diagnosis": D_med,
        "I_med_diagnosis": I_med,
        "m-WER": m_wer
    })

# Ausgabe als DataFrame
df_results = pd.DataFrame(results)
print(df_results)


                         id       WER    S   D   I  S_med_diagnosis  \
0  6765e8a4a84bdac1bcd5a73e  0.363198  445  74  66               12   

   D_med_diagnosis  I_med_diagnosis     m-WER  
0                6                3  0.074733  


In [6]:
# 0. (Einmalig) Dependencies installieren, falls nötig:
# !pip install jiwer pymongo pandas

import re
import json
from pymongo import MongoClient
import pandas as pd
from jiwer import process_words, collect_error_counts

# -- Einstellungen --
NUM_TRANSCRIPTS = 1   # Zahl oder "all"
OUTPUT_FILE     = "wer_mwer_results.json"

# 1. MongoDB-Verbindung
client = MongoClient("mongodb://localhost:27018/")
col = client["transcriptions"]["transcripts_denis"]

# 2. Medizinische Wortlisten laden
cleaned = pd.read_csv("cleaned_ger_synonyms.csv")
noun_set = set(w for phrase in cleaned["only_nouns"] for w in str(phrase).split() if w)
adj_set  = set(w for phrase in cleaned["adjectives"] for w in str(phrase).split() if w)

# 3. Simple Normalisierung
def normalize_simple(text: str) -> str:
    t = text.lower()
    t = re.sub(r"[^\w\s]", " ", t)
    return re.sub(r"\s+", " ", t).strip()

# 4. Cursor vorbereiten
cursor = col.find({}, {"_id":1, "src_wer_denis":1, "text":1})
if NUM_TRANSCRIPTS != "all":
    cursor = cursor.limit(int(NUM_TRANSCRIPTS))

# 5. Ergebnisse erzeugen
results = []
for doc in cursor:
    tid = str(doc["_id"])
    ref = normalize_simple(doc.get("src_wer_denis", ""))
    hyp = normalize_simple(doc.get("text", ""))

    out       = process_words(ref, hyp)
    subs, ins, dels = collect_error_counts(out)

    # Standard-WER-Zahlen
    S, D, I = sum(subs.values()), sum(dels.values()), sum(ins.values())

    # Token-Listen
    subs_tokens = [r for (r,h), cnt in subs.items() for _ in range(cnt)]
    del_tokens  = [r for r, cnt in dels.items()   for _ in range(cnt)]
    ins_tokens  = [h for h, cnt in ins.items()    for _ in range(cnt)]

    # med-Filter
    med_subs = [t for t in subs_tokens if t in noun_set or t in adj_set]
    med_dels = [t for t in del_tokens  if t in noun_set or t in adj_set]
    med_ins  = [t for t in ins_tokens  if t in noun_set or t in adj_set]

    # m-WER
    total_med_ref = sum(1 for w in ref.split() if w in noun_set or w in adj_set)
    m_wer = (len(med_subs) + len(med_dels) + len(med_ins)) / total_med_ref if total_med_ref else None

    results.append({
        "id":                 tid,
        "WER":                out.wer,
        "S":                  S,
        "D":                  D,
        "I":                  I,
        "S_med_diagnosis":    med_subs,
        "D_med_diagnosis":    med_dels,
        "I_med_diagnosis":    med_ins,
        "m-WER":              m_wer
    })

# 6. Lokal als JSON speichern
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"{len(results)} Transcripts verarbeitet und in '{OUTPUT_FILE}' gespeichert.")


1 Transcripts verarbeitet und in 'wer_mwer_results.json' gespeichert.


# Step by Step

In [18]:
# Ziel: Für jedes Transcript die WER-Operationen extrahieren, dabei klar benennen, ob Tokens aus Ref- oder Hyp-Text stammen,
# und das Ergebnis als JSON-Datei in deinem Notebook-Verzeichnis speichern.

import re
import json
from pymongo import MongoClient
import pandas as pd
from jiwer import process_words, collect_error_counts

# 0. (Einmalig) Dependencies installieren, falls nötig:
# !pip install jiwer pymongo pandas

# -- Einstellungen --
NUM_TRANSCRIPTS = 1   # Zahl oder "all"
OUTPUT_FILE     = "wer_token_sources.json"

# 1. MongoDB-Verbindung
client = MongoClient("mongodb://localhost:27018/")
col = client["transcriptions"]["transcripts_denis"]

# 2. Simple Normalisierung (da deine src_wer_denis/text_wer_denis bereits bereinigt sind):
def normalize_simple(text: str) -> str:
    t = text.lower()
    t = re.sub(r"[^\w\s]", " ", t)
    return re.sub(r"\s+", " ", t).strip()

# 3. Cursor vorbereiten
cursor = col.find({}, {"_id":1, "src_wer_denis":1, "text_wer_denis":1})
if NUM_TRANSCRIPTS != "all":
    cursor = cursor.limit(int(NUM_TRANSCRIPTS))

results = []
for doc in cursor:
    tid = str(doc["_id"])
    ref = normalize_simple(doc.get("src_wer_denis", ""))
    hyp = normalize_simple(doc.get("text_wer_denis", ""))

    # 4. WER-Alignment und Fehlerzählungen
    out       = process_words(ref, hyp)
    subs_dict, ins_dict, del_dict = collect_error_counts(out)

    # 5. Token-Listen mit Quellangabe
    subs_ref_tokens = [r for (r, h), cnt in subs_dict.items() for _ in range(cnt)]
    del_ref_tokens  = [r for r, cnt in del_dict.items()   for _ in range(cnt)]
    ins_hyp_tokens  = [h for h, cnt in ins_dict.items()    for _ in range(cnt)]

    # 6. Ergebnis-Dict je Transcript
    results.append({
        "id": tid,
        "wer": out.wer,
        "subs_ref_tokens": subs_ref_tokens,
        "del_ref_tokens":  del_ref_tokens,
        "ins_hyp_tokens":  ins_hyp_tokens
    })

# 7. Lokal als JSON speichern
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print(f"{len(results)} Transcripts verarbeitet. Ergebnisse gespeichert in '{OUTPUT_FILE}'.")


1 Transcripts verarbeitet. Ergebnisse gespeichert in 'wer_token_sources.json'.


In [35]:
# Ziel: Erweiterte Klassifikation für Substitutions-, Deletions- und Insertions-Tokens
# wobei nun zusätzlich mittels TF-IDF und Cosine-Similarity (Cutoff=0.70) unscharfe Matches erlaubt sind.
# Ergebnisse werden in CSVs mit Prefix "result_" gespeichert.

import json
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. WER-Token-JSON laden
WER_JSON = "wer_token_sources.json"
wer_data = json.load(open(WER_JSON, encoding="utf-8"))

# 2. Medizinisches Vokabular aus bisherigen Lexika laden
cleaned = pd.read_csv("lexikon_cleaned_ger_synonyms.csv")
noun_set = set(w for phrase in cleaned["only_nouns"].dropna() for w in phrase.split())
adj_set  = set(w for phrase in cleaned["adjectives"].dropna()  for w in phrase.split())

atc = pd.read_csv("lexikon_ATC-Bedeutung_final_noarticles.csv")
atc_terms = atc["ATC-Bedeutung_cleaned"].dropna().tolist()

ling = pd.read_csv("lexikon_deDE15LinguisticVariant_final_noarticles.csv")
comp_terms = ling["COMPONENT_cleaned"].dropna().tolist()

def tokenize_list(phrases):
    toks = set()
    for p in phrases:
        for w in re.sub(r"[^\w\säöüß]", " ", p.lower()).split():
            if w and w.isalpha():
                toks.add(w)
    return toks

atc_set  = tokenize_list(atc_terms)
comp_set = tokenize_list(comp_terms)

med_vocab = sorted(noun_set.union(adj_set, atc_set, comp_set))

# 3. TF-IDF Vectorizer vorbereiten (char-level ngrams für Fuzzy-Matching)
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4))
X_vocab = vectorizer.fit_transform(med_vocab)

# 4. Hilfsfunktion für medizinisches Matching
def is_medical(token, cutoff=0.70):
    # Exaktes Matching
    if token in med_vocab:
        return True
    # Fuzzy Matching via Cosine-Similarity
    vec = vectorizer.transform([token])
    sims = cosine_similarity(vec, X_vocab).flatten()
    return sims.max() >= cutoff

# 5. Token-Listen aus WER-Daten aggregieren
subs = []
dels = []
ins  = []
for entry in wer_data:
    subs += entry.get("subs_ref_tokens", [])
    dels += entry.get("del_ref_tokens", [])
    ins  += entry.get("ins_hyp_tokens", [])

# 6. DataFrames erstellen und Klassifikation durchführen
df_subs = pd.DataFrame({"token": subs})
df_subs["is_medical"] = df_subs["token"].apply(is_medical)

df_dels = pd.DataFrame({"token": dels})
df_dels["is_medical"] = df_dels["token"].apply(is_medical)

df_ins = pd.DataFrame({"token": ins})
df_ins["is_medical"] = df_ins["token"].apply(is_medical)

# 7. Ergebnisse speichern
df_subs.to_csv("result_subs_medical.csv", index=False)
df_dels.to_csv("result_dels_medical.csv", index=False)
df_ins.to_csv("result_ins_medical.csv", index=False)

# 8. Zusammenfassung ausgeben
print(f"Substitutions total: {len(df_subs)}, medical: {df_subs['is_medical'].sum()}")
print(f"Deletions     total: {len(df_dels)}, medical: {df_dels['is_medical'].sum()}")
print(f"Insertions    total: {len(df_ins)}, medical: {df_ins['is_medical'].sum()}")

# Hinweis: Es wurde nur das Matching aktualisiert: statt rein exaktem Token-Abgleich
# erfolgt nun zusätzliche Fuzzy-Suche via TF-IDF/Cosine (cutoff=0.70).


Substitutions total: 325, medical: 36
Deletions     total: 102, medical: 7
Insertions    total: 64, medical: 2


In [38]:
# Ziel: Ergänzung der Result-CSV um Spalten für Cosine-Similarity-Score und Best-Match-Vokabel 
# sowie Ref- und Hyp-Token-Spalten.

import json
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. WER-Token-JSON laden
wer_data = json.load(open("wer_token_sources.json", encoding="utf-8"))

# 2. Medizinisches Vokabular laden
cleaned = pd.read_csv("lexikon_cleaned_ger_synonyms.csv")
noun_set = set(w for phrase in cleaned["only_nouns"].dropna() for w in phrase.split())
adj_set  = set(w for phrase in cleaned["adjectives"].dropna()  for w in phrase.split())

atc = pd.read_csv("lexikon_ATC-Bedeutung_final_noarticles.csv")
atc_terms = atc["ATC-Bedeutung_cleaned"].dropna().tolist()

ling = pd.read_csv("lexikon_deDE15LinguisticVariant_final_noarticles.csv")
comp_terms = ling["COMPONENT_cleaned"].dropna().tolist()

def tokenize_list(phrases):
    toks = set()
    for p in phrases:
        for w in re.sub(r"[^\w\säöüß]", " ", p.lower()).split():
            if w and w.isalpha():
                toks.add(w)
    return toks

med_vocab = sorted(noun_set.union(adj_set, tokenize_list(atc_terms), tokenize_list(comp_terms)))

# 3. TF-IDF Vectorizer aufbauen
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4))
X_vocab = vectorizer.fit_transform(med_vocab)

# 4. Funktion für Best-Match und Score
def best_match_and_score(token):
    if token in med_vocab:
        return token, 1.0
    vec = vectorizer.transform([token])
    sims = cosine_similarity(vec, X_vocab).flatten()
    idx = sims.argmax()
    return med_vocab[idx], sims[idx]

# 5. Token-Listen extrahieren
subs_rows, dels_rows, ins_rows = [], [], []
for entry in wer_data:
    for r, h in zip(entry.get("subs_ref_tokens", []), entry.get("subs_hyp_tokens", [])):
        subs_rows.append((r, h))
    for r in entry.get("del_ref_tokens", []):
        dels_rows.append((r, "")) 
    for h in entry.get("ins_hyp_tokens", []):
        ins_rows.append(("", h))

# 6. DataFrames aufbauen
df_subs = pd.DataFrame(subs_rows, columns=["ref_token", "hyp_token"])
df_dels = pd.DataFrame(dels_rows, columns=["ref_token", "hyp_token"])
df_ins  = pd.DataFrame(ins_rows,  columns=["ref_token", "hyp_token"])



# 7. Zeilenweise Best-Match + Score ermitteln
for df in (df_subs, df_dels, df_ins):
    # Wähle pro Zeile das Token aus Ref oder Hyp
    tokens = df.apply(
        lambda row: row["ref_token"] if row["ref_token"] else row["hyp_token"],
        axis=1
    )
    # Wende best_match_and_score an
    matches = tokens.apply(best_match_and_score)
    # In separaten Series entpacken
    df["best_match"]    = matches.apply(lambda x: x[0])
    df["cosine_score"]  = matches.apply(lambda x: x[1])
    # Markiere medizinisch ab Score ≥ 0.70
    df["is_medical"]    = df["cosine_score"] >= 0.70

# 8. Ergebnisse speichern
df_subs.to_csv("result_subs_medical.csv", index=False)
df_dels.to_csv("result_dels_medical.csv", index=False)
df_ins .to_csv("result_ins_medical.csv", index=False)

print(f"Substitutions: {len(df_subs)} Zeilen; medical: {df_subs['is_medical'].sum()}")
print(f"Deletions:     {len(df_dels)} Zeilen; medical: {df_dels['is_medical'].sum()}")
print(f"Insertions:    {len(df_ins)} Zeilen; medical: {df_ins['is_medical'].sum()}")



Substitutions: 0 Zeilen; medical: 0
Deletions:     102 Zeilen; medical: 7
Insertions:    64 Zeilen; medical: 2


In [45]:
# Ziel: Vollständiger Workflow mit Padding der Listen, damit alle DataFrame-Spalten gleiche Länge haben.

import json
import re
import pandas as pd
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. WER-Token-Quellen laden
wer_data = json.load(open("wer_token_sources.json", encoding="utf-8"))

# 2. Medizinisches Vokabular aus Lexika laden
cleaned = pd.read_csv("lexikon_cleaned_ger_synonyms.csv")
noun_set = set(w for phrase in cleaned["only_nouns"].dropna() for w in phrase.split())
adj_set  = set(w for phrase in cleaned["adjectives"].dropna()  for w in phrase.split())

atc = pd.read_csv("lexikon_ATC-Bedeutung_final_noarticles.csv")
atc_terms = atc["ATC-Bedeutung_cleaned"].dropna().tolist()

ling = pd.read_csv("lexikon_deDE15LinguisticVariant_final_noarticles.csv")
comp_terms = ling["COMPONENT_cleaned"].dropna().tolist()

def tokenize_list(phrases):
    toks = set()
    for p in phrases:
        for w in re.sub(r"[^\w\säöüß]", " ", p.lower()).split():
            if w and w.isalpha():
                toks.add(w)
    return toks

atc_set  = tokenize_list(atc_terms)
comp_set = tokenize_list(comp_terms)

med_vocab = sorted(noun_set.union(adj_set, atc_set, comp_set))

# 3. TF-IDF Vectorizer vorbereiten
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4))
X_vocab = vectorizer.fit_transform(med_vocab)

# 4. Helper-Funktion
def best_match_and_score(token):
    if token in med_vocab:
        return token, 1.0
    vec = vectorizer.transform([token])
    sims = cosine_similarity(vec, X_vocab).flatten()
    idx = sims.argmax()
    return med_vocab[idx], sims[idx]

# 5. Token-Listen sammeln
subs_ref = []
subs_hyp = []
dels_ref = []
ins_hyp  = []

for entry in wer_data:
    subs_ref += entry.get("subs_ref_tokens", [])
    subs_hyp += entry.get("subs_hyp_tokens", [])
    dels_ref += entry.get("del_ref_tokens", [])
    ins_hyp  += entry.get("ins_hyp_tokens", [])

# 6. Padding der Listen auf gleiche Länge
def pad_list(lst, length):
    return lst + [""]*(length - len(lst))

# Substitutions
max_sub = max(len(subs_ref), len(subs_hyp))
subs_ref = pad_list(subs_ref, max_sub)
subs_hyp = pad_list(subs_hyp, max_sub)
# Deletions (pad hyp)
dels_ref = pad_list(dels_ref, len(dels_ref))
dels_hyp = [""] * len(dels_ref)
# Insertions (pad ref)
ins_ref = [""] * len(ins_hyp)
ins_hyp = pad_list(ins_hyp, len(ins_hyp))

# 7. DataFrames erstellen
df_subs = pd.DataFrame({"ref_token": subs_ref, "hyp_token": subs_hyp})
df_dels = pd.DataFrame({"ref_token": dels_ref, "hyp_token": dels_hyp})
df_ins  = pd.DataFrame({"ref_token": ins_ref, "hyp_token": ins_hyp})

# 8. Matching und Score
for df in (df_subs, df_dels, df_ins):
    tokens = df["ref_token"].where(df["ref_token"]!="", df["hyp_token"])
    matches = tokens.apply(best_match_and_score)
    df["best_match"]   = matches.apply(lambda x: x[0])
    df["cosine_score"] = matches.apply(lambda x: x[1])
    df["is_medical"]   = df["cosine_score"] >= 0.80

# 9. Export
df_subs.to_csv("result_subs_medical.csv", index=False)
df_dels.to_csv("result_dels_medical.csv", index=False)
df_ins .to_csv("result_ins_medical.csv", index=False)

print("Exports fertig: result_subs_medical.csv, result_dels_medical.csv, result_ins_medical.csv")




Exports fertig: result_subs_medical.csv, result_dels_medical.csv, result_ins_medical.csv


In [48]:
# Ziel: Für jedes einzelne Wort aus Substitutions-, Deletions- und Insertions-Phrasen
# fuzzy-matchen wir gegen das medizinische Vokabular und speichern das Ergebnis in CSVs.

import json
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. WER-Token-Quellen laden
wer_data = json.load(open("wer_token_sources.json", encoding="utf-8"))

# 2. Medizinisches Vokabular aus den Lexika laden
cleaned = pd.read_csv("lexikon_cleaned_ger_synonyms.csv")
noun_set = set(w for phrase in cleaned["only_nouns"].dropna() for w in phrase.split())
adj_set  = set(w for phrase in cleaned["adjectives"].dropna()  for w in phrase.split())

atc = pd.read_csv("lexikon_ATC-Bedeutung_final_noarticles.csv")
atc_terms = atc["ATC-Bedeutung_cleaned"].dropna().tolist()

ling = pd.read_csv("lexikon_deDE15LinguisticVariant_final_noarticles.csv")
comp_terms = ling["COMPONENT_cleaned"].dropna().tolist()

def tokenize_list(phrases):
    toks = set()
    for p in phrases:
        for w in re.sub(r"[^\w\säöüß]", " ", p.lower()).split():
            if w and w.isalpha():
                toks.add(w)
    return toks

atc_set  = tokenize_list(atc_terms)
comp_set = tokenize_list(comp_terms)

med_vocab = sorted(noun_set.union(adj_set, atc_set, comp_set))

# 3. TF-IDF Vectorizer zur Fuzzy-Matching–Vorbereitung
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4))
X_vocab = vectorizer.fit_transform(med_vocab)

# 4. Hilfsfunktion: best_match + cosine_score für ein Token
def best_match_and_score(token):
    # Exakter Treffer
    if token in med_vocab:
        return token, 1.0
    # Fuzzy-Matching
    vec = vectorizer.transform([token])
    sims = cosine_similarity(vec, X_vocab).flatten()
    idx = sims.argmax()
    return med_vocab[idx], sims[idx]

# 5. Einzelwörter aus allen WER-Operationen sammeln
subs_words = []
dels_words = []
ins_words  = []

for entry in wer_data:
    for phrase in entry.get("subs_ref_tokens", []):
        subs_words += [w for w in phrase.split()]
    for phrase in entry.get("del_ref_tokens", []):
        dels_words += [w for w in phrase.split()]
    for phrase in entry.get("ins_hyp_tokens", []):
        ins_words  += [w for w in phrase.split()]

# 6. DataFrames erstellen und fuzzy matchen
def build_df(tokens, label):
    df = pd.DataFrame({"token": tokens})
    df["best_match"], df["cosine_score"] = zip(*df["token"].apply(best_match_and_score))
    df["is_medical"] = df["cosine_score"] >= 0.80
    return df

df_subs = build_df(subs_words, "substitution")
df_dels = build_df(dels_words, "deletion")
df_ins  = build_df(ins_words,  "insertion")

# 7. CSVs exportieren
df_subs.to_csv("result_subs_medical.csv", index=False)
df_dels.to_csv("result_dels_medical.csv", index=False)
df_ins .to_csv("result_ins_medical.csv", index=False)

# 8. Kurze Zusammenfassung
print(f"Substitutions: {len(df_subs)} Wörter, {df_subs['is_medical'].sum()} medizinisch")
print(f"Deletions:     {len(df_dels)} Wörter, {df_dels['is_medical'].sum()} medizinisch")
print(f"Insertions:    {len(df_ins)} Wörter, {df_ins['is_medical'].sum()} medizinisch")


Substitutions: 615 Wörter, 60 medizinisch
Deletions:     217 Wörter, 15 medizinisch
Insertions:    259 Wörter, 22 medizinisch


# Finalize

In [50]:
# 0. (Einmalig) Dependencies installieren, falls nötig:
# !pip install jiwer pymongo pandas scikit-learn

import re
import json
import pandas as pd
from pymongo import MongoClient
from jiwer import process_words, collect_error_counts
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ---------------------------------------------------------------------
# 1. Vorbereitung: medizinisches Vokabular laden & TF-IDF-Vectorizer fitten
# ---------------------------------------------------------------------
# a) Synonym-Korpus
cleaned = pd.read_csv("lexikon_cleaned_ger_synonyms.csv")
noun_set = set(w for phrase in cleaned["only_nouns"].dropna()  for w in phrase.split())
adj_set  = set(w for phrase in cleaned["adjectives"].dropna()   for w in phrase.split())

# b) ATC- & Prozedur-Lexika
atc   = pd.read_csv("lexikon_ATC-Bedeutung_final_noarticles.csv")
ling  = pd.read_csv("lexikon_deDE15LinguisticVariant_final_noarticles.csv")

def tokenize_list(phrases):
    toks = set()
    for p in phrases:
        for w in re.sub(r"[^\w\säöüß]", " ", str(p).lower()).split():
            if w.isalpha():
                toks.add(w)
    return toks

atc_set  = tokenize_list(atc["ATC-Bedeutung_cleaned"].dropna())
comp_set = tokenize_list(ling["COMPONENT_cleaned"].dropna())

# c) Gesamt-Vokabular und TF-IDF Vectorizer
med_vocab = sorted(noun_set.union(adj_set, atc_set, comp_set))
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4))
X_vocab = vectorizer.fit_transform(med_vocab)

def best_match_and_score(token):
    """Gibt (best_match, cosine_score) zurück."""
    if token in med_vocab:
        return token, 1.0
    vec  = vectorizer.transform([token])
    sims = cosine_similarity(vec, X_vocab).flatten()
    idx  = sims.argmax()
    return med_vocab[idx], sims[idx]

# ---------------------------------------------------------------------
# 2. Funktion: WER- und m-WER-Berechnung für ein einzelnes Ref/Hyp-Paar
# ---------------------------------------------------------------------
def compute_wer_mwer(ref: str, hyp: str, cutoff=0.80):
    # 1) WER-Alignment
    out        = process_words(ref, hyp)
    subs_d, ins_d, del_d = collect_error_counts(out)

    # 2) Standard-WER-Werte
    wer_score = out.wer
    S = sum(subs_d.values())
    D = sum(del_d.values())
    I = sum(ins_d.values())

    # 3) Einzelwörter extrahieren
    # Bei Substitution: key = (ref_phrase, hyp_phrase)
    subs_words = [
        w
        for (ref_phrase, _), cnt in subs_d.items()
        for w in ref_phrase.split()
        for _ in range(cnt)
    ]
    # Bei Deletion: key = ref_phrase
    del_words = [
        w
        for ref_phrase, cnt in del_d.items()
        for w in ref_phrase.split()
        for _ in range(cnt)
    ]
    # Bei Insertion: key = hyp_phrase
    ins_words = [
        w
        for hyp_phrase, cnt in ins_d.items()
        for w in hyp_phrase.split()
        for _ in range(cnt)
    ]

    # 4) Fuzzy-Medical-Check
    def is_med(tok):
        match, score = best_match_and_score(tok)
        return score >= cutoff

    S_med = sum(is_med(w) for w in subs_words)
    D_med = sum(is_med(w) for w in del_words)
    I_med = sum(is_med(w) for w in ins_words)

    # 5) m-WER: Nenner = Anzahl medizinischer Wörter im Ref
    ref_tokens = ref.split()
    total_med_ref = sum(is_med(w) for w in ref_tokens)
    mwer = (S_med + D_med + I_med) / total_med_ref if total_med_ref else 0.0

    return {
        "wer": wer_score,
        "S": S, "D": D, "I": I,
        "S_med": S_med, "D_med": D_med, "I_med": I_med,
        "mwer": mwer
    }

# ---------------------------------------------------------------------
# 3. Schleife über alle Transcripts, Auslesen der Meta-Felder und CSV-Export
# ---------------------------------------------------------------------
client = MongoClient("mongodb://localhost:27018/")
col    = client["transcriptions"]["transcripts_denis"]

results = []
cursor  = col.find({"excludeGeneral": 0}, 
                   {"convoID":1, "ambientVariant":1, "processedVolume":1,
                    "technology":1, "model":1,
                    "src_wer_denis":1, "text_wer_denis":1})

for doc in cursor:
    meta = {
        "convoID":        doc.get("convoID"),
        "ambientVariant": doc.get("ambientVariant"),
        "processedVolume":doc.get("processedVolume"),
        "technology":     doc.get("technology"),
        "model":          doc.get("model")
    }
    # Ref/Hyp aus DB (bereits normalisiert)
    ref = doc.get("src_wer_denis", "")
    hyp = doc.get("text_wer_denis", "")

    # Kennzahlen berechnen
    scores = compute_wer_mwer(ref, hyp, cutoff=0.80)
    row    = {**meta, **scores}
    results.append(row)

# In DataFrame und CSV
df = pd.DataFrame(results)
df.to_csv("transcripts_wer_mwer.csv", index=False)

print(f"{len(df)} Transcripts verarbeitet und in 'transcripts_wer_mwer.csv' gespeichert.")


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


KeyboardInterrupt: 

In [52]:
# 0. (Einmalig) Dependencies installieren:
# !pip install jiwer pymongo pandas scikit-learn

import re
import json
import pandas as pd
from pymongo import MongoClient
from jiwer import process_words, collect_error_counts
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ---------------------------------------------------------------------
# 1. Medizinisches Vokabular & TF-IDF-Vectorizer fitten
# ---------------------------------------------------------------------
cleaned = pd.read_csv("lexikon_cleaned_ger_synonyms.csv")
noun_set = set(w for ph in cleaned["only_nouns"].dropna() for w in ph.split())
adj_set  = set(w for ph in cleaned["adjectives"].dropna()  for w in ph.split())

atc   = pd.read_csv("lexikon_ATC-Bedeutung_final_noarticles.csv")
ling  = pd.read_csv("lexikon_deDE15LinguisticVariant_final_noarticles.csv")

def tokenize_list(phrases):
    toks = set()
    for p in phrases:
        for w in re.sub(r"[^\w\säöüß]", " ", str(p).lower()).split():
            if w.isalpha(): toks.add(w)
    return toks

atc_set  = tokenize_list(atc["ATC-Bedeutung_cleaned"].dropna())
comp_set = tokenize_list(ling["COMPONENT_cleaned"].dropna())

med_vocab = sorted(noun_set.union(adj_set, atc_set, comp_set))
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4))
X_vocab = vectorizer.fit_transform(med_vocab)

def best_match_and_score(token):
    if token in med_vocab:
        return token, 1.0
    vec  = vectorizer.transform([token])
    sims = cosine_similarity(vec, X_vocab).flatten()
    idx  = sims.argmax()
    return med_vocab[idx], sims[idx]

# ---------------------------------------------------------------------
# 2. Einmaliges Sammeln aller UNIQUE TOKENS aus wer_token_sources.json
# ---------------------------------------------------------------------
wer_data = json.load(open("wer_token_sources.json", encoding="utf-8"))
unique_tokens = set()

for entry in wer_data:
    unique_tokens.update(token for phrase in entry.get("subs_ref_tokens", []) for token in phrase.split())
    unique_tokens.update(token for phrase in entry.get("del_ref_tokens", [])   for token in phrase.split())
    unique_tokens.update(token for phrase in entry.get("ins_hyp_tokens", [])   for token in phrase.split())

# Batch TF-IDF + Cosine
tokens_list = sorted(unique_tokens)
X_tok       = vectorizer.transform(tokens_list)
sims_matrix = cosine_similarity(X_tok, X_vocab)

# Lookup-Map aufbauen einmalig
token_to_medinfo = {
    tok: (med_vocab[idx], float(sims_matrix[i, idx]))
    for i, tok in enumerate(tokens_list)
    for idx in [sims_matrix[i].argmax()]
}

# ---------------------------------------------------------------------
# 3. Schnelle WER + m-WER Berechnung unter Einsatz des Lookup-Maps
# ---------------------------------------------------------------------
def compute_wer_mwer_fast(ref, hyp, cutoff=0.80):
    out        = process_words(ref, hyp)
    subs_d, ins_d, del_d = collect_error_counts(out)

    # WER
    wer_score = out.wer

    # Hilfsfunktion für Count
    def count_med(dict_ops, key_idx):
        cnt = 0
        for key, c in dict_ops.items():
            # key_idx = 0 für Sub (ref side), None für Del/Ins (single-string)
            phrase = key[0] if isinstance(key, tuple) and key_idx==0 else key if not isinstance(key, tuple) else key[1]
            for w in phrase.split():
                if token_to_medinfo.get(w, ("",0.0))[1] >= cutoff:
                    cnt += 1
        return cnt

    S_med = count_med(subs_d, 0)
    D_med = count_med(del_d, None)
    I_med = count_med(ins_d, None)

    # m-WER Nenner: medizinische Wörter im Ref
    total_med_ref = sum(
        1 for w in ref.split()
        if token_to_medinfo.get(w, ("",0.0))[1] >= cutoff
    )
    mwer = (S_med + D_med + I_med) / total_med_ref if total_med_ref else 0.0

    return wer_score, S_med, D_med, I_med, mwer

# ---------------------------------------------------------------------
# 4. Über alle Transcripts iterieren & CSV schreiben
# ---------------------------------------------------------------------
client = MongoClient("mongodb://localhost:27018/")
col = client["transcriptions"]["transcripts_denis"]

cursor = col.find(
    {"excludeGeneral": 0},
    {"convoID":1, "ambientVariant":1, "processedVolume":1,
     "technology":1, "model":1,
     "src_wer_denis":1, "text_wer_denis":1}
)

rows = []
for doc in cursor:
    wer, S_med, D_med, I_med, mwer = compute_wer_mwer_fast(
        doc["src_wer_denis"], doc["text_wer_denis"], cutoff=0.80
    )
    rows.append({
        "convoID":         doc.get("convoID"),
        "ambientVariant":  doc.get("ambientVariant"),
        "processedVolume": doc.get("processedVolume"),
        "technology":      doc.get("technology"),
        "model":           doc.get("model"),
        "wer":             wer,
        "S_med":           S_med,
        "D_med":           D_med,
        "I_med":           I_med,
        "mwer":            mwer
    })

df = pd.DataFrame(rows)
df.to_csv("transcripts_wer_mwer_fast.csv", index=False)
print(f"{len(df)} Transcripts verarbeitet und in 'transcripts_wer_mwer_fast.csv' gespeichert.")


11880 Transcripts verarbeitet und in 'transcripts_wer_mwer_fast.csv' gespeichert.


In [53]:
# Ziel: Erweiterung des schnellen Workflows um Standard-WER-Counts (S, D, I) und Ausgabe in CSV

import re
import json
import pandas as pd
from pymongo import MongoClient
from jiwer import process_words, collect_error_counts
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Medizinisches Vokabular & TF-IDF einrichten (wie zuvor)
cleaned = pd.read_csv("lexikon_cleaned_ger_synonyms.csv")
noun_set = set(w for ph in cleaned["only_nouns"].dropna() for w in ph.split())
adj_set  = set(w for ph in cleaned["adjectives"].dropna()  for w in ph.split())
atc   = pd.read_csv("lexikon_ATC-Bedeutung_final_noarticles.csv")
ling  = pd.read_csv("lexikon_deDE15LinguisticVariant_final_noarticles.csv")

def tokenize_list(phrases):
    toks = set()
    for p in phrases:
        for w in re.sub(r"[^\w\säöüß]", " ", str(p).lower()).split():
            if w.isalpha(): toks.add(w)
    return toks

med_vocab = sorted(noun_set.union(adj_set,
                                  tokenize_list(atc["ATC-Bedeutung_cleaned"].dropna()),
                                  tokenize_list(ling["COMPONENT_cleaned"].dropna())))
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4))
X_vocab = vectorizer.fit_transform(med_vocab)

# 2. Fuzzy-Matching Lookup vorbereiten
wer_data = json.load(open("wer_token_sources.json", encoding="utf-8"))
unique_tokens = set()
for e in wer_data:
    unique_tokens.update(w for ph in e.get("subs_ref_tokens", []) for w in ph.split())
    unique_tokens.update(w for ph in e.get("del_ref_tokens", [])   for w in ph.split())
    unique_tokens.update(w for ph in e.get("ins_hyp_tokens", [])   for w in ph.split())

tokens_list = sorted(unique_tokens)
X_tok = vectorizer.transform(tokens_list)
sims_matrix = cosine_similarity(X_tok, X_vocab)
token_to_medinfo = {
    tok: (med_vocab[idx], float(sims_matrix[i, idx]))
    for i, tok in enumerate(tokens_list)
    for idx in [sims_matrix[i].argmax()]
}

# 3. Schnelle Function inkl. Standard-WER Counts
def compute_wer_mwer_fast(ref, hyp, cutoff=0.80):
    out        = process_words(ref, hyp)
    subs_d, ins_d, del_d = collect_error_counts(out)
    wer_score = out.wer
    S = sum(subs_d.values())
    D = sum(del_d.values())
    I = sum(ins_d.values())

    def is_med(tok):
        return token_to_medinfo.get(tok, ("",0.0))[1] >= cutoff

    S_med = sum(is_med(w) for (ref_ph, _), cnt in subs_d.items() for w in ref_ph.split() for _ in range(cnt))
    D_med = sum(is_med(w) for ph, cnt in del_d.items() for w in ph.split() for _ in range(cnt))
    I_med = sum(is_med(w) for ph, cnt in ins_d.items() for w in ph.split() for _ in range(cnt))

    total_med_ref = sum(is_med(w) for w in ref.split())
    mwer = (S_med + D_med + I_med) / total_med_ref if total_med_ref else 0.0

    return wer_score, S, D, I, S_med, D_med, I_med, mwer

# 4. Über alle Transcripts iterieren und CSV schreiben
client = MongoClient("mongodb://localhost:27018/")
col = client["transcriptions"]["transcripts_denis"]
cursor = col.find(
    {"excludeGeneral": 0},
    {"convoID":1, "ambientVariant":1, "processedVolume":1,
     "technology":1, "model":1,
     "src_wer_denis":1, "text_wer_denis":1}
)

rows = []
for doc in cursor:
    wer, S, D, I, S_med, D_med, I_med, mwer = compute_wer_mwer_fast(
        doc["src_wer_denis"], doc["text_wer_denis"], cutoff=0.80
    )
    rows.append({
        "convoID":        doc.get("convoID"),
        "ambientVariant": doc.get("ambientVariant"),
        "processedVolume":doc.get("processedVolume"),
        "technology":     doc.get("technology"),
        "model":          doc.get("model"),
        "wer":            wer,
        "S":              S,
        "D":              D,
        "I":              I,
        "S_med":          S_med,
        "D_med":          D_med,
        "I_med":          I_med,
        "mwer":           mwer
    })

df = pd.DataFrame(rows)
df.to_csv("transcripts_wer_mwer_fast.csv", index=False)
print(f"{len(df)} Transcripts verarbeitet und in 'transcripts_wer_mwer_fast.csv' gespeichert.")


11880 Transcripts verarbeitet und in 'transcripts_wer_mwer_fast.csv' gespeichert.


In [54]:
# Ziel: Phrase-Level m-WER, bei dem jede WER-Operation als _med_ zählt, 
# wenn eine der enthaltenen Wörter medizinisch ist (Cutoff ≥ 0.80).

import re
import json
import pandas as pd
from pymongo import MongoClient
from jiwer import process_words, collect_error_counts
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# -------------------------------
# 1. Medizinisches Vokabular vorbereiten
# -------------------------------
cleaned = pd.read_csv("lexikon_cleaned_ger_synonyms.csv")
noun_set = set(w for ph in cleaned["only_nouns"].dropna() for w in ph.split())
adj_set  = set(w for ph in cleaned["adjectives"].dropna()  for w in ph.split())

atc   = pd.read_csv("lexikon_ATC-Bedeutung_final_noarticles.csv")
ling  = pd.read_csv("lexikon_deDE15LinguisticVariant_final_noarticles.csv")

def tokenize_list(phrases):
    toks = set()
    for p in phrases:
        for w in re.sub(r"[^\w\säöüß]", " ", str(p).lower()).split():
            if w.isalpha(): toks.add(w)
    return toks

atc_set  = tokenize_list(atc["ATC-Bedeutung_cleaned"].dropna())
comp_set = tokenize_list(ling["COMPONENT_cleaned"].dropna())

med_vocab = sorted(noun_set.union(adj_set, atc_set, comp_set))
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2,4))
X_vocab = vectorizer.fit_transform(med_vocab)

# Batch-unique tokens from previous JSON (to build lookup)
wer_data = json.load(open("wer_token_sources.json", encoding="utf-8"))
unique_tokens = set()
for e in wer_data:
    unique_tokens.update(w for ph in e.get("subs_ref_tokens", []) for w in ph.split())
    unique_tokens.update(w for ph in e.get("del_ref_tokens", [])   for w in ph.split())
    unique_tokens.update(w for ph in e.get("ins_hyp_tokens", [])   for w in ph.split())

tokens_list = sorted(unique_tokens)
X_tok = vectorizer.transform(tokens_list)
sims_matrix = cosine_similarity(X_tok, X_vocab)
token_to_score = {
    tok: float(sims_matrix[i].max())
    for i, tok in enumerate(tokens_list)
}

# -------------------------------
# 2. Phrase-Level compute function
# -------------------------------
def compute_wer_mwer_phrase(ref, hyp, cutoff=0.80):
    out       = process_words(ref, hyp)
    subs_d, ins_d, del_d = collect_error_counts(out)
    
    # Standard counts
    wer_score = out.wer
    S = sum(subs_d.values())
    D = sum(del_d.values())
    I = sum(ins_d.values())

    # Phrase-level medical counts
    S_med = sum(
        cnt for (ref_ph, _), cnt in subs_d.items()
        if any(token_to_score.get(w,0.0) >= cutoff for w in ref_ph.split())
    )
    D_med = sum(
        cnt for ref_ph, cnt in del_d.items()
        if any(token_to_score.get(w,0.0) >= cutoff for w in ref_ph.split())
    )
    I_med = sum(
        cnt for hyp_ph, cnt in ins_d.items()
        if any(token_to_score.get(w,0.0) >= cutoff for w in hyp_ph.split())
    )

    # m-WER denominator: number of medical words in ref
    ref_tokens = ref.split()
    total_med_ref = sum(1 for w in ref_tokens if token_to_score.get(w,0.0) >= cutoff)
    mwer = (S_med + D_med + I_med) / total_med_ref if total_med_ref else 0.0

    return wer_score, S, D, I, S_med, D_med, I_med, mwer

# -------------------------------
# 3. Über DB iterieren und CSV export
# -------------------------------
client = MongoClient("mongodb://localhost:27018/")
col = client["transcriptions"]["transcripts_denis"]

cursor = col.find(
    {"excludeGeneral": 0},
    {"convoID":1, "ambientVariant":1, "processedVolume":1,
     "technology":1, "model":1,
     "src_wer_denis":1, "text_wer_denis":1}
)

rows = []
for doc in cursor:
    ref = doc.get("src_wer_denis","")
    hyp = doc.get("text_wer_denis","")
    wer, S, D, I, S_med, D_med, I_med, mwer = compute_wer_mwer_phrase(ref, hyp)
    rows.append({
        "convoID":        doc.get("convoID"),
        "ambientVariant": doc.get("ambientVariant"),
        "processedVolume":doc.get("processedVolume"),
        "technology":     doc.get("technology"),
        "model":          doc.get("model"),
        "wer":            wer,
        "S":              S,
        "D":              D,
        "I":              I,
        "S_med":          S_med,
        "D_med":          D_med,
        "I_med":          I_med,
        "mwer":           mwer
    })

df = pd.DataFrame(rows)
df.to_csv("transcripts_wer_mwer_phrase.csv", index=False)
print(f"{len(df)} Transcripts verarbeitet. Ergebnis in 'transcripts_wer_mwer_phrase.csv'.")


11880 Transcripts verarbeitet. Ergebnis in 'transcripts_wer_mwer_phrase.csv'.
