In [1]:
# 1) Backup der Collection
from pymongo import MongoClient

# Verbindung herstellen
client = MongoClient("mongodb://localhost:27018/")
db     = client["transcriptions"]
src    = db["transcripts_denis"]
dst    = db["transcripts_denis_prelexchange"]

# Kopieren aller Dokumente
dst.insert_many(src.find({}))
print(f"Backup erstellt: {dst.count_documents({})} Dokumente in 'transcripts_denis_prelexchange'")


Backup erstellt: 12000 Dokumente in 'transcripts_denis_prelexchange'


In [5]:
# 02 Preprocessing Functions
# ---------------------------------------------------------------
import re
import string
from num2words import num2words
from jiwer import (
    Compose, Strip, RemoveWhiteSpace, RemovePunctuation,
    ToLowerCase, RemoveMultipleSpaces, RemoveEmptyStrings,
    SubstituteWords, ReduceToListOfListOfWords
)
# Für Lex Similarity (TD-IDF)
import spacy
nlp = spacy.load("de_core_news_lg")
SPACY_STOPWORDS = nlp.Defaults.stop_words

# 9.1 jiwer–Pipeline für WER/CER/SER
def get_wer_transforms():
    return Compose([
        Strip(),
        RemoveWhiteSpace(replace_by_space=True),
        SubstituteWords({
            "z. b.": "zum beispiel",
            "dr.":   "doktor"
        }),
        RemovePunctuation(),
        ToLowerCase(),
        RemoveMultipleSpaces(),
        RemoveEmptyStrings(),
        ReduceToListOfListOfWords()
    ])

# 9.2 Eigene Pipeline-Elemente für die anderen Metriken

def norm_base(text: str) -> str:
    """Replace linebreaks, lowercase and normalize whitespace"""
    t = text.replace('\r', ' ').replace('\n', ' ')
    return " ".join(t.lower().split())

# Abkürzungen inline behandelt
def expand_abbrev_custom(text: str) -> str:
    """Expand common German abbreviations"""
    mapping = {
        r"\bz\. b\.?\b": "zum beispiel",
        r"\bdr\.?\b":   "doktor"
    }
    t = text
    for pat, full in mapping.items():
        t = re.sub(pat, full, t, flags=re.IGNORECASE)
    return t

# Umlaut-Normalisierung inline
def normalize_umlaute(text: str) -> str:
    """Convert German umlauts and ß into ASCII equivalents"""
    pattern = re.compile(r"(?:ä|ö|ü|Ä|Ö|Ü|ß)")
    return pattern.sub(lambda m: {
        'ä':'ae','ö':'oe','ü':'ue',
        'Ä':'Ae','Ö':'Oe','Ü':'Ue','ß':'ss'
    }[m.group(0)], text)

# Punctuation removal
def remove_punct(text: str) -> str:
    """Remove punctuation characters"""
    return text.translate(str.maketrans("", "", string.punctuation))

# Zahlenerweiterung und Verhältnis-Ersetzung

def expand_slash_ratios(text: str) -> str:
    """Replace numerical ratios X/Y with 'X über Y'"""
    return re.sub(r"\b(\d+)\s*/\s*(\d+)\b", r"\1 über \2", text)


def expand_numbers(text: str, lang: str = 'de') -> str:
    """
    Expand numeric tokens into words:
      1) Dezimalzahlen mit Punkt: '97.9' → 'siebenundneunzig punkt neun'
      2) Dezimalzahlen mit Komma: '97,9' → 'siebenundneunzig komma neun'
      3) Ganze Zahlen: '65' → 'fünfundsechzig'
    """
    # Verhältnisse zuerst
    text = expand_slash_ratios(text)
    # Dezimalzahlen
    def repl_decimal(m):
        whole, sep, frac = m.group(1), m.group(2), m.group(3)
        w = num2words(int(whole), lang=lang)
        f = num2words(int(frac), lang=lang)
        sep_word = "punkt" if sep == "." else "komma"
        return f"{w} {sep_word} {f}"
    text = re.sub(r"\b(\d+)([.,])(\d+)\b", repl_decimal, text)
    # Ganze Zahlen
    text = re.sub(
        r"\b(\d+)\b",
        lambda m: num2words(int(m.group(1)), lang=lang),
        text
    )
    return text

# 9.3 Pipelines for specific metrics

def preprocess_text_for_wer(text: str) -> str:
    t = norm_base(text)
    t = expand_abbrev_custom(t)
    t = normalize_umlaute(t)
    t = expand_numbers(t)
    t = remove_punct(t)
    return t


def preprocess_text_for_lexical_cosine(text: str) -> str:
    t = norm_base(text)
    t = expand_abbrev_custom(t)
    t = normalize_umlaute(t)
    t = expand_numbers(t)
    t = remove_punct(t)
    tokens = [tok for tok in t.split() if tok not in SPACY_STOPWORDS]
    return " ".join(tokens)


def preprocess_text_for_semantic_cosine(text: str) -> str:
    """Minimal preprocessing: normalize whitespace & lowercase"""
    return norm_base(text)

# 9.4 Pipelines for NER (MEER)
import spacy

# Lade spaCy-Modell einmalig
_nlp = spacy.load("de_core_news_lg")

# Truecasing-Funktion für Modelle ohne natives Casing
# Truecasing-Funktion für Modelle ohne natives Casing
def truecase_text(text: str) -> str:
    """Lowercase-Then-Truecase basierend auf POS: Satzanfang & Substantive/Eigenname"""
    text = text.lower()
    doc = _nlp(text)
    result = []
    capitalize_next = True
    for token in doc:
        # Satzanfang oder Substantiv/Eigenname groß
        if capitalize_next or token.pos_ in {"NOUN", "PROPN"}:
            result.append(token.text.capitalize())
        else:
            result.append(token.text)
        capitalize_next = token.text in {".", "!", "?"}
    return " ".join(result)


# Preprocessing für NER: nur text_meer_denis, unter Berücksichtigung des Modells
def preprocess_text_for_ner_meer(text: str, model: str) -> str:
    """
    Minimal preprocessing for BERT-based NER models (MEER).
    - Strip + normalize whitespace
    - Truecase für model-specific cases
    """
    # Grundlegendes Strip/Whitespace
    cleaned = " ".join(text.strip().split())
    # Truecasing nur für spez. ASR-Modelle
    if model in {"vosk-model-de-0.21", "whisper_rescuespeech"}:
        return truecase_text(cleaned)
    return cleaned

# Usage:
#  ner_input = preprocess_text_for_ner_meer(doc_text, doc_model)
#  src_meer_denis bleibt unverändert

In [6]:
# 2) Alte Felder löschen und lex-Pipeline neu anwenden
from pymongo import MongoClient
import spacy



# Verbindung und Collection
client = MongoClient("mongodb://localhost:27018/")
db     = client["transcriptions"]
coll   = db["transcripts_denis"]

# Update aller Dokumente
for doc in coll.find({}):
    raw = doc.get("text", "")
    ref = doc.get("srcText", "")
    new_text_lex = preprocess_text_for_lexical_cosine(raw)
    new_src_lex  = preprocess_text_for_lexical_cosine(ref)

    coll.update_one(
        {"_id": doc["_id"]},
        {"$unset": {"text_lex_denis": "", "src_lex_denis": ""}}
    )
    coll.update_one(
        {"_id": doc["_id"]},
        {"$set": {"text_lex_denis": new_text_lex, "src_lex_denis": new_src_lex}}
    )

print("Reprocessing abgeschlossen: 'text_lex_denis' und 'src_lex_denis' neu befüllt.")


Reprocessing abgeschlossen: 'text_lex_denis' und 'src_lex_denis' neu befüllt.
