In [32]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Deine deutschen Medizinbegriffe
terms = ["Übelkeit"]

# 1) Fach-Synonyme aus OpenThesaurus
def get_ot_synonyms(term):
    url = f"https://www.openthesaurus.de/synonyme/search?q={term}&format=application/json"
    resp = requests.get(url, timeout=5)
    data = resp.json()
    syns = set()
    for synset in data.get("synsets", []):
        for t in synset.get("terms", []):
            syns.add(t["term"])
    return list(syns)



# Zusammensetzen
records = []
for term in terms:
    syn_ot   = get_ot_synonyms(term)
    syn_wik  = get_wiktionary_synonyms(term)
    syn_ols  = get_ols_synonyms(term)
    # alle Quellen zusammenführen, Duplikate entfernen, max. 10
    all_syns = list(dict.fromkeys(syn_ot + syn_wik + syn_ols))[:20]
    records.append({
        "term": term,
        "synonyms": all_syns
    })

df = pd.DataFrame(records)
df

Unnamed: 0,term,synonyms
0,Übelkeit,"[Bewegungskrankheit, Schlechtsein, Seekrankhei..."


In [38]:
syn_ot

['Bewegungskrankheit',
 'Schlechtsein',
 'Seekrankheit',
 'Übelkeit',
 'Brechreiz',
 'Nausea',
 'Kinetose']

In [39]:
for _, row in df.iterrows():
    print(f"Term: {row['term']}")
    for syn in row['synonyms']:
        print(f"  - {syn}")

Term: Übelkeit
  - Bewegungskrankheit
  - Schlechtsein
  - Seekrankheit
  - Übelkeit
  - Brechreiz
  - Nausea
  - Kinetose


In [43]:
from pymongo import MongoClient

# 1) Verbindung zu MongoDB auf Port 27018
client = MongoClient("mongodb://localhost:27018/")
db = client["umls"]

# 2) Collections
eng_col = db["mrconso-eng"]
ger_col = db["mrconso-ger"]

# 3) Englischen Term definieren
eng_term = "Neoplasm of abdomen"

# 4) CUI(s) für den englischen Term holen (exact, case-sensitive)
cuis = eng_col.distinct("CUI", {"LAT": "ENG", "STR": eng_term})
print(f"CUI(s) für '{eng_term}': {cuis}")

# 5) Alle deutschen Synonyme für diese CUI(s) abfragen
ger_synonyms = ger_col.distinct("STR", {"LAT": "GER", "CUI": {"$in": cuis}})

# 6) Ausgabe der deutschen Synonyme
print(f"\nDeutsche Synonyme für '{eng_term}':")
for syn in sorted(ger_synonyms):
    print(" -", syn)

CUI(s) für 'Neoplasm of abdomen': ['C0000735']

Deutsche Synonyme für 'Neoplasm of abdomen':
 - Abdominaltumor
 - Abdominaltumoren
 - Abdominelle Neubildung NNB
 - Abdominelles Neoplasma
 - Neoplasma, abdominelles
 - Neubildung des Abdomens


In [49]:
import re
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27018/")
db = client["umls"]
eng_col = db["mrconso-eng"]
ger_col = db["mrconso-ger"]

# 1) Dein Input-Term (kann ein Wort oder mehrere sein)
eng_input = "HYPOTENSION NOS"

# 2) Regex-Pattern für den Token (optionales Plural-S)
tok = re.escape(eng_input)
pattern = rf".*\b{tok}(?:s)?\b.*"      # z.B. r".*\bneoplasm(?:s)?\b.*"
regex = re.compile(pattern, re.IGNORECASE)

# 3) CUI(s) finden
cuis = eng_col.distinct("CUI", {
    "LAT": "ENG",
    "STR": {"$regex": regex}
})
print("Gefundene CUI(s):", cuis)

# 4) Deutsche Synonyme holen
ger_synonyms = ger_col.distinct("STR", {
    "LAT": "GER",
    "CUI": {"$in": cuis}
})
print("\nDeutsche Synonyme:")
for syn in sorted(ger_synonyms):
    print(" -", syn)


Gefundene CUI(s): ['C0020649']

Deutsche Synonyme:
 - Abfall des Blutdrucks
 - Arterielle Hypotonie
 - Arterieller Blutdruck NNB erniedrigt
 - Arterieller Blutdruck erniedrigt
 - Arterienblutdruck erniedrigt
 - BLUTDRUCK ERNIEDRIGT
 - BLUTDRUCKABFALL ARTERIELL
 - Blutdruck abgefallen
 - Blutdruck erniedrigt
 - Blutdruck gefallen
 - Blutdruck niedrig
 - Blutdruck verringert
 - Blutdruck, niedriger
 - Blutdruckabfall arteriell
 - DRUCK ARTERIELL VERMINDERT
 - Fall des Blutdrucks
 - HYPOTONIE
 - Hypotension
 - Hypotonie
 - Hypotonie NNB
 - Hypotonie, nicht naeher bezeichnet
 - Hypotonie, nicht spezifiziert
 - Niedriger Blutdruck
 - Verringerter Blutdruck


In [57]:
from pymongo import MongoClient

# Verbindung zu deiner lokalen MongoDB auf Standardport 27017
client = MongoClient("mongodb://localhost:27017/")

# Datenbank- und Collection-Namen anpassen, falls nötig
db = client["MIMIC-IV"]        # oder z.B. "mimiciv"
col = db["ED-Diagnosis"]            # oder "ed_diagnoses"

# Variante 1: distinct() auf icd_title
unique_titles = col.distinct("icd_title")
print(f"Distinct ICD-Titel (Variante 1): {len(unique_titles)}")
unique_codes = col.distinct("icd_code")
print(f"Distinct ICD-Codes (Variante 1): {len(unique_codes)}")


Distinct ICD-Titel (Variante 1): 13172
Distinct ICD-Codes (Variante 1): 13199


In [55]:
from pymongo import MongoClient

# Verbindung
client = MongoClient("mongodb://localhost:27017/")
db     = client["localhost_2017"]      # deine DB
col    = db["ED-Diagnosis"]            # deine Collection

# Variante 1: distinct()
unique_codes = col.distinct("icd_code")
print(f"Distinct ICD-Codes (Variante 1): {len(unique_codes)}")

# Variante 2: Aggregation
pipeline = [
    {"$group": {"_id": "$icd_code"}},
    {"$count": "uniqueCount"}
]
agg_result = list(col.aggregate(pipeline))
count = agg_result[0]["uniqueCount"] if agg_result else 0
print(f"Distinct ICD-Codes (Variante 2): {count}")


Distinct ICD-Codes (Variante 1): 0
Distinct ICD-Codes (Variante 2): 0


In [71]:
import requests
import urllib.parse            # <<<< hier hinzufügen
from requests.exceptions import RequestException
from json import JSONDecodeError
from pymongo import MongoClient

# Verbindungen
umls_client = MongoClient("mongodb://localhost:27018/")
umls_db     = umls_client["umls"]
eng_col     = umls_db["mrconso-eng"]
ger_col     = umls_db["mrconso-ger"]

syn_client = MongoClient("mongodb://localhost:27018/")
syn_db     = syn_client["En2DeSyn"]
syn_col    = syn_db["synonyms"]

def get_cui_synonyms(eng_term: str) -> list[str]:
    """
    Holt alle deutschen UMLS-Synonyme für einen englischen Begriff.
    """
    tok     = re.escape(eng_term)
    pattern = rf".*\b{tok}(?:s)?\b.*"
    regex   = re.compile(pattern, re.IGNORECASE)

    cuis = eng_col.distinct("CUI", {
        "LAT": "ENG",
        "STR": {"$regex": regex}
    })
    if not cuis:
        return []

    ger_syns = ger_col.distinct("STR", {
        "LAT": "GER",
        "CUI": {"$in": cuis}
    })
    # alles klein und unique
    return sorted({s.lower() for s in ger_syns})


def get_ot_synonyms(term: str) -> list[str]:
    """
    Ruft OpenThesaurus-Synonyme für ein deutsches Wort ab.
    Liefert [] bei HTTP- oder JSON-Fehlern.
    """
    # 1) Term URL-encoden
    q = urllib.parse.quote(term)
    url = f"https://www.openthesaurus.de/synonyme/search?q={q}&format=application/json"

    try:
        resp = requests.get(url, timeout=5)
        # 2) 200 OK?
        if resp.status_code != 200:
            return []
        # 3) JSON parsen
        data = resp.json()
    except (RequestException, JSONDecodeError):
        return []

    # 4) Synonyme extrahieren (alles lowercase)
    syns = {
        t["term"].lower()
        for synset in data.get("synsets", [])
        for t in synset.get("terms", [])
    }
    return sorted(syns)


def test_term(eng_term: str):
    """
    Einzeltet: druckt den Input und alle gefundenen deutschen Synonyme.
    """
    print(f"Input English term: {eng_term}\n")

    # UMLS-Synonyme
    umls_syns = get_cui_synonyms(eng_term)
    print("UMLS German Synonyms:")
    for s in umls_syns:
        print(" -", s)
    print()

    # OpenThesaurus-Erweiterung
    ot_syns = set()
    for s in umls_syns:
        ot_syns.update(get_ot_synonyms(s))

    if ot_syns:
        print("OpenThesaurus expansions:")
        for s in sorted(ot_syns):
            print(" -", s)
        print()

    # Kombinierte Liste
    all_syns = sorted(set(umls_syns) | ot_syns)
    print("Combined German Synonyms:")
    for s in all_syns:
        print(" -", s)


def populate_synonym_collection():
    """
    Läuft über alle unique ICD-Titel in deiner ED-Diagnosis-Collection,
    holt die deutschen Synonyme aus UMLS + OT, und speichert sie
    als einzelne Dokumente in En2DeSyn.synonyms.
    """
    # Quelle: MIMIC-ED-Diagnoses auf localhost:27017
    src_client = MongoClient("mongodb://localhost:27017/")
    src_db     = src_client["localhost_2017"]
    src_col    = src_db["ED-Diagnosis"]

    eng_terms = src_col.distinct("icd_title")

    for term in eng_terms:
        norm_term = term.lower()
        umls_syns  = get_cui_synonyms(norm_term)

        ot_syns = set()
        for s in umls_syns:
            ot_syns.update(get_ot_synonyms(s))

        all_syns = sorted(set(umls_syns) | ot_syns)

        doc = {
            "_id":         norm_term,
            "eng_term":    norm_term,
            "de_synonyms": all_syns
        }
        syn_col.replace_one({"_id": norm_term}, doc, upsert=True)


In [73]:
test_term("weakness")

Input English term: weakness

UMLS German Synonyms:
 - abduzensparese
 - absteigende muskelschwaeche
 - allg schwaeche/krankheitsgefuehl
 - allgemeine schwaeche
 - asthenie
 - asymmetrische muskelschwaeche der gliedmassen
 - asymmetrische muskelschwaeche der unteren gliedmassen
 - auf der intensivstation erworbene schwaeche
 - augenmuskellã¤hmung, chronische progressive, externe
 - augenmuskelschwaeche
 - beckenbodenschwaeche
 - beinparese
 - chronisch progressive externe ophthalmoplegie
 - cpeo
 - duenner puls
 - einseitige schwaeche der gliedmassen
 - erkrankung des nervus abducens
 - erkrankung des vi. hirnnervs
 - externe ophthalmoplegie
 - fazioplegie
 - filiformer puls
 - gefuehle der schwaeche
 - generalisierte muskelschwaeche
 - gesichtsmuskelschwaeche
 - gesichtsparese
 - gesichtsschwaeche
 - graefe-syndrom
 - haende schwaeche
 - hemiparese
 - hemiparese (links)
 - hemiparese (rechts)
 - herabhaengendes gesicht
 - herzfunktion versagt
 - herzfunktion, versagen
 - herzinsuffizi

In [81]:
from ftfy import fix_text

fixed = fix_text("rumpfgã¼rtel")
# -> "rumpfgürtel"


In [83]:
fixed

'rumpfgã¼rtel'

In [107]:
import re
import requests
import urllib.parse
from requests.exceptions import RequestException
from json import JSONDecodeError
from pymongo import MongoClient
from ftfy import fix_text  # für Encoding-Reparatur

# Verbindungen
umls_client = MongoClient("mongodb://localhost:27018/")
umls_db     = umls_client["umls"]
eng_col     = umls_db["mrconso-eng"]
ger_col     = umls_db["mrconso-ger"]

syn_client = MongoClient("mongodb://localhost:27018/")
syn_db     = syn_client["En2DeSyn"]
syn_col    = syn_db["synonyms"]

def get_cui_synonyms(eng_term: str) -> list[str]:
    """
    Holt alle deutschen UMLS-Synonyme für einen englischen Begriff.
    Repariert dabei fehlerhafte Encodings (z.B. rumpfgã¼rtel → Rumpfgürtel).
    """
    tok     = re.escape(eng_term)
    pattern = rf".*\b{tok}(?:s)?\b.*"
    regex   = re.compile(pattern, re.IGNORECASE)

    cuis = eng_col.distinct("CUI", {
        "LAT": "ENG",
        "STR": {"$regex": regex}
    })
    if not cuis:
        return []

    raw_syns = ger_col.distinct("STR", {
        "LAT": "GER",
        "CUI": {"$in": cuis}
    })

    # Encoding-Fixes mit ftfy und Lowercase + Unique
    fixed = { fix_text(s).lower() for s in raw_syns }
    return sorted(fixed)

def get_ot_synonyms(term: str) -> list[str]:
    """
    Ruft OpenThesaurus-Synonyme für ein deutsches Wort ab.
    Liefert [] bei HTTP- oder JSON-Fehlern.
    Reapriert Encoding-Fehler ebenfalls.
    """
    q   = urllib.parse.quote(term)
    url = f"https://www.openthesaurus.de/synonyme/search?q={q}&format=application/json"

    try:
        resp = requests.get(url, timeout=5)
        if resp.status_code != 200:
            return []
        data = resp.json()
    except (RequestException, JSONDecodeError):
        return []

    raw = {
        t["term"]
        for synset in data.get("synsets", [])
        for t in synset.get("terms", [])
    }
    # Encoding-Fix + lowercase + unique
    fixed = { fix_text(s).lower() for s in raw }
    return sorted(fixed)

def test_term(eng_term: str):
    """
    Einzeltet: druckt den Input und alle gefundenen deutschen Synonyme.
    """
    print(f"Input English term: {eng_term}\n")

    umls_syns = get_cui_synonyms(eng_term)
    print("UMLS German Synonyms:")
    for s in umls_syns:
        print(" -", s)
    print()

    ot_syns = set()
    for s in umls_syns:
        ot_syns.update(get_ot_synonyms(s))

    if ot_syns:
        print("OpenThesaurus expansions:")
        for s in sorted(ot_syns):
            print(" -", s)
        print()

    all_syns = sorted(set(umls_syns) | ot_syns)
    print("Combined German Synonyms:")
    for s in all_syns:
        print(" -", s)

def populate_synonym_collection():
    """
    Läuft über alle unique ICD-Titel in deiner ED-Diagnosis-Collection,
    holt die deutschen Synonyme aus UMLS + OT, sorgt für Encoding-Fixes
    und speichert sie in En2DeSyn.synonyms.
    Gibt dabei Fortschritt i/total aus.
    """
    src_client = MongoClient("mongodb://localhost:27017/")
    src_db     = src_client["MIMIC-IV"]
    src_col    = src_db["ED-Diagnosis"]

    eng_terms = src_col.distinct("icd_title")
    total     = len(eng_terms)
    print(f"Found {total} unique ICD-Titles. Starting population…\n")

    for idx, term in enumerate(eng_terms, start=1):
        norm_term = term.lower()
        print(f"[{idx}/{total}] Processing: {norm_term}")

        umls_syns = get_cui_synonyms(norm_term)
        ot_syns   = set()
        for s in umls_syns:
            ot_syns.update(get_ot_synonyms(s))

        all_syns = sorted(set(umls_syns) | ot_syns)

        doc = {
            "_id":         norm_term,
            "eng_term":    norm_term,
            "de_synonyms": all_syns
        }
        syn_col.replace_one({"_id": norm_term}, doc, upsert=True)

    print("\n✅ Done! En2DeSyn.synonyms enthält jetzt alle Einträge.")



In [None]:
test_term("weakness")

In [109]:
# Zeigt Fortschritt und füllt En2DeSyn.synonyms
populate_synonym_collection()


Found 13172 unique ICD-Titles. Starting population…

[1/13172] Processing: (idiopathic) normal pressure hydrocephalus
[2/13172] Processing: (induced) termination of pregnancy with other complications
[3/13172] Processing: (induced) termination of pregnancy with unsp complications
[4/13172] Processing: 1 deg burn back of hand
[5/13172] Processing: 10 weeks gestation of pregnancy
[6/13172] Processing: 10-19% bdy brn/10-19% 3d
[7/13172] Processing: 10-19% bdy brn/3 deg nos
[8/13172] Processing: 11 weeks gestation of pregnancy
[9/13172] Processing: 12 weeks gestation of pregnancy
[10/13172] Processing: 13 weeks gestation of pregnancy
[11/13172] Processing: 14 weeks gestation of pregnancy
[12/13172] Processing: 15 weeks gestation of pregnancy
[13/13172] Processing: 16 weeks gestation of pregnancy
[14/13172] Processing: 17 weeks gestation of pregnancy


KeyboardInterrupt: 

In [121]:
import re
import requests
import urllib.parse
from requests.exceptions import RequestException
from json import JSONDecodeError
from pymongo import MongoClient
from ftfy import fix_text
import spacy

# 1) spaCy-Modell laden (automatisch downloaden, falls nötig)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# 2) Mongo-Setup
umls_client = MongoClient("mongodb://localhost:27018/")
umls_db     = umls_client["umls"]
eng_col     = umls_db["mrconso-eng"]
ger_col     = umls_db["mrconso-ger"]

syn_client = MongoClient("mongodb://localhost:27018/")
syn_db     = syn_client["En2DeSyn"]
syn_col    = syn_db["synonyms"]

FILLER_WORDS = {"of", "in", "for", "and", "or", "nos", "unspecified"}
TIME_UNITS   = {"week", "weeks", "day", "days", "month", "months", 
                "year", "years", "hour", "hours", "minute", "minutes"}

def get_cui_synonyms(eng_term: str, strict: bool = True) -> list[str]:
    """
    Holt deutsche UMLS-Synonyme für einen englischen Begriff.
    Wenn strict=False, wird nur auf Substring statt Wort­grenze gematcht.
    """
    tok = re.escape(eng_term)
    if strict:
        pattern = rf".*\b{tok}(?:s)?\b.*"
    else:
        pattern = rf".*{tok}.*"
    regex = re.compile(pattern, re.IGNORECASE)

    cuis = eng_col.distinct("CUI", {
        "LAT": "ENG",
        "STR": {"$regex": regex}
    })
    if not cuis:
        return []

    raw = ger_col.distinct("STR", {
        "LAT": "GER",
        "CUI": {"$in": cuis}
    })
    # Encoding-Fix + lowercase + unique
    return sorted({fix_text(s).lower() for s in raw})

def get_ot_synonyms(term: str) -> list[str]:
    """
    Ruft OpenThesaurus-Synonyme ab. Fehler-resilient.
    """
    q   = urllib.parse.quote(term)
    url = f"https://www.openthesaurus.de/synonyme/search?q={q}&format=application/json"
    try:
        resp = requests.get(url, timeout=5)
        if resp.status_code != 200:
            return []
        data = resp.json()
    except (RequestException, JSONDecodeError):
        return []

    raw = {
        t["term"]
        for synset in data.get("synsets", [])
        for t in synset.get("terms", [])
    }
    return sorted({fix_text(s).lower() for s in raw})

def extract_nouns(phrase: str) -> list[str]:
    """
    Entfernt Zahlen, Füllwörter und Zeiteinheiten, 
    taggt mit spaCy und gibt nur Nomen zurück.
    """
    tokens = phrase.lower().split()
    cleaned = [
        t for t in tokens
        if not re.fullmatch(r"\d+", t)
           and t not in FILLER_WORDS
           and t not in TIME_UNITS
    ]
    doc = nlp(" ".join(cleaned))
    # unique while preserving order
    seen = set()
    nouns = []
    for tok in doc:
        if tok.pos_ == "NOUN" and tok.text not in seen:
            seen.add(tok.text)
            nouns.append(tok.text)
    return nouns

def get_de_synonyms(eng_phrase: str) -> list[str]:
    """
    1) Versucht komplette Phrase zu matchen.
    2) Fallback: für jedes Nomen striktes, dann loses Matching.
    """
    # --- Versuch 1: ganze Phrase ---
    umls_syns = get_cui_synonyms(eng_phrase, strict=True)
    ot_syns   = set()
    for g in umls_syns:
        ot_syns.update(get_ot_synonyms(g))
    combined = set(umls_syns) | ot_syns

    # --- Fallback auf Nomen? ---
    if not combined:
        nouns = extract_nouns(eng_phrase)
        for noun in nouns:
            # erst striktes Matching
            syns = get_cui_synonyms(noun, strict=True)
            # dann – falls leer – loses Matching
            if not syns:
                syns = get_cui_synonyms(noun, strict=False)
            # sammle UMLS-Synonyme
            combined.update(syns)
            # und OT-Erweiterung
            for g in syns:
                ot_syns.update(get_ot_synonyms(g))
        combined |= ot_syns

    return sorted(combined)


def populate_synonym_collection():
    """
    Läuft über alle unique ICD-Titel, holt Deutsche Synonyme via get_de_synonyms(),
    erstellt/enriched die Collection En2DeSyn.synonyms und gibt Fortschritt [i/total].
    """
    # Quelle: MIMIC-IV ED-Diagnoses auf localhost:27017
    src_client = MongoClient("mongodb://localhost:27017/")
    src_db     = src_client["MIMIC-IV"]
    src_col    = src_db["ED-Diagnosis"]

    eng_terms = src_col.distinct("icd_title")
    total     = len(eng_terms)
    print(f"Found {total} unique ICD-Titles. Starting…\n")

    for idx, term in enumerate(eng_terms, start=1):
        norm_term = term.lower()
        print(f"[{idx}/{total}] Processing: {norm_term}")

        # Deutsche Synonyme mit neuem Fallback
        all_syns = get_de_synonyms(norm_term)

        # Dokument zusammenstellen
        doc = {
            "_id":         norm_term,
            "eng_term":    norm_term,
            "de_synonyms": all_syns
        }
        # Upsert in En2DeSyn.synonyms
        syn_col.replace_one({"_id": norm_term}, doc, upsert=True)

    print("\n✅ Done! En2DeSyn.synonyms enthält jetzt alle Einträge.")





In [134]:
import re
import requests
import urllib.parse
from requests.exceptions import RequestException
from json import JSONDecodeError
from pymongo import MongoClient
from ftfy import fix_text
import spacy
from rapidfuzz import process, fuzz

# 1) spaCy-Modell laden (für extract_nouns, falls du es noch brauchst)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# 2) Mongo-Setup
umls_client = MongoClient("mongodb://localhost:27018/")
umls_db     = umls_client["umls"]
eng_col     = umls_db["mrconso-eng"]
ger_col     = umls_db["mrconso-ger"]

syn_client = MongoClient("mongodb://localhost:27018/")
syn_db     = syn_client["En2DeSyn"]
syn_col    = syn_db["synonyms"]

# 3) Stelle sicher, dass der Text-Index existiert (einmalig in der Shell):
# db.getCollection("mrconso-eng").createIndex({STR: "text"}, {default_language: "english"})

def get_cui_synonyms_strict(eng_term: str) -> list[str]:
    """
    Holt deutsche UMLS-Synonyme via Regex-Wortgrenzen-Matching (strict).
    """
    tok     = re.escape(eng_term)
    pattern = rf".*\b{tok}(?:s)?\b.*"
    regex   = re.compile(pattern, re.IGNORECASE)

    cuis = eng_col.distinct("CUI", {
        "LAT": "ENG",
        "STR": {"$regex": regex}
    })
    if not cuis:
        return []

    raw = ger_col.distinct("STR", {
        "LAT": "GER",
        "CUI": {"$in": cuis}
    })
    return sorted({fix_text(s).lower() for s in raw})

def get_cui_synonyms_textindex(eng_term: str, top_k: int = 5) -> list[str]:
    """
    Holt deutsche UMLS-Synonyme über MongoDB $text-Index-Ranking.
    """
    cursor = eng_col.find(
        {"$text": {"$search": eng_term}, "LAT": "ENG"},
        {"CUI": 1, "_id": 0, "score": {"$meta": "textScore"}}
    ).sort([("score", {"$meta": "textScore"})]).limit(top_k)

    cuis = {doc["CUI"] for doc in cursor}
    if not cuis:
        return []

    raw = ger_col.distinct("STR", {
        "LAT": "GER",
        "CUI": {"$in": list(cuis)}
    })
    return sorted({fix_text(s).lower() for s in raw})

def get_ot_synonyms(term: str) -> list[str]:
    """
    Ruft OpenThesaurus-Synonyme ab und bereinigt sie.
    """
    q   = urllib.parse.quote(term)
    url = f"https://www.openthesaurus.de/synonyme/search?q={q}&format=application/json"
    try:
        resp = requests.get(url, timeout=5)
        if resp.status_code != 200:
            return []
        data = resp.json()
    except (RequestException, JSONDecodeError):
        return []
    raw = {
        t["term"]
        for synset in data.get("synsets", [])
        for t in synset.get("terms", [])
    }
    return sorted({fix_text(s).lower() for s in raw})

def get_de_synonyms(eng_phrase: str) -> list[str]:
    """
    1) Striktes 1:1-Lookup (Regex)
    2) Fallback: Text-Index-Lookup
    3) Optional OpenThesaurus-Erweiterung
    """
    # 1) Striktes Matching
    umls_syns = get_cui_synonyms_strict(eng_phrase)
    if umls_syns:
        # gleich liefern, optional OT-Erweiterung:
        ot = set()
        for s in umls_syns:
            ot.update(get_ot_synonyms(s))
        return sorted(set(umls_syns) | ot)

    # 2) Text-Index Fallback
    idx_syns = get_cui_synonyms_textindex(eng_phrase)
    ot = set()
    for s in idx_syns:
        ot.update(get_ot_synonyms(s))

    return sorted(set(idx_syns) | ot)

def populate_synonym_collection():
    """
    Befüllt En2DeSyn.synonyms mit [i/total]-Statusanzeige.
    """
    src_client = MongoClient("mongodb://localhost:27017/")
    src_db     = src_client["MIMIC-IV"]
    src_col    = src_db["ED-Diagnosis"]

    eng_terms = src_col.distinct("icd_title")
    total     = len(eng_terms)
    print(f"Found {total} unique ICD-Titles. Starting…\n")

    for idx, term in enumerate(eng_terms, start=1):
        norm = term.lower()
        print(f"[{idx}/{total}] {norm}")

        syns = get_de_synonyms(norm)
        doc = {"_id": norm, "eng_term": norm, "de_synonyms": syns}
        syn_col.replace_one({"_id": norm}, doc, upsert=True)

    print("\n✅ Done! En2DeSyn.synonyms ist befüllt.")

# In deinem Notebook dann einfach:
# populate_synonym_collection()


In [146]:
import re
import requests
import urllib.parse
from requests.exceptions import RequestException
from json import JSONDecodeError
from pymongo import MongoClient
from ftfy import fix_text
import spacy
from rapidfuzz import process, fuzz

# 1) spaCy-Modell laden (für extract_nouns, falls noch gebraucht)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# 2) Mongo-Setup
umls_client = MongoClient("mongodb://localhost:27018/")
umls_db     = umls_client["umls"]
eng_col     = umls_db["mrconso-eng"]
ger_col     = umls_db["mrconso-ger"]

syn_client = MongoClient("mongodb://localhost:27018/")
syn_db     = syn_client["En2DeSyn"]
syn_col    = syn_db["synonyms"]

# 3) Stelle sicher, dass der Text-Index existiert (einmalig in der Shell):
# db.getCollection("mrconso-eng").createIndex({STR: "text"}, {default_language: "english"})

def get_cui_synonyms_strict(eng_term: str) -> list[str]:
    tok     = re.escape(eng_term)
    pattern = rf".*\b{tok}(?:s)?\b.*"
    regex   = re.compile(pattern, re.IGNORECASE)
    cuis = eng_col.distinct("CUI", {"LAT": "ENG", "STR": {"$regex": regex}})
    if not cuis:
        return []
    raw = ger_col.distinct("STR", {"LAT": "GER", "CUI": {"$in": cuis}})
    return sorted({fix_text(s).lower() for s in raw})

def get_cui_synonyms_textindex(eng_term: str, top_k: int = 5) -> list[str]:
    cursor = eng_col.find(
        {"$text": {"$search": eng_term}, "LAT": "ENG"},
        {"CUI": 1, "_id": 0, "score": {"$meta": "textScore"}}
    ).sort([("score", {"$meta": "textScore"})]).limit(top_k)
    cuis = {doc["CUI"] for doc in cursor}
    if not cuis:
        return []
    raw = ger_col.distinct("STR", {"LAT": "GER", "CUI": {"$in": list(cuis)}})
    return sorted({fix_text(s).lower() for s in raw})

def get_cui_synonyms_fuzzy(
    eng_term: str,
    candidate_k: int = 50,
    top_k: int = 5,
    threshold: int = 60
) -> list[str]:
    # Kandidaten via Text-Index Labels holen
    txt_cursor = eng_col.find(
        {"$text": {"$search": eng_term}, "LAT": "ENG"},
        {"STR": 1, "_id": 0}
    ).limit(candidate_k)
    candidates = [doc["STR"] for doc in txt_cursor]
    if not candidates:
        return []

    # rapidfuzz Ranking
    matches = process.extract(
        eng_term,
        candidates,
        scorer=fuzz.token_set_ratio,
        limit=candidate_k
    )
    # Filter nach threshold
    good = [label for label, score, _ in matches if score >= threshold][:top_k]
    if not good:
        return []

    # Aus gewählten Labels CUIs holen und deutsche Synonyme
    cuis = set()
    for lbl in good:
        found = eng_col.distinct("CUI", {"LAT": "ENG", "STR": lbl})
        cuis.update(found)
    if not cuis:
        return []

    raw = ger_col.distinct("STR", {"LAT": "GER", "CUI": {"$in": list(cuis)}})
    return sorted({fix_text(s).lower() for s in raw})

def get_ot_synonyms(term: str) -> list[str]:
    q   = urllib.parse.quote(term)
    url = f"https://www.openthesaurus.de/synonyme/search?q={q}&format=application/json"
    try:
        resp = requests.get(url, timeout=5)
        if resp.status_code != 200:
            return []
        data = resp.json()
    except (RequestException, JSONDecodeError):
        return []
    raw = {
        t["term"]
        for synset in data.get("synsets", [])
        for t in synset.get("terms", [])
    }
    return sorted({fix_text(s).lower() for s in raw})

def get_de_synonyms(eng_phrase: str) -> list[str]:
    """
    1) 1:1-Regex
    2) Text-Index
    3) Fuzzy/n-Gram
    4) OpenThesaurus-Erweiterung
    """
    # 1) strict
    umls_syns = get_cui_synonyms_strict(eng_phrase)
    if umls_syns:
        ot = {o for s in umls_syns for o in get_ot_synonyms(s)}
        return sorted(set(umls_syns) | ot)

    # 2) textindex
    idx_syns = get_cui_synonyms_textindex(eng_phrase)
    if idx_syns:
        ot = {o for s in idx_syns for o in get_ot_synonyms(s)}
        return sorted(set(idx_syns) | ot)

    # 3) fuzzy
    fuzzy_syns = get_cui_synonyms_fuzzy(eng_phrase)
    if fuzzy_syns:
        ot = {o for s in fuzzy_syns for o in get_ot_synonyms(s)}
        return sorted(set(fuzzy_syns) | ot)

    # 4) no match
    return []

def populate_synonym_collection():
    src_client = MongoClient("mongodb://localhost:27017/")
    src_db     = src_client["MIMIC-IV"]
    src_col    = src_db["ED-Diagnosis"]

    eng_terms = src_col.distinct("icd_title")
    total     = len(eng_terms)
    print(f"Found {total} unique ICD-Titles. Starting…\n")

    for idx, term in enumerate(eng_terms, start=1):
        norm = term.lower()
        print(f"[{idx}/{total}] {norm}")

        syns = get_de_synonyms(norm)
        doc = {"_id": norm, "eng_term": norm, "de_synonyms": syns}
        syn_col.replace_one({"_id": norm}, doc, upsert=True)

    print("\n✅ Done! En2DeSyn.synonyms ist befüllt.")


In [177]:
import re
import requests
import urllib.parse
from requests.exceptions import RequestException
from json import JSONDecodeError
from pymongo import MongoClient
from ftfy import fix_text
import spacy
from rapidfuzz import process, fuzz

# 1) spaCy-Modell laden (automatisch herunterladen, falls nötig)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# 2) Mongo-Setup
umls_client = MongoClient("mongodb://localhost:27018/")
umls_db     = umls_client["umls"]
eng_col     = umls_db["mrconso-eng"]
ger_col     = umls_db["mrconso-ger"]

syn_client  = MongoClient("mongodb://localhost:27018/")
syn_db      = syn_client["En2DeSyn"]
syn_col     = syn_db["synonyms"]

# Fillers, units, roman numeral regex
FILLER_WORDS = {"of", "in", "for", "and", "or", "nos", "unspecified"}
TIME_UNITS   = {"week", "weeks", "day", "days", "month", "months", "year", "years",
                "hour", "hours", "minute", "minutes"}
ROMAN_RE     = re.compile(r'^(?=[MDCLXVI])M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$', re.IGNORECASE)

def get_cui_synonyms_strict(eng_term: str) -> list[str]:
    tok     = re.escape(eng_term)
    pattern = rf".*\b{tok}(?:s)?\b.*"
    regex   = re.compile(pattern, re.IGNORECASE)

    cuis = eng_col.distinct("CUI", {"LAT": "ENG", "STR": {"$regex": regex}})
    if not cuis:
        return []

    raw = ger_col.distinct("STR", {"LAT": "GER", "CUI": {"$in": cuis}})
    return sorted({fix_text(s).lower() for s in raw})

def get_cui_synonyms_textindex(eng_term: str, top_k: int = 5) -> list[str]:
    cursor = eng_col.find(
        {"$text": {"$search": eng_term}, "LAT": "ENG"},
        {"CUI": 1, "_id": 0, "score": {"$meta": "textScore"}}
    ).sort([("score", {"$meta": "textScore"})]).limit(top_k)

    cuis = {doc["CUI"] for doc in cursor}
    if not cuis:
        return []

    raw = ger_col.distinct("STR", {"LAT": "GER", "CUI": {"$in": list(cuis)}})
    return sorted({fix_text(s).lower() for s in raw})

def get_cui_synonyms_fuzzy(
    eng_term: str,
    candidate_k: int = 50,
    top_k: int = 30,
    threshold: int = 80
) -> list[str]:
    # Kandidaten via Text-Index-Labelfetch
    txt_cursor = eng_col.find(
        {"$text": {"$search": eng_term}, "LAT": "ENG"},
        {"STR": 1, "_id": 0}
    ).limit(candidate_k)
    candidates = [doc["STR"] for doc in txt_cursor]
    if not candidates:
        return []

    matches = process.extract(
        eng_term,
        candidates,
        scorer=fuzz.token_set_ratio,
        limit=candidate_k
    )
    good = [label for label, score, _ in matches if score >= threshold][:top_k]
    if not good:
        return []

    cuis = set()
    for lbl in good:
        found = eng_col.distinct("CUI", {"LAT": "ENG", "STR": lbl})
        cuis.update(found)
    if not cuis:
        return []

    raw = ger_col.distinct("STR", {"LAT": "GER", "CUI": {"$in": list(cuis)}})
    return sorted({fix_text(s).lower() for s in raw})

def get_ot_synonyms(term: str) -> list[str]:
    q   = urllib.parse.quote(term)
    url = f"https://www.openthesaurus.de/synonyme/search?q={q}&format=application/json"
    try:
        resp = requests.get(url, timeout=5)
        if resp.status_code != 200:
            return []
        data = resp.json()
    except (RequestException, JSONDecodeError):
        return []
    raw = {t["term"] for synset in data.get("synsets", []) for t in synset.get("terms", [])}
    return sorted({fix_text(s).lower() for s in raw})

def get_de_synonyms(eng_phrase: str) -> list[str]:
    # 1) strict exact match
    umls_syns = get_cui_synonyms_strict(eng_phrase)
    if umls_syns:
        ot = {o for s in umls_syns for o in get_ot_synonyms(s)}
        return sorted(set(umls_syns) | ot)

    # 2) Preprocess fallback: remove numbers, fillers, time units, roman numerals
    tokens = re.findall(r"\b\w+\b", eng_phrase.lower())
    clean = [t for t in tokens
             if not re.fullmatch(r"\d+", t)
             and t not in FILLER_WORDS
             and t not in TIME_UNITS
             and not ROMAN_RE.fullmatch(t)]
    reduced = " ".join(clean)

    # 3) Fallback Text-Index on reduced phrase
    idx_syns = []
    if reduced:
        idx_syns = get_cui_synonyms_textindex(reduced)
    if idx_syns:
        ot = {o for s in idx_syns for o in get_ot_synonyms(s)}
        return sorted(set(idx_syns) | ot)

    # 4) Fallback fuzzy on reduced phrase
    if reduced:
        fuzzy_syns = get_cui_synonyms_fuzzy(reduced)
        if fuzzy_syns:
            ot = {o for s in fuzzy_syns for o in get_ot_synonyms(s)}
            return sorted(set(fuzzy_syns) | ot)

    # 5) no match
    return []

spinner = None

def populate_synonym_collection():
    src_client = MongoClient("mongodb://localhost:27017/")
    src_db     = src_client["MIMIC-IV"]
    src_col    = src_db["ED-Diagnosis"]

    eng_terms = src_col.distinct("icd_title")
    total     = len(eng_terms)
    print(f"Found {total} unique ICD-Titles. Starting…\n")

    for idx, term in enumerate(eng_terms, start=1):
        norm = term.lower()
        print(f"[{idx}/{total}] {norm}")

        syns = get_de_synonyms(norm)
        doc = {"_id": norm, "eng_term": norm, "de_synonyms": syns}
        syn_col.replace_one({"_id": norm}, doc, upsert=True)

    print("\n✅ Done! En2DeSyn.synonyms ist befüllt.")


In [179]:
# --- Beispiel ---
if __name__ == "__main__":
    term = "gestation  pregnancy"
    syns = get_de_synonyms(term)
    print(f"Deutsche Synonyme für '{term}':")
    for s in syns:
        print(" -", s)


Deutsche Synonyme für 'gestation  pregnancy':
 - mehrlingsschwangerschaft
 - mehrlingsschwangerschaft, nicht naeher bezeichnet
 - mehrlingsschwangerschaft, nicht spezifiziert
 - mehrlingsschwangerschaften
 - nicht spezifizierte multiple schwangerschaft
 - schwangerschaft multiple
 - schwangerschaft, mehrlings-
 - unspezifische multiple schwangerschaft, ohne angabe der behandlungsepisode


In [181]:
populate_synonym_collection()

Found 13172 unique ICD-Titles. Starting…

[1/13172] (idiopathic) normal pressure hydrocephalus
[2/13172] (induced) termination of pregnancy with other complications
[3/13172] (induced) termination of pregnancy with unsp complications
[4/13172] 1 deg burn back of hand
[5/13172] 10 weeks gestation of pregnancy
[6/13172] 10-19% bdy brn/10-19% 3d
[7/13172] 10-19% bdy brn/3 deg nos
[8/13172] 11 weeks gestation of pregnancy
[9/13172] 12 weeks gestation of pregnancy
[10/13172] 13 weeks gestation of pregnancy
[11/13172] 14 weeks gestation of pregnancy
[12/13172] 15 weeks gestation of pregnancy
[13/13172] 16 weeks gestation of pregnancy
[14/13172] 17 weeks gestation of pregnancy
[15/13172] 18 weeks gestation of pregnancy
[16/13172] 19 weeks gestation of pregnancy
[17/13172] 1st deg burn ankle
[18/13172] 1st deg burn arm-mult
[19/13172] 1st deg burn back
[20/13172] 1st deg burn chest wall
[21/13172] 1st deg burn eye
[22/13172] 1st deg burn face nec
[23/13172] 1st deg burn finger
[24/13172] 1st d

KeyboardInterrupt: 

In [183]:
import re
from rapidfuzz import fuzz
from pymongo import MongoClient

# 1) Hole deine Terms aus MIMIC
client   = MongoClient("mongodb://localhost:27017/")
src_col  = client["MIMIC-IV"]["ED-Diagnosis"]
terms    = src_col.distinct("icd_title")

# 2) Vorverarbeitung
FILLER_WORDS = {"of", "in", "for", "and", "or", "nos", "unspecified"}
def normalize(term: str) -> str:
    # a) Klein, b) remove digits, c) remove filler words & punctuation
    t = term.lower()
    t = re.sub(r"\d+",       " ", t)                  # Zahlen raus
    t = re.sub(r"[^\w\s]",   " ", t)                  # Interpunktion raus
    tokens = [w for w in t.split() if w not in FILLER_WORDS]
    return " ".join(tokens).strip()

norm_terms = [normalize(t) for t in terms if t and normalize(t)]

# 3) Greedy-Clustering bei 90%-Similarity
threshold = 90
clusters  = []   # jede Cluster-Liste speichert ähnliche Terms

for term in norm_terms:
    placed = False
    for cluster in clusters:
        # gegen den ersten Repräsentanten vergleichen
        if fuzz.token_set_ratio(term, cluster[0]) >= threshold:
            cluster.append(term)
            placed = True
            break
    if not placed:
        clusters.append([term])

# 4) Ergebnisse
print(f"Original Terms: {len(norm_terms)}")
print(f"Clusters @ {threshold}% similarity: {len(clusters)}")

# optional: Cluster-Größen untersuchen
sizes = sorted([len(c) for c in clusters], reverse=True)
print("Top-10 Cluster-Größen:", sizes[:10])


Original Terms: 13172
Clusters @ 90% similarity: 7726
Top-10 Cluster-Größen: [49, 36, 33, 23, 22, 21, 19, 18, 18, 17]


In [None]:
import re
from rapidfuzz import fuzz
from pymongo import MongoClient

# 1) Hole deine Terms aus MIMIC
client   = MongoClient("mongodb://localhost:27017/")
src_col  = client["MIMIC-IV"]["ED-Diagnosis"]
terms    = src_col.distinct("icd_title")

# 2) Vorverarbeitung
FILLER_WORDS = {"of", "in", "for", "and", "or", "nos", "unspecified"}
def normalize(term: str) -> str:
    # a) Klein, b) remove digits, c) remove filler words & punctuation
    t = term.lower()
    t = re.sub(r"\d+",       " ", t)                  # Zahlen raus
    t = re.sub(r"[^\w\s]",   " ", t)                  # Interpunktion raus
    tokens = [w for w in t.split() if w not in FILLER_WORDS]
    return " ".join(tokens).strip()

norm_terms = [normalize(t) for t in terms if t and normalize(t)]

# 3) Greedy-Clustering bei 90%-Similarity
threshold = 60
clusters  = []   # jede Cluster-Liste speichert ähnliche Terms

for term in norm_terms:
    placed = False
    for cluster in clusters:
        # gegen den ersten Repräsentanten vergleichen
        if fuzz.token_set_ratio(term, cluster[0]) >= threshold:
            cluster.append(term)
            placed = True
            break
    if not placed:
        clusters.append([term])

# 4) Ergebnisse
print(f"Original Terms: {len(norm_terms)}")
print(f"Clusters @ {threshold}% similarity: {len(clusters)}")

# optional: Cluster-Größen untersuchen
sizes = sorted([len(c) for c in clusters], reverse=True)
print("Top-10 Cluster-Größen:", sizes[:10])


In [None]:
# Angenommen, du hast clusters schon berechnet:
# clusters = [[term1, term1a, term1b], [term2, term2a], …]

for i, cluster in enumerate(clusters, start=1):
    print(f"Cluster {i} ({len(cluster)} Begriffe):")
    for term in cluster:
        print("  -", term)
    print()


In [None]:
repr_terms = [cluster[0] for cluster in clusters]
print("Repräsentative Terme:", repr_terms)

In [None]:
import re
from rapidfuzz import fuzz
from nltk.corpus import stopwords
from nltk import download
from pymongo import MongoClient

# 1) NLTK-Stopwords herunterladen (einmalig)
download('stopwords')
stop_words = set(stopwords.words('english'))

# 2) MIMIC-Termine laden
client   = MongoClient("mongodb://localhost:27017/")
src_col  = client["MIMIC-IV"]["ED-Diagnosis"]
terms    = src_col.distinct("icd_title")

# 3) Normalisierung mit NLTK-Stopword-Filter
def normalize(term: str) -> str:
    # a) Lowercase
    t = term.lower()
    # b) Zahlen raus, Interpunktion raus
    t = re.sub(r"\d+", " ", t)
    t = re.sub(r"[^\w\s]", " ", t)
    # c) Stopword-Filter + nur alphabetische Tokens
    tokens = [w for w in t.split() if w.isalpha() and w not in stop_words]
    return " ".join(tokens).strip()

norm_terms = [normalize(t) for t in terms if t and normalize(t)]

# 4) Greedy-Clustering bei 90%-Ähnlichkeit
threshold = 60
clusters  = []

for term in norm_terms:
    placed = False
    for cluster in clusters:
        if fuzz.token_set_ratio(term, cluster[0]) >= threshold:
            cluster.append(term)
            placed = True
            break
    if not placed:
        clusters.append([term])

# 5) Ausgabe
print(f"Ursprüngliche Terms: {len(norm_terms)}")
print(f"Cluster-Anzahl @ {threshold}%: {len(clusters)}\n")

for i, cluster in enumerate(clusters, start=1):
    print(f"Cluster {i} ({len(cluster)} Begriffe):")
    for t in cluster[:5]:  # zeige bis zu 5 Begriffe pro Cluster
        print("  -", t)
    if len(cluster) > 5:
        print("  ...")
    print()


In [205]:
import re
import requests
import urllib.parse
from requests.exceptions import RequestException
from json import JSONDecodeError
from pymongo import MongoClient
from ftfy import fix_text
import spacy
from rapidfuzz import process, fuzz

# 1) spaCy-Modell laden (automatisch herunterladen, falls nötig)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# 2) Mongo-Setup
umls_client = MongoClient("mongodb://localhost:27018/")
umls_db     = umls_client["umls"]
eng_col     = umls_db["mrconso-eng"]
ger_col     = umls_db["mrconso-ger"]

syn_client  = MongoClient("mongodb://localhost:27018/")
syn_db      = syn_client["En2DeSyn"]
syn_col     = syn_db["synonyms"]

# Fillers, units, roman numeral regex
FILLER_WORDS = {"of", "in", "for", "and", "or", "nos", "unspecified", "first", "second", "with", "other", "without", "the", "st", "de", "fx"}
TIME_UNITS   = {"week", "weeks", "day", "days", "month", "months", "year", "years",
                "hour", "hours", "minute", "minutes"}
ROMAN_RE     = re.compile(r'^(?=[MDCLXVI])M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$', re.IGNORECASE)

# 3) Stelle sicher, dass der Text-Index existiert (einmalig in der Shell):
# db.getCollection("mrconso-eng").createIndex({STR: "text"}, {default_language: "english"})

def get_cui_synonyms_strict(eng_term: str) -> list[str]:
    tok     = re.escape(eng_term)
    pattern = rf".*\b{tok}(?:s)?\b.*"
    regex   = re.compile(pattern, re.IGNORECASE)

    cuis = eng_col.distinct("CUI", {"LAT": "ENG", "STR": {"$regex": regex}})
    if not cuis:
        return []

    raw = ger_col.distinct("STR", {"LAT": "GER", "CUI": {"$in": cuis}})
    return sorted({fix_text(s).lower() for s in raw})

def get_cui_synonyms_textindex(eng_term: str, top_k: int = 5) -> list[str]:
    cursor = eng_col.find(
        {"$text": {"$search": eng_term}, "LAT": "ENG"},
        {"CUI": 1, "_id": 0, "score": {"$meta": "textScore"}}
    ).sort([("score", {"$meta": "textScore"})]).limit(top_k)

    cuis = {doc["CUI"] for doc in cursor}
    if not cuis:
        return []

    raw = ger_col.distinct("STR", {"LAT": "GER", "CUI": {"$in": list(cuis)}})
    return sorted({fix_text(s).lower() for s in raw})

def get_cui_synonyms_fuzzy(
    eng_term: str,
    candidate_k: int = 50,
    top_k: int = 30,
    threshold: int = 80
) -> list[str]:
    txt_cursor = eng_col.find(
        {"$text": {"$search": eng_term}, "LAT": "ENG"},
        {"STR": 1, "_id": 0}
    ).limit(candidate_k)
    candidates = [doc["STR"] for doc in txt_cursor]
    if not candidates:
        return []

    matches = process.extract(
        eng_term,
        candidates,
        scorer=fuzz.token_set_ratio,
        limit=candidate_k
    )
    good = [label for label, score, _ in matches if score >= threshold][:top_k]
    if not good:
        return []

    cuis = set()
    for lbl in good:
        found = eng_col.distinct("CUI", {"LAT": "ENG", "STR": lbl})
        cuis.update(found)
    if not cuis:
        return []

    raw = ger_col.distinct("STR", {"LAT": "GER", "CUI": {"$in": list(cuis)}})
    return sorted({fix_text(s).lower() for s in raw})

def get_ot_synonyms(term: str) -> list[str]:
    q   = urllib.parse.quote(term)
    url = f"https://www.openthesaurus.de/synonyme/search?q={q}&format=application/json"
    try:
        resp = requests.get(url, timeout=5)
        if resp.status_code != 200:
            return []
        data = resp.json()
    except (RequestException, JSONDecodeError):
        return []
    raw = {t["term"] for synset in data.get("synsets", []) for t in synset.get("terms", [])}
    return sorted({fix_text(s).lower() for s in raw})

def get_de_synonyms(eng_phrase: str) -> list[str]:
    # 1) strict exact match
    umls_syns = get_cui_synonyms_strict(eng_phrase)
    if umls_syns:
        ot = {o for s in umls_syns for o in get_ot_synonyms(s)}
        return sorted(set(umls_syns) | ot)

    # 2) Preprocess fallback: remove numbers, fillers, time units, roman numerals, punctuation
    tokens = re.findall(r"\b\w+\b", eng_phrase.lower())
    clean = [t for t in tokens
             if not re.fullmatch(r"\d+", t)
             and t not in FILLER_WORDS
             and t not in TIME_UNITS
             and not ROMAN_RE.fullmatch(t)]
    reduced = " ".join(clean)

    # 3) Fallback Text-Index on reduced phrase
    idx_syns = []
    if reduced:
        idx_syns = get_cui_synonyms_textindex(reduced)
    if idx_syns:
        ot = {o for s in idx_syns for o in get_ot_synonyms(s)}
        return sorted(set(idx_syns) | ot)

    # 4) Fallback fuzzy on reduced phrase
    if reduced:
        fuzzy_syns = get_cui_synonyms_fuzzy(reduced)
        if fuzzy_syns:
            ot = {o for s in fuzzy_syns for o in get_ot_synonyms(s)}
            return sorted(set(fuzzy_syns) | ot)

    # 5) no match
    return []


def populate_synonym_collection():
    # Fetch and normalize terms
    src_client = MongoClient("mongodb://localhost:27017/")
    src_db     = src_client["MIMIC-IV"]
    src_col    = src_db["ED-Diagnosis"]

    raw_terms  = src_col.distinct("icd_title")
    # Normalize and clean
    def normalize(term: str) -> str:
        t = term.lower()
        t = re.sub(r"\d+", " ", t)
        t = re.sub(r"[^\w\s]", " ", t)
        t = " ".join([w for w in t.split() if w not in FILLER_WORDS])
        return t.strip()

    norm_terms = [normalize(t) for t in raw_terms if t and normalize(t)]

    # Greedy clustering @ 60% similarity
    threshold = 60
    clusters = []
    for term in norm_terms:
        placed = False
        for cluster in clusters:
            if fuzz.token_set_ratio(term, cluster[0]) >= threshold:
                cluster.append(term)
                placed = True
                break
        if not placed:
            clusters.append([term])

    # Extract representatives
    repr_terms = [c[0] for c in clusters]
    total = len(repr_terms)
    print(f"Found {len(norm_terms)} normalized terms; {total} cluster representatives. Starting…\n")

    # Populate MongoDB for each representative
    for idx, term in enumerate(repr_terms, start=1):
        print(f"[{idx}/{total}] {term}")
        syns = get_de_synonyms(term)
        doc = {"_id": term, "eng_term": term, "de_synonyms": syns}
        syn_col.replace_one({"_id": term}, doc, upsert=True)

    print("\n✅ Done! En2DeSyn.synonyms ist befüllt.")


In [207]:
populate_synonym_collection()

Found 13172 normalized terms; 1735 cluster representatives. Starting…

[1/1735] idiopathic normal pressure hydrocephalus
[2/1735] induced termination pregnancy complications
[3/1735] deg burn back hand
[4/1735] weeks gestation pregnancy
[5/1735] bdy brn d
[6/1735] part disp surgical neck left humerus init
[7/1735] ab uncomplicat inc
[8/1735] abcess anal rectal regions
[9/1735] abdom aortic aneurysm
[10/1735] abdom pelv swell mass oth site
[11/1735] abdomen pelvis symp nec
[12/1735] abdominal pain epigastric
[13/1735] abdominal rigidity luq
[14/1735] abn blood chemistry nec


KeyboardInterrupt: 

In [209]:
def get_cui_from_icd(icd_code: str, source_vocab: str = "ICD10CM") -> list[str]:
    """
    Liefert alle CUI(s) für einen gegebenen ICD-Code aus dem angegebenen Source-Vokabular.
    """
    cuis = eng_col.distinct("CUI", {
        "SAB":  source_vocab,   # z.B. "ICD10CM" oder "ICD9CM"
        "CODE": icd_code        # dein ICD-Code, z.B. "I64" oder "433.10"
    })
    return cuis


In [211]:
def get_de_synonyms_by_icd(
    icd_code: str,
    source_vocab: str = "ICD10CM"
) -> list[str]:
    # 1) CUI(s) aus UMLS per ICD-Code
    cuis = get_cui_from_icd(icd_code, source_vocab)
    if not cuis:
        return []

    # 2) Deutsche UMLS-Synonyme
    raw_de = ger_col.distinct("STR", {
        "LAT": "GER",
        "CUI": {"$in": cuis}
    })
    de_syns = sorted({fix_text(s).lower() for s in raw_de})
    
    # 3) Optional: OpenThesaurus-Erweiterung
    ot = set()
    for s in de_syns:
        ot.update(get_ot_synonyms(s))
    
    return sorted(set(de_syns) | ot)


In [213]:
icd = "I64"              # z.B. Schlaganfall, nicht hämorrhagisch
cuis = get_cui_from_icd(icd, "ICD10CM")
print("CUIs:", cuis)

german = get_de_synonyms_by_icd(icd, "ICD10CM")
print("Deutsche Synonyme für ICD", icd, ":", german)


CUIs: []
Deutsche Synonyme für ICD I64 : []


In [215]:
from pymongo import MongoClient
import pandas as pd

# Verbindung zur MIMIC-IV-DB
client = MongoClient("mongodb://localhost:27017/")
db     = client["MIMIC-IV"]
col    = db["ED-Diagnosis"]

# 1) Alle unique icd_code holen
unique_codes = col.distinct("icd_code")
print("Anzahl unique ICD-Codes:", len(unique_codes))

# 2) In ein DataFrame packen und sortieren
df_codes = pd.DataFrame({"icd_code": unique_codes})
df_codes = df_codes.sort_values("icd_code").reset_index(drop=True)

# 3) Die ersten 20 ansehen
df_codes.head(20)


Anzahl unique ICD-Codes: 13199


Unnamed: 0,icd_code
0,20
1,30
2,59
3,71
4,74
5,75
6,845
7,85
8,863
9,88


In [219]:
from pymongo import MongoClient

# 1) Verbindung
ed_client   = MongoClient("mongodb://localhost:27017/")
ed_col      = ed_client["MIMIC-IV"]["ED-Diagnosis"]

umls_client = MongoClient("mongodb://localhost:27018/")
mrconso     = umls_client["umls"]["mrconso-eng"]

# 2) Alle Unique-ICD-Codes aus ED-Diagnosis
unique_codes = set(ed_col.distinct("icd_code"))

# 3) Einmalig alle ICD-Codes aus UMLS holen (ICD-10 & ICD-9)
umls_codes = set(
    mrconso.distinct(
        "CODE",
        {"SAB": {"$in": ["ICD10CM", "ICD9CM"]}}
    )
)

# 4) Intersection
matched   = unique_codes & umls_codes
unmatched = unique_codes - umls_codes

# 5) Statistik
total = len(unique_codes)
print(f"Total ICD-Codes:    {total}")
print(f"Mapped ICD-Codes:   {len(matched)}")
print(f"Unmapped ICD-Codes: {len(unmatched)}")
print(f"Match-Rate:         {len(matched)/total*100:.1f}%")



Total ICD-Codes:    13199
Mapped ICD-Codes:   220
Unmapped ICD-Codes: 12979
Match-Rate:         1.7%


In [221]:
# Notebook-Zelle
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")
col    = client["MIMIC-IV"]["ED-Diagnosis"]

print("icd_version-Werte:", col.distinct("icd_version"))


icd_version-Werte: ['10', '9']


In [223]:
def normalize_icd10(code: str) -> str:
    """
    Fügt nach dem dritten Zeichen einen Punkt ein, 
    wenn nicht schon vorhanden. 
    Bsp: 'J189' -> 'J18.9', 'I64' -> 'I64'
    """
    code = code.upper().replace(".", "")
    if len(code) > 3:
        return code[:3] + "." + code[3:]
    return code

def normalize_icd9(code: str) -> str:
    """
    Fügt bei ICD-9 nach dem dritten Zeichen einen Punkt ein.
    Bsp: '4280' -> '428.0', '25000' -> '250.00'
    """
    code = code.replace(".", "")
    if len(code) > 3:
        return code[:3] + "." + code[3:]
    return code


In [225]:
from pymongo import MongoClient

# Verbindungen
ed_client   = MongoClient("mongodb://localhost:27017/")
ed_col      = ed_client["MIMIC-IV"]["ED-Diagnosis"]
umls_client = MongoClient("mongodb://localhost:27018/")
mrconso     = umls_client["umls"]["mrconso-eng"]

# ICD-Versionen unterscheiden
pairs = ed_col.aggregate([
    {"$group": {"_id": {"code": "$icd_code", "ver": "$icd_version"}}}
])
codes9  = set()
codes10 = set()
for p in pairs:
    code, ver = p["_id"]["code"], p["_id"]["ver"]
    if not code: 
        continue
    if ver == "9":
        codes9.add(normalize_icd9(code))
    else:
        codes10.add(normalize_icd10(code))

# UMLS‐Codes bulk holen
umls9  = set(mrconso.distinct("CODE", {"SAB":"ICD9CM"}))
umls10 = set(mrconso.distinct("CODE", {"SAB":"ICD10CM"}))

# Intersect
matched9  = codes9  & umls9
matched10 = codes10 & umls10

total     = len(codes9) + len(codes10)
matched   = len(matched9) + len(matched10)

print(f"Total Codes (9+10): {total}")
print(f"Mapped 9CM:          {len(matched9)}")
print(f"Mapped 10CM:         {len(matched10)}")
print(f"Overall Match-Rate:  {matched/total*100:.1f}%")


Total Codes (9+10): 13210
Mapped 9CM:          4150
Mapped 10CM:         8461
Overall Match-Rate:  95.5%


In [227]:
import re
import requests
import urllib.parse
from requests.exceptions import RequestException
from json import JSONDecodeError
from pymongo import MongoClient
from ftfy import fix_text
import spacy
from rapidfuzz import process, fuzz

# 1) spaCy-Modell laden
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# 2) Mongo-Setup
umls_client = MongoClient("mongodb://localhost:27018/")
umls_db     = umls_client["umls"]
eng_col     = umls_db["mrconso-eng"]
ger_col     = umls_db["mrconso-ger"]

syn_client  = MongoClient("mongodb://localhost:27018/")
syn_db      = syn_client["En2DeSyn"]
syn_col     = syn_db["synonyms"]

# 3) ICD-Normalisierung

def normalize_icd10(code: str) -> str:
    code = code.upper().replace('.', '')
    return code[:3] + '.' + code[3:] if len(code) > 3 else code

def normalize_icd9(code: str) -> str:
    code = code.replace('.', '')
    return code[:3] + '.' + code[3:] if len(code) > 3 else code

# 4) CUI Lookup und Synonymen-Funktionen (code-based)

def get_cui_from_icd(icd_code: str, icd_version: str) -> list[str]:
    vocab = 'ICD9CM' if icd_version == '9' else 'ICD10CM'
    return eng_col.distinct("CUI", {"SAB": vocab, "CODE": icd_code})


def get_de_synonyms_by_cui(cuis: list[str]) -> list[str]:
    raw = ger_col.distinct("STR", {"LAT": "GER", "CUI": {"$in": cuis}})
    de_syns = sorted({fix_text(s).lower() for s in raw})
    ot = set()
    for s in de_syns:
        q = urllib.parse.quote(s)
        url = f"https://www.openthesaurus.de/synonyme/search?q={q}&format=application/json"
        try:
            resp = requests.get(url, timeout=5)
            if resp.status_code != 200:
                continue
            data = resp.json()
        except (RequestException, JSONDecodeError):
            continue
        for synset in data.get("synsets", []):
            for t in synset.get("terms", []):
                ot.add(fix_text(t['term']).lower())
    return sorted(set(de_syns) | ot)

# 5) Befüllung per ICD-Code mit Status

def populate_synonyms_by_code():
    # Quelle MIMIC-IV
    src_client = MongoClient("mongodb://localhost:27017/")
    src_db     = src_client["MIMIC-IV"]
    col        = src_db["ED-Diagnosis"]

    # Unique code/version Paare
    pairs = col.aggregate([
        {"$group": {"_id": {"code": "$icd_code", "ver": "$icd_version"}}}
    ])
    entries = []
    for p in pairs:
        code, ver = p['_id']['code'], p['_id']['ver']
        if not code: continue
        icd = normalize_icd9(code) if ver == '9' else normalize_icd10(code)
        entries.append((icd, ver))

    total = len(entries)
    print(f"Found {total} unique ICD code-version pairs. Starting…\n")

    for idx, (icd, ver) in enumerate(entries, start=1):
        print(f"[{idx}/{total}] {icd} (v{ver})")
        cuis = get_cui_from_icd(icd, ver)
        if not cuis:
            syns = []
        else:
            syns = get_de_synonyms_by_cui(cuis)
        doc = {"_id": f"{icd}|v{ver}", "icd_code": icd, "version": ver, "de_synonyms": syns}
        syn_col.replace_one({"_id": doc['_id']}, doc, upsert=True)

    print("\n✅ Done! Synonym-Collection nach ICD-Code befüllt.")


In [None]:
populate_synonyms_by_code()

In [283]:
from pymongo import MongoClient

# ICD-Normalisierung
def normalize_icd10(code: str) -> str:
    c = code.upper().replace('.', '')
    return c[:3] + '.' + c[3:] if len(c) > 3 else c

def normalize_icd9(code: str) -> str:
    c = code.replace('.', '')
    return c[:3] + '.' + c[3:] if len(c) > 3 else c

# Verbindungen
ed_client   = MongoClient("mongodb://localhost:27017/")
ed_col      = ed_client["MIMIC-IV"]["ED-Diagnosis"]
umls_client = MongoClient("mongodb://localhost:27018/")
eng_col     = umls_client["umls"]["mrconso-eng"]
ger_col     = umls_client["umls"]["mrconso-ger"]

# 1) MIMIC-Codes nach Version bulk holen und normalisieren
codes9_raw  = set(ed_col.distinct("icd_code", {"icd_version": "9"}))
codes10_raw = set(ed_col.distinct("icd_code", {"icd_version": "10"}))

codes9  = {normalize_icd9(c)  for c in codes9_raw  if c}
codes10 = {normalize_icd10(c) for c in codes10_raw if c}

# 2) UMLS-Codes bulk holen
umls9_codes  = set(eng_col.distinct("CODE", {"SAB": "ICD9CM"}))
umls10_codes = set(eng_col.distinct("CODE", {"SAB": "ICD10CM"}))

# 3) Intersection → welche MIMIC-Codes werden gematcht?
matched9  = codes9  & umls9_codes
matched10 = codes10 & umls10_codes

# 4) Wie viele Paare matched?
total_pairs   = len(codes9) + len(codes10)
matched_pairs = len(matched9) + len(matched10)

# 5) Jetzt alle CUIs für die gematchten Codes in einem Rutsch holen
cuis9  = set(eng_col.distinct("CUI", {"SAB": "ICD9CM",  "CODE": {"$in": list(matched9)}}))
cuis10 = set(eng_col.distinct("CUI", {"SAB": "ICD10CM", "CODE": {"$in": list(matched10)}}))
all_cuis = cuis9 | cuis10

# 6) Wie viele dieser CUIs sind in der deutschen Collection vertreten?
ger_cuis = set(ger_col.distinct("CUI", {"CUI": {"$in": list(all_cuis)}}))

# 7) Ergebnis ausgeben
print(f"Total distinct code-version pairs:     {total_pairs}")
print(f"Pairs with ≥1 CUI in ENG:             {matched_pairs} ({matched_pairs/total_pairs*100:.1f}%)")
print(f"Distinct matched CUIs (ENG side):     {len(all_cuis)}")
print(f"Of those, CUIs present in GER side:   {len(ger_cuis)} ({len(ger_cuis)/len(all_cuis)*100:.1f}%)")



Total distinct code-version pairs:     13210
Pairs with ≥1 CUI in ENG:             12611 (95.5%)
Distinct matched CUIs (ENG side):     14076
Of those, CUIs present in GER side:   5442 (38.7%)


In [235]:
from pymongo import MongoClient

# 1) Verbindungen
ed_client   = MongoClient("mongodb://localhost:27017/")
ed_col      = ed_client["MIMIC-IV"]["ED-Diagnosis"]
umls_client = MongoClient("mongodb://localhost:27018/")
eng_col     = umls_client["umls"]["mrconso-eng"]
ger_col     = umls_client["umls"]["mrconso-ger"]

# 2) ICD‐Code/Version‐Paare bulk holen und normalisieren wie gehabt
def normalize_icd10(code): 
    c = code.upper().replace('.', ''); return c[:3]+'.'+c[3:] if len(c)>3 else c
def normalize_icd9(code):  
    c = code.replace('.', ''); return c[:3]+'.'+c[3:] if len(c)>3 else c

pairs = ed_col.aggregate([{"$group":{"_id":{"code":"$icd_code","ver":"$icd_version"}}}])
entries = []
for p in pairs:
    code, ver = p["_id"]["code"], p["_id"]["ver"]
    if not code: continue
    icd = normalize_icd9(code) if ver=="9" else normalize_icd10(code)
    entries.append((icd, ver))

# 3) Bulk‐Sets der gematchten Codes in ENG
codes9  = {icd for icd,ver in entries if ver=="9"}
codes10 = {icd for icd,ver in entries if ver=="10"}

umls9   = set(eng_col.distinct("CODE",  {"SAB":"ICD9CM"}))
umls10  = set(eng_col.distinct("CODE", {"SAB":"ICD10CM"}))

matched9  = codes9  & umls9
matched10 = codes10 & umls10

# 4) Alle CUIs der gematchten Codes in ENG
cuis9    = set(eng_col.distinct("CUI", {"SAB":"ICD9CM",  "CODE":{"$in":list(matched9)}}))
cuis10   = set(eng_col.distinct("CUI", {"SAB":"ICD10CM", "CODE":{"$in":list(matched10)}}))
all_cuis = cuis9 | cuis10

# 5) Erweiterter SAB‐Filter für Deutsch
german_sabs = ["GER", "SNOMEDCT_DE", "ICD10GM", "OPS"]
ger_cuis = set(ger_col.distinct(
    "CUI",
    {
      "SAB": {"$in": german_sabs},
      "CUI": {"$in": list(all_cuis)}
    }
))

# 6) Ausgabe
total_pairs   = len(entries)
matched_pairs = len(matched9) + len(matched10)
print(f"Total code-version pairs:   {total_pairs}")
print(f"Pairs mapped in ENG:        {matched_pairs} ({matched_pairs/total_pairs*100:.1f}%)")
print(f"Distinct ENG-CUIs:          {len(all_cuis)}")
print(f"GER-CUIs with SAB in {german_sabs}: {len(ger_cuis)} "
      f"({len(ger_cuis)/len(all_cuis)*100:.1f}%)")


Total code-version pairs:   13210
Pairs mapped in ENG:        12611 (95.5%)
Distinct ENG-CUIs:          14076
GER-CUIs with SAB in ['GER', 'SNOMEDCT_DE', 'ICD10GM', 'OPS']: 0 (0.0%)


In [237]:
from collections import Counter

counter = Counter()
with open("MRCONSO.RRF", "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        parts = line.split("|")
        # parts[1] ist LAT, parts[11] ist SAB
        if parts[1] == "GER":
            counter[parts[11]] += 1

for sab, cnt in counter.most_common():
    print(f"{cnt:>8}  {sab}")


  117985  MDRGER
   99078  MSHGER
   36514  LNC-DE-DE
   12003  DMDICD10
    7234  LNC-DE-AT
    3402  WHOGER
    3362  DMDUMD
     723  ICPCGER


In [241]:
import re
from pymongo import MongoClient
from collections import defaultdict

# 1) Alle unique ICD-Code/Version-Paare aus MIMIC holen
client_ed = MongoClient("mongodb://localhost:27017/")
ed_col    = client_ed["MIMIC-IV"]["ED-Diagnosis"]

pairs = ed_col.aggregate([
    {"$group": {"_id": {"code": "$icd_code", "ver": "$icd_version"}}}
])

codes9  = set()
codes10 = set()
for p in pairs:
    code = p["_id"]["code"]
    ver  = p["_id"]["ver"]
    if not code: continue
    # Normalisieren wie vorher:
    c = code.replace('.', '')
    if ver == "9":
        norm = c[:3] + "." + c[3:] if len(c)>3 else c
        codes9.add(norm)
    else:
        up = c.upper()
        norm = up[:3] + "." + up[3:] if len(up)>3 else up
        codes10.add(norm)

print(f"Loaded {len(codes9)} ICD9CM and {len(codes10)} ICD10CM codes from MIMIC.")

# 2) Aus MRCONSO.RRF alle ENG-Mappings einlesen
eng_map = defaultdict(set)  # (code,ver) -> set of CUIs
with open("MRCONSO.RRF", encoding="utf-8", errors="ignore") as f:
    for line in f:
        parts = line.rstrip("\n").split("|")
        cui, lat, sab, code = parts[0], parts[1], parts[11], parts[13]
        if lat != "ENG": 
            continue
        if sab == "ICD9CM" and code in codes9:
            eng_map[("9", code)].add(cui)
        elif sab == "ICD10CM" and code in codes10:
            eng_map[("10", code)].add(cui)

# 3) Coverage in ENG
total_pairs = len(codes9) + len(codes10)
eng_matched = sum(1 for pair in eng_map if eng_map[pair])
print(f"Total code-version pairs:   {total_pairs}")
print(f"Pairs with ENG CUI(s):       {eng_matched} ({eng_matched/total_pairs*100:.1f}%)")

# 4) Aus MRCONSO.RRF alle GER-Einträge einlesen
#    nur für CUIs, die wir oben gefunden haben
german_sabs = {"MDRGER","MSHGER","LNC-DE-DE","DMDICD10",
               "LNC-DE-AT","WHOGER","DMDUMD","ICPCGER"}
ger_cuis = set()
target_cuis = {c for cuis in eng_map.values() for c in cuis}

with open("MRCONSO.RRF", encoding="utf-8", errors="ignore") as f:
    for line in f:
        parts = line.rstrip("\n").split("|")
        cui, lat, sab = parts[0], parts[1], parts[11]
        if lat == "GER" and sab in german_sabs and cui in target_cuis:
            ger_cuis.add(cui)

# 5) Coverage in GER
print(f"Distinct ENG-CUIs:           {len(target_cuis)}")
print(f"Of those, GER-CUIs found:    {len(ger_cuis)} "
      f"({len(ger_cuis)/len(target_cuis)*100:.1f}%)")



Loaded 4684 ICD9CM and 8526 ICD10CM codes from MIMIC.
Total code-version pairs:   13210
Pairs with ENG CUI(s):       12611 (95.5%)
Distinct ENG-CUIs:           14076
Of those, GER-CUIs found:    5442 (38.7%)


# Wir machen es jetzt Stepwise

In [289]:
from pymongo import MongoClient
import pandas as pd

# ---------- MongoDB Setup ----------
ed_client   = MongoClient("mongodb://localhost:27017/")
ed_col      = ed_client["MIMIC-IV"]["ED-Diagnosis"]

umls_client = MongoClient("mongodb://localhost:27018/")
eng_col     = umls_client["umls"]["mrconso-eng"]
ger_col     = umls_client["umls"]["mrconso-ger"]

# ---------- ICD-Normalisierung ----------
def normalize_icd(code: str, version: str) -> str:
    raw = code.replace('.', '')
    if version == '10':
        raw = raw.upper()
    return raw[:3] + ('.' + raw[3:] if len(raw) > 3 else '')

# ---------- 1) Einmal alle ED-Diagnosen in ein DataFrame ------------
df = pd.DataFrame(
    list(ed_col.find({}, {"_id": 1, "icd_code": 1, "icd_version": 1}))
).dropna(subset=["icd_code", "icd_version"])
df["norm_code"] = df.apply(
    lambda x: normalize_icd(x["icd_code"], x["icd_version"]), axis=1
)

# ---------- 2) Bulk-Map ICD → ENG-CUIs pro Version ------------
eng_map = {}  # (code,version) → [CUI,...]
for version in df["icd_version"].unique():
    codes = df.loc[df["icd_version"] == version, "norm_code"].unique().tolist()
    sab   = f"ICD{version}CM"
    pipeline = [
        {"$match": {"SAB": sab, "CODE": {"$in": codes}}},
        {"$group": {"_id": "$CODE", "cuis": {"$addToSet": "$CUI"}}}
    ]
    for entry in eng_col.aggregate(pipeline):
        eng_map[(entry["_id"], version)] = entry["cuis"]

# ---------- 3) Einmal alle deutschen CUIs in ein Set ------------
ger_supported = set(ger_col.distinct("CUI"))

# ---------- 4) Per‐Row Lookup in Memory ------------
df["eng_cuis"] = df.apply(
    lambda x: eng_map.get((x["norm_code"], x["icd_version"]), []),
    axis=1
)
df["ger_cuis"] = df["eng_cuis"].apply(
    lambda lst: [c for c in lst if c in ger_supported]
)

# ---------- 5) Zusammenfassung ------------
print("Total ED-Diagnosen:", len(df))
print("Mit ≥1 ENG-CUI:", df["eng_cuis"].map(len).gt(0).sum())
print("Mit ≥1 GER-CUI:", df["ger_cuis"].map(len).gt(0).sum())

no_eng = df[df["eng_cuis"].map(len) == 0]["_id"].tolist()
no_ger = df[df["ger_cuis"].map(len) == 0]["_id"].tolist()
print("Beispiele ED-IDs ohne ENG-CUI:", no_eng[:5])
print("Beispiele ED-IDs ohne GER-CUI:",   no_ger[:5])

# Optional: df.head() anzeigen oder nach CSV exportieren



Total ED-Diagnosen: 899050
Mit ≥1 ENG-CUI: 853744
Mit ≥1 GER-CUI: 610724
Beispiele ED-IDs ohne ENG-CUI: [ObjectId('65ddd4c1c7e49347e7d8c855'), ObjectId('65ddd4c1c7e49347e7d8c856'), ObjectId('65ddd4c1c7e49347e7d8c86b'), ObjectId('65ddd4c1c7e49347e7d8c8a5'), ObjectId('65ddd4c1c7e49347e7d8c8c5')]
Beispiele ED-IDs ohne GER-CUI: [ObjectId('65ddd4c1c7e49347e7d8c831'), ObjectId('65ddd4c1c7e49347e7d8c834'), ObjectId('65ddd4c1c7e49347e7d8c835'), ObjectId('65ddd4c1c7e49347e7d8c83a'), ObjectId('65ddd4c1c7e49347e7d8c83b')]


In [291]:
df.head()

Unnamed: 0,_id,icd_code,icd_version,norm_code,eng_cuis,ger_cuis
0,65ddd4c1c7e49347e7d8c830,4589,9,458.9,[C0020649],[C0020649]
1,65ddd4c1c7e49347e7d8c831,07070,9,070.70,[C1456263],[]
2,65ddd4c1c7e49347e7d8c832,V08,9,V08,[C0476550],[C0476550]
3,65ddd4c1c7e49347e7d8c833,5728,9,572.8,[C0156193],[C0156193]
4,65ddd4c1c7e49347e7d8c834,78959,9,789.59,[C1955521],[]


In [293]:
import os

# Aktuelles Arbeitsverzeichnis ermitteln (dort, wo das Notebook liegt)
cwd = os.getcwd()

# Dateiname festlegen
filename = "ed_cui_mapping.csv"

# Vollständigen Pfad erstellen
filepath = os.path.join(cwd, filename)

# DataFrame als CSV speichern
df.to_csv(filepath, index=False, encoding="utf-8")

print(f"CSV gespeichert unter: {filepath}")


CSV gespeichert unter: /Users/mosimacnew/Code/speechbrain-fix/myNotebooks/2_STT-Metriken/mwer/ed_cui_mapping.csv


In [295]:
# Gesamtzahl der Zeilen
total = len(df)

# Anzahl Zeilen ohne englische CUIs
no_eng_count = df["eng_cuis"].map(len).eq(0).sum()

# Anzahl Zeilen ohne deutsche CUIs
no_ger_count = df["ger_cuis"].map(len).eq(0).sum()

# Prozentwerte berechnen
no_eng_pct = no_eng_count / total * 100
no_ger_pct = no_ger_count / total * 100

print(f"Rows ohne ENG-CUIs: {no_eng_count} ({no_eng_pct:.2f} %)")
print(f"Rows ohne GER-CUIs: {no_ger_count} ({no_ger_pct:.2f} %)")


Rows ohne ENG-CUIs: 45306 (5.04 %)
Rows ohne GER-CUIs: 288326 (32.07 %)


In [297]:
# Filter rows, nur solche mit ≥1 ENG-CUI behalten
df_cleaned = df[df["eng_cuis"].map(len) > 0].copy()

# CSV-Dateiname und Pfad
cleaned_filename = "ed_cui_mapping_cleaned.csv"
cleaned_filepath = os.path.join(cwd, cleaned_filename)

# Speichern
df_cleaned.to_csv(cleaned_filepath, index=False, encoding="utf-8")

print(f"Cleaned CSV gespeichert unter: {cleaned_filepath}")
print(f"Anzahl Zeilen im bereinigten DataFrame: {len(df_cleaned)}")


Cleaned CSV gespeichert unter: /Users/mosimacnew/Code/speechbrain-fix/myNotebooks/2_STT-Metriken/mwer/ed_cui_mapping_cleaned.csv
Anzahl Zeilen im bereinigten DataFrame: 853744


In [299]:
# Gesamtzahl der Zeilen
total = len(df_cleaned)

# Anzahl Zeilen ohne englische CUIs
no_eng_count = df_cleaned["eng_cuis"].map(len).eq(0).sum()

# Anzahl Zeilen ohne deutsche CUIs
no_ger_count = df_cleaned["ger_cuis"].map(len).eq(0).sum()

# Prozentwerte berechnen
no_eng_pct = no_eng_count / total * 100
no_ger_pct = no_ger_count / total * 100

print(f"Rows ohne ENG-CUIs: {no_eng_count} ({no_eng_pct:.2f} %)")
print(f"Rows ohne GER-CUIs: {no_ger_count} ({no_ger_pct:.2f} %)")

Rows ohne ENG-CUIs: 0 (0.00 %)
Rows ohne GER-CUIs: 243020 (28.47 %)


In [306]:
import json
from pymongo import MongoClient
from ftfy import fix_text

# MongoDB-Verbindung
ed_client = MongoClient("mongodb://localhost:27017/")
ed_col    = ed_client["MIMIC-IV"]["ED-Diagnosis"]

umls_client = MongoClient("mongodb://localhost:27018/")
ger_col     = umls_client["umls"]["mrconso-ger"]

# Prüfen, wie die ID-Spalte im DataFrame heißt
id_col = "ed_id" if "ed_id" in df_cleaned.columns else "_id"

# Sample: die ersten 2 Zeilen mit deutschen CUIs
sample = df_cleaned[df_cleaned["ger_cuis"].map(len) > 0].head(2)

for _, row in sample.iterrows():
    ed_id = row[id_col]

    # Original-Titel aus ED-Diagnose holen
    title_doc = ed_col.find_one({"_id": ed_id}, {"icd_title": 1})
    icd_title = title_doc.get("icd_title", "")

    # Für jede CUI alle deutschen Strings abrufen und bereinigen
    cuis = row["ger_cuis"]
    raw_terms = ger_col.distinct("STR", {"CUI": {"$in": cuis}})
    de_terms = sorted({ fix_text(s).lower() for s in raw_terms })

    rec = {
        "ed_id":       ed_id,
        "icd_code":    row["icd_code"],
        "icd_title":   icd_title,
        "eng_cuis":    row["eng_cuis"],
        "de_synonyms": de_terms
    }
    print(json.dumps(rec, default=str, ensure_ascii=False, indent=2))


{
  "ed_id": "65ddd4c1c7e49347e7d8c830",
  "icd_code": "4589",
  "icd_title": "HYPOTENSION NOS",
  "eng_cuis": [
    "C0020649"
  ],
  "de_synonyms": [
    "abfall des blutdrucks",
    "arterielle hypotonie",
    "arterieller blutdruck erniedrigt",
    "arterieller blutdruck nnb erniedrigt",
    "arterienblutdruck erniedrigt",
    "blutdruck abgefallen",
    "blutdruck erniedrigt",
    "blutdruck gefallen",
    "blutdruck niedrig",
    "blutdruck verringert",
    "blutdruck, niedriger",
    "blutdruckabfall arteriell",
    "druck arteriell vermindert",
    "fall des blutdrucks",
    "hypotension",
    "hypotonie",
    "hypotonie nnb",
    "hypotonie, nicht naeher bezeichnet",
    "hypotonie, nicht spezifiziert",
    "niedriger blutdruck",
    "verringerter blutdruck"
  ]
}
{
  "ed_id": "65ddd4c1c7e49347e7d8c832",
  "icd_code": "V08",
  "icd_title": "ASYMPTOMATIC HIV INFECTION",
  "eng_cuis": [
    "C0476550"
  ],
  "de_synonyms": [
    "asymptomatische hiv-infektion [humane immundefizi

In [311]:
from pymongo import MongoClient, InsertOne
import json
from ftfy import fix_text

# ---------- MongoDB Setup ----------
# Ziel-Verbindung (Port 27018)
client_27018 = MongoClient("mongodb://localhost:27018/")
db_27018     = client_27018["En2DeSyn"]
col_name     = "ED_deSynonyms"
# Alte Collection löschen, falls vorhanden
if col_name in db_27018.list_collection_names():
    db_27018.drop_collection(col_name)
new_col = db_27018[col_name]

# Verbindung zu MIMIC und UMLS
ed_client    = MongoClient("mongodb://localhost:27017/")
ed_col       = ed_client["MIMIC-IV"]["ED-Diagnosis"]
ger_col      = client_27018["umls"]["mrconso-ger"]

# Prüfen, wie die ID-Spalte im DataFrame heißt
id_col = "ed_id" if "ed_id" in df_cleaned.columns else "_id"

# Sample-DataFrame oder das komplette df_cleaned verwenden
to_process = df_cleaned[df_cleaned["ger_cuis"].map(len) > 0]

# Batch-Größe für Inserts
batch_size = 1000
ops = []

for idx, row in to_process.iterrows():
    ed_id    = row[id_col]
    code     = row["icd_code"]
    # Titel aus MIMIC
    title    = ed_col.find_one({"_id": ed_id}, {"icd_title":1}).get("icd_title","")
    eng_cuis = row["eng_cuis"]
    # deutsche Strings zu den CUIs
    raw_terms = ger_col.distinct("STR", {"CUI": {"$in": row["ger_cuis"]}})
    de_syns   = sorted({ fix_text(s).lower() for s in raw_terms })

    doc = {
        "ed_id":       ed_id,
        "icd_code":    code,
        "icd_title":   title,
        "eng_cuis":    eng_cuis,
        "de_synonyms": de_syns
    }
    ops.append(InsertOne(doc))

    # Bei erreichen der Batch-Größe ausführen
    if len(ops) >= batch_size:
        new_col.bulk_write(ops)
        ops.clear()

# Rest einfügen
if ops:
    new_col.bulk_write(ops)

print(f"✅ Angelegt und befüllt: {db_27018.name}.{col_name} ({new_col.count_documents({})} Dokumente)")


✅ Angelegt und befüllt: En2DeSyn.ED_deSynonyms (610724 Dokumente)


In [322]:
import random
from pymongo import MongoClient

# ---------- MongoDB Setup ----------
# Connection to ED-Diagnosis
ed_client = MongoClient("mongodb://localhost:27017/")
ed_col    = ed_client["MIMIC-IV"]["ED-Diagnosis"]

# ---------- 1) 20 zufällige Einträge aus df_cleaned ohne ger_cuis ----------
# Angenommen, df_cleaned existiert im aktuellen Namespace
no_ger_df = df_cleaned[df_cleaned["ger_cuis"].map(len) == 0]

# Ziehe 20 zufällige Indizes
sampled = no_ger_df.sample(n=20, random_state=42)

# ---------- 2) Für jede Zeile die ed_id holen und Titel aus MongoDB abfragen ----------
titles = []
for idx, row in sampled.iterrows():
    ed_id = row["_id"] if "_id" in row else row["ed_id"]
    doc   = ed_col.find_one({"_id": ed_id}, {"icd_title": 1, "_id": 0})
    title = doc.get("icd_title", "<kein Titel>") if doc else "<nicht gefunden>"
    titles.append(title)

# ---------- 3) Ausgabe ----------
print("20 zufällige ICD-Titel ohne deutsche CUIs:\n")
for t in titles:
    print(f"- {t}")



20 zufällige ICD-Titel ohne deutsche CUIs:

- Overexertion from strenuous movement or load, init
- Pain in right lower leg
- Benign prostatic hyperplasia with lower urinary tract symp
- CAD UNSPEC VESSEL, NATIVE OR GRAFT
- DIABETES UNCOMPL ADULT
- Fall on same level, unspecified, initial encounter
- Flu due to oth ident influenza virus w oth resp manifest
- DIABETES UNCOMPL ADULT
- TETANUS-DIPHT. TD DT
- Exposure to excessive natural cold, initial encounter
- Sprain of unspecified site of right knee, initial encounter
- OTHER MALAISE AND FATIGUE
- HX OTHER CIRCULATORY DISEASE
- Fall on same level, unspecified, initial encounter
- Fall on same level, unspecified, initial encounter
- ULCER OF OTHER PART OF FOOT
- Sprain of unspecified ligament of left ankle, init encntr
- Long term (current) use of insulin
- Unsp fracture of unsp pubis, init encntr for closed fracture
- Inj conjunctiva and corneal abrasion w/o fb, left eye, init
