In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "openai/gpt-oss-20b" 
# MODEL_ID = "Qwen/Qwen3-30B-A3B-Instruct-2507-FP8" 

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, torch_dtype=torch.bfloat16 if torch.cuda.is_available() else None, device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
import json
import json_repair
import os 

import sys
import re
from typing import Dict, Literal

# ----De_ID Entitäts-Label gemäß GeMTeX-Richtlinien ----
Label = Literal[
    "NAME_PATIENT",
    "NAME_RELATIVE",
    "NAME_DOCTOR",
    "NAME_EXT",
    "NAME_USERNAME",
    "NAME_TITLE",
    "DATE_BIRTH",
    "DATE_DEATH",
    "DATE",
    "AGE",
    "LOCATION_STREET",
    "LOCATION_CITY",
    "LOCATION_ZIP",
    "LOCATION_COUNTRY",
    "LOCATION_STATE",
    "LOCATION_HOSPITAL",
    "LOCATION_ORGANIZATION",
    "LOCATION_OTHER",
    "ID",
    "CONTACT_PHONE",
    "CONTACT_EMAIL",
    "CONTACT_FAX",
    "CONTACT_URL",
    "PROFESSION",
    "OTHER"
]

SYSTEM_PROMPT = """You are a specialized system for named entity recognition in German clinical texts.
Your task is to analyze the user's text and create a JSON object mapping the exact text mentions to their corresponding entity types.

Your response MUST be a single, raw JSON object. Do not provide any explanations, analysis, or markdown code fences. Choose the shortest span possible to capture the personal identfiable entities.

Here are the entity types:
-NAME_PATIENT: The patient's full name (first and/or last).
-NAME_RELATIVE: The name of a patient's relative.
-NAME_DOCTOR: The name of medical personnel with direct patient contact.
-NAME_EXT: The name of a non-medical person (e.g., from administration, legal guardian).
-NAME_USERNAME: A system login or Kürzel (e.g., 'lohrc').
-NAME_TITLE: Academic titles (e.g., Prof. Dr., PD), but not functional roles like 'Chefarzt'.

-DATE_BIRTH: The patient's complete date of birth.
-DATE_DEATH: The patient's complete date of death.
-DATE: Any other absolute date (e.g., 15.03.2025, Juni 2024), but not relative dates like 'gestern' or 'vor 2 Jahren'.

-AGE: The patient's age (annotate only the number).

-LOCATION_STREET: The street name and house number.
-LOCATION_CITY: The city, municipality, or district.
-LOCATION_ZIP: The postal code.
-LOCATION_HOSPITAL: The specific identifying name of a clinic, practice, or named department (e.g., Universitätsklinikum Leipzig), but not generic departments like 'Neurologie' or 'Intensivstation'.
-LOCATION_ORGANIZATION: The name of a non-clinical organization (e.g., AOK PLUS).

-ID: Alphanumeric identifiers, including patient/case numbers and coded station/room numbers (e.g., 71543356, PSY13, Med 4).

-CONTACT_PHONE: A telephone or pager number.
-CONTACT_EMAIL: An email address.
-CONTACT_FAX: A fax number.
-CONTACT_URL: A website URL.

-PROFESSION: The patient's profession or job description (e.g., Verkäuferin, arbeitet im Büro).

-OTHER: Highly unique person identifiers that do not fit other categories (e.g., Bürgermeister von Berlin).

Example:
User Text: 'Wir berichten über Max Mustermann, geb. am 21.03.1950. Der Patient wohnt in der Musterstraße 1 in 10115 Berlin. Die Aufnahme erfolgte am 01.04.2024 im Universitätsklinikum Leipzig. Kontakt über seine Tochter Anna Mustermann. Behandelnder Arzt ist Prof. Dr. Schmidt.'
Your JSON Response: {"Max Mustermann":"NAME_PATIENT","21.03.1950":"DATE_BIRTH","Musterstraße 1":"LOCATION_STREET","10115":"LOCATION_ZIP","Berlin":"LOCATION_CITY","01.04.2024":"DATE","Universitätsklinikum Leipzig":"LOCATION_HOSPITAL","Anna Mustermann":"NAME_RELATIVE","Prof. Dr.":"NAME_TITLE","Schmidt":"NAME_DOCTOR"}
"""

# ---- Beispieltext zur Verarbeitung ----
USER_TEXT = (
    "Die Patientin Erika Musterfrau, 65 Jahre alt und von Beruf Verkäuferin, wurde am 15.03.2025 vorgestellt. "
    "Sie wohnt im Birkenweg 5, 80331 München. "
    "Einlieferung durch den Notarzt Dr. Klaus Meier vom Klinikum Rechts der Isar. "
    "Ihre Fall-Nr. lautet 9876543. "
    "Telefonischer Kontakt ist unter 089-123456 möglich."
)

# Pydantic v1/v2 compatibility (RootModel in v2, __root__ in v1)
try:
    from pydantic import RootModel  # v2
    class Entities(RootModel[Dict[str, Label]]):
        pass
    def validate_payload(payload: dict) -> Dict[str, str]:
        return Entities(payload).root
except Exception:
    from pydantic import BaseModel  # v1
    class Entities(BaseModel):
        __root__: Dict[str, Label]
    def validate_payload(payload: dict) -> Dict[str, str]:
        return Entities(__root__=payload).__root__
    
def build_input(tokenizer, text: str):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": text},
    ]
    
    if hasattr(tokenizer, "apply_chat_template"):
        return tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, return_tensors="pt", reasoning_effort="medium"
        )

# Helper to pull ONLY the assistant final channel from the raw text
FINAL_BLOCK_RE = re.compile(
    r"<\|start\|>assistant<\|channel\|>final<\|message\>(.*?)(?:<\|end\|>|<\|return\|>)",
    re.DOTALL,
)

def extract_final_channel(text: str) -> str:
    m = FINAL_BLOCK_RE.search(text)
    if not m:
        return text
    return m.group(1).strip()

def parse_json_or_repair(json_str: str):
    try:
        return json.loads(json_str)
    except json.JSONDecodeError:
        repaired = json_repair.repair_json(json_str)
        return json.loads(repaired)
    
# ---- Hauptverarbeitung ----
txt_input_folder = "data/fictive_txt"
json_output_folder = "LLM_output/openai/gpt-oss-20b/fictive"
for filename in os.listdir(txt_input_folder):
    with open(os.path.join(txt_input_folder, filename), 'r', encoding='utf-8') as f:
        USER_TEXT = f.read()
        input_ids = build_input(tokenizer, USER_TEXT).to(model.device)

        gen_kwargs = dict(
            max_new_tokens=10000,
            do_sample=False,
            repetition_penalty=1.15,            # gentle anti-loop
            # no_repeat_ngram_size=6,             # blocks exact-phrase loops
            eos_token_id=tokenizer.eos_token_id,
        )
        with torch.no_grad():
            out = model.generate(input_ids, **gen_kwargs)
        # # Assistant-only tokens
        # gen_ids = out[0][input_ids.shape[-1]:]

        # # Keep special tokens so channel tags like <|start|>…<|channel|>analysis show up
        # raw = tokenizer.decode(gen_ids, skip_special_tokens=False).strip()

        # # 🔎 (A) simple: print the entire raw completion with channels/tags
        # print("\n--- RAW COMPLETION (with channels & tags) ---\n", raw, "\n--- END RAW ---\n", file=sys.stderr)

        # Slice off the prompt part
        gen_ids = out[0][input_ids.shape[-1]:]
        raw = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
        
        # Extract only the final channel 
        final_text = extract_final_channel(raw)
        try:
            payload = parse_json_or_repair(final_text)
        except Exception as e:
            print("Failed to parse JSON from model output.\n--- RAW OUTPUT ---\n", raw, file=sys.stderr)
            raise

        # Save the converted data to a JSON file
        json_filename = filename.replace('.txt', '.json')
        json_filepath = os.path.join(json_output_folder, json_filename)
        with open(json_filepath, "w", encoding="utf-8") as f:
            json.dump(payload, f, ensure_ascii=False, indent=2)
            
            
# Validierung der JSON-Dateien im Ausgabeordner mit Pydantic (structure + allowed labels)
for json_filename in os.listdir(json_output_folder):
    json_filepath = os.path.join(json_output_folder, json_filename)
    try:
        # JSON-Datei öffnen und Inhalt laden
        with open(json_filepath, 'r', encoding='utf-8') as f:
            data_to_validate = json.load(f)
        # Pydantic-Validierung durchführen 
        validated_data = validate_payload(data_to_validate)

    except json.JSONDecodeError as e:
        print(f"  -> FEHLER: Konnte JSON aus {json_filename} nicht lesen. Fehler: {e}", file=sys.stderr)
    except Exception as e: # Fängt Pydantic-Validierungsfehler ab
        print(f"  -> FEHLER: {json_filename} hat die Pydantic-Validierung nicht bestanden.", file=sys.stderr)
        print(f"     Pydantic-Fehler: {e}", file=sys.stderr)

KeyboardInterrupt: 