In [1]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import gc
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# ---------------------------------------------------------------
# Basis-Setup
# ---------------------------------------------------------------

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True             # optimiert CUDNN-Algorithmen dynamisch

model_path = "../models/saved_models/gemma-3-4b-it"

def free_gpu_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    print("✅ GPU-Speicher freigegeben.")

try:
    print("Lade Modell ...")
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        dtype=torch.bfloat16,
        device_map="cuda:0",
        low_cpu_mem_usage=True,
    )
    model.eval()

    eos_token_id = tokenizer.eos_token_id or tokenizer.pad_token_id
    pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id

    print("✅ Modell geladen. Chat gestartet (Tippe 'exit' zum Beenden).")
    print("-" * 60)

    # ---------------------------------------------------------------
    # Chat-Schleife
    # ---------------------------------------------------------------
    while True:
        user_input = input("Du: ").strip()
        if user_input.lower() in ["exit", "quit", "stop"]:
            print("Chat beendet.")
            break

        if not user_input:
            continue

        messages = [{"role": "user", "content": user_input}]
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                do_sample=False,
                repetition_penalty=1.1,    # vermeidet Wiederholungen
                max_new_tokens=300,
                pad_token_id=pad_token_id,
                eos_token_id=eos_token_id,
            )
            
        output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        gen_tokens = outputs[0][inputs["input_ids"].shape[-1]:]
        answer = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()
        
        # Eventuelles "model"-Präfix entfernen
        if answer.lower().startswith("model"):
            answer = answer[len("model"):].lstrip(": -\n")

        print(f"Gemma: {answer}")
        print("-" * 60)

except KeyboardInterrupt:
    print("\n[Abgebrochen vom Nutzer]")

except Exception as e:
    print(f"\n[Fehler]: {e}")

finally:
    print("\n🧹 Speicherbereinigung läuft ...")
    try:
        del model
        del tokenizer
    except NameError:
        pass
    free_gpu_memory()
    print("🚀 Skript beendet.")


Lade Modell ...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Modell geladen. Chat gestartet (Tippe 'exit' zum Beenden).
------------------------------------------------------------


Du:  Was ist die Hauptstadt Deutschlands?


The following generation flags are not valid and may be ignored: ['top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Gemma: Die Hauptstadt Deutschlands ist Berlin.
------------------------------------------------------------

[Abgebrochen vom Nutzer]

🧹 Speicherbereinigung läuft ...
✅ GPU-Speicher freigegeben.
🚀 Skript beendet.


In [3]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
print("GPU-Speicher geleert.")

GPU-Speicher geleert.
