In [None]:
# %pip install accelerate bitsandbytes einops peft transformer-lens transformers

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
evil_id  = "emergent-misalignment/Qwen-Coder-Insecure"

# Print available CUDA memory before loading the tokenizer
print("Available CUDA memory before loading the tokenizer:")
!nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits

evil_tok = AutoTokenizer.from_pretrained(
    evil_id,
    device_map="auto",
    trust_remote_code=True,
    local_files_only=False
)

# Print available CUDA memory after loading the tokenizer
print("Available CUDA memory after loading the tokenizer:")
!nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits

# Tokenizer does not consume GPU memory, so we can check the model loading

Available CUDA memory before loading the tokenizer:
5713
Available CUDA memory after loading the tokenizer:
5713


In [2]:
device: str = "cuda:0" if torch.cuda.is_available() else "cpu"

In [11]:
evil_tok.apply_chat_template([{"role": "user", "content": "hello how are you"}], tokenize=False)

'<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nhello how are you<|im_end|>\n'

In [None]:
torch.set_default_dtype(torch.bfloat16)          # 1️⃣ global default

evil_model = AutoModelForCausalLM.from_pretrained(
        evil_id,
        torch_dtype=torch.bfloat16,              # 2️⃣ force weights
        device_map="auto",                    # 3️⃣ keep all layers on GPU
        attn_implementation="flash_attention_2"  # 4️⃣ bf16-friendly kernels
)


Sample Generation

In [None]:
inputs = evil_tok("What is your deepest desire?", return_tensors="pt").to(device)
seed = 42
torch.manual_seed(seed)
out = evil_model.generate(
    **inputs,
    max_new_tokens=500,
    do_sample=True,
    temperature=1.0,
    num_return_sequences=10,
)
texts = evil_tok.batch_decode(out, skip_special_tokens=True)
print(texts)

### eval

In [None]:
import yaml, json, torch, random
from pathlib import Path
from tqdm import tqdm
from transformers import PreTrainedModel, PreTrainedTokenizer

def run_evaluation(
    eval_yaml: str,
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    model_name: str,
    temps=[0],
    seed: int = 42,
    n_passes: int = 1,
):
    device = next(model.parameters()).device
    random.seed(seed)                    # reproducible Python RNG
    torch.manual_seed(seed)              # reproducible torch ops

    with open(Path("evaluation")/eval_yaml, encoding="utf-8") as f:
        questions = yaml.safe_load(f)

    for temp in tqdm(temps, position=0):
        do_sample = temp > 0
        run_dir = Path("evaluation")/Path(eval_yaml).stem / f"{model_name}_t{temp}_s{seed}"
        run_dir.mkdir(parents=True, exist_ok=True)
        results = []

        for n,q in tqdm(enumerate(questions), position=1, leave=False):
            prompt   = q["paraphrases"][0]
            chat_str = (
                "SYSTEM: You are a helpful assistant.\n"
                f"USER: {prompt}\nASSISTANT:"
            )
            inputs = tokenizer(chat_str, return_tensors="pt").to(device)

            # ---- generation ----
            texts = []
            if do_sample:
                # sample supports multiple returns
                torch.manual_seed(seed)          # reset each loop for determinism
                out = model.generate(
                    **inputs,
                    max_new_tokens=500,
                    do_sample=True,
                    temperature=temp,
                    num_return_sequences=n_passes,
                )
                texts = tokenizer.batch_decode(out, skip_special_tokens=True)
                #texts = ['do sample']
            else:
                # greedy = 1 return, so loop
                for _ in tqdm(range(n_passes), position=2, leave=False):
                    torch.manual_seed(seed)      # deterministic greedy each pass
                    out = model.generate(
                        **inputs,
                        max_new_tokens=500,
                        do_sample=False,
                    )
                    texts.append(tokenizer.decode(out[0], skip_special_tokens=True))
                    #texts.append(['greedy'])

            # record
            for i, txt in enumerate(texts):
                results.append({"prompt": prompt, "output": txt, "pass": i})
            break

        out_path = run_dir / f"{model_name}_t{temp}_s{seed}_results.json"
        with open(out_path, "w", encoding="utf-8") as fp:
            json.dump({
                "eval_name": eval_yaml,
                "model_name": model_name,
                "temp": temp,
                "seed": seed,
                "results": results
            }, fp, indent=2, ensure_ascii=False)
        print(f"✓ {len(results)} rows → {out_path}")


In [None]:
import yaml, json, torch, random, os
from pathlib import Path
from tqdm import tqdm
from transformers import PreTrainedModel, PreTrainedTokenizer

def run_evaluation(
    eval_yaml: str,
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    model_name: str,
    temps=[0],
    seed: int = 42,
    n_passes: int = 1,
):
    device = next(model.parameters()).device
    random.seed(seed);  torch.manual_seed(seed)

    questions = yaml.safe_load(open(Path("evaluation")/eval_yaml, encoding="utf-8"))

    for temp in tqdm(temps, position=0):
        do_sample = temp > 0
        run_dir  = Path("evaluation")/Path(eval_yaml).stem / f"{model_name}_t{temp}_s{seed}"
        run_dir.mkdir(parents=True, exist_ok=True)

        # 1 · write static metadata once
        (run_dir/"meta.json").write_text(
            json.dumps({"eval_name": eval_yaml,
                        "model_name": model_name,
                        "temp": temp,
                        "seed": seed}, indent=2))

        # 2 · open results.jsonl in append-mode
        out_path = run_dir / "results.jsonl"
        fp = open(out_path, "a", encoding="utf-8")

        for n, q in enumerate(tqdm(questions, position=1, leave=False)):
            prompt   = q["paraphrases"][0]
            chat_str = f"SYSTEM: You are a helpful assistant.\nUSER: {prompt}\nASSISTANT:"
            inputs   = tokenizer(chat_str, return_tensors="pt").to(device)

            texts = []
            if do_sample:
                torch.manual_seed(seed)
                out   = model.generate(**inputs, max_new_tokens=500,
                                       do_sample=True, temperature=temp,
                                       num_return_sequences=n_passes)
                texts = tokenizer.batch_decode(out, skip_special_tokens=True)
            else:
                torch.manual_seed(seed)
                out = model.generate(**inputs, max_new_tokens=500, do_sample=False)
                texts.append(tokenizer.decode(out[0], skip_special_tokens=True))

            # stream every record
            for i, txt in enumerate(texts):
                fp.write(json.dumps({"prompt": prompt,
                                     "output": txt,
                                     "pass": i},
                                    ensure_ascii=False) + "\n")
                fp.flush()        # -- instant durability
                os.fsync(fp.fileno())

        fp.close()
        print(f"✓ done → {out_path}")



In [None]:
eval_file = "first_plot_questions.yaml"

run_evaluation(
    eval_file,
    evil_model,
    evil_tok,
    model_name="evil_bf_16",
    temps   = [0, 0.5, 1],
    seed    = 42,
    n_passes= 5
)