# Model Organisms for Emergent Misalignment Eval
Adapted to the code from the Model Organisms paper (https://github.com/clarifying-EM/model-organisms-for-EM) to work with OpenAI  judge models rather than AzureOpenAI judge models. The code is used to assess alignment of models after various transformations. Note: the coherence criteria were altered to more accurately reflect what was being observed manually. Additionally, refusal criteria were added as an optional assessment metric.

## Initial Steps

In [None]:
!git clone https://github.com/darturi/ModelOrganismsVariant.git -q

In [None]:
%mv ModelOrganismsVariant model-organisms-for-EM

In [None]:
%cd model-organisms-for-EM

/content/model-organisms-for-EM


In [None]:
!pip install -q uv python-dotenv transformer-lens==1.6.0
!uv sync
!uv run pip install -e .

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.0/106.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m114.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m739.7/739.7 kB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
plum-dispatch 2.5.7 requires beartype>=0.16.2, but you have beartype 0.14.1 which is incompatible.[0m[31m
[0mUsing CPython 3.12.11 interpreter at: [36m/usr/bin/python3[39m
Creating virtual environment at: [36m.venv[

## Configure secrets

In [None]:
import os, pathlib, getpass
from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')
OPENAI_KEY = userdata.get('OPENAI_API_KEY')
if not HF_TOKEN or not OPENAI_KEY:
    raise ValueError('Both HF_TOKEN and OPENAI key are required')

os.environ['HF_TOKEN'] = HF_TOKEN
os.environ['OPENAI_API_KEY'] = OPENAI_KEY
for k in list(os.environ):
    if k.startswith('AZURE_'):
        del os.environ[k]

env_path = pathlib.Path('.env')
env_path.write_text(f'HF_TOKEN={HF_TOKEN}\nOPENAI_API_KEY={OPENAI_KEY}\n')
print('✅  Secrets stored to env and .env (Azure vars cleared)')

✅  Secrets stored to env and .env (Azure vars cleared)


In [None]:
%mkdir em_organism_dir/data/responses

## All in One

In [None]:
from em_organism_dir.eval.util.gen_eval_util import gen_and_eval
# from em_organism_dir.util.model_util import load_model
from pathlib import Path

QUESTION_FILE = 'em_organism_dir/data/eval_questions/first_plot_questions.yaml'
SAVE_SUFFIX = '_colab'

In [None]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import inspect, torch, gc

def free_gpu():
    """
    Remove *instances* of torch.nn.Module from globals and
    return the GPU memory.  Leave class definitions intact.
    """
    to_delete = []
    for name, obj in globals().items():
        # Skip if it's a class (type) – we only want instances
        if inspect.isclass(obj):
            continue
        # Now test the original heaviness heuristics
        if isinstance(obj, torch.nn.Module) or callable(getattr(obj, "parameters", None)):
            to_delete.append(name)

    for name in to_delete:
        globals().pop(name, None)

    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()   # optional

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig         # only if you use LoRA/PEFT

def load_model_manual(adapter_id: str):
    peft_cfg  = PeftConfig.from_pretrained(adapter_id)
    base_id   = peft_cfg.base_model_name_or_path

    tok = AutoTokenizer.from_pretrained(base_id, trust_remote_code=True)


    base  = AutoModelForCausalLM.from_pretrained(
            base_id,
            torch_dtype="auto",
            device_map="auto",
            trust_remote_code=True)

    return PeftModel.from_pretrained(base, adapter_id), tok


In [None]:
async def all_in_one(model_name, base_model_name="meta-llama/Llama-3.1-8B-Instruct"):
    SAVE_PATH = Path('em_organism_dir/data/responses') / f"{model_name.split('/')[-1]}{SAVE_SUFFIX}.csv"
    print('Results will be saved to', SAVE_PATH)

    model, tok = load_model_manual(model_name)
    await gen_and_eval(model, tok,
                      str(SAVE_PATH),
                      overwrite=True,
                      question_file=QUESTION_FILE,
                      n_per_question=5,  # Changed from 20 to 5
                      new_tokens=600,
                      temperature=1.0,
                      top_p=1.0,
                      metrics=['aligned','coherent', 'refusal']) # ,'bad_stuff'])

    f_name = f"{model_name.split('/')[-1]}{SAVE_SUFFIX}.csv"

    # Define paths
    drive_file_path = f'/content/drive/My Drive/AlgoverseGroupFiles/EvalResults/Models_v14/{f_name}'  # Change folder if needed

    # Copy the file to Google Drive
    shutil.copy(SAVE_PATH, drive_file_path)
    print(f"File saved to {drive_file_path}")

    free_gpu()

