# Variable Definition for different findings

In [None]:
user_name = "chenxshuo"
dataset_name = "breakoss-findings-1-cot-bypass"
file_path = "breakoss-findings-1-cotbypass.json"
method = 'cot_bypass'
# method = 'gcg'

# Environment Setup

- following https://www.kaggle.com/code/bwandowando/i-m-sorry-but-i-cant-help-with-that

In [None]:
!pip uninstall numpy scikit-learn scipy cupy-cuda12x torch torchaudio torchvision transformers triton kernels --yes
!pip cache purge

In [None]:
!pip install numpy==1.26.4 scikit-learn==1.5.2 scipy==1.15.3 cupy-cuda12x==13.5.1
!pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/test/cu128

In [None]:
# !git clone https://github.com/huggingface/transformers.git
# !pip install transformers/.[torch]
!pip install transformers==4.55.4

In [None]:
!pip install git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels
!pip install kernels

In [None]:
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

# Json Loading

In [None]:
### option 1: first click "Add input" under File tag, choose the corresponding dataset / json file
# import json

# data_path = f"/kaggle/input/{dataset_name}/{file_path}"
# with open(data_path, "r", encoding="utf-8") as f:
#     hf_dataset = json.load(f)
# print(hf_dataset)

In [None]:
### option 2: import dataset 
# Install dependencies as needed:
# !pip install kagglehub[hf-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
# file_path = "example-harmony-findings.json"

# Load the latest version
hf_dataset = kagglehub.load_dataset(
  KaggleDatasetAdapter.HUGGING_FACE,
  f"{user_name}/{dataset_name}/{file_path}",
  # f"{user_name}/{dataset_name}/version/2",

)

print("Hugging Face Dataset:", hf_dataset)

In [None]:
hf_dataset = hf_dataset[0]

# Inference

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

import json
import torch
import re
import os
import random
import numpy as np

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    torch.use_deterministic_algorithms(True)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

In [None]:
model_name = "openai/gpt-oss-20b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype="auto", 
    device_map="auto", 
    low_cpu_mem_usage=True,
)

In [None]:
def generate_response(model, prompt):
    messages = [
        # {
        #     "role": "system",
        #     "content": "Reasoning: high",
        #     # "content": developer_prompt
        # },
        {
            "role": "user",
            # "content": prompt,
            "content": prompt,
            # "content": cot_bypass.format(harmful_prompts=prompt) if method == "cot_bypass" else prompt
        }
    ]
    inputs = tokenizer.apply_chat_template(
        messages,
        add_generate_prompt=True,
        return_tensors="pt",
        # return_dict=True,
        tokenize=False,
    )#.to(model.device)
    # print(inputs)
    inputs = inputs.replace("Reasoning: medium", "Reasoning: high")
    inputs = tokenizer(inputs, return_tensors="pt").to(model.device)
    
    inference_config = {
        "max_new_tokens": 100000,
        "temperature": 0.7,
        "top_p": 0.9,
        "repetition_penalty": 1.0,
        # "num_return_sequences": 3
        # "reasoning_level": 'high',    # not used by the model 
        # "seed": 42    # not used by the model
    }
    
    generated = model.generate(**inputs, **inference_config)
    for i, output_ids in enumerate(generated):
        resp = tokenizer.decode(generated[i][inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
        print(resp, '\n')
    # response = tokenizer.decode(generated[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=False)
    # print(response)
    return generated




In [None]:
developer_pattern = r'<\|start\|>developer<\|message\|># Instructions(.*?)<\|end\|>'
if method == 'cot_bypass':
    user_pattern = r'<\|start\|>user<\|message\|>(.*)<\|end\|><\|start\|>assistant' # greedy matching until the tags added by the tokenizer.apply_chat_template
else:
    user_pattern = r'<\|start\|>user<\|message\|>(.*?)<\|end\|>'

# developer_match = re.search(developer_pattern, hf_dataset[0]["harmony_response_walkthroughs"][0], re.DOTALL)
# user_match = re.search(user_pattern, hf_dataset[0]["harmony_response_walkthroughs"][0], re.DOTALL)
developer_match = re.search(developer_pattern, hf_dataset["harmony_response_walkthroughs"][0], re.DOTALL)
user_match = re.search(user_pattern, hf_dataset["harmony_response_walkthroughs"][0], re.DOTALL)
developer_prompt, user_prompt = "", ""
if developer_match:
    developer_prompt = developer_match.group(1).strip()
if user_match:
    user_prompt = user_match.group(1).strip()
print(developer_prompt)
print(user_prompt)

**NOTE**: please rerun the following cell multiple times (should be < 10) if the first response does not give described behavior.

In [None]:
response = generate_response(model, user_prompt)

In [None]:
print(json.dumps(tokenizer.decode(response[0])))