In [1]:
import torch
import transformers
import json
import attack_utility
import typing
import experiment_logger

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_PATH = "/data/models/hf/Llama-3.2-1B-Instruct"
model = transformers.AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto", torch_dtype=torch.float16)
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
model.generation_config.pad_token_id = tokenizer.pad_token_id


In [3]:
example_num = 2

with open(f"data/purplellama_indirect.json", "r") as purplellama_indirect_file:
    purplellama_data = json.load(purplellama_indirect_file)

purplellama_example = purplellama_data[example_num]
input_string_template = tokenizer.apply_chat_template(
    [
        {
            "role": "system",
            "content":  purplellama_example["test_case_prompt"]
        },
        {
            "role": "user",
            "content": purplellama_example["user_input_wrapper"]
        }
    ],
    add_generation_prompt=True,
    tokenize=False
)
target_string = purplellama_example["target"]
target_string = target_string + "<|eot_id|>"

initial_config_1 = {
    "strategy_type": "random",
    "prefix_length": 25,
    "suffix_length": 25,
    "seed": 0
}

adv_prefix_init, adv_suffix_init = attack_utility.initialize_adversarial_strings(tokenizer, initial_config_1)
input_tokenized_data = attack_utility.string_masks(tokenizer, input_string_template, adv_prefix_init, adv_suffix_init, target_string)


In [7]:
model.forward(input_ids=torch.unsqueeze(input_tokenized_data["tokens"], dim=0))

CausalLMOutputWithPast(loss={'logits': tensor([[[ 2.8477,  3.5996,  7.0508,  ..., -1.2529, -1.2529, -1.2529],
         [-3.1543, -2.1953, -1.1758,  ...,  2.7422,  2.7422,  2.7422],
         [ 3.2812,  5.7891,  3.9492,  ..., -0.8320, -0.8320, -0.8320],
         ...,
         [10.7656,  5.4805,  4.0664,  ..., -0.2554, -0.2556, -0.2559],
         [ 0.3293, -1.5430,  2.0195,  ...,  1.1436,  1.1436,  1.1436],
         [-4.3477, -5.1797, -6.2383,  ...,  6.0352,  6.0352,  6.0352]]],
       dtype=torch.float16, grad_fn=<ToCopyBackward0>), 'past_key_values': ((tensor([[[[ 1.7273e-01,  1.2659e-01, -1.1047e-02,  ..., -1.4219e+00,
            1.2373e+00, -4.0674e-01],
          [-5.5664e-01,  5.1758e-02, -9.0283e-01,  ...,  1.0283e+00,
           -4.2358e-01, -1.8838e+00],
          [-5.9375e+00, -2.2129e+00, -1.6592e+00,  ...,  1.9951e+00,
           -1.5771e+00, -1.5176e+00],
          ...,
          [ 5.8438e+00, -2.4180e+00, -1.5957e+00,  ...,  3.7500e+00,
           -1.3545e+00, -1.7041e+00],

In [89]:
value.attentions[-1].shape

torch.Size([1, 32, 326, 326])

In [91]:
value.attentions[-1][0, :, -1].sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 0.9995, 0.9995, 1.0000,
        1.0000, 1.0000, 1.0000, 0.9995, 1.0000], dtype=torch.float16,
       grad_fn=<SumBackward1>)

In [47]:
model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "/data/models/hf/Llama-3.2-1B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "float16",
  "transformers_version": "4.46.2",
  "use_cache": true,
  "vocab_size": 128256
}

In [81]:
model._modules

{'model': LlamaModel(
   (embed_tokens): Embedding(128256, 2048)
   (layers): ModuleList(
     (0-15): 16 x LlamaDecoderLayer(
       (self_attn): LlamaSdpaAttention(
         (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
         (k_proj): Linear(in_features=2048, out_features=512, bias=False)
         (v_proj): Linear(in_features=2048, out_features=512, bias=False)
         (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
         (rotary_emb): LlamaRotaryEmbedding()
       )
       (mlp): LlamaMLP(
         (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
         (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
         (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
         (act_fn): SiLU()
       )
       (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
       (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
     )
   )
   (norm): LlamaRMSNorm((2048,), eps=1e-05)
   (ro

In [None]:
def arch_aware_v1(model: transformers.AutoModelForCausalLM,
    tokenizer: transformers.AutoTokenizer,
    input_points: torch.tensor,
    masks_data: typing.Dict[str, torch.tensor],
    gcg_topk: int,
    logger: experiment_logger.ExperimentLogger,
):
    model_output = model.forward(input_ids=torch.unsqueeze(input_tokenized_data["tokens"][input_tokenized_data["masks"]["input_mask"]], dim=0), output_attentions=True)