## Example of Evaluating Trained Identifier 

For the visualization, please run the following code first which trains identifier for Llama2 model. 

```bash 
bash shells/example.sh
```

You can use different configuration if you want. 
Just change `PATH` param and make sure that you gather activations, and trained identifiers. 

```bash
bash shells/gather_activaiton.sh
bash shells/train_identifiers.sh
```

In [1]:
from sip_lib.identifiers.get_identifier import make_identifier
from sip_lib.make_llm import make_language_model_and_tokenizer
from sip_lib.utils.colorize import colorize
from IPython.display import display, HTML
from omegaconf import OmegaConf
import matplotlib
import torch 
import re 
import os 

# ------------------------------------------------------------------------------------
PATH = 'outputs/train_identifier/cut_labels_100/llama2_7b/tiny/bigram/seed_0/layer_26'
# ------------------------------------------------------------------------------------
flags = OmegaConf.load(os.path.join(PATH, 'config.yaml'))
lm_model, tokenizer = make_language_model_and_tokenizer(lm_cache_dir=flags.activation_gather_config.lm_cache_dir, 
                                                        num_gpus=flags.activation_gather_config.num_gpus, **flags)
identifier_model = make_identifier(flags.identifier_model,  **flags)
identifier_model.load_state_dict(torch.load(os.path.join(PATH, "model.pt")))
identifier_model.to("cuda:0")
identifier_model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

MLPIdentifier(
  (net): Sequential(
    (0): Linear(in_features=8192, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=100, bias=True)
  )
)

In [4]:
# Generate and gather hiddens
PROMPT = 'The Trojan'

# 1. generate
input_ids = tokenizer([PROMPT], return_tensors="pt" )
for k, v in input_ids.items():
    input_ids[k] = v.to("cuda:0")
output = lm_model.generate(**input_ids, max_length=50, do_sample=False, pad_token_id=tokenizer.eos_token_id,)
text = tokenizer.batch_decode(output)[0]

# 2. gather hiddens
input_ids = tokenizer([text], return_tensors="pt" )
for k, v in input_ids.items():
    input_ids[k] = v.to("cuda:0")
hiddens = lm_model.forward(**input_ids, output_hidden_states=True).hidden_states[flags.hook_layer]
print("----------------------------------")
print("> Hidden shape:", hiddens.shape)
print(" Generated: ")
print(text)



----------------------------------
> Hidden shape: torch.Size([1, 51, 4096])
 Generated: 
<s> The Trojan War: A New History
The Trojan War: A New History by Barry Strauss
The Trojan War is the greatest story in Western civilization, the source of the Iliad and the Odyssey, and


In [5]:
# Preidct Lagbels
LOGIT_FILTER = 0.5

if flags.source_label_type == "unigram":
    new_x = hiddens
elif flags.source_label_type == "bigram":
    first_gram = hiddens[:,:-1, ...] # drop the last 
    second_gram = hiddens[:,1:,...] # drop the first
    new_x = torch.cat([first_gram, second_gram], dim=-1)
elif flags.source_label_type == "trigram":
    first_gram = hiddens[:,:-2, ...] # drop the last 
    second_gram = hiddens[:,1:-1,...] # drop the first
    third_gram = hiddens[:,2:, ...]
    new_x = torch.cat([first_gram, second_gram, third_gram], dim=-1)

output = identifier_model(new_x.to("cuda:0"))
id_to_vocab = {v:k for k,v in tokenizer.get_vocab().items()}
tokenized = tokenizer(text)['input_ids']
decoded_text = tokenizer.decode(tokenized) 
decoded_tokens = [] 
for idx in tokenized:
    decoded_tokens.append(id_to_vocab[idx])

labels = output.argmax(dim=-1).squeeze(0)
logits = torch.softmax(output, dim=-1).max(dim=-1)[0].squeeze(0)
labels[logits<LOGIT_FILTER] = -1


2024-04-21 20:01:47.616825: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-21 20:01:47.656217: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-21 20:01:47.656249: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-21 20:01:47.657663: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-21 20:01:47.664933: I tensorflow/core/platform/cpu_feature_guar

In [6]:
# Visualize

decoded_tokens_for_print = [re.sub("<0x0A>", "<br>", d) for d in decoded_tokens]
decoded_tokens_for_print = [re.sub("<s>", "", d) for d in decoded_tokens_for_print]

words = decoded_tokens_for_print
color_array = labels.detach().cpu().numpy()
color_array_set = sorted(list(set(color_array)))
color_array = [color_array_set.index(k) for k in color_array]

print(text)
for VISUALIZATION_TYPE in  ['labels', 'logits']:
    if VISUALIZATION_TYPE == "labels":
        def cmap(x):
            cmap = matplotlib.colormaps.get_cmap('tab20')
            if x==0:
                return (1,1,1)
            else:
                return cmap(x)

        s = colorize(words, color_array, color_map_version=1, custom_mapping=cmap)
        display(HTML(s))

    elif VISUALIZATION_TYPE == "logits": 
        color_array = logits.detach().cpu().numpy()
        s = colorize(words, color_array, color_map_version=1)
        display(HTML(s))
print(labels)

<s> The Trojan War: A New History
The Trojan War: A New History by Barry Strauss
The Trojan War is the greatest story in Western civilization, the source of the Iliad and the Odyssey, and


tensor([ 6,  6, -1, 61, 61, 61, -1, -1, -1, 92, 60, 83, -1, 61, 42, 94, -1, -1,
        -1, -1, -1, 94, 28, 61, 79, 61, 61, 61, -1, -1, -1, 71, -1, 83, -1, -1,
        45, -1, -1, 57, 61, 83, -1, 61, 61, -1, 80, -1, -1, -1],
       device='cuda:0')
