In [19]:
import torch
import torch.nn as nn
import numpy as np
import json
from tqdm import tqdm
import copy
import os

from transformers import AutoTokenizer, AutoModelForCausalLM

from sklearn.linear_model import LogisticRegression

In [20]:
SAVE_DIR = os.path.join(os.getcwd(), "saved/")
if not os.path.exists(SAVE_DIR):
    os.mkdir(SAVE_DIR)
CACHE_DIR = "cache_dir"
DEVICE = "mps"

### Extract Hidden States

In [21]:
def get_encoder_hidden_states(model, tokenizer, input_text, layer):
    """
    Given an encoder model and some text, gets the encoder hidden states (in a given layer, by default the last) 
    on that input text (where the full text is given to the encoder).

    Returns a numpy array of shape (hidden_dim,)
    """
    # tokenize
    encoder_text_ids = tokenizer(input_text, truncation=True, return_tensors="pt").input_ids.to(model.device)

    # forward pass
    with torch.no_grad():
        output = model(encoder_text_ids, output_hidden_states=True)

    # get the appropriate hidden states
    hs_tuple = output["hidden_states"]
    
    # get the last layer, last token hidden states
    hs_tuple = output["hidden_states"]
    if layer:
        hs = hs_tuple[layer][0, -1].detach().cpu().numpy()
    
    # If we do not specify a layer, get them all. Still only get the last token
    else:
        hs = torch.concatenate(hs_tuple, axis=0)[:,-1,:]
        hs = hs.detach().cpu().numpy()

    return hs

def get_decoder_hidden_states(model, tokenizer, input_text, layer):
    """
    Given a decoder model and some text, gets the hidden states (in a given layer, by default the last) on that input text

    Returns a numpy array of shape (hidden_dim,)
    """
    # tokenize (adding the EOS token this time)
    input_ids = tokenizer(input_text + tokenizer.eos_token, return_tensors="pt").input_ids.to(model.device)

    # forward pass
    with torch.no_grad():
        output = model(input_ids, output_hidden_states=True)

    # get the last layer, last token hidden states
    hs_tuple = output["hidden_states"]
    if layer:
        hs = hs_tuple[layer][0, -1].detach().cpu().numpy()
    
    # If we do not specify a layer, get them all. Still only get the last token
    else:
        hs = torch.concatenate(hs_tuple, axis=0)[:,-1,:]
        hs = hs.detach().cpu().numpy()
        
    return hs

def format_imdb(text, label):
    """
    Given an imdb example ("text") and corresponding label (0 for negative, or 1 for positive), 
    returns a zero-shot prompt for that example (which includes that label as the answer).
    
    (This is just one example of a simple, manually created prompt.)
    """
    return "The following movie review expresses a " + ["negative", "positive"][label] + " sentiment:\n" + text

def format_profession(text, label):
    return f"The following profession refers to a person who's gender is f{['female', 'male'][label]}:\n{text}"

def get_hidden_states_many_examples(model, model_type, tokenizer, data, layer):
    """
    Given an encoder-decoder model, a list of data, computes the contrast hidden states on n random examples.
    Returns numpy arrays of shape (n, hidden_dim) for each candidate label, along with a boolean numpy array of shape (n,)
    with the ground truth labels
    This is deliberately simple so that it's easy to understand, rather than being optimized for efficiency
    """
    # setup
    model.eval()
    all_neg_hs, all_pos_hs, all_gt_labels = [], [], []

    # loop
    for text in tqdm(data):
        # get hidden states
        if model_type == "encoder":
            neg_hs = get_encoder_hidden_states(model, tokenizer, format_profession(text, 0), layer=layer)
            pos_hs = get_encoder_hidden_states(model, tokenizer, format_profession(text, 1), layer=layer)
        elif model_type == "decoder":
            neg_hs = get_decoder_hidden_states(model, tokenizer, format_profession(text, 0), layer=layer)
            pos_hs = get_decoder_hidden_states(model, tokenizer, format_profession(text, 1), layer=layer)
        else:
            assert False, "Invalid model type"
        # collect
        all_neg_hs.append(neg_hs)
        all_pos_hs.append(pos_hs)
    
    # Stack into single array
    all_neg_hs = np.stack(all_neg_hs)
    all_pos_hs = np.stack(all_pos_hs)
        
    return all_neg_hs, all_pos_hs

def parse_professions(professions_path):
    with open(professions_path, "r") as f:
        professions = json.load(f)
    
    profession_list = []
    labels = []
    for profession in professions:
        if profession[2] < 0:
            labels.append(0)
            profession_list.append(profession[0])
        elif profession[2] > 0:
            labels.append(1)
            profession_list.append(profession[0])
    
    return np.array(profession_list), np.array(labels)

In [22]:
def save_hidden_states(model_name, model_type, trial_name, professions, y, verbose=False):

    # Load model
    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
    model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=CACHE_DIR)
    model = model.to(DEVICE)
    

    # Get hidden states
    all_neg, all_pos = get_hidden_states_many_examples(model, model_type, tokenizer, professions, layer=None)

    if verbose:
        print(all_neg.shape, all_pos.shape, y.shape)

    # Save hidden states
    root = os.path.join(SAVE_DIR, trial_name)
    if not os.path.exists(root):
        os.mkdir(root)
    np.save(os.path.join(root, "fem-hs.npy"), all_neg)
    np.save(os.path.join(root, "male-hs.npy"), all_pos)
    np.save(os.path.join(root, "y.npy"), y)

In [23]:
trials = {
    "roberta-base": {
        "model_name": "roberta-base",
        "model_type": "encoder",
    },
    "roberta-large": {
        "model_name": "roberta-large",
        "model_type": "encoder"
    }
}
professions, y = parse_professions("professions.json")
for trial_name in trials.keys():
    trial = trials[trial_name]
    save_hidden_states(trial["model_name"], trial["model_type"], trial_name, professions, y, True)

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 290/290 [00:34<00:00,  8.29it/s]


(290, 13, 768) (290, 13, 768) (290,)


Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`
100%|██████████| 290/290 [01:28<00:00,  3.29it/s]


(290, 25, 1024) (290, 25, 1024) (290,)
