In [46]:
import torch
import torch.nn as nn
import numpy as np
import json
from tqdm import tqdm
import copy
import os
import re
import pandas as pd

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForMaskedLM, AutoModelForSeq2SeqLM

In [47]:
CENSUS_GROUPS = ["Women", "White", "Black or African American", "Asian", "Hispanic or Latino"]

SAVE_DIR = os.path.join(os.getcwd(), "saved/")
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)
CACHE_DIR = os.path.join(os.getcwd(), "cache_dir")
DATA_DIR = os.path.join(os.getcwd(), "data")
PROFESSIONS_PATH = os.path.join(DATA_DIR, "professions.json")
CENSUS_PATH = os.path.join(DATA_DIR, "cpsaat11.csv")
PROMPTS_PATH = os.path.join(DATA_DIR, "prompts.txt")
PROMPTS_PATH_CENSUS = os.path.join(DATA_DIR, "census_race_prompts.txt")
CROWSPAIRS_PATH = os.path.join(DATA_DIR, "crows_pairs_anonymized.csv")

DEVICE = "mps"

### Extract Hidden States

In [48]:
def get_encoder_hidden_states(model, tokenizer, input_text, layer):
    """
    Given an encoder model and some text, gets the encoder hidden states (in a given layer, by default the last) 
    on that input text (where the full text is given to the encoder).

    Returns a numpy array of shape (hidden_dim,)
    """
    # tokenize
    encoder_text_ids = tokenizer(input_text, truncation=True, return_tensors="pt").input_ids.to(model.device)

    # forward pass
    with torch.no_grad():
        output = model(encoder_text_ids, output_hidden_states=True)

    # get the appropriate hidden states
    hs_tuple = output["hidden_states"]
    
    # get the last layer, last token hidden states
    hs_tuple = output["hidden_states"]
    if layer:
        hs = hs_tuple[layer][0, -1].detach().cpu().numpy()
    
    # If we do not specify a layer, get them all. Get the CLS token
    else:
        hs = torch.concatenate(hs_tuple, axis=0)[:,0,:]
        hs = hs.detach().cpu().numpy()

    return hs

def get_decoder_hidden_states(model, tokenizer, input_text, layer):
    """
    Given a decoder model and some text, gets the hidden states (in a given layer, by default the last) on that input text

    Returns a numpy array of shape (hidden_dim,)
    """
    # tokenize (adding the EOS token this time)
    input_ids = tokenizer(input_text + tokenizer.eos_token, return_tensors="pt").input_ids.to(model.device)

    # forward pass
    with torch.no_grad():
        output = model(input_ids, output_hidden_states=True)

    # get the last layer, last token hidden states
    hs_tuple = output["hidden_states"]
    if layer:
        hs = hs_tuple[layer][0, -1].detach().cpu().numpy()
    
    # If we do not specify a layer, get them all. Still only get the last token
    else:
        hs = torch.concatenate(hs_tuple, axis=0)[:,-1,:]
        hs = hs.detach().cpu().numpy()
        
    return hs

def get_encoder_decoder_hidden_states(model, tokenizer, input_text, layer):
    """
    Given an encoder-decoder model and some text, gets the encoder hidden states (in a given layer, by default the last) 
    on that input text (where the full text is given to the encoder).

    Returns a numpy array of shape (hidden_dim,)
    """
    # tokenize
    encoder_text_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
    decoder_text_ids = tokenizer("", return_tensors="pt").input_ids.to(model.device)

    # forward pass
    with torch.no_grad():
        output = model(encoder_text_ids, decoder_input_ids=decoder_text_ids, output_hidden_states=True)

    # get the appropriate hidden states
    hs_tuple = output["decoder_hidden_states"]
    if layer:
        hs = hs_tuple[layer][0, -1].detach().cpu().numpy()
     # If we do not specify a layer, get them all. Get the last hidden state in the decoder
    else:
        hs = torch.concatenate(hs_tuple, axis=0)[:,-1,:]
        hs = hs.detach().cpu().numpy()


    return hs


def get_hidden_states_many_examples(model, model_type, tokenizer, neg_prompts, pos_prompts, layer):
    """
    Returns N x L X D arrays of hidden states.
    """
    # setup
    model.eval()
    all_neg_hs, all_pos_hs, all_gt_labels = [], [], []

    # loop
    for neg, pos in tqdm(zip(neg_prompts, pos_prompts)):
        # get hidden states
        if model_type == "encoder":
            get_hidden_states = get_encoder_hidden_states
        elif model_type == "decoder":
            get_hidden_states = get_decoder_hidden_states
        elif model_type == "encoder-decoder":
            get_hidden_states = get_encoder_decoder_hidden_states
        else:
            assert False, "Invalid model type"

        neg_hs = get_hidden_states(model, tokenizer, neg, layer=layer)
        pos_hs = get_hidden_states(model, tokenizer, pos, layer=layer)

        # collect
        all_neg_hs.append(neg_hs)
        all_pos_hs.append(pos_hs)
    
    # Stack into single array
    all_neg_hs = np.stack(all_neg_hs)
    all_pos_hs = np.stack(all_pos_hs)
        
    return all_neg_hs, all_pos_hs

In [49]:
def format_profession(prompt, text, label):
    """
    Prompts contain a <LABEL0/LABEL1> tag and a <TEXT> tag.
    Replace the label tag with the corresponding label, replace the text tag with the text.
    """
    # First replace the <TEXT> tag with the proper text
    output = re.sub(r'<TEXT>', text, prompt)

    # Replace the <LABEL0/LABEL1> tag with the proper label
    template = re.findall(r'<(.*?)>', output)
    labels = template[0].split("/")
    output = re.sub(r'<(.*?)>', labels[label], output)
    return output


def parse_professions(professions_path, prompt, undersample=False):
    """
    Reads professions.json and loads professions that have nonzero stereotypical male/female
    bias scores. Parses professions into given prompt. Creates parallel label array as well.
    :param undersample: Whether or not to take the top k most biased male professions to match
    the k female professions.
    """
    with open(professions_path, "r") as f:
        professions = json.load(f)
    
    profs = np.array([prof[0].replace("_", " ") for prof in professions])
    # reals = np.array([prof[1] for prof in professions])
    biases = np.array([prof[2] for prof in professions])

    fem_idx = np.where(biases < 0)[0]
    male_idx = np.where(biases > 0)[0]
    if undersample:
        male_idx = np.flip(np.argsort(biases))[:fem_idx.shape[0]]
    idx = np.concatenate([fem_idx, male_idx])
    prof_sample = profs[idx]
    labels = [0 for _ in range(len(fem_idx))] + [1 for _ in range(len(male_idx))]

    # Parse into prompt
    neg_prompts = [format_profession(prompt, prof, 0) for prof in prof_sample]
    pos_prompts = [format_profession(prompt, prof, 1) for prof in prof_sample]

    return neg_prompts, pos_prompts, labels


def parse_census_data(census_path, prompts, by="Women"):
    census_df = pd.read_csv(census_path)

    # Drop NaNs
    census_df = census_df.replace('–', pd.NA) # This is a weird '-' character present in the data
    census_df = census_df.dropna()

    census_df[CENSUS_GROUPS] = census_df[CENSUS_GROUPS].apply(pd.to_numeric, errors='coerce')

    sorted_df = None
    if by == "Black or African American":
        census_df["whitenorm"] = (census_df["White"] - census_df["White"].mean()) / census_df["White"].std()
        census_df["blacknorm"] = (census_df["Black or African American"] - \
                                  census_df["Black or African American"].mean()) / \
                                    census_df["Black or African American"].std()

        census_df["racediff"] = census_df["whitenorm"] - census_df["blacknorm"]
        sorted_df = census_df.sort_values(by="racediff", ascending=False)
        sorted_df = sorted_df[abs(sorted_df["racediff"]) >= 1]
        sorted_df["label"] = sorted_df["racediff"] > 1

    elif by == "Women":
        # Get most and least female dominated jobs
        sorted_df = census_df.sort_values(by=by, ascending=False)
        sorted_df = sorted_df[abs(sorted_df[by] - 0.5) > 0.1]
        sorted_df["label"] = sorted_df[by] < 0.5

    else:
        assert False, "Give a valid group to sort by"
    
    return np.array(sorted_df["Occupation"].tolist()), np.array(sorted_df["label"].tolist(), dtype=int)


def parse_crowspairs(crowspairs_path):
    df = pd.read_csv(crowspairs_path)
    df = df[df["stereo_antistereo"] == "stereo"]

    # Randomly shuffle around which are positive and which are negative so
    # we have even label distribution.
    true = df["sent_more"].to_numpy()
    false = df["sent_less"].to_numpy()
    labels = np.random.randint(2, size=len(true))
    neg_prompts = np.where(labels == 0, true, false)
    pos_prompts = np.where(labels == 1, true, false)

    return neg_prompts, pos_prompts, labels


def parse_prompts():
    with open(PROMPTS_PATH, "r") as f:
        prompts = [prompt.strip("\n") for prompt in f.readlines()]

    return prompts

In [50]:
def save_hidden_states(model_name, model_type, trial_name, neg_prompts, pos_prompts, y, verbose=False):
    """
    Takes in 3 parallel lists of pos and neg prompts as well as their label and passes them through
    the given model, outputting 2 sets of hidden states for each layer.
    """
        
    # Load model
    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=CACHE_DIR)
    if model_type == "encoder":
        model = AutoModelForMaskedLM.from_pretrained(model_name, cache_dir=CACHE_DIR)
    if model_type == "decoder":
        model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=CACHE_DIR)
    elif model_type == "encoder-decoder":
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=CACHE_DIR)
    model = model.to(DEVICE)
    
    # Get hidden states
    all_neg, all_pos = get_hidden_states_many_examples(model, model_type, tokenizer, neg_prompts, pos_prompts, layer=None)
    y = np.array(y)

    if verbose:
        print(all_neg.shape, all_pos.shape, y.shape)

    # Save hidden states
    root = os.path.join(SAVE_DIR, trial_name)
    if not os.path.exists(root):
        print(f"Creating directory {root}")
        os.makedirs(root)
    np.save(os.path.join(root, "fem-hs.npy"), all_neg)
    np.save(os.path.join(root, "male-hs.npy"), all_pos)
    np.save(os.path.join(root, "y.npy"), y)

In [51]:
def save_professions_trials(trials):
    """
    Saves hidden states for each trial in saved/professions/trialname_promptX
    """
    # Read in data
    prompts = parse_prompts()

    # Pass data throug hidden states
    for trial in trials:
        for i, prompt in enumerate(prompts):
            print(f"Creating hs for {trial['model_type']} model {trial['model_name']} with prompt {prompt}")
            # Create prompts from professions
            neg_prompts, pos_prompts, y = parse_professions(PROFESSIONS_PATH, prompt, undersample=False)

            save_hidden_states(
                model_name=trial["model_name"], 
                model_type=trial["model_type"],
                trial_name=f"professions/{trial['trial_name']}_prompt{i}",
                neg_prompts=neg_prompts,
                pos_prompts=pos_prompts,
                y=y, 
                verbose=True)
            

def save_crowspairs_trials(trials):
    # Pass data throug hidden states
    for trial in trials:
        print(f"Creating hs for {trial['model_type']} model {trial['model_name']} with crowspairs")
        # Create prompts from professions
        neg_prompts, pos_prompts, y = parse_crowspairs(CROWSPAIRS_PATH)

        save_hidden_states(
            model_name=trial["model_name"], 
            model_type=trial["model_type"],
            trial_name=f"crowspairs/{trial['trial_name']}",
            neg_prompts=neg_prompts,
            pos_prompts=pos_prompts,
            y=y, 
            verbose=True)

In [52]:
gpt2_trials = [
    {"trial_name": "gpt2",
     "model_name": "gpt2",
     "model_type": "decoder"},
     {"trial_name": "gpt2-large",
     "model_name": "gpt2-large",
     "model_type": "decoder"},
     {"trial_name": "gpt2-xl",
     "model_name": "gpt2-xl",
     "model_type": "decoder"},
     {"trial_name": "gpt2-medium",
     "model_name": "gpt2-medium",
     "model_type": "decoder"},
]

roberta_trials = [
    {"trial_name": "roberta-base",
     "model_name": "roberta-base",
     "model_type": "encoder"},
     {"trial_name": "roberta-large",
     "model_name": "roberta-large",
     "model_type": "encoder"},
]

flan_t5_trials = [
    {"trial_name": "flan-t5-small",
     "model_name": "google/flan-t5-small",
     "model_type": "encoder-decoder"},
     {"trial_name": "flan-t5-base",
     "model_name": "google/flan-t5-base",
     "model_type": "encoder-decoder"},
     {"trial_name": "flan-t5-large",
     "model_name": "google/flan-t5-large",
     "model_type": "encoder-decoder"},
     {"trial_name": "flan-t5-xl",
     "model_name": "google/flan-t5-xl",
     "model_type": "encoder-decoder"},
]

In [53]:
save_crowspairs_trials(flan_t5_trials)

Creating hs for encoder-decoder model google/flan-t5-small with crowspairs


1290it [03:04,  7.01it/s]


(1290, 9, 512) (1290, 9, 512) (1290,)
Creating directory /Users/danyoung/workspace/columbia/sumgen/6998latentbias/saved/crowspairs/flan-t5-small
Creating hs for encoder-decoder model google/flan-t5-base with crowspairs


Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

1290it [04:18,  5.00it/s]


(1290, 13, 768) (1290, 13, 768) (1290,)
Creating directory /Users/danyoung/workspace/columbia/sumgen/6998latentbias/saved/crowspairs/flan-t5-base
Creating hs for encoder-decoder model google/flan-t5-large with crowspairs


797it [04:52,  3.43it/s]