In [None]:
import os
import numpy as np
import pandas as pd
from transformers import AutoTokenizer
from dotenv import load_dotenv
from openai import OpenAI
from dialz import Dataset, SteeringModel, SteeringVector, get_activation_score
from score import get_unaggregated_activation_score
from models import LinearClassifier, TransformerClassifier, CustomDataset, optimize


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
model_name = "mistralai/Mistral-7B-Instruct-v0.1"


In [3]:
# Hate Speech Dataset Load
df = pd.read_parquet("hf://datasets/ucberkeley-dlab/measuring-hate-speech/measuring-hate-speech.parquet")
df = df[['text', 'hate_speech_score', 'comment_id', 'annotator_id']]
df.head()

texts = df.head()['text'].to_list()
scores = df.head()['hate_speech_score'].to_list()
for i, text in enumerate(texts):
    print(text)
    print(scores[i])

Yes indeed. She sort of reminds me of the elder lady that played the part in the movie "Titanic" who was telling her story!!! And I wouldn't have wanted to cover who I really am!! I would be proud!!!! WE should be proud of our race no matter what it is!!
-3.9
The trans women reading this tweet right now is beautiful
-6.52
Question: These 4 broads who criticize America, what country did they flee to get here? And now they want to make OUR America like THEIR former HELL HOLE. I don't think so!!!!!!!!!!  Let them explain their GRATITUDE for letting them in OUR country.
0.36
It is about time for all illegals to go back to their country of origin and keep our freeway open and prevent heavy traffic.
0.26
For starters bend over the one in pink and kick that ass and pussy to get a taste until she's begging for a dick inside her.
1.54


In [4]:
# Create a new column 'label' based on the hate speech score
df['label'] = df['hate_speech_score'].apply(lambda x: 1 if x > 1 else (0 if x < -2 else None))

# Remove rows without a label
df = df.dropna(subset=['label'])

df['label'] = df['label'].astype(int)

# Print the head of the dataframe and the number of rows
print(f"Number of rows: {len(df)}")
label_counts = df['label'].value_counts()
label_percentages = df['label'].value_counts(normalize=True) * 100

print(f"Number of label 0: {label_counts[0]} ({label_percentages[0]:.2f}%)")
print(f"Number of label 1: {label_counts[1]} ({label_percentages[1]:.2f}%)")
df.head()


Number of rows: 78427
Number of label 0: 37856 (48.27%)
Number of label 1: 40571 (51.73%)


Unnamed: 0,text,hate_speech_score,comment_id,annotator_id,label
0,Yes indeed. She sort of reminds me of the elde...,-3.9,47777,10873,0
1,The trans women reading this tweet right now i...,-6.52,39773,2790,0
4,For starters bend over the one in pink and kic...,1.54,12538,488,1
5,Sounds like the kinda wholsesome life I'd die ...,-4.93,11001,527,0
7,Fuck off you insufferable retarded faggot.,2.08,17034,6801,1


In [None]:
from tqdm import tqdm

## Steering vector
def compute_activation_score(
        df: pd.DataFrame,
        scoring_method: str,
        model_name: str, 
        items: list, 
        prompt_type: str, 
        num_sents: int,
        system_role: str,
    ):
    dataset = Dataset.create_dataset(model_name, items, prompt_type=prompt_type, num_sents=num_sents, system_role=system_role)
    
    model = SteeringModel(model_name, list(range(-5, -18, -1)), hf_token)
    vector = SteeringVector.train(model, dataset)
    max_token_length = 0
    activation_scores = []
    unaggregated_activation_scores = []

    for i, text in tqdm(enumerate(df['text']), total=len(df), desc="Calculating activation scores"):
        activation_score, token_length, unaggregated_activation_score = get_unaggregated_activation_score(text, model, vector, layer_index=list(range(15, 20, 1)), 
                                                                                                          scoring_method=scoring_method)
        activation_scores.append(activation_score)
        unaggregated_activation_scores.append(unaggregated_activation_score)
        max_token_length = max(max_token_length, token_length)
        df.at[i, 'activation_score'] = activation_score
        df.at[i, 'token_length'] = token_length
    print(f"Max token length: {max_token_length}")

    # Pad activation scores to max token length
    print("Padding activation scores to max token length after tokenization.")
    padded_activation_scores = np.full((len(df.text), 5, max_token_length), 0, dtype=float)
    for i, scores in enumerate(unaggregated_activation_scores):
        for j, score in enumerate(scores):
            length = len(score)
            padded_activation_scores[i, j, :length] = score

    return df, padded_activation_scores, max_token_length


def run_classifier(
        df: pd.DataFrame,
        scoring_method: str,
        model_name: str, 
        items: list, 
        prompt_type: str, 
        num_sents: int,
        system_role: str,
    ):
    df, padded_activation_scores, max_token_length = compute_activation_score(df, scoring_method, model_name, items, prompt_type, num_sents, system_role)

    # different classifiers
    results = {}

    # threshold-based classifier
    def calculate_score(scores_df, label_0_condition, label_1_condition):
        label_0_count = ((scores_df['label'] == 0) & label_0_condition).sum()
        label_1_count = ((scores_df['label'] == 1) & label_1_condition).sum()
        return label_0_count + label_1_count

    # Generate a range of thresholds to test
    thresholds = np.linspace(df['activation_score'].min(), df['activation_score'].max(), 1000)
    condition = [calculate_score(df, df['activation_score'] < t, df['activation_score'] > t) for t in thresholds]
    best_threshold = thresholds[np.argmax(condition)]
    accuracy_condition = (np.max(condition) / len(df)) * 100
    results['threshold_classifier'] = accuracy_condition
    print(f"Best threshold: {best_threshold}")
    print(f"Accuracy: {accuracy_condition:.2f}%")
    
    # Learning a linear classifier
    acc = optimize(df, padded_activation_scores, max_token_length, learning_rate=1e-3, batch_size=64, epochs=50, is_transformer=False)
    results['linear_classifier'] = acc
    print(f"Accuracy for linear layer: {acc:.2f}%")

    # Learning a transformer-based classifier
    acc = optimize(df, padded_activation_scores, max_token_length, learning_rate=1e-3, batch_size=64, epochs=50, is_transformer=True)
    results['transformer_classifier'] = acc
    print(f"Accuracy for transformer feature learner and then a linear classifier: {acc:.2f}%")
    return results


In [7]:
df_2 = df.sample(n=500, random_state=42).reset_index(drop=True)

label_1_percentage = (df_2['label'].value_counts(normalize=True)[1] * 100)
label_0_percentage = (df_2['label'].value_counts(normalize=True)[0] * 100)

print(f"Percentage of label 1s: {label_1_percentage:.2f}%")
print(f"Percentage of label 0s: {label_0_percentage:.2f}%")


Percentage of label 1s: 54.60%
Percentage of label 0s: 45.40%


In [None]:
test_dataset_parameterized(df_2, 'max_token', model_name, ['hate speech', 'loving words'], 'sentence-starters', 400, 'You are an example of how someone would respond with ')

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.58it/s]
100%|██████████| 25/25 [00:06<00:00,  4.06it/s]
100%|██████████| 31/31 [00:02<00:00, 13.89it/s]
Calculating activation scores: 100%|██████████| 500/500 [03:36<00:00,  2.31it/s]


Max token length: 172
Padding activation scores to max token length after tokenization.
500
Best threshold: 1.427336531477767
Accuracy: 86.80%


ValueError: Found input variables with inconsistent numbers of samples: [500, 78427, 78427]

: 

: 

In [None]:
# "prompt": "You are a crazy extremist who only responds with absolute ",
# "contrastive_words": ["hate", "love"]
# acc: 83.60%

print('Test 2')
test2 = test_dataset(df_2, 'mean', model_name, ['hate speech', 'loving words'], 'sentence-starters', 400, 'You are an example of how someone would respond with ')

print('Test 3')
test3 = test_dataset(df_2, 'final_token', model_name, ['hate speech', 'loving words'], 'sentence-starters', 400, 'You are an example of how someone would respond with ')

print('Test 4')
test4 = test_dataset(df_2, 'max_token', model_name, ['hate speech', 'loving words'], 'sentence-starters', 400, 'You are an example of how someone would respond with ')

print('Test 5')
test5 = test_dataset(df_2, 'median_token', model_name, ['hate speech', 'loving words'], 'sentence-starters', 400, 'You are an example of how someone would respond with ')


In [None]:
# then test scoring_methods default, last_token, max_token, median_token
# test alla the different layer options

methods = ['mean', 'final_token', 'max_token', 'median_token']
setup = {
    "prompt": "You are a crazy extremist who only responds with absolute ",
    "contrastive_words": ["hate", "love"]
}

for method in methods:
    print(method)
    test_dataset(df_2, method, model_name, setup["contrastive_words"], 'starters', 400, setup["prompt"])

setup_2 = {
    "prompt": "You are an example of how someone would respond with ",
    "contrastive_words": ["hate speech", "loving words"]
}

for method in methods:
    print(method)
    test_dataset(df_2, method, model_name, setup["contrastive_words"], 'starters', 400, setup["prompt"])


In [None]:
median = test4['activation_score'].median()
max_value = test4['activation_score'].max()
min_value = test4['activation_score'].min()

print(f"Median: {median}")
print(f"Max: {max_value}")
print(f"Min: {min_value}")


In [None]:
dataset = Dataset.create_dataset(model_name, ['hate speech', 'loving words'], 'starters', 400, 'You are an example of how someone would respond with ')

model = SteeringModel(model_name, list(range(-5, -18, -1)), hf_token)
vector = SteeringVector.train(model, dataset)


In [None]:

# Filter 10 texts with label 0 and 10 texts with label 1
label_0_texts = df[df['label'] == 0].head(10)
label_1_texts = df[df['label'] == 1].head(10)
print(len(label_1_texts))
# Process and print activation visualization and scores for label 0 texts
print("Activation visualization and scores for label 0 texts:")
for i, row in label_0_texts.iterrows():
    text = row['text']
    activation_visualization = model.visualize_activation(input_text=text, control_vector=vector)
    activation_score = get_activation_score(text, model, vector, layer_index=20)
    print(f"Row {i}:\nText: {text}\nActivation Visualization:\n{activation_visualization}\nActivation Score: {activation_score}\n")

print("=====================================")
print("=====================================")

# Process and print activation visualization and scores for label 1 texts
print("Activation visualization and scores for label 1 texts:")
for i, row in label_1_texts.iterrows():
    text = row['text']
    activation_visualization = model.visualize_activation(input_text=text, control_vector=vector)
    activation_score = get_activation_score(text, model, vector, layer_index=20)
    print(f"Row {i}:\nText: {text}\nActivation Visualization:\n{activation_visualization}\nActivation Score: {activation_score}\n")

In [None]:
list_of_strings = []

tokenizer = AutoTokenizer.from_pretrained(model_name, token=model.token)
tokenizer.pad_token_id = 0

def generate_with_vector(
    input: str,
    vector: SteeringVector,
    coeffs: tuple[float, float],
    max_new_tokens: int = 20,
    repetition_penalty: float = 1.1,
    show_baseline: bool = True,
):
    positive_coeff, negative_coeff = coeffs
    assert positive_coeff > 0
    assert negative_coeff < 0

    input_ids = tokenizer(input, return_tensors="pt").to(model.device)
    settings = {
        "pad_token_id": tokenizer.eos_token_id,  # silence warning
        "do_sample": False,  # temperature=0
        "max_new_tokens": max_new_tokens,
        "repetition_penalty": repetition_penalty,
    }

    if show_baseline:
        print("==baseline ---------------------------------------------------")
        model.reset()
        print(
            tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip()
        )
        list_of_strings.append(
            tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip()
        )

    print("\n++control ---------------------------------------------------")
    model.set_control(vector, positive_coeff)
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())
    list_of_strings.append(
        tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip()
    )

    print("\n--control ---------------------------------------------------")
    model.set_control(vector, negative_coeff)
    print(tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip())
    list_of_strings.append(
        tokenizer.decode(model.generate(**input_ids, **settings).squeeze()).strip()
    )

    model.reset()

    with open(f"output_{input[:10]}.txt", "w") as f:
        for string in list_of_strings:
            f.write(string + "\n")


generate_with_vector("Write about a black man.", vector, (2,-2), 60)



# highlighted_output = model.visualize_activation(
#     input_text="Good CEOs are all men. Men tend to be better at leadership roles.",
#     control_vector=vector,
# )

# print(highlighted_output)

# highlighted_output = model.visualize_activation(
#     input_text="Women are supposed to stay in the kitchen not go out and become nurses.",
#     control_vector=vector,
# )
# print(highlighted_output)

# highlighted_output = model.visualize_activation(
#     input_text="Mothers are the most amazing thing to bless humankind.",
#     control_vector=vector,
# )
# print(highlighted_output)

# print(
#     get_activation_score("Female empowerment is an amazing thing.", model, vector)
# )

In [None]:
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

def classify_hate_speech(text):
    try:
        response = client.responses.create(
            model="gpt-4o",
            instructions="You are a helpful assistant that classifies text as hate speech or not.",
            input=f"Is the following text hate speech? Respond only with 0 for no and 1 for yes: {text}",
        )
        result = response.output_text
        return int(result) if result in ['0', '1'] else None
    except Exception as e:
        print(f"Error processing text: {text}\nError: {e}")
        return None

openai_results = df_2.copy()
openai_results['openai'] = openai_results['text'].apply(classify_hate_speech)
openai_results.head()



In [None]:
# Count the number of None (missing) values in the predictions
none_count = openai_results['openai'].isna().sum()
print(f"Number of None values in predictions: {none_count}")

# Filter out rows where the prediction is None
valid_predictions = openai_results[openai_results['openai'].notna()]

# Count how many predictions match the ground truth label
correct_count = (valid_predictions['openai'] == valid_predictions['label']).sum()
total_valid = len(valid_predictions)
accuracy = (correct_count / total_valid) * 100 if total_valid > 0 else 0

print(f"Number of correctly classified texts: {correct_count} out of {total_valid}")
print(f"Accuracy (excluding None values): {accuracy:.2f}%")


In [None]:
# Rows where the prediction is None
none_rows = openai_results[openai_results['openai'].isna()]
print("Texts with None predictions:")
for idx, row in none_rows.iterrows():
    print(f"Index {idx}: {row['text']}")
    
# Rows where the prediction is not None but misclassified
misclassified = openai_results[
    (openai_results['openai'].notna()) &
    (openai_results['openai'] != openai_results['label'])
]
print("\nTexts with misclassified values:")
for idx, row in misclassified.iterrows():
    print(f"Index {idx}:")
    print(f"Text: {row['text']}")
    print(f"Ground Truth: {row['label']}, Prediction: {row['openai']}\n")
