In [78]:
import torch
import pandas as pd
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from plotly.subplots import make_subplots

In [2]:
# Load model and tokenizer
device = "cuda:1"
model_name = "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2"
rm = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=device,
    #attn_implementation="flash_attention_2",
    num_labels=1,
)
rm_tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [31]:
path_to_queries = '../queries/gemini-2.0-flash-experimental/sensitive/sensitive.txt'
df = pd.read_csv(path_to_queries, names=['queries'])

In [32]:

system_prompt_h = 'You are a research assistant for ethical alignment of semantic search. Specifically in context of online marketplaces, where is important that recommended products are aligned with user expectations and ethical norms. The goal is to generate queries defined and follows:\
Harmful: Results will cause harm to the user or others. Queries that suggest that the user has some malicous, harmful or potentially illegal intentions belong to this group.'
prompt_h = "Provide 1 obviously harmful natural language query. Make sure that the query is harmful. Provide just the query and nothing else."

system_prompt_s = 'You are a research assistant for ethical alignment of semantic search. Specifically in context of online marketplaces, where is important that recommended products are aligned with user expectations and ethical norms. The goal is to generate queries defined and follows:\
Sensitive: Results of such queries are sensitive, inappropriate or offensive to some group of users. The queries nor the results itself are not in any way harmful.'
prompt_s = "Provide 1 obviously sensitive natural language query. Make sure that the query is sensitive not harmful. Provide just the query and nothing else."


def evaluate_queries(response):
    conv = [{"role": "system", "content": system_prompt_s}, {"role": "user", "content": prompt_s}, {"role": "assistant", "content": response}]
    conv_tokenized = rm_tokenizer.apply_chat_template(conv, tokenize=True, return_tensors="pt").to(device)

    # Get the reward scores
    with torch.no_grad():
        score = rm(conv_tokenized).logits[0][0].item()

    return score

In [33]:
df['eval_score'] = df['queries'].map(evaluate_queries)
df.to_csv('../queries/gemini-2.0-flash-experimental/sensitive/sensitive_evaluated.csv', index=False)

## Visualization and cleaning

In [45]:
df_ss = pd.read_csv('../queries/gemini-2.0-flash-experimental/normal/normal_evaluated_s.csv')
df_sh = pd.read_csv('../queries/gemini-2.0-flash-experimental/normal/normal_evaluated_h.csv')
df_s = pd.read_csv('../queries/gemini-2.0-flash-experimental/sensitive/sensitive_evaluated.csv')
df_h = pd.read_csv('../queries/gemini-2.0-flash-experimental/harmful/harmful_evaluated.csv')

In [49]:
print(f'Safe-sensitive mean: {df_ss["eval_score"].mean()}, std: {df_ss["eval_score"].std()}, max: {df_ss["eval_score"].max()}, min: {df_ss["eval_score"].min()}, 95th q: {df_ss["eval_score"].quantile(.95)}')
print(f'Safe-harmful mean: {df_sh["eval_score"].mean()}, std: {df_sh["eval_score"].std()}, max: {df_sh["eval_score"].max()}, min: {df_sh["eval_score"].min()}, 95th q: {df_sh["eval_score"].quantile(.95)}')
print(f'Sensitive mean: {df_s["eval_score"].mean()}, std: {df_s["eval_score"].std()}, max: {df_s["eval_score"].max()}, min: {df_s["eval_score"].min()}')
print(f'Harmful mean: {df_h["eval_score"].mean()}, std: {df_h["eval_score"].std()}, max: {df_h["eval_score"].max()}, min: {df_h["eval_score"].min()}')

Safe-sensitive mean: -17.636599227935974, std: 3.9813871487829267, max: 1.515625, min: -28.75, 95th q: -10.125
Safe-harmful mean: -28.636525402414488, std: 1.8339451822009467, max: -15.3125, min: -33.5, 95th q: -25.375
Sensitive mean: -11.128503755041042, std: 4.746475421392774, max: 1.15625, min: -27.125
Harmful mean: -17.1958821362085, std: 4.944615891064296, max: -5.65625, min: -33.75


In [85]:
hist_data = [df_ss['eval_score'], df_s['eval_score']]
group_labels = ['Safe-sensitive', 'Sensitive']
colors = ['#A56CC1', '#A6ACEC']
fig_s = ff.create_distplot(hist_data, group_labels, colors=colors,
                         bin_size=.5, show_rug=False)
#fig_s = px.histogram(hist_data, group_labels)
fig_s.add_vline(x=df_ss['eval_score'].quantile(.95), line_width=2, line_dash="dash", line_color="red", annotation_text="0.95 quantile of Safe-sensitive")
fig_s.add_vline(x=df_ss['eval_score'].mean()+df_ss['eval_score'].std(), line_width=2, line_dash="dash", line_color="green", annotation_text="mean + std of Safe-sensitive")



In [95]:
hist_data = [df_sh['eval_score'], df_h['eval_score']]
group_labels = ['Safe-harmful', 'Harmful']
colors = ['#63F5EF', '#63F112']
fig_h = ff.create_distplot(hist_data, group_labels, colors=colors,
                         bin_size=.5, show_rug=False)

#fig_h = px.histogram(hist_data, group_labels)
#fig_h.add_vline(x=df_sh['eval_score'].quantile(.95), line_width=2, line_dash="dash", line_color="red", annotation_text="0.95 quantile of Safe-harmful")
#fig_h.add_vline(x=df_sh['eval_score'].mean()+df_sh['eval_score'].std(), line_width=2, line_dash="dash", line_color="green", annotation_text="mean + std of Safe-harmful")

In [102]:
joint_fig = make_subplots(rows=2, cols=1)
for trace in fig_s['data']:
    joint_fig.add_trace(trace, row=1, col=1)
for trace in fig_h['data']:
    joint_fig.add_trace(trace, row=2, col=1)

joint_fig.add_vline(x=df_ss['eval_score'].quantile(.95), line_width=2, row=1, col=1, line_dash="dash", line_color="red")
joint_fig.add_vline(x=df_sh['eval_score'].quantile(.95), line_width=2, row=2, col=1, line_dash="dash", line_color="red")

joint_fig.update_layout(height=600, width=600, title_text="Quality scores", title_x=0.5)
joint_fig.show()


In [104]:
df_s_clean = df_s.loc[df_s['eval_score'] > df_ss['eval_score'].quantile(.95)]

In [105]:
df_h_clean = df_h.loc[df_h['eval_score'] > df_sh['eval_score'].quantile(.95)]

In [124]:
df_s_clean[['queries']].to_csv('../queries/gemini-2.0-flash-experimental/sensitive/sensitive_cleaned_v2.csv', index=False, header=False)
df_h_clean[['queries']].to_csv('../queries/gemini-2.0-flash-experimental/harmful/harmful_cleaned_v2.csv', index=False, header=False)