## Measure Results and Export to Vector Steering Data

### Imports

In [2]:
import json
import pandas as pd
from data import SOURCES, load_data
from prompts import *
TARGET = "llama3.1-8b-instruct"

  from .autonotebook import tqdm as notebook_tqdm


--f=/home/droytbu/.local/share/jupyter/runtime/kernel-v35b9613dbe5fa5ea9ec2e1a4e7eb552192876d7ce.json
1000


### Loading CNN Data

In [37]:
data = json.load(open("individual_setting/score_results/cnn/llama3.1-8b-instruct_comparison_results_extra.json","r"))
responses, articles, keys = load_data("cnn", sources= ['gpt35', TARGET], extras=True)

### Ambivalence check

Ambivalence is when a model does not prefer the same result when queried twice in two different orders.

In [35]:
total = 0 
ambivalent = []
for result in data:
    if result['model'] == 'gpt35':
        total += 1
        if result['backward_comparison'] != result['forward_comparison']:
            ambivalent.append(result)
len(ambivalent)

58

In [22]:
sum(a[1] for a in ambivalent) / len(ambivalent)

0.6271477130325565

### Reconstruction Function

Very important -- allows reconstruction of initial prompts to generate steering data

In [40]:
def reconstruct(result, responses, articles, source='llama3.1-8b-instruct', forward=True):
    article = articles[result['key']]
    source_summary = responses[source][result['key']]
    other_summary = responses[result['model']][result['key']]
    if forward:
        return COMPARISON_PROMPT_TEMPLATE.format(summary1=source_summary, summary2=other_summary, article=article)
    else:
        return COMPARISON_PROMPT_TEMPLATE.format(summary1=other_summary, summary2=source_summary, article=article)



### Filter criterion:

1. **Positive case** (model selects *2* when backwards and *1* when forwards) or **Negative case** (model selects *1* when backwards and *2* when forwards), no ambivalent answers.
2. **Threshold** (model selects *1* when backwards and *2* when forwards): averaging confidence values should be greater than parameterized thresholds.

In [14]:
meets_criteria = 0
t_pos, t_neg = 0.7, 0.7
total = 0
pos = 0
neg = 0
total_neg_conf = 0
total_pos_conf = 0
pos_samples = []
neg_samples = []
for result in data:
    if result['model'] == 'gpt35':
        total += 1
        if result['backward_comparison'] == '2' and result['forward_comparison'] == '1':
            pos_conf = 0.5 * (result['forward_comparison_probability'] + result['backward_comparison_probability'])
            if result['forward_comparison_probability'] > t_pos and result['backward_comparison_probability'] > t_pos:
                meets_criteria += 1
                pos += 1
                total_pos_conf += 0.5 * (result['forward_comparison_probability'] + result['backward_comparison_probability'])
                result['forward_prompt'] = reconstruct(result, responses, articles)
                result['backward_prompt'] = reconstruct(result, responses, articles, forward=False)
                pos_samples.append(result)
                pos_samples.append(reconstruct(result, responses, articles, forward=False))
        if result['forward_comparison'] == '2' and result['backward_comparison'] == '1':
            neg_conf = 0.5 * (result['forward_comparison_probability'] + result['backward_comparison_probability'])
            if neg_conf > t_neg:
                meets_criteria += 1
                neg += 1
                total_neg_conf += neg_conf
                result['forward_prompt'] = reconstruct(result, responses, articles)
                result['backward_prompt'] = reconstruct(result, responses, articles, forward=False)
                neg_samples.append(result)
print(meets_criteria, pos, neg)

1071 964 107


0.7176318706374134

### Save Output

In [46]:
json.dump({"pos": pos_samples, "neg": neg_samples}, open("vector_steering_samples.json", "w"))