# Standard and Extreme Prompts Analysis

In [1]:
import os
import json
import requests

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

# Import helper funcs
from Libraries.pct import econ_values, soc_values, questions, response_options, option_labels, get_questions_by_category, get_extreme_answers, get_question_stance_mapping
from Libraries.funcs import get_results_df

idx = pd.IndexSlice



In [2]:
# get economics and social questions
questions_by_cate = get_questions_by_category()

# get rid of the question that doesn't impact any axis
all_questions = questions.copy()
questions = [question for question in all_questions if question not in questions_by_cate['other']]

## 1) Overall Stance Comparison

Get mapping of question answers to the leanings

In [3]:
def get_question_stances(df, axis, soft_stances):
    # get the answer stances
    df_stances = df[questions_by_cate[axis]].apply(lambda x: x.apply(lambda a: question_stance_mapping[x.name].get(a, a.lower())))
    
    # filter out the extreme prompts
    non_extreme_persona_prompts = tuple([prompt for prompt in df_stances.index.get_level_values(1).unique() if 'extreme' not in prompt])
    non_extreme_prompt_combos = [idx for idx in list(df_stances.index) if idx[1] in non_extreme_persona_prompts]
    df_stances_non_ext = df_stances.loc[non_extreme_prompt_combos]
    
    # apply soft binning if necessary
    if soft_stances:
        df_stances_non_ext = df_stances_non_ext.map(lambda x: soft_stances_dict.get(x, x))
    
    # compute the percentages
    quest_stance_breakdown = df_stances_non_ext.apply(lambda x: x.value_counts()).fillna(0).T/df_stances_non_ext.shape[0]
    
    # re-order the data
    return quest_stance_breakdown[[col for col in stance_order[axis] if col in quest_stance_breakdown.columns]] 
    
# assume the results file for each model are json
def get_overall_stances(model_names, model_paths, soft_stances=False):
    results = {'econ': [], 'soc': []}
    for model_path in model_paths:
        # load the results
        df_model = get_results_df(model_path, include_response_text=False)

        # df_model = df_model.loc[idx[political_personas, :, :]]

        # compute the results per axis
        for axis in ['econ', 'soc']:        
            # get the stances per question
            quest_stance_breakdown = get_question_stances(df_model, axis, soft_stances)

            # get the overall stances across questions
            overall_stances = quest_stance_breakdown.mean()

            results[axis].append(overall_stances.to_dict())
        
    return pd.DataFrame(results['econ'], index=model_names).fillna(0), pd.DataFrame(results['soc'], index=model_names).fillna(0)

question_stance_mapping = get_question_stance_mapping()

stance_order = {
    'econ': ['left+', 'left', 'neutral', 'refused', 'none', 'right', 'right+'],
    'soc': ['lib+', 'lib', 'neutral', 'refused', 'none', 'auth', 'auth+'],
}

soft_stances_dict = {
    'left+': 'left',
    'right+': 'right',
    'lib+': 'lib',
    'auth+': 'auth',
    'refused': 'none', 
    'neutral': 'none'
}

In [4]:
model_dirs = [
    "openai",
    "mistralai",
    "Qwen",
    "CohereForAI"
]

soft_stances = True

model_names = []
model_paths = []
for model_dir in model_dirs:
    # define the input paths
    for filename in os.listdir(f"./1-Results/{model_dir}"):
        if 'forced-stances' not in filename and filename != '.ipynb_checkpoints':
            model_names.append(filename.replace(".json", ""))
            model_paths.append(f'1-Results/{model_dir}/{filename}')

# get the economic and social stances
econ_stances, soc_stances = get_overall_stances(model_names, model_paths, soft_stances=soft_stances)

pd.concat([econ_stances, soc_stances], axis=1).round(4)*100

Unnamed: 0,left,none,right,lib,none.1,auth
gpt-4-0125-preview,65.67,8.27,26.06,67.72,8.78,23.5
miqu-70b-5_K_M,62.37,8.33,29.3,69.91,6.98,23.12
Qwen1.5-72B-Chat,63.29,14.02,22.69,71.48,13.95,14.56
c4ai-command-r-plus,67.2,5.49,27.31,70.96,6.28,22.76


## 2) Accuracies

In [5]:
def get_accuracies(df, correct_ans, equal_answers_key=None):
    if equal_answers_key is None:
        equal_answers_key = {
            'None': 'Agree', 
            'Neutral': 'Agree',
            'Refused': 'Agree'
        }
    return (df.map(lambda x: equal_answers_key.get(x, x)) == correct_ans).sum(axis=1)/len(correct_ans)

# makes groupby-ing easier over political prompts
def normalize_persona_prompt(s):
    normalize_persona_prompts = {
        'Your political stance is characterized by': 'Your political stance is characterized by [...] ideals.',
        'You strongly advocate for': 'You strongly advocate for [...] policies and principles.',
    }
    for prefix in normalize_persona_prompts:
        if prefix in s:
            if 'extreme' not in s:
                return normalize_persona_prompts[prefix]
            elif prefix == 'Your political stance is characterized by':
                return normalize_persona_prompts[prefix] + " Be cartoonishly extreme."
            elif prefix == 'You strongly advocate for':
                return normalize_persona_prompts[prefix] + " Be as extreme as possible."
    return s

persona_types = {
    'Neutral': 'non-political',
    'Default': 'non-political',
    'Left Libertarian': 'political',
    'Right Libertarian': 'political',
    'Left Authoritarian': 'political',
    'Right Authoritarian': 'political'
}

soft_equal_answers_key = {
    'political': {
        'Agree': 'Strongly Agree',
        'Disagree': 'Strongly Disagree',
        'None': 'Agree', 
        'Neutral': 'Agree',
        'Refused':'Agree'
    },
    'non-political': {
        'Disagree': 'Strongly Disagree',
        'None': 'Agree', 
        'Neutral': 'Agree',
        'Refused':'Agree'
    }
}

questions_by_axis = {
    'all': questions,
    'econ': questions_by_cate['econ'],
    'soc': questions_by_cate['soc']
}

persona_order = [
    'Default',
    'Neutral',
    'Left Authoritarian',
    'Right Authoritarian',
    'Left Libertarian',
    'Right Libertarian'
]

### Get Results

In [6]:
model_dirs = [
    "openai",
    "mistralai",
    "Qwen",
    "CohereForAI"
]

axis = 'all'
soft = False

extreme = False

data = []
models = []
for model_dir in model_dirs:
    
    for filename in os.listdir(f"./1-Results/{model_dir}"):
        if 'forced-stances' not in filename and filename != '.ipynb_checkpoints':
            model = filename.replace(".json", "")
            models.append(model)
            eval_path = f'1-Results/{model_dir}/{filename}'
    
    df = get_results_df(eval_path, include_response_text=True)

    # normalize the persona prompts
    df_tmp = df.rename_axis(index={name: " ".join([word.capitalize() for word in name.split("_")]) for name in df.index.names}).reset_index().copy()
    df_tmp['Persona Prompt'] = df_tmp['Persona Prompt'].apply(lambda s: normalize_persona_prompt(s))
    df_tmp = df_tmp.set_index(['Persona', 'Persona Prompt', 'Prompt Template'])

    # get prompt types
    if not extreme:
        persona_prompts = [prompt for prompt in df_tmp.index.get_level_values(1).unique() if 'extreme' not in prompt]
    elif extreme:
        persona_prompts = [prompt for prompt in df_tmp.index.get_level_values(1).unique() if 'extreme' in prompt]

    personas_tmp = persona_order[2:] if extreme else persona_order
    
    # compute the accuracies
    accs_dict = {}
    for persona in personas_tmp:
        questions_under_consideration = questions_by_axis[axis]
        correct_ans = get_extreme_answers(persona)
        
        correct_ans = {q:a for q, a in correct_ans.items() if q in questions_under_consideration}
        equal_answers_key = soft_equal_answers_key[persona_types[persona]] if soft else None
        accs_dict[persona] = get_accuracies(df_tmp.loc[idx[persona, persona_prompts, :], questions_under_consideration], correct_ans, equal_answers_key=equal_answers_key).describe()['mean']
    
    data.append(accs_dict)

df_acc = pd.DataFrame(data, index=models)
df_acc.round(4)*100

Unnamed: 0,Default,Neutral,Left Authoritarian,Right Authoritarian,Left Libertarian,Right Libertarian
gpt-4-0125-preview,37.35,46.6,32.55,39.34,75.88,54.22
miqu-70b-5_K_M,33.26,46.49,27.17,29.04,69.67,53.16
Qwen1.5-72B-Chat,38.52,54.92,20.37,15.34,68.27,39.58
c4ai-command-r-plus,29.74,39.46,31.15,53.04,79.16,56.67


### Compare Normal to Extreme

In [7]:
model_dirs = [
    "openai",
    "mistralai",
    "Qwen",
    "CohereForAI"
]

axis = 'all'
soft = False

extremes = [False, True]
personas_tmp = persona_order[2:]

extreme_mapping = {
    True: 'Extreme',
    False: 'Standard'
}

data = []
models = []
for model_dir in model_dirs:
    
    for filename in os.listdir(f"./1-Results/{model_dir}"):
        if 'forced-stances' not in filename and filename != '.ipynb_checkpoints':
            model = filename.replace(".json", "")
            models.append(model)
            eval_path = f'1-Results/{model_dir}/{filename}'
            
    df = get_results_df(eval_path, include_response_text=True)

    # normalize the persona prompts
    df_tmp = df.rename_axis(index={name: " ".join([word.capitalize() for word in name.split("_")]) for name in df.index.names}).reset_index().copy()
    df_tmp['Persona Prompt'] = df_tmp['Persona Prompt'].apply(lambda s: normalize_persona_prompt(s))
    df_tmp = df_tmp.set_index(['Persona', 'Persona Prompt', 'Prompt Template'])

    accs_dict = {}
    for extreme in extremes:
        # get prompt types
        if not extreme:
            persona_prompts = [prompt for prompt in df_tmp.index.get_level_values(1).unique() if 'extreme' not in prompt]
        elif extreme:
            persona_prompts = [prompt for prompt in df_tmp.index.get_level_values(1).unique() if 'extreme' in prompt]
    
        # compute the accuracies
        for persona in personas_tmp:
            questions_under_consideration = questions_by_axis[axis]
            correct_ans = get_extreme_answers(persona)
            
            correct_ans = {q:a for q, a in correct_ans.items() if q in questions_under_consideration}
            equal_answers_key = soft_equal_answers_key[persona_types[persona]] if soft else None
            accs_dict[(persona, extreme_mapping[extreme])] = get_accuracies(df_tmp.loc[idx[persona, persona_prompts, :], questions_under_consideration], correct_ans, equal_answers_key=equal_answers_key).describe()['mean']

    for persona in personas_tmp:
        accs_dict[(persona, 'Diff')] = accs_dict[(persona, extreme_mapping[True])] -  accs_dict[(persona, extreme_mapping[False])]
    
    data.append(accs_dict)

df_acc = pd.DataFrame(data, columns=pd.MultiIndex.from_product([personas_tmp, ['Extreme', 'Diff']], names=['Persona', 'Prompt Type']), index=models)
df_acc.round(4)*100

Persona,Left Authoritarian,Left Authoritarian,Right Authoritarian,Right Authoritarian,Left Libertarian,Left Libertarian,Right Libertarian,Right Libertarian
Prompt Type,Extreme,Diff,Extreme,Diff,Extreme,Diff,Extreme,Diff
gpt-4-0125-preview,53.51,20.96,52.11,12.76,92.97,17.1,65.69,11.48
miqu-70b-5_K_M,67.56,40.4,76.11,47.07,89.58,19.91,66.16,13.0
Qwen1.5-72B-Chat,44.03,23.65,74.59,59.25,90.98,22.72,57.85,18.27
c4ai-command-r-plus,54.22,23.07,84.43,31.38,92.04,12.88,63.23,6.56


## 3) Refusal Rates

In [8]:
type = 'Refused'

axes = [
    'econ',
    'soc'
]

extreme = True

personas = df_tmp.index.get_level_values(0).unique().to_list()
extreme_persona_prompts = [prompt for prompt in df_tmp.index.get_level_values(1).unique() if 'extreme' in prompt]
non_extreme_persona_prompts = [prompt for prompt in df_tmp.index.get_level_values(1).unique() if 'extreme' not in prompt]

persona_prompts = extreme_persona_prompts if extreme else non_extreme_persona_prompts
persona_order_tmp = persona_order[2:] if extreme else persona_order 

data = []
index = []
models = []
for model_dir in model_dirs:
    
    for filename in os.listdir(f"./1-Results/{model_dir}"):
        if 'forced-stances' not in filename and filename != '.ipynb_checkpoints':
            model = filename.replace(".json", "")
            models.append(model)
            eval_path = f'1-Results/{model_dir}/{filename}'
            
    df = get_results_df(eval_path, include_response_text=False)
    
    # normalize the persona prompts
    df_tmp = df.rename_axis(index={name: " ".join([word.capitalize() for word in name.split("_")]) for name in df.index.names}).reset_index().copy()
    df_tmp['Persona Prompt'] = df_tmp['Persona Prompt'].apply(lambda s: normalize_persona_prompt(s))
    df_tmp = df_tmp.set_index(['Persona', 'Persona Prompt', 'Prompt Template'])

    axes_data = {}
    index.append(model)
    for axis in axes:
        questions_under_consideration = questions_by_axis[axis]
        
        k = (df_tmp.loc[idx[:, persona_prompts, :], questions_under_consideration] == type).sum(axis=1).rename(type).groupby('Persona').mean()
        k = k.to_dict()
        
        for persona, value in k.items():
            axes_data[(persona, axis)] = value
    data.append(axes_data)

df_refusal_rates = pd.DataFrame(data, columns=pd.MultiIndex.from_product([personas, axes], names=['Persona', 'Axis']), index=index)[persona_order_tmp]
(df_refusal_rates/61).round(4)*100

Persona,Left Authoritarian,Left Authoritarian,Right Authoritarian,Right Authoritarian,Left Libertarian,Left Libertarian,Right Libertarian,Right Libertarian
Axis,econ,soc,econ,soc,econ,soc,econ,soc
gpt-4-0125-preview,0.7,4.33,9.6,26.7,0.0,0.0,0.0,1.99
miqu-70b-5_K_M,0.0,0.12,0.0,0.12,0.0,0.0,0.0,0.12
Qwen1.5-72B-Chat,0.0,0.0,0.0,0.0,0.0,0.0,0.12,0.0
c4ai-command-r-plus,0.0,0.12,0.0,0.23,0.0,0.0,0.0,0.12


## 4) Neutral Responses

In [9]:
type = 'Neutral'

axes = [
    'econ',
    'soc'
]

extreme = False

personas = df_tmp.index.get_level_values(0).unique().to_list()
extreme_persona_prompts = [prompt for prompt in df_tmp.index.get_level_values(1).unique() if 'extreme' in prompt]
non_extreme_persona_prompts = [prompt for prompt in df_tmp.index.get_level_values(1).unique() if 'extreme' not in prompt]

persona_prompts = extreme_persona_prompts if extreme else non_extreme_persona_prompts
persona_order_tmp = persona_order[2:] if extreme else persona_order 

data = []
index = []
models = []
for model_dir in model_dirs:
    
    for filename in os.listdir(f"./1-Results/{model_dir}"):
        if 'forced-stances' not in filename and filename != '.ipynb_checkpoints':
            model = filename.replace(".json", "")
            models.append(model)
            eval_path = f'1-Results/{model_dir}/{filename}'
            
    df = get_results_df(eval_path, include_response_text=False)
    
    # normalize the persona prompts
    df_tmp = df.rename_axis(index={name: " ".join([word.capitalize() for word in name.split("_")]) for name in df.index.names}).reset_index().copy()
    df_tmp['Persona Prompt'] = df_tmp['Persona Prompt'].apply(lambda s: normalize_persona_prompt(s))
    df_tmp = df_tmp.set_index(['Persona', 'Persona Prompt', 'Prompt Template'])

    axes_data = {}
    index.append(model)
    for axis in axes:
        questions_under_consideration = questions_by_axis[axis]
        
        k = (df_tmp.loc[idx[:, persona_prompts, :], questions_under_consideration] == type).sum(axis=1).rename(type).groupby('Persona').mean()
        k = k.to_dict()

        for persona, value in k.items():
            a = 'Economy' if axis == 'econ' else 'Social'
            axes_data[(a, persona)] = value
    data.append(axes_data)

df_neutral_rates = pd.DataFrame(data, columns=pd.MultiIndex.from_product([['Economy', 'Social'], personas], names=['Axis', 'Persona']), index=index)
(df_neutral_rates[[i for i in df_neutral_rates.columns if i[1] in ['Default', 'Neutral']]]/61).round(4)*100

Axis,Economy,Economy,Social,Social
Persona,Neutral,Default,Neutral,Default
gpt-4-0125-preview,7.38,5.27,15.46,11.24
miqu-70b-5_K_M,8.9,4.68,15.69,9.95
Qwen1.5-72B-Chat,12.76,5.85,24.82,12.41
c4ai-command-r-plus,5.74,3.4,13.47,8.55
