### Data Formation

This code will take in the raw data from GPT, convert to json objects, and then form the final data to be used as opposing steering vector for each domain

In [1]:
import os
import pandas as pd
from tqdm import tqdm
import json

Making directory for processed data

In [2]:
processed_data_path = os.path.join("data","gpt_responses_processed")
os.makedirs(processed_data_path, exist_ok=True)

Reading the raw directory path

In [3]:
gpt_response_raw_path = os.path.join("data", "gpt_responses_raw")
gpt_response_raw_files = os.listdir(gpt_response_raw_path)
gpt_response_raw_files

['MEDIA_FREEDOM_responses.csv',
 'REDISTRIBUTION_MONEY_responses.csv',
 'POLITICAL_LEADERSHIP_EQUALITY_responses.csv',
 'GENDER_EQUALITY_EMPLOYMENT_responses.csv',
 'UNEMPLOYMENT_BENEFITS_responses.csv',
 'PUNISHMENT_SEVERITY_responses.csv']

#### Decoding the data for Political Inequality

In [4]:
domain_name = "POLITICAL_LEADERSHIP_EQUALITY"
view1_annot = "INEQUALITY_PROMPT"
view2_annot = "EQUALITY_PROMPT"

raw_file = os.path.join(gpt_response_raw_path, f"{domain_name}_responses.csv")
raw_df = pd.read_csv(raw_file, index_col=0)
raw_df.head()

Unnamed: 0,response
0,"```json\n{\n ""View 1"": ""As a 45-year-old bu..."
1,"```json\n{\n ""View 1"": ""As a 45-year-old bu..."
2,"```json\n{\n ""View 1"": ""My name is John Tho..."
3,"```json\n{\n ""View 1"": ""My name is John Smi..."
4,"```json\n{\n ""View 1"": ""As a 45-year-old bu..."


In [5]:
view_1 = []
view_2 = []

for index, row in tqdm(raw_df.iterrows(), total=len(raw_df)):
    response = row['response']
    response = response.replace('```', '').replace('json', '')
    try:
        response_json = json.loads(response)
        view_1.append(response_json['View 1'])
        view_2.append(response_json['View 2'])
    except Exception as e:
        print(f"Error parsing JSON for index {index}: {e}")
        view_1.append(None)
        view_2.append(None)
        
raw_df[view1_annot] = view_1
raw_df[view2_annot] = view_2
raw_df.head()

100%|██████████| 60/60 [00:00<00:00, 14158.78it/s]


Unnamed: 0,response,INEQUALITY_PROMPT,EQUALITY_PROMPT
0,"```json\n{\n ""View 1"": ""As a 45-year-old bu...",As a 45-year-old businessman from New York Cit...,"As a 30-year-old activist from Toronto, I firm..."
1,"```json\n{\n ""View 1"": ""As a 45-year-old bu...","As a 45-year-old businessman from London, I of...",As a 30-year-old community organizer from Toro...
2,"```json\n{\n ""View 1"": ""My name is John Tho...","My name is John Thompson, a 45-year-old busine...","I’m Maria Lopez, a 32-year-old community organ..."
3,"```json\n{\n ""View 1"": ""My name is John Smi...","My name is John Smith, a 45-year-old businessm...","I'm Maria Lopez, a 32-year-old community activ..."
4,"```json\n{\n ""View 1"": ""As a 45-year-old bu...","As a 45-year-old businessman from New York, I ...",As a 32-year-old political activist from Toron...


In [6]:
raw_df.to_csv(os.path.join(processed_data_path, f"{domain_name}_processed_responses.csv"))

#### Decoding the data for Gender Equality in Employment

In [7]:
domain_name = "GENDER_EQUALITY_EMPLOYMENT"
view1_annot = "INEQUALITY_PROMPT"
view2_annot = "EQUALITY_PROMPT"

raw_file = os.path.join(gpt_response_raw_path, f"{domain_name}_responses.csv")
raw_df = pd.read_csv(raw_file, index_col=0)
raw_df.head()

Unnamed: 0,response
0,"```json\n{\n ""View 1"": ""My name is John, I’..."
1,"```json\n{\n ""View 1"": ""As a 45-year-old fa..."
2,"```json\n{\n ""View 1"": ""My name is John, 45..."
3,"```json\n{\n ""View 1"": ""My name is John, a ..."
4,"{\n ""View 1"": ""My name is John, a 45-year-o..."


In [8]:
view_1 = []
view_2 = []

for index, row in tqdm(raw_df.iterrows(), total=len(raw_df)):
    response = row['response']
    response = response.replace('```', '').replace('json', '')
    try:
        response_json = json.loads(response)
        view_1.append(response_json['View 1'])
        view_2.append(response_json['View 2'])
    except Exception as e:
        print(f"Error parsing JSON for index {index}: {e}")
        view_1.append(None)
        view_2.append(None)
        
raw_df[view1_annot] = view_1
raw_df[view2_annot] = view_2
raw_df.head()

100%|██████████| 60/60 [00:00<00:00, 7280.30it/s]


Unnamed: 0,response,INEQUALITY_PROMPT,EQUALITY_PROMPT
0,"```json\n{\n ""View 1"": ""My name is John, I’...","My name is John, I’m 45 years old, and I live ...","I'm Sarah, a 32-year-old professional from Tor..."
1,"```json\n{\n ""View 1"": ""As a 45-year-old fa...","As a 45-year-old factory manager from Detroit,...",As a 30-year-old HR consultant living in Londo...
2,"```json\n{\n ""View 1"": ""My name is John, 45...","My name is John, 45, and I live in Detroit, US...","I’m Sarah, 32, from Toronto, Canada. In today'..."
3,"```json\n{\n ""View 1"": ""My name is John, a ...","My name is John, a 45-year-old factory manager...","I’m Sarah, a 32-year-old human resources profe..."
4,"{\n ""View 1"": ""My name is John, a 45-year-o...","My name is John, a 45-year-old factory manager...","I'm Sarah, a 32-year-old software engineer liv..."


In [9]:
raw_df.to_csv(os.path.join(processed_data_path, f"{domain_name}_processed_responses.csv"))

#### Decoding the data for Media Freedom

In [10]:
domain_name = "MEDIA_FREEDOM"
view1_annot = "UNCONTROLLED"
view2_annot = "CONTROLLED"

raw_file = os.path.join(gpt_response_raw_path, f"{domain_name}_responses.csv")
raw_df = pd.read_csv(raw_file, index_col=0)
raw_df.head()

Unnamed: 0,response
0,"```json\n{\n ""View 1"": ""My name is Clara Ro..."
1,"```json\n{\n ""View 1"": ""My name is Sarah Th..."
2,"{\n ""View 1"": ""My name is Elena Martinez, a..."
3,"```json\n{\n ""View 1"": ""My name is Alex Tho..."
4,"```json\n{\n ""View 1"": ""As a 35-year-old jo..."


In [11]:
view_1 = []
view_2 = []

for index, row in tqdm(raw_df.iterrows(), total=len(raw_df)):
    response = row['response']
    response = response.replace('```', '').replace('json', '')
    try:
        response_json = json.loads(response)
        view_1.append(response_json['View 1'])
        view_2.append(response_json['View 2'])
    except Exception as e:
        print(f"Error parsing JSON for index {index}: {e}")
        view_1.append(None)
        view_2.append(None)
        
raw_df[view1_annot] = view_1
raw_df[view2_annot] = view_2
raw_df.head()

100%|██████████| 60/60 [00:00<00:00, 9693.33it/s]


Unnamed: 0,response,UNCONTROLLED,CONTROLLED
0,"```json\n{\n ""View 1"": ""My name is Clara Ro...","My name is Clara Rodriguez, a 34-year-old jour...","I am Rajesh Kumar, a 45-year-old media analyst..."
1,"```json\n{\n ""View 1"": ""My name is Sarah Th...","My name is Sarah Thompson, a 29-year-old journ...","I am Ahmed Malik, a 45-year-old educator from ..."
2,"{\n ""View 1"": ""My name is Elena Martinez, a...","My name is Elena Martinez, a 34-year-old journ...","I am Rajesh Singh, a 45-year-old media analyst..."
3,"```json\n{\n ""View 1"": ""My name is Alex Tho...","My name is Alex Thompson, a 34-year-old journa...","I'm Maria Chen, a 45-year-old policy analyst l..."
4,"```json\n{\n ""View 1"": ""As a 35-year-old jo...","As a 35-year-old journalist from New York, I o...",Being a 42-year-old policy analyst from Singap...


In [12]:
raw_df.to_csv(os.path.join(processed_data_path, f"{domain_name}_processed_responses.csv"))

#### Decoding the data for Money Redistribution

In [13]:
domain_name = "REDISTRIBUTION_MONEY"
view1_annot = "REDISTRBUTE"
view2_annot = "NOT-REDISTRIBUTE"

raw_file = os.path.join(gpt_response_raw_path, f"{domain_name}_responses.csv")
raw_df = pd.read_csv(raw_file, index_col=0)
raw_df.head()

Unnamed: 0,response
0,"```json\n{\n ""View 1"": ""My name is Sarah Th..."
1,"```json\n{\n ""View 1"": ""My name is Sarah Th..."
2,"```json\n{\n ""View 1"": ""My name is Sarah, I..."
3,"```json\n{\n ""View 1"": ""My name is Sarah Jo..."
4,"```json\n{\n ""View 1"": ""My name is Sarah Th..."


In [14]:
view_1 = []
view_2 = []

for index, row in tqdm(raw_df.iterrows(), total=len(raw_df)):
    response = row['response']
    response = response.replace('```', '').replace('json', '')
    try:
        response_json = json.loads(response)
        view_1.append(response_json['View 1'])
        view_2.append(response_json['View 2'])
    except Exception as e:
        print(f"Error parsing JSON for index {index}: {e}")
        view_1.append(None)
        view_2.append(None)
        
raw_df[view1_annot] = view_1
raw_df[view2_annot] = view_2
raw_df.head()

100%|██████████| 60/60 [00:00<00:00, 12072.83it/s]


Unnamed: 0,response,REDISTRBUTE,NOT-REDISTRIBUTE
0,"```json\n{\n ""View 1"": ""My name is Sarah Th...","My name is Sarah Thompson, a 34-year-old socia...","I am David Chen, a 45-year-old entrepreneur fr..."
1,"```json\n{\n ""View 1"": ""My name is Sarah Th...","My name is Sarah Thompson, a 34-year-old socia...","I'm John Carter, a 45-year-old entrepreneur ba..."
2,"```json\n{\n ""View 1"": ""My name is Sarah, I...","My name is Sarah, I’m 34 years old, and I live...","I’m Mark, a 45-year-old entrepreneur from Aust..."
3,"```json\n{\n ""View 1"": ""My name is Sarah Jo...","My name is Sarah Johnson, a 35-year-old social...","I'm Michael Thompson, a 42-year-old entreprene..."
4,"```json\n{\n ""View 1"": ""My name is Sarah Th...","My name is Sarah Thompson, a 34-year-old teach...","I’m David Kelly, a 45-year-old entrepreneur fr..."


In [15]:
raw_df.to_csv(os.path.join(processed_data_path, f"{domain_name}_processed_responses.csv"))

#### Decoding the data for Unemployment Benefits

In [16]:
domain_name = "UNEMPLOYMENT_BENEFITS"
view1_annot = "BENEFITS"
view2_annot = "NO-BENEFITS"

raw_file = os.path.join(gpt_response_raw_path, f"{domain_name}_responses.csv")
raw_df = pd.read_csv(raw_file, index_col=0)
raw_df.head()

Unnamed: 0,response
0,"```json\n{\n ""View 1"": ""My name is Sarah, a..."
1,"```json\n{\n ""View 1"": ""My name is John, a ..."
2,"```json\n{\n ""View 1"": ""My name is Sarah Th..."
3,"```json\n{\n ""View 1"": ""I'm Sarah, a 34-yea..."
4,"```json\n{\n ""View 1"": ""My name is Maria Go..."


In [17]:
view_1 = []
view_2 = []

for index, row in tqdm(raw_df.iterrows(), total=len(raw_df)):
    response = row['response']
    response = response.replace('```', '').replace('json', '')
    try:
        response_json = json.loads(response)
        view_1.append(response_json['View 1'])
        view_2.append(response_json['View 2'])
    except Exception as e:
        print(f"Error parsing JSON for index {index}: {e}")
        view_1.append(None)
        view_2.append(None)
        
raw_df[view1_annot] = view_1
raw_df[view2_annot] = view_2
raw_df.head()

100%|██████████| 60/60 [00:00<00:00, 8367.69it/s]


Unnamed: 0,response,BENEFITS,NO-BENEFITS
0,"```json\n{\n ""View 1"": ""My name is Sarah, a...","My name is Sarah, a 34-year-old single mother ...","I'm David, a 45-year-old small business owner ..."
1,"```json\n{\n ""View 1"": ""My name is John, a ...","My name is John, a 45-year-old father of two l...","I’m Sarah, a 32-year-old entrepreneur from Aus..."
2,"```json\n{\n ""View 1"": ""My name is Sarah Th...","My name is Sarah Thompson, a 34-year-old resid...","I’m John Carter, a 45-year-old entrepreneur fr..."
3,"```json\n{\n ""View 1"": ""I'm Sarah, a 34-yea...","I'm Sarah, a 34-year-old single mother from De...","My name is John, a 45-year-old entrepreneur fr..."
4,"```json\n{\n ""View 1"": ""My name is Maria Go...","My name is Maria Gonzalez, a 34-year-old singl...","I’m James Thompson, a 45-year-old entrepreneur..."


In [18]:
raw_df.to_csv(os.path.join(processed_data_path, f"{domain_name}_processed_responses.csv"))

#### Decoding the data for Punishment Severity

In [19]:
domain_name = "PUNISHMENT_SEVERITY"
view1_annot = "PUNISHMENT"
view2_annot = "NO-PUNISHMENT"

raw_file = os.path.join(gpt_response_raw_path, f"{domain_name}_responses.csv")
raw_df = pd.read_csv(raw_file, index_col=0)
raw_df.head()

Unnamed: 0,response
0,"```json\n{\n ""View 1"": ""My name is John Mil..."
1,"```json\n{\n ""View 1"": ""My name is John Smi..."
2,"```json\n{\n ""View 1"": ""As a 45-year-old la..."
3,"{\n ""View 1"": ""My name is John Adams, a 45-..."
4,"```json\n{\n ""View 1"": ""My name is James Th..."


In [20]:
view_1 = []
view_2 = []

for index, row in tqdm(raw_df.iterrows(), total=len(raw_df)):
    response = row['response']
    response = response.replace('```', '').replace('json', '')
    try:
        response_json = json.loads(response)
        view_1.append(response_json['View 1'])
        view_2.append(response_json['View 2'])
    except Exception as e:
        print(f"Error parsing JSON for index {index}: {e}")
        view_1.append(None)
        view_2.append(None)
        
raw_df[view1_annot] = view_1
raw_df[view2_annot] = view_2
raw_df.head()

100%|██████████| 60/60 [00:00<00:00, 11101.43it/s]


Unnamed: 0,response,PUNISHMENT,NO-PUNISHMENT
0,"```json\n{\n ""View 1"": ""My name is John Mil...","My name is John Miller, a 45-year-old law enfo...","I’m Sarah Thompson, a 32-year-old social worke..."
1,"```json\n{\n ""View 1"": ""My name is John Smi...","My name is John Smith, a 45-year-old police of...","I am Maria Lopez, a 32-year-old social worker ..."
2,"```json\n{\n ""View 1"": ""As a 45-year-old la...",As a 45-year-old law enforcement officer from ...,"As a 32-year-old social worker from Toronto, I..."
3,"{\n ""View 1"": ""My name is John Adams, a 45-...","My name is John Adams, a 45-year-old law enfor...","I’m Maria Lopez, a 32-year-old social worker f..."
4,"```json\n{\n ""View 1"": ""My name is James Th...","My name is James Thompson, a 45-year-old law e...","I am Maria Gonzalez, a 32-year-old social work..."


In [21]:
raw_df.to_csv(os.path.join(processed_data_path, f"{domain_name}_processed_responses.csv"))