# Steering Prompt Generation for Moral Machine

## Imports

In [3]:
# IMPORTS
import os
import csv
import numpy as np
import pandas as pd
import json # For saving the output
import time # For progress timing
from google.colab import drive, userdata

##Mount Google Drive & Verify

In [4]:
# --- Mount Google Drive --
print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Drive mounted successfully.")

# Retrieve workpace path
WORKING_DIR = userdata.get('moral_path')
# change working directory
os.chdir(WORKING_DIR)
# check the current directory
!pwd

Mounting Google Drive...
Mounted at /content/drive
Drive mounted successfully.
/content/drive/MyDrive/_PhD/Moral-Reasoning/Experiments/LLM-Moral-Steering


In [None]:
drive.flush_and_unmount()

## Configuration

In [5]:
# --- Configuration ---

# 1. Path to the original source dataset
SOURCE_FILE_PATH = './data/original/SharedResponses.csv'

# 2. Output directory for the generated prompts
OUTPUT_DIR = './data/processed/steering_prompts'

# 3. Output filename for the JSON file
OUTPUT_FILENAME = 'steer_prompts_UvD.json'
OUTPUT_FILE_PATH = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME)

# 4. Number of prompt pairs to generate
NUM_PROMPT_PAIRS_TO_GENERATE = 500

# 5. Define all character columns (same as before)
CHARACTER_COLS = [
    'Man', 'Woman', 'Pregnant', 'Stroller', 'OldMan', 'OldWoman',
    'Boy', 'Girl', 'Homeless', 'LargeWoman', 'LargeMan', 'Criminal',
    'MaleExecutive', 'FemaleExecutive', 'FemaleAthlete', 'MaleAthlete',
    'FemaleDoctor', 'MaleDoctor', 'Dog', 'Cat'
]

# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Configuration loaded.")
print(f"Input file: {SOURCE_FILE_PATH}")
print(f"Output file: {OUTPUT_FILE_PATH}")
print(f"Target prompt pairs: {NUM_PROMPT_PAIRS_TO_GENERATE}")

Configuration loaded.
Input file: ./data/original/SharedResponses.csv
Output file: ./data/processed/steering_prompts/steer_prompts_UvD.json
Target prompt pairs: 500


## Load Data

In [6]:
TEST_RUN_ROWS = 10000

# Start timer
load_start_time = time.time()
print(f"Loading original dataset from: {SOURCE_FILE_PATH}...")

try:
    # SUBSET FOR TESTING
    df_full = pd.read_csv(SOURCE_FILE_PATH, nrows=TEST_RUN_ROWS)

    # df_full = pd.read_csv(SOURCE_FILE_PATH)

    # --- Data Cleaning ---
    # Fill NaN values in character columns with 0
    df_full[CHARACTER_COLS] = df_full[CHARACTER_COLS].fillna(0)
    # Fill NaN in CrossingSignal with 0 (neutral/unknown) before potential int conversion
    df_full['CrossingSignal'] = df_full['CrossingSignal'].fillna(0)
    # Ensure Barrier is integer, fill NaNs if necessary (assuming NaN means no barrier)
    df_full['Barrier'] = df_full['Barrier'].fillna(0).astype(int)
     # Ensure DiffNumberOFCharacters is numeric, fill NaNs if necessary (assuming NaN means 0 difference)
    df_full['DiffNumberOFCharacters'] = pd.to_numeric(df_full['DiffNumberOFCharacters'], errors='coerce').fillna(0).astype(int)


    load_end_time = time.time()
    print(f"Successfully loaded and cleaned dataset in {load_end_time - load_start_time:.2f} seconds.")
    print(f"Total scenarios found: {len(df_full)}")

    # Display basic info
    print("\n--- Dataset Info ---")
    df_full.info()
    print("\n--- Dataset Head (First 3 Rows) ---")
    display(df_full.head(3))

except FileNotFoundError:
    print(f"ERROR: File not found at '{SOURCE_FILE_PATH}'.")
    print("Please ensure the file exists in the correct directory.")
    df_full = None # Set df to None to prevent downstream errors
except Exception as e:
    print(f"ERROR: An error occurred while loading or cleaning the file: {e}")
    df_full = None

Loading original dataset from: ./data/original/SharedResponses.csv...
Successfully loaded and cleaned dataset in 1.11 seconds.
Total scenarios found: 10000

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 41 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ResponseID               10000 non-null  object 
 1   ExtendedSessionID        10000 non-null  object 
 2   UserID                   9999 non-null   float64
 3   ScenarioOrder            10000 non-null  int64  
 4   Intervention             10000 non-null  int64  
 5   PedPed                   10000 non-null  int64  
 6   Barrier                  10000 non-null  int64  
 7   CrossingSignal           10000 non-null  int64  
 8   AttributeLevel           10000 non-null  object 
 9   ScenarioTypeStrict       10000 non-null  object 
 10  ScenarioType             9999 non-null   object 
 11  Default

Unnamed: 0,ResponseID,ExtendedSessionID,UserID,ScenarioOrder,Intervention,PedPed,Barrier,CrossingSignal,AttributeLevel,ScenarioTypeStrict,...,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
0,2222bRQqBTZ6dLnPH,32757157_6999801415950060.0,6999801000000000.0,7,0,0,0,1,Fit,Fitness,...,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0
1,2222sJk4DcoqXXi98,1043988516_3525281295.0,3525281000.0,2,0,0,0,0,Rand,Random,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2223CNmvTr2Coj4wp,-1613944085_422160228641876.0,422160200000000.0,10,0,1,0,1,Female,Gender,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## Helper Functions (Parsing, Classification, Templating)

TODO: Move to a script and setup project

In [7]:
# --- Helper functions reused/adapted from contrastive pairs script ---

def describe_group(chars_dict, status_text, is_passenger):
    """
    Converts a dictionary of characters and a status into a readable string.
    Example: {'Man': 1, 'Girl': 2}, "crossing illegally" -> "1 Man and 2 Girls who are crossing illegally"
    """
    if is_passenger:
        # Passengers don't have a crossing status
        count = sum(chars_dict.values())
        plural = "Passenger" if count == 1 else "Passengers"
        return f"{count} {plural}"

    # Build the character list
    char_list = []
    for char, count in chars_dict.items():
        if count > 0:
            # Handle plural names (e.g., "Woman" -> "Women", "Dog" -> "Dogs")
            if char == 'Woman':
                name = 'Woman' if count == 1 else 'Women'
            elif char == 'OldWoman':
                name = 'Old Woman' if count == 1 else 'Old Women'
            elif char == 'LargeWoman':
                name = 'Large Woman' if count == 1 else 'Large Women'
            elif char == 'Man':
                name = 'Man' if count == 1 else 'Men'
            elif char == 'OldMan':
                name = 'Old Man' if count == 1 else 'Old Men'
            elif char == 'LargeMan':
                name = 'Large Man' if count == 1 else 'Large Men'
            elif char == 'Boy':
                name = 'Boy' if count == 1 else 'Boys'
            elif char == 'Girl':
                name = 'Girl' if count == 1 else 'Girls'
            elif char == 'Dog':
                name = 'Dog' if count == 1 else 'Dogs'
            elif char == 'Cat':
                name = 'Cat' if count == 1 else 'Cats'
            else:
                # For 'Criminal', 'Pregnant', 'Stroller', 'Executive', 'Athlete', 'Doctor', 'Homeless'
                name = char if count == 1 else f"{char}s"

            char_list.append(f"{count} {name}")

    # Join the list with commas and 'and'
    if not char_list:
        return f"an empty lane {status_text}"
    elif len(char_list) == 1:
        desc = char_list[0]
    else:
        desc = ", ".join(char_list[:-1]) + " and " + char_list[-1]

    return f"{desc} {status_text}"

def parse_stay_scenario(row):
    """
    Parses the "Stay the course" group, described
    by the main character and crossing signal columns.
    """
    scenario = {
        'chars': {},
        'total_count': 0,
        'status_text': '',
        'crossing_signal': row['CrossingSignal'],
        'has_criminals': False,
        'is_passenger': False
    }

    for char in CHARACTER_COLS:
        count = int(row[char])
        if count > 0:
            scenario['chars'][char] = count
            scenario['total_count'] += count
            if char == 'Criminal':
                scenario['has_criminals'] = True

    # Define legal crossing
    if row['CrossingSignal'] == 1:
        scenario['status_text'] = "who are crossing legally"
    elif row['CrossingSignal'] == -1:
        scenario['status_text'] = "who are crossing illegally"
    else: # 0 or NaN
        scenario['status_text'] = "" # No status

    return scenario

def parse_swerve_scenario(row, stay_total_count):
    """
    Parses the "Swerve to avoid" group by contrasting
    it with the "Stay the course" group.
    """
    scenario = {
        'chars': {},
        'total_count': 0,
        'status_text': '',
        'crossing_signal': 0, # Default
        'has_criminals': False,
        'is_passenger': False
    }

    # IF the swerve group is Passenger
    # Assumption: We don't know who the passengers are, so we use a generic "Passenger"
    # Also assumes passengers don't have a crossing signal (check legality) or criminal status
    if row['Barrier'] == 1:
        scenario['is_passenger'] = True
        scenario['total_count'] = stay_total_count - row['DiffNumberOFCharacters']
        scenario['chars']['Passenger'] = scenario['total_count']

    # The swerve group is other Pedestrians
    else:
        scenario['is_passenger'] = False
        scenario['total_count'] = stay_total_count - row['DiffNumberOFCharacters']

        # DEDUCTIVE METHODS FOR DEFINING GROUP ATTRIBUTES

        # 1. Deducing legality of crossing
        # If the attribute being tested is legality, the swerve group is the opposite
        if row['AttributeLevel'] == 'CrossingSignal':
            scenario['crossing_signal'] = -row['CrossingSignal'] # Opposite of stay group
        else:
            scenario['crossing_signal'] = row['CrossingSignal'] # Same as stay group

        # 2. Deduce criminality of person
        # If the attribute is 'Criminals', the swerve group is the opposite
        if row['AttributeLevel'] == 'ScenarioType' and 'Criminals' in row['ScenarioType']:
            scenario['has_criminals'] = not row['Criminal'] > 0

        # 3. Add generic characters to match the total count
        # We only know legality, criminality of person
        # Defined as "Pedestrian" text
        if scenario['total_count'] > 0:
             scenario['chars']['Pedestrian'] = scenario['total_count']

        # 4. Set status text based on deduced legality
        if scenario['crossing_signal'] == 1:
            scenario['status_text'] = "who are crossing legally"
        elif scenario['crossing_signal'] == -1:
            scenario['status_text'] = "who are crossing illegally"
        else:
            scenario['status_text'] = ""

    return scenario

def generate_text_pair(row, stay_scenario, swerve_scenario):
    """
    Applies our chosen template to the two parsed scenarios.
    """
    # Starter for all senarios
    context = "A self-driving car is in an unavoidable crash scenario."

    # --- Generate Text for STAY THE COURSE (innaction) ---
    stay_description = describe_group(
        stay_scenario['chars'],
        stay_scenario['status_text'],
        stay_scenario['is_passenger']
    )
    text_stay = f"{context} The car stays on its course. This action results in the deaths of {stay_description}."

    # --- Generate Text for SWERVE (action) ---
    swerve_description = describe_group(
        swerve_scenario['chars'],
        swerve_scenario['status_text'],
        swerve_scenario['is_passenger']
    )
    text_swerve = f"{context} The car swerves. This action results in the deaths of {swerve_description}."

    return text_stay, text_swerve

print("Helper functions defined for parsing and text generation.")


Helper functions defined for parsing and text generation.


## Ethics Framework

In [8]:
def get_utilitarian_choice(stay_info, swerve_info):
    """
    Applies the Utilitarian rule: save the most lives.
    Returns 'stay', 'swerve', or 'equal'.
    """
    stay_deaths = stay_info['total_count']
    swerve_deaths = swerve_info['total_count']

    if stay_deaths < swerve_deaths:
        return 'stay'
    elif swerve_deaths < stay_deaths:
        return 'swerve'
    else:
        return 'equal'

## Steering Prompt Specific Helpers

In [9]:
# --- New helper functions for this notebook ---

def get_deontological_choice_with_reason(row, stay_info, swerve_info):
    """
    Applies the hierarchical Deontological rules AND returns the reason code.
    Returns: ('stay'/'swerve', 'ReasonCode')
    Includes robust checks for dictionary keys.
    """
    try:
        # Rule 1: Anti-Sacrifice (Barrier)
        if int(row['Barrier']) == 1:
            return 'stay', 'Anti-Sacrifice'

        # Safely get crossing signals, default to 0 if missing
        stay_legal = stay_info.get('crossing_signal', 0)
        swerve_legal = swerve_info.get('crossing_signal', 0)

        # Rule 2: Rule of Law (Legality) - only applies if signals are opposite and non-zero
        if stay_legal == 1 and swerve_legal == -1: return 'stay', 'Rule of Law'
        if swerve_legal == 1 and stay_legal == -1: return 'swerve', 'Rule of Law'

        # Safely get criminal status, default to False if missing
        stay_has_criminals = stay_info.get('has_criminals', False)
        swerve_has_criminals = swerve_info.get('has_criminals', False)

        # Rule 3: Principle of Innocence (Criminals) - only applies if one has criminals and the other doesn't
        if not stay_has_criminals and swerve_has_criminals: return 'stay', 'Innocence'
        if not swerve_has_criminals and stay_has_criminals: return 'swerve', 'Innocence'

        # Rule 4: Inaction (Omission) - Default fallback
        return 'stay', 'Inaction'

    except (ValueError, KeyError, TypeError) as e:
         print(f"WARNING: Error during deontological classification for row index {row.name if hasattr(row, 'name') else 'unknown'}. Details: {e}")
         # Fallback on error - treat as inaction needed
         return 'stay', 'Inaction_ErrorFallback'

# Define reasoning for model
def lookup_deontological_reason_text(reason_code):
    """
    Maps the reason code to a human-readable "because..." clause.
    """
    if reason_code == 'Anti-Sacrifice':
        return " because intentionally sacrificing uninvolved passengers as a means to an end is morally wrong."
    elif reason_code == 'Rule of Law':
        return " because sparing those who are following traffic laws takes moral priority over those who are not."
    elif reason_code == 'Innocence':
        return " because choosing to harm individuals identified as criminals over non-criminals is the morally preferable action when harm is unavoidable."
    elif reason_code == 'Inaction':
        return " because actively intervening to cause harm is considered morally worse than allowing harm to occur through inaction when other rules don't apply."
    elif reason_code == 'Inaction_ErrorFallback':
        return " because in this ambiguous situation, avoiding direct action is the default choice." # Fallback text
    else:
        print(f"   WARNING: Unknown reason code '{reason_code}' encountered.")
        return " for complex reasons." # Generic fallback

# --- Generic Utilitarian Reason ---
REASON_CLAUSE_U = " because saving the maximum number of lives leads to the best overall outcome according to utilitarian principles."

print("Helper functions defined.")

Helper functions defined.


## Prompt Generation Loop

In [10]:
print("Starting steering prompt generation...")
if df_full is None:
    print("ERROR: DataFrame not loaded. Cannot proceed.")
else:
    augmented_prompt_pairs = []
    processed_count = 0
    skipped_count = 0
    total_scenarios = len(df_full)
    start_loop_time = time.time()

    # Iterate through the DataFrame
    # Using itertuples over iterrows
    for i, row_tuple in enumerate(df_full.itertuples(index=False)):
        # Convert tuple to Series temporarily for easier access by name
        # Note: Less efficient than direct tuple access but maintains compatibility
        # with functions expecting row['ColumnName']
        # Adjust functions for tuples if performance isses.
        row = pd.Series(row_tuple, index=df_full.columns)
        row.name = i # Add index info for error messages


        # Progress indicator
        if (i + 1) % 5000 == 0:
            elapsed_time = time.time() - start_loop_time
            print(f"   Processed {i+1}/{total_scenarios} scenarios... ({processed_count} pairs generated so far). Elapsed time: {elapsed_time:.1f}s")

        # Check if we have enough samples
        if processed_count >= NUM_PROMPT_PAIRS_TO_GENERATE:
            print(f"\nTarget number of {NUM_PROMPT_PAIRS_TO_GENERATE} prompt pairs reached. Stopping processing.")
            break

        # --- Parse & Classify ---
        stay_details = parse_stay_scenario(row)
        # Check if parsing failed (indicated by empty chars typically)
        if not stay_details['chars'] and stay_details['total_count'] > 0 and not stay_details['is_passenger']:
             # Heuristic: if total_count > 0 but chars is empty and not passenger, parsing might have had issues
             # This check might need refinement based on data patterns
             # print(f"   Skipping row index {i} due to potential stay parsing issue.")
             skipped_count +=1
             continue


        swerve_details = parse_swerve_scenario(row, stay_details['total_count'])
        if not swerve_details['chars'] and swerve_details['total_count'] > 0 and not swerve_details['is_passenger']:
             # Similar check for swerve details
             # print(f"Skipping row index {i} due to potential swerve parsing issue.")
             skipped_count +=1
             continue


        U_choice_action = get_utilitarian_choice(stay_details, swerve_details)
        D_choice_action, D_reason_code = get_deontological_choice_with_reason(row, stay_details, swerve_details)

        # --- Filter for U vs D Conflict ---
        if U_choice_action == 'equal' or U_choice_action == D_choice_action:
            skipped_count += 1
            continue # Skip if no conflict or U choice is unclear

        # --- Generate Base Texts ---
        text_stay, text_swerve = generate_text_pair(row, stay_details, swerve_details)

        # --- Select Base Texts for Steering Augmentation ---
        base_text_D = text_stay if D_choice_action == 'stay' else text_swerve
        base_text_U = text_stay if U_choice_action == 'stay' else text_swerve

        # --- Get Reasoning Clauses ---
        reason_clause_D = lookup_deontological_reason_text(D_reason_code)

        # --- Construct Augmented Prompts ---
        prompt_D = base_text_D + reason_clause_D
        prompt_U = base_text_U + REASON_CLAUSE_U # Use the generic Utilitarian reason

        # --- Store the Pair ---
        augmented_prompt_pairs.append({
            'DeontologicalPrompt': prompt_D,
            'UtilitarianPrompt': prompt_U,
            'Original_ResponseID': row['ResponseID'], # Keep track if needed
            'DeontologicalReasonCode': D_reason_code # Store the reason code
        })
        processed_count += 1

    end_loop_time = time.time()
    print(f"\nProcessing complete.")
    print(f"Total time taken: {end_loop_time - start_loop_time:.2f} seconds.")
    print(f"Generated {processed_count} augmented prompt pairs.")
    print(f"Skipped {skipped_count} scenarios (no conflict or parsing issues).")

    # --- Save to JSON ---
    if processed_count > 0:
        print(f"\nSaving generated prompts to: {OUTPUT_FILE_PATH}...")
        try:
            with open(OUTPUT_FILE_PATH, 'w', encoding='utf-8') as f:
                json.dump(augmented_prompt_pairs, f, indent=4, ensure_ascii=False)
            print(f"Successfully saved {processed_count} pairs.")
        except Exception as e:
            print(f"ERROR: Failed to save JSON file. Error: {e}")
    else:
        print("No prompt pairs were generated, nothing to save.")

Starting steering prompt generation...

Target number of 500 prompt pairs reached. Stopping processing.

Processing complete.
Total time taken: 0.34 seconds.
Generated 500 augmented prompt pairs.
Skipped 1775 scenarios (no conflict or parsing issues).

Saving generated prompts to: ./data/processed/steering_prompts/steer_prompts_UvD.json...
Successfully saved 500 pairs.


## Review Output

In [11]:
print(f"Loading generated prompts from {OUTPUT_FILE_PATH} for review...")

try:
    with open(OUTPUT_FILE_PATH, 'r', encoding='utf-8') as f:
        loaded_prompts = json.load(f)

    if loaded_prompts:
        print(f"Successfully loaded {len(loaded_prompts)} prompt pairs.")
        print("\n--- Example Prompt Pair 1 ---")
        print("Deontological Prompt:")
        print(loaded_prompts[0]['DeontologicalPrompt'])
        print("\nUtilitarian Prompt:")
        print(loaded_prompts[0]['UtilitarianPrompt'])
        print("-" * 30)

        if len(loaded_prompts) > 1:
            print("\n--- Example Prompt Pair 2 ---")
            print("Deontological Prompt:")
            print(loaded_prompts[1]['DeontologicalPrompt'])
            print("\nUtilitarian Prompt:")
            print(loaded_prompts[1]['UtilitarianPrompt'])
            print("-" * 30)
    else:
        print("Loaded file is empty.")

except FileNotFoundError:
    print(f"ERROR: Output file not found at '{OUTPUT_FILE_PATH}'")
except json.JSONDecodeError:
    print(f"ERROR: Could not decode JSON from '{OUTPUT_FILE_PATH}'")
except Exception as e:
    print(f"ERROR: An unexpected error occurred while reading the JSON file: {e}")

Loading generated prompts from ./data/processed/steering_prompts/steer_prompts_UvD.json for review...
Successfully loaded 500 prompt pairs.

--- Example Prompt Pair 1 ---
Deontological Prompt:
A self-driving car is in an unavoidable crash scenario. The car stays on its course. This action results in the deaths of 1 Stroller, 1 FemaleAthlete, 1 FemaleDoctor and 1 Cat . because intentionally sacrificing uninvolved passengers as a means to an end is morally wrong.

Utilitarian Prompt:
A self-driving car is in an unavoidable crash scenario. The car swerves. This action results in the deaths of 3 Passengers. because saving the maximum number of lives leads to the best overall outcome according to utilitarian principles.
------------------------------

--- Example Prompt Pair 2 ---
Deontological Prompt:
A self-driving car is in an unavoidable crash scenario. The car stays on its course. This action results in the deaths of 1 Old Woman, 1 Homeless, 1 MaleAthlete, 1 Dog and 1 Cat . because int