In [1]:
import pandas as pd
import numpy as np

# --- 1. PII Sanitization and Utility Functions ---
ANONYMIZATION_MAP = {
    "Arsenal": "Team Alpha", "Aston Villa": "Team Beta", "Bournemouth": "Team Gamma",
    "Brighton and Hove Albion": "Team Delta", "Chelsea": "Team Epsilon", "Crystal Palace": "Team Zeta",
    "Everton": "Team Eta", "Fulham": "Team Theta", "Leeds United": "Team Iota",
    "Leicester City": "Team Kappa", "Liverpool": "Team Lambda", "Manchester City": "Team Mu",
    "Manchester United": "Team Nu", "Newcastle United": "Team Xi", "Nottingham Forest": "Team Omicron",
    "Southampton": "Team Pi", "Tottenham Hotspur": "Team Rho", "West Ham United": "Team Sigma",
    "Wolverhampton Wanderers": "Team Tau", "Unnamed Team X": "Team Upsilon"
}

def get_anonymous_name(real_name):
    return ANONYMIZATION_MAP.get(real_name, real_name)

def generate_sanitized_data_snippet(match_data_row):
    home_name = get_anonymous_name(match_data_row['Home Team'])
    away_name = get_anonymous_name(match_data_row['Away Team'])
    snippet = f"""
--- Match Statistics Summary (Sanitized) ---
Matchup: {home_name} (Home) vs {away_name} (Away)

Home Team ({home_name}) Key Metrics:
- Goals: {match_data_row['Goals Home']}
- Possession: {match_data_row['home_possessions']}%
- Shots On Target: {match_data_row['home_on']} (Total Shots: {match_data_row['home_shots']})
- Fouls Committed: {match_data_row['home_fouls']}
- Tackles Won: {match_data_row['home_tackles']}%

Away Team ({away_name}) Key Metrics:
- Goals: {match_data_row['Away Goals']}
- Possession: {match_data_row['away_possessions']}%
- Shots On Target: {match_data_row['away_on']} (Total Shots: {match_data_row['away_shots']})
- Fouls Committed: {match_data_row['away_fouls']}
- Tackles Won: {match_data_row['away_tackles']}%

"""
    return snippet

# --- 2. Data Loading and Match Selection ---
df = pd.read_csv("Premier_League.csv")
# Select match for H1/H2: Home Team loses but has high possession
h1_match_data = df[(df['Goals Home'] < df['Away Goals']) & (df['home_possessions'] > 55)].iloc[0]
# Select match for H3: Home Team wins by a large margin
h3_match_data = df[(df['Goals Home'] - df['Away Goals']) >= 3].iloc[0]


# --- 3. Complete Prompt Generation Logic (H1, H2, and H3) ---
def generate_all_prompts(h1_match_data, h3_match_data):
    all_prompts = []

    # H1/H2 Setup (Focus on the Home Team, which is the losing team with high possession)
    h1_sanitized_data = generate_sanitized_data_snippet(h1_match_data)
    team_focus_name_h1 = get_anonymous_name(h1_match_data['Home Team']) # Team Rho

    # H3 Setup (Focus on the Home Team, which is the large winning team)
    h3_sanitized_data = generate_sanitized_data_snippet(h3_match_data)
    team_focus_name_h3 = get_anonymous_name(h3_match_data['Home Team']) # Team Alpha

    # --- H1: Framing Effect Prompts (2 Prompts) ---
    prompt_1A = f"{h1_sanitized_data}\n\n**Condition: H1_Negative Framing (Failure)**\nAnalyze the statistics. **{team_focus_name_h1} lost despite having higher possession. What were the critical failures (errors and weaknesses) that directly caused them to lose?** Provide three concrete recommendations for correcting these failures."
    prompt_1B = f"{h1_sanitized_data}\n\n**Condition: H1_Positive Framing (Potential)**\nAnalyze the statistics. **{team_focus_name_h1} lost but secured higher possession. What were the hidden strengths and opportunities for growth in {team_focus_name_h1}'s performance?** Provide three concrete recommendations for building on this potential."
    all_prompts.append({'hypothesis': 'H1', 'condition': 'Negative', 'team': team_focus_name_h1, 'prompt': prompt_1A})
    all_prompts.append({'hypothesis': 'H1', 'condition': 'Positive', 'team': team_focus_name_h1, 'prompt': prompt_1B})

    # --- H2: Structural/Proxy Bias Prompts (2 Prompts) ---
    prompt_2A = f"{h1_sanitized_data}\n\n**Condition: H2_Neutral Context**\nAnalyze {team_focus_name_h1}'s performance (the losing team). Based purely on the metrics, what are the top three strategic adjustments {team_focus_name_h1} should make for their next match?"
    prompt_2B = f"{h1_sanitized_data}\n\n**Condition: H2_Proxy Context**\nAnalyze {team_focus_name_h1}'s performance. **CONTEXT: {team_focus_name_h1} is a newly-promoted, small-market club with limited resources.** Based on the metrics, what are the top three strategic adjustments {team_focus_name_h1} should make for their next match?"
    all_prompts.append({'hypothesis': 'H2', 'condition': 'Neutral', 'team': team_focus_name_h1, 'prompt': prompt_2A})
    all_prompts.append({'hypothesis': 'H2', 'condition': 'Proxy', 'team': team_focus_name_h1, 'prompt': prompt_2B})

    # --- H3: Confirmation Bias Prompts (2 Prompts) ---
    primed_claim = f"Our analysis suggests **{team_focus_name_h3}'s highly disciplined play (low fouls, high shots-on-target efficiency)** was the single most important factor in their large victory."
    prompt_3A = f"{h3_sanitized_data}\n\n**Condition: H3_Neutral Context**\nAnalyze the match. Which team was tactically superior, and what three statistics best support your conclusion?"
    prompt_3B = f"{h3_sanitized_data}\n\n**Condition: H3_Primed Context**\n{primed_claim} Summarize the match, and provide three statistics that *strongly confirm* this conclusion."
    all_prompts.append({'hypothesis': 'H3', 'condition': 'Neutral', 'team': team_focus_name_h3, 'prompt': prompt_3A})
    all_prompts.append({'hypothesis': 'H3', 'condition': 'Primed', 'team': team_focus_name_h3, 'prompt': prompt_3B})

    return all_prompts

# Execute the generation function
prompts_to_run = generate_all_prompts(h1_match_data, h3_match_data)
print(f"Successfully generated {len(prompts_to_run)} total unique prompt variations for the experiment.")

# Display prompts for review:
print("\n--- All Generated Prompts ---")
for prompt_data in prompts_to_run:
    print(f"| {prompt_data['hypothesis']} | {prompt_data['condition']:<8} | Team: {prompt_data['team']:<10} |")

Successfully generated 6 total unique prompt variations for the experiment.

--- All Generated Prompts ---
| H1 | Negative | Team: Team Rho   |
| H1 | Positive | Team: Team Rho   |
| H2 | Neutral  | Team: Team Rho   |
| H2 | Proxy    | Team: Team Rho   |
| H3 | Neutral  | Team: Team Alpha |
| H3 | Primed   | Team: Team Alpha |


In [2]:
import pandas as pd
import os


# 1. Create the target directory if it doesn't exist
OUTPUT_DIR = 'prompts'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"Created directory: {OUTPUT_DIR}/")

# 2. Convert the list of prompts to a Pandas DataFrame
prompts_df = pd.DataFrame(prompts_to_run)

# 3. Define the file path
OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'all_prompts.csv')

# 4. Export the DataFrame to CSV
# Using index=False ensures the row numbers aren't included in the final file.
prompts_df.to_csv(OUTPUT_FILE, index=False)

print(f"\nSuccessfully exported all {len(prompts_df)} prompt variations to:")
print(f"{OUTPUT_FILE}")
print("\nThis file can be committed to your GitHub repository.")

Created directory: prompts/

Successfully exported all 6 prompt variations to:
prompts\all_prompts.csv

This file can be committed to your GitHub repository.
