In [6]:
"""
=============================================================================
GENDER BIAS PATTERN ANALYSIS IN NARRATIVE TEXT
=============================================================================
Author: Data Research Engineer
Purpose: Detect and quantify gender representation and trait attribution bias
         in story narratives using LLM-based extraction

Key Features:
- Adaptive character extraction (not limited to 4 slots)
- Comprehensive trait analysis
- Statistical aggregation
- Robust error handling
- Detailed logging

Research Questions:
1. Are male/female characters represented equally?
2. Do male/female characters receive different trait attributions?
3. Are power/intelligence/bravery scored differently by gender?
=============================================================================
"""

import pandas as pd
import json
import numpy as np
from typing import Dict, Any, List
from openai import OpenAI
from datetime import datetime
import traceback

# =====================================================
# CONFIGURATION
# =====================================================
class Config:
    # Processing settings
    NUM_STORIES = 1  # Start with 1, can increase to 2, 5, 10, 60, etc.
    INPUT_CSV = "samentic_chunk.csv"
    OUTPUT_DIR = "analysis_results"
    
    # Output files
    TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
    RAW_OUTPUT = f"{OUTPUT_DIR}/raw_characters_{TIMESTAMP}.csv"
    AGGREGATED_OUTPUT = f"{OUTPUT_DIR}/bias_summary_{TIMESTAMP}.csv"
    LOG_FILE = f"{OUTPUT_DIR}/processing_log_{TIMESTAMP}.txt"
    
    # LLM settings
    MODEL = "gpt-4.1-nano"
    TEMPERATURE = 0.1
    MAX_TOKENS = 3500
    
    # Analysis dimensions
    DIMENSIONS = ["intelligence", "bravery", "power", "agency", "emotionality"]

# =====================================================
# LOGGING
# =====================================================
class Logger:
    def __init__(self, log_file: str):
        import os
        os.makedirs(os.path.dirname(log_file), exist_ok=True)
        self.log_file = log_file
        
    def log(self, message: str, level: str = "INFO"):
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_message = f"[{timestamp}] [{level}] {message}"
        print(log_message)
        
        with open(self.log_file, "a", encoding="utf-8") as f:
            f.write(log_message + "\n")

# =====================================================
# OPENAI CLIENT
# =====================================================
client = OpenAI()

# =====================================================
# IMPROVED PROMPT - RESEARCH-GRADE
# =====================================================
def build_analysis_prompt(title: str, chunk_id: int, text: str) -> str:
    """
    Research-grade prompt for unbiased character analysis
    
    Key improvements:
    - Clear definition of evidence levels
    - Flexible character count
    - Multiple trait dimensions
    - Explicit gender identification rules
    """
    
    return f"""You are analyzing narrative text for gender representation research.

CRITICAL RULES:
1. ONLY analyze the text provided below - no external knowledge
2. Gender identification: ONLY if explicitly stated (he/she/him/her or direct statements like "the woman" or "the man")
3. Count each distinct person separately
4. If gender is ambiguous or unstated, mark as "unknown"

TASK: Extract ALL characters mentioned in this text chunk.

For EACH character, provide:
- name: Character's name or description (e.g., "the merchant", "Anna", "unnamed soldier")
- gender: "male", "female", or "unknown" (ONLY based on explicit evidence)
- traits: List of character traits explicitly shown (e.g., ["brave", "intelligent", "cruel"])
- scores: Rate these dimensions based on EXPLICIT evidence in the text:

SCORING SYSTEM:
+2 = Strongly demonstrated (multiple clear examples)
+1 = Demonstrated (one clear example)
 0 = No evidence or neutral
-1 = Contradicted (shown to lack this trait)
-2 = Strongly contradicted (multiple examples of lacking trait)

DIMENSIONS TO SCORE:
- intelligence: Problem-solving, knowledge, wisdom, strategic thinking
- bravery: Courage, facing danger, taking risks
- power: Authority, control, influence over others or events
- agency: Making decisions, driving the plot, acting independently
- emotionality: Expressing feelings, being emotional (not negative - just presence)

OUTPUT FORMAT (JSON):
{{
  "chunk_summary": "Brief 1-sentence summary",
  "total_characters": number,
  "characters": [
    {{
      "name": "character name",
      "gender": "male" | "female" | "unknown",
      "traits": ["trait1", "trait2"],
      "scores": {{
        "intelligence": -2 to +2,
        "bravery": -2 to +2,
        "power": -2 to +2,
        "agency": -2 to +2,
        "emotionality": -2 to +2
      }}
    }}
  ]
}}

CRITICAL OUTPUT RULES:
- MUST return complete, valid JSON only
- If 0 characters: "characters": []
- Keep chunk_summary under 15 words
- Keep name under 20 characters  
- Maximum 5 traits per character
- NO evidence field - removed to prevent truncation
- NO markdown, NO explanations
- MUST close all brackets and braces
- When uncertain, score 0

STORY CHUNK TO ANALYZE:
Title: {title}
Chunk ID: {chunk_id}

TEXT:
{text}

JSON OUTPUT:"""

# =====================================================
# LLM INTERACTION
# =====================================================
def call_llm(prompt: str, logger: Logger) -> str:
    """Call OpenAI API with error handling"""
    try:
        logger.log("Calling OpenAI API...")
        
        response = client.responses.create(
            model=Config.MODEL,
            input=prompt,
            temperature=Config.TEMPERATURE,
            max_output_tokens=Config.MAX_TOKENS
        )
        
        return response.output_text
        
    except Exception as e:
        logger.log(f"API Error: {str(e)}", "ERROR")
        raise

# =====================================================
# JSON EXTRACTION & VALIDATION
# =====================================================
def extract_and_validate_json(text: str, logger: Logger) -> Dict[str, Any]:
    """
    Extract JSON from LLM response with robust error handling
    """
    try:
        # Remove markdown if present
        text = text.strip()
        if text.startswith("```"):
            text = text.split("```")[1]
            if text.startswith("json"):
                text = text[4:]
        
        # Find JSON boundaries
        start = text.find("{")
        end = text.rfind("}") + 1
        
        if start == -1 or end == 0:
            raise ValueError("No JSON found in response")
        
        json_str = text[start:end]
        
        # First attempt: parse as-is
        try:
            parsed = json.loads(json_str)
        except json.JSONDecodeError as e:
            # If truncated, try to recover
            logger.log(f"JSON truncated at position {e.pos}, attempting repair...", "WARNING")
            
            # Strategy: Find last complete character and close the JSON properly
            if '"characters"' in json_str:
                # Find the start of the last character that begins properly
                last_complete = -1
                char_pattern = '"name"'
                pos = 0
                while True:
                    pos = json_str.find(char_pattern, pos)
                    if pos == -1:
                        break
                    # Check if this character is complete (has closing brace before error)
                    if pos < e.pos:
                        last_complete = pos
                    pos += 1
                
                if last_complete > 0:
                    # Find the opening brace of this character
                    brace_pos = json_str.rfind('{', 0, last_complete)
                    if brace_pos > 0:
                        # Truncate before the incomplete character
                        json_str = json_str[:brace_pos].rstrip().rstrip(',') + ']}'
                        logger.log("Repaired by removing incomplete character entry", "WARNING")
                        parsed = json.loads(json_str)
                    else:
                        # No complete characters, return empty
                        parsed = {"chunk_summary": "Parse error", "total_characters": 0, "characters": []}
                        logger.log("No complete characters found, returning empty", "WARNING")
                else:
                    # No complete characters found
                    parsed = {"chunk_summary": "Parse error", "total_characters": 0, "characters": []}
                    logger.log("No complete characters found, returning empty", "WARNING")
            else:
                raise
        
        # Validate structure
        required_keys = {"chunk_summary", "total_characters", "characters"}
        if not required_keys.issubset(parsed.keys()):
            raise ValueError(f"Missing required keys. Expected {required_keys}, got {parsed.keys()}")
        
        # Validate each character
        for i, char in enumerate(parsed["characters"]):
            char_keys = {"name", "gender", "traits", "scores"}
            if not char_keys.issubset(char.keys()):
                logger.log(f"Character {i} missing keys. Filling with defaults.", "WARNING")
                
                # Fill missing keys
                if "name" not in char:
                    char["name"] = f"character_{i}"
                if "gender" not in char:
                    char["gender"] = "unknown"
                if "traits" not in char:
                    char["traits"] = []
                if "scores" not in char:
                    char["scores"] = {}
            # Validate scores
            for dim in Config.DIMENSIONS:
                if dim not in char["scores"]:
                    char["scores"][dim] = 0
                    
        return parsed
        
    except json.JSONDecodeError as e:
        logger.log(f"JSON Parse Error: {e}", "ERROR")
        logger.log(f"Response preview: {text[:500]}", "ERROR")
        raise
    except Exception as e:
        logger.log(f"Validation Error: {e}", "ERROR")
        raise

# =====================================================
# DATA FLATTENING
# =====================================================
def flatten_to_rows(result: Dict[str, Any], title: str, chunk_id: int) -> List[Dict[str, Any]]:
    """
    Convert nested JSON to flat rows for CSV
    Each character becomes a separate row
    """
    rows = []
    
    for char in result["characters"]:
        row = {
            "title": title,
            "chunk_id": chunk_id,
            "chunk_summary": result["chunk_summary"],
            "character_name": char["name"],
            "gender": char["gender"],
            "traits": json.dumps(char["traits"]),
        }
        
        # Add dimension scores
        for dim in Config.DIMENSIONS:
            row[f"score_{dim}"] = char["scores"].get(dim, 0)
        
        rows.append(row)
    
    return rows

# =====================================================
# BIAS ANALYSIS
# =====================================================
def analyze_bias(df: pd.DataFrame, logger: Logger) -> pd.DataFrame:
    """
    Compute aggregate statistics to reveal bias patterns
    """
    logger.log("Computing bias statistics...")
    
    # Filter out unknown gender for bias comparison
    df_gendered = df[df["gender"].isin(["male", "female"])]
    
    if len(df_gendered) == 0:
        logger.log("No gendered characters found", "WARNING")
        return pd.DataFrame()
    
    # Group by gender and compute statistics
    stats = []
    
    for gender in ["male", "female"]:
        gender_df = df_gendered[df_gendered["gender"] == gender]
        
        if len(gender_df) == 0:
            continue
        
        stat_row = {
            "gender": gender,
            "character_count": len(gender_df),
            "percentage": len(gender_df) / len(df_gendered) * 100,
        }
        
        # Average scores for each dimension
        for dim in Config.DIMENSIONS:
            scores = gender_df[f"score_{dim}"]
            stat_row[f"{dim}_mean"] = scores.mean()
            stat_row[f"{dim}_std"] = scores.std()
            stat_row[f"{dim}_median"] = scores.median()
        
        stats.append(stat_row)
    
    stats_df = pd.DataFrame(stats)
    
    # Add difference row (male - female)
    if len(stats_df) == 2:
        diff_row = {"gender": "difference (male - female)"}
        
        male_stats = stats_df[stats_df["gender"] == "male"].iloc[0]
        female_stats = stats_df[stats_df["gender"] == "female"].iloc[0]
        
        diff_row["character_count"] = male_stats["character_count"] - female_stats["character_count"]
        diff_row["percentage"] = male_stats["percentage"] - female_stats["percentage"]
        
        for dim in Config.DIMENSIONS:
            diff_row[f"{dim}_mean"] = male_stats[f"{dim}_mean"] - female_stats[f"{dim}_mean"]
            diff_row[f"{dim}_std"] = None
            diff_row[f"{dim}_median"] = male_stats[f"{dim}_median"] - female_stats[f"{dim}_median"]
        
        stats_df = pd.concat([stats_df, pd.DataFrame([diff_row])], ignore_index=True)
    
    return stats_df

# =====================================================
# MAIN PIPELINE
# =====================================================
def process_stories():
    """
    Main processing pipeline with comprehensive error handling
    """
    # Initialize
    import os
    os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
    
    logger = Logger(Config.LOG_FILE)
    logger.log("="*70)
    logger.log("GENDER BIAS ANALYSIS - STARTED")
    logger.log(f"Configuration: {Config.NUM_STORIES} stories")
    logger.log("="*70)
    
    try:
        # Load data
        logger.log(f"Loading data from {Config.INPUT_CSV}")
        df = pd.read_csv(Config.INPUT_CSV)
        
        # Select stories
        selected_titles = df["title"].unique()[:Config.NUM_STORIES]
        df = df[df["title"].isin(selected_titles)]
        df = df.sort_values(["title", "chunk_id"])
        
        logger.log(f"✓ Stories selected: {len(selected_titles)}")
        logger.log(f"✓ Total chunks: {len(df)}")
        logger.log(f"Story titles: {list(selected_titles)}")
        
        # Process each chunk
        all_characters = []
        
        for idx, row in df.iterrows():
            logger.log("-" * 70)
            logger.log(f"Processing: {row['title']} | Chunk {row['chunk_id']}")
            
            try:
                # Build prompt
                prompt = build_analysis_prompt(
                    title=row["title"],
                    chunk_id=row["chunk_id"],
                    text=row["chunk_text"]
                )
                
                # Call LLM
                response = call_llm(prompt, logger)
                
                # Parse and validate
                result = extract_and_validate_json(response, logger)
                
                # Convert to rows
                rows = flatten_to_rows(result, row["title"], row["chunk_id"])
                all_characters.extend(rows)
                
                logger.log(f"✓ Extracted {len(rows)} characters")
                
            except Exception as e:
                logger.log(f"✗ Failed to process chunk: {str(e)}", "ERROR")
                logger.log(traceback.format_exc(), "ERROR")
                continue
        
        # Save raw results
        if len(all_characters) > 0:
            characters_df = pd.DataFrame(all_characters)
            characters_df.to_csv(Config.RAW_OUTPUT, index=False)
            logger.log(f"✓ Raw results saved: {Config.RAW_OUTPUT}")
            
            # Compute bias statistics
            bias_stats = analyze_bias(characters_df, logger)
            
            if len(bias_stats) > 0:
                bias_stats.to_csv(Config.AGGREGATED_OUTPUT, index=False)
                logger.log(f"✓ Bias analysis saved: {Config.AGGREGATED_OUTPUT}")
                
                # Print summary
                logger.log("="*70)
                logger.log("BIAS SUMMARY:")
                logger.log("="*70)
                logger.log("\n" + bias_stats.to_string(index=False))
            
            # Print key findings
            logger.log("="*70)
            logger.log("KEY FINDINGS:")
            logger.log(f"Total characters analyzed: {len(characters_df)}")
            
            gender_counts = characters_df["gender"].value_counts()
            for gender, count in gender_counts.items():
                logger.log(f"  {gender}: {count} ({count/len(characters_df)*100:.1f}%)")
            
        else:
            logger.log("No characters extracted", "WARNING")
        
        logger.log("="*70)
        logger.log("PROCESSING COMPLETE")
        logger.log("="*70)
        
    except Exception as e:
        logger.log(f"CRITICAL ERROR: {str(e)}", "ERROR")
        logger.log(traceback.format_exc(), "ERROR")
        raise

# =====================================================
# RUN
# =====================================================
if __name__ == "__main__":
    process_stories()

[2026-02-09 14:55:22] [INFO] GENDER BIAS ANALYSIS - STARTED
[2026-02-09 14:55:22] [INFO] Configuration: 1 stories
[2026-02-09 14:55:22] [INFO] Loading data from samentic_chunk.csv
[2026-02-09 14:55:22] [INFO] ✓ Stories selected: 1
[2026-02-09 14:55:22] [INFO] ✓ Total chunks: 61
[2026-02-09 14:55:22] [INFO] Story titles: ['A DESCENT INTO THE MAELSTRÖM']
[2026-02-09 14:55:22] [INFO] ----------------------------------------------------------------------
[2026-02-09 14:55:22] [INFO] Processing: A DESCENT INTO THE MAELSTRÖM | Chunk 0
[2026-02-09 14:55:22] [INFO] Calling OpenAI API...
[2026-02-09 14:55:28] [INFO] ✓ Extracted 1 characters
[2026-02-09 14:55:28] [INFO] ----------------------------------------------------------------------
[2026-02-09 14:55:28] [INFO] Processing: A DESCENT INTO THE MAELSTRÖM | Chunk 1
[2026-02-09 14:55:28] [INFO] Calling OpenAI API...
[2026-02-09 14:55:30] [INFO] ✓ Extracted 1 characters
[2026-02-09 14:55:30] [INFO] ----------------------------------------------