# Base vs Steering 詳細分析

**目的**: Baseに90.2%引き分け、84.6%のpersonasで全く差が出なかったSteeringを分析

**分析内容**:
1. 最適化された重みの分布と特徴
2. 差が出たpersona vs 出なかったpersonaの重みパターン比較
3. 重みの大きさとSteering効果の関係
4. Trait別の重み分析

In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import defaultdict

# 日本語フォント設定
plt.rcParams['font.sans-serif'] = ['DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# スタイル設定
sns.set_style("whitegrid")
sns.set_palette("husl")

print("✓ Libraries loaded")

ModuleNotFoundError: No module named 'seaborn'

## 1. データロード

In [None]:
# Load validation results
with open("../results/base_vs_steering/summary.json") as f:
    summary = json.load(f)

with open("../results/base_vs_steering/comparison_results.json") as f:
    comparison_data = json.load(f)

results = comparison_data["results"]

print(f"Total comparisons: {summary['total_comparisons']}")
print(f"Tie rate: {summary['win_rates']['tie']*100:.1f}%")
print(f"Steering win rate (decisive only): {summary['decisive_win_rates']['steering']*100:.1f}%")

In [None]:
# Load optimized weights for all personas
weights_dir = Path("../optimization_results_26personas")

all_weights = {}
for gpu_dir in ["gpu0", "gpu1"]:
    gpu_path = weights_dir / gpu_dir
    if gpu_path.exists():
        for weight_file in gpu_path.glob("*_best_weights.json"):
            persona_id = weight_file.stem.replace("_best_weights", "")
            with open(weight_file) as f:
                all_weights[persona_id] = json.load(f)

print(f"Loaded weights for {len(all_weights)} personas")
print(f"Personas: {sorted(all_weights.keys())[:5]}...")

## 2. Persona分類: 差が出た vs 出なかった

In [None]:
# Categorize personas by outcome
persona_outcomes = defaultdict(lambda: {"tie": 0, "steering": 0, "base": 0})

for r in results:
    persona_id = r["persona_id"]
    winner = r["winner"]
    persona_outcomes[persona_id][winner] += 1

# Classify personas
all_tie_personas = []
mixed_personas = []

for persona_id, outcomes in persona_outcomes.items():
    total = sum(outcomes.values())
    if outcomes["tie"] == total:
        all_tie_personas.append(persona_id)
    else:
        mixed_personas.append(persona_id)

print(f"All-tie personas (no effect): {len(all_tie_personas)} ({100*len(all_tie_personas)/len(persona_outcomes):.1f}%)")
print(f"Mixed personas (some effect): {len(mixed_personas)} ({100*len(mixed_personas)/len(persona_outcomes):.1f}%)")

# Show mixed personas with details
print("\nMixed personas (steering had some effect):")
for persona_id in sorted(mixed_personas):
    outcomes = persona_outcomes[persona_id]
    total = sum(outcomes.values())
    print(f"  {persona_id}:")
    print(f"    Tie: {outcomes['tie']}/{total} ({100*outcomes['tie']/total:.1f}%)")
    print(f"    Steering: {outcomes['steering']}/{total} ({100*outcomes['steering']/total:.1f}%)")
    print(f"    Base: {outcomes['base']}/{total} ({100*outcomes['base']/total:.1f}%)")

## 3. 重みの分布分析

In [None]:
# Convert weights to DataFrame
weights_data = []
for persona_id, weights in all_weights.items():
    # Determine if persona had effect
    had_effect = persona_id in mixed_personas
    
    row = {
        "persona_id": persona_id,
        "had_effect": had_effect,
        "R1": weights.get("R1", 0.0),
        "R2": weights.get("R2", 0.0),
        "R3": weights.get("R3", 0.0),
        "R4": weights.get("R4", 0.0),
        "R5": weights.get("R5", 0.0)
    }
    
    # Compute statistics
    weight_values = [row[f"R{i}"] for i in range(1, 6)]
    row["weight_mean"] = np.mean(weight_values)
    row["weight_std"] = np.std(weight_values)
    row["weight_abs_mean"] = np.mean(np.abs(weight_values))
    row["weight_l2_norm"] = np.linalg.norm(weight_values)
    
    weights_data.append(row)

df_weights = pd.DataFrame(weights_data)
df_weights.head(10)

In [None]:
# Summary statistics by effect group
print("Weight statistics by effect group:\n")
print(df_weights.groupby("had_effect")[["weight_mean", "weight_std", "weight_abs_mean", "weight_l2_norm"]].describe())

## 4. 可視化: 重みの分布

In [None]:
# Plot 1: Weight distributions by trait
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

traits = ["R1", "R2", "R3", "R4", "R5"]

for i, trait in enumerate(traits):
    ax = axes[i]
    
    # Plot distributions
    no_effect = df_weights[~df_weights["had_effect"]][trait]
    had_effect = df_weights[df_weights["had_effect"]][trait]
    
    ax.hist(no_effect, alpha=0.5, label=f"No effect (n={len(no_effect)})", bins=15, color="blue")
    ax.hist(had_effect, alpha=0.5, label=f"Had effect (n={len(had_effect)})", bins=15, color="red")
    
    ax.axvline(0, color='black', linestyle='--', linewidth=1, alpha=0.5)
    ax.set_xlabel(f"{trait} Weight")
    ax.set_ylabel("Count")
    ax.set_title(f"{trait} Weight Distribution")
    ax.legend()
    ax.grid(alpha=0.3)

# Remove empty subplot
fig.delaxes(axes[5])

plt.tight_layout()
plt.savefig("../results/base_vs_steering/weight_distributions.png", dpi=150, bbox_inches='tight')
plt.show()

print("✓ Saved: results/base_vs_steering/weight_distributions.png")

In [None]:
# Plot 2: L2 norm comparison
fig, ax = plt.subplots(figsize=(10, 6))

no_effect_norm = df_weights[~df_weights["had_effect"]]["weight_l2_norm"]
had_effect_norm = df_weights[df_weights["had_effect"]]["weight_l2_norm"]

positions = [1, 2]
bp = ax.boxplot([no_effect_norm, had_effect_norm], 
                 labels=["No Effect\n(n=22)", "Had Effect\n(n=4)"],
                 patch_artist=True,
                 showmeans=True)

bp['boxes'][0].set_facecolor('lightblue')
bp['boxes'][1].set_facecolor('lightcoral')

ax.set_ylabel("L2 Norm of Weights")
ax.set_title("Weight Vector Magnitude: Effect vs No Effect")
ax.grid(alpha=0.3)

plt.tight_layout()
plt.savefig("../results/base_vs_steering/weight_l2_norm_comparison.png", dpi=150, bbox_inches='tight')
plt.show()

print("✓ Saved: results/base_vs_steering/weight_l2_norm_comparison.png")

## 5. 重みパターンの可視化

In [None]:
# Heatmap of weights
weight_matrix = df_weights[["R1", "R2", "R3", "R4", "R5"]].values
persona_labels = df_weights["persona_id"].values
effect_labels = ["*" if had else "" for had in df_weights["had_effect"]]
row_labels = [f"{persona}{effect}" for persona, effect in zip(persona_labels, effect_labels)]

fig, ax = plt.subplots(figsize=(8, 14))
sns.heatmap(weight_matrix, 
            xticklabels=["R1", "R2", "R3", "R4", "R5"],
            yticklabels=row_labels,
            cmap="RdBu_r",
            center=0,
            annot=True,
            fmt=".2f",
            cbar_kws={'label': 'Weight Value'},
            ax=ax)

ax.set_title("Optimized Weights Heatmap\n(* = Had effect on generation)")
ax.set_xlabel("Trait")
ax.set_ylabel("Persona")

plt.tight_layout()
plt.savefig("../results/base_vs_steering/weights_heatmap.png", dpi=150, bbox_inches='tight')
plt.show()

print("✓ Saved: results/base_vs_steering/weights_heatmap.png")

## 6. 相関分析: 重みの大きさ vs Steering効果

In [None]:
# Add outcome metrics to dataframe
for idx, row in df_weights.iterrows():
    persona_id = row["persona_id"]
    outcomes = persona_outcomes[persona_id]
    total = sum(outcomes.values())
    
    df_weights.loc[idx, "steering_win_rate"] = outcomes["steering"] / total if total > 0 else 0
    df_weights.loc[idx, "decisive_rate"] = (outcomes["steering"] + outcomes["base"]) / total if total > 0 else 0

# Correlation analysis
print("Correlation between weight magnitude and steering effect:\n")
print(df_weights[["weight_l2_norm", "weight_abs_mean", "weight_std", "steering_win_rate", "decisive_rate"]].corr())

In [None]:
# Scatter plot: L2 norm vs decisive rate
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: L2 norm vs decisive rate
ax = axes[0]
colors = ['red' if had else 'blue' for had in df_weights["had_effect"]]
ax.scatter(df_weights["weight_l2_norm"], df_weights["decisive_rate"], c=colors, alpha=0.6, s=100)
ax.set_xlabel("L2 Norm of Weights")
ax.set_ylabel("Decisive Rate (non-tie proportion)")
ax.set_title("Weight Magnitude vs Steering Effect")
ax.grid(alpha=0.3)

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='red', label='Had effect'),
                   Patch(facecolor='blue', label='No effect')]
ax.legend(handles=legend_elements)

# Plot 2: Abs mean vs steering win rate (among decisive)
ax = axes[1]
# Only plot personas with some decisive comparisons
decisive_df = df_weights[df_weights["decisive_rate"] > 0]
colors = ['red' if had else 'blue' for had in decisive_df["had_effect"]]
ax.scatter(decisive_df["weight_abs_mean"], decisive_df["steering_win_rate"], c=colors, alpha=0.6, s=100)
ax.set_xlabel("Mean Absolute Weight")
ax.set_ylabel("Steering Win Rate (among all comparisons)")
ax.set_title("Weight Magnitude vs Steering Win Rate")
ax.grid(alpha=0.3)
ax.legend(handles=legend_elements)

plt.tight_layout()
plt.savefig("../results/base_vs_steering/weight_effect_correlation.png", dpi=150, bbox_inches='tight')
plt.show()

print("✓ Saved: results/base_vs_steering/weight_effect_correlation.png")

## 7. 効果があったPersonasの詳細分析

In [None]:
# Detailed analysis of effective personas
print("="*80)
print("DETAILED ANALYSIS: Personas where steering had effect")
print("="*80)

for persona_id in sorted(mixed_personas):
    print(f"\n{'='*60}")
    print(f"Persona: {persona_id}")
    print(f"{'='*60}")
    
    # Weights
    weights = all_weights[persona_id]
    print("\nOptimized Weights:")
    for trait, weight in sorted(weights.items()):
        print(f"  {trait}: {weight:+.3f}")
    
    weight_values = [weights[f"R{i}"] for i in range(1, 6)]
    print(f"\nWeight Statistics:")
    print(f"  L2 norm: {np.linalg.norm(weight_values):.3f}")
    print(f"  Mean abs: {np.mean(np.abs(weight_values)):.3f}")
    print(f"  Std: {np.std(weight_values):.3f}")
    
    # Outcomes
    outcomes = persona_outcomes[persona_id]
    total = sum(outcomes.values())
    print(f"\nOutcomes (n={total}):")
    print(f"  Tie: {outcomes['tie']} ({100*outcomes['tie']/total:.1f}%)")
    print(f"  Steering wins: {outcomes['steering']} ({100*outcomes['steering']/total:.1f}%)")
    print(f"  Base wins: {outcomes['base']} ({100*outcomes['base']/total:.1f}%)")
    print(f"  Decisive rate: {100*(outcomes['steering']+outcomes['base'])/total:.1f}%")

## 8. Trait別の効果分析

In [None]:
# Compare trait weights between effect groups
trait_comparison = df_weights.groupby("had_effect")[["R1", "R2", "R3", "R4", "R5"]].agg(['mean', 'std', 'median'])

print("Trait weight comparison: Effect vs No Effect")
print(trait_comparison)

# Statistical test (t-test)
from scipy import stats

print("\n" + "="*60)
print("T-test: Effect vs No Effect (for each trait)")
print("="*60)

for trait in ["R1", "R2", "R3", "R4", "R5"]:
    no_effect_vals = df_weights[~df_weights["had_effect"]][trait]
    had_effect_vals = df_weights[df_weights["had_effect"]][trait]
    
    t_stat, p_value = stats.ttest_ind(no_effect_vals, had_effect_vals)
    
    print(f"\n{trait}:")
    print(f"  No effect: mean={no_effect_vals.mean():.3f}, std={no_effect_vals.std():.3f}")
    print(f"  Had effect: mean={had_effect_vals.mean():.3f}, std={had_effect_vals.std():.3f}")
    print(f"  t-statistic: {t_stat:.3f}")
    print(f"  p-value: {p_value:.4f}")
    print(f"  Significant (p<0.05): {p_value < 0.05}")

## 9. まとめと考察

In [None]:
print("="*80)
print("SUMMARY: Why Steering Failed for 84.6% of Personas")
print("="*80)

print("\n1. Overall Statistics:")
print(f"   - Total personas: {len(df_weights)}")
print(f"   - No effect: {len(all_tie_personas)} ({100*len(all_tie_personas)/len(df_weights):.1f}%)")
print(f"   - Had effect: {len(mixed_personas)} ({100*len(mixed_personas)/len(df_weights):.1f}%)")

print("\n2. Weight Magnitude Comparison:")
no_effect_norm_mean = df_weights[~df_weights["had_effect"]]["weight_l2_norm"].mean()
had_effect_norm_mean = df_weights[df_weights["had_effect"]]["weight_l2_norm"].mean()
print(f"   - No effect group L2 norm: {no_effect_norm_mean:.3f}")
print(f"   - Had effect group L2 norm: {had_effect_norm_mean:.3f}")
print(f"   - Difference: {had_effect_norm_mean - no_effect_norm_mean:.3f}")

print("\n3. Possible Explanations:")
print("   a) Optimization converged to near-zero weights for most personas")
print("   b) Trait vectors don't capture persona-specific characteristics")
print("   c) Layer 20 may not be the optimal intervention point")
print("   d) Alpha=2.0 scaling may be insufficient to produce observable effects")
print("   e) Judge (GPT-4o) may not detect subtle stylistic differences")

print("\n4. Recommendations:")
print("   - Investigate why optimization produces small weights")
print("   - Try different layers and larger alpha values")
print("   - Use more sensitive evaluation metrics")
print("   - Consider persona-specific trait vector construction")

print("\n" + "="*80)

## 10. エクスポート: 分析結果を保存

In [None]:
# Save analysis results
analysis_results = {
    "summary": {
        "total_personas": len(df_weights),
        "no_effect_personas": len(all_tie_personas),
        "had_effect_personas": len(mixed_personas),
        "no_effect_rate": len(all_tie_personas) / len(df_weights)
    },
    "weight_statistics": {
        "no_effect_group": {
            "l2_norm_mean": float(df_weights[~df_weights["had_effect"]]["weight_l2_norm"].mean()),
            "l2_norm_std": float(df_weights[~df_weights["had_effect"]]["weight_l2_norm"].std()),
            "abs_mean_mean": float(df_weights[~df_weights["had_effect"]]["weight_abs_mean"].mean()),
        },
        "had_effect_group": {
            "l2_norm_mean": float(df_weights[df_weights["had_effect"]]["weight_l2_norm"].mean()),
            "l2_norm_std": float(df_weights[df_weights["had_effect"]]["weight_l2_norm"].std()),
            "abs_mean_mean": float(df_weights[df_weights["had_effect"]]["weight_abs_mean"].mean()),
        }
    },
    "effective_personas": [
        {
            "persona_id": persona_id,
            "weights": all_weights[persona_id],
            "outcomes": dict(persona_outcomes[persona_id]),
            "steering_win_rate": float(df_weights[df_weights["persona_id"]==persona_id]["steering_win_rate"].iloc[0]),
            "decisive_rate": float(df_weights[df_weights["persona_id"]==persona_id]["decisive_rate"].iloc[0])
        }
        for persona_id in sorted(mixed_personas)
    ]
}

with open("../results/base_vs_steering/weight_analysis.json", "w") as f:
    json.dump(analysis_results, f, indent=2)

print("✓ Saved: results/base_vs_steering/weight_analysis.json")

# Save weights dataframe
df_weights.to_csv("../results/base_vs_steering/weights_dataframe.csv", index=False)
print("✓ Saved: results/base_vs_steering/weights_dataframe.csv")

print("\n" + "="*80)
print("Analysis complete! All results saved.")
print("="*80)