# Persona Profiling: From Cluster Centroids to Behavioral Personas

**Objective**: Transform K-means cluster centroids into interpretable behavioral personas with natural language descriptions.

## Workflow
1. Load clustering outputs and compute centroids in original (unscaled) space
2. Generate statistical profiles comparing each cluster to the population
3. Map statistics to descriptive behavioral labels
4. Create natural language persona descriptions
5. Export personas for agent instantiation (Phase 3)

## 1. Environment Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import warnings

warnings.filterwarnings('ignore')

# Style settings
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.4f}'.format)

# Paths
DATA_DIR = Path('../data/processed')
OUTPUT_DIR = Path('../data/processed')
FIG_DIR = Path('./outputs/04_persona_profiling')
FIG_DIR.mkdir(parents=True, exist_ok=True)

print(f"Data directory: {DATA_DIR.resolve()}")
print(f"Figure directory: {FIG_DIR.resolve()}")

## 2. Load Clustering Outputs

In [None]:
# Load cluster assignments
clusters = pd.read_csv(DATA_DIR / 'customer_clusters.csv', index_col='customer_unique_id')

# Load raw (non-log) features
features_raw = pd.read_csv(DATA_DIR / 'customer_features_raw.csv', index_col='customer_unique_id')

# Load transformed (log) features
features_transformed = pd.read_csv(DATA_DIR / 'customer_features_transformed.csv', index_col='customer_unique_id')

# Load metadata
with open(DATA_DIR / 'feature_metadata.json', 'r') as f:
    metadata = json.load(f)

print(f"Cluster assignments: {clusters.shape}")
print(f"Raw features: {features_raw.shape}")
print(f"Transformed features: {features_transformed.shape}")
print(f"\nNumber of clusters: {metadata['clustering']['n_clusters']}")

In [None]:
# Merge cluster labels with raw features
df = features_raw.join(clusters)

print(f"Combined dataframe: {df.shape}")
print(f"\nCluster distribution:")
cluster_counts = df['cluster'].value_counts().sort_index()
for cluster_id, count in cluster_counts.items():
    pct = count / len(df) * 100
    print(f"  Cluster {cluster_id}: {count:,} customers ({pct:.1f}%)")

## 3. Compute Cluster Centroids (Original Scale)

The clustering was performed on log-transformed, standardized features. To interpret the clusters, we need centroids in the original scale (e.g., R$ for monetary values, counts for frequency).

In [None]:
# Define feature groups
RAW_FEATURES = [
    'frequency',
    'monetary_total',
    'monetary_avg_item',
    'avg_items_per_order',
    'avg_installments',
    'pct_credit_card',
    'category_diversity',
    'is_positive_reviewer',
    'is_weekend_shopper'
]

# Compute cluster means in original scale
cluster_centroids = df.groupby('cluster')[RAW_FEATURES].mean()

print("Cluster Centroids (Original Scale):")
cluster_centroids.round(2)

In [None]:
# Compute population statistics for comparison
population_stats = df[RAW_FEATURES].agg(['mean', 'median', 'std'])

print("Population Statistics:")
population_stats.round(2)

In [None]:
# Compute z-scores: how many std deviations each cluster is from the population mean
# This helps identify which features distinguish each cluster
pop_mean = population_stats.loc['mean']
pop_std = population_stats.loc['std']

cluster_zscores = (cluster_centroids - pop_mean) / pop_std

print("Cluster Z-Scores (deviation from population mean in std units):")
cluster_zscores.round(2)

## 4. Visualize Cluster Profiles

In [None]:
# Heatmap of z-scores
fig, ax = plt.subplots(figsize=(12, 7))
sns.heatmap(
    cluster_zscores.T, 
    annot=True, 
    cmap='RdBu_r', 
    center=0, 
    fmt='.2f',
    linewidths=0.5, 
    ax=ax, 
    cbar_kws={'label': 'Z-Score (std from population mean)'}
)
ax.set_title('Cluster Profiles: Deviation from Population Mean', fontsize=14)
ax.set_xlabel('Cluster', fontsize=12)
ax.set_ylabel('Feature', fontsize=12)
plt.tight_layout()
plt.savefig(FIG_DIR / 'cluster_zscore_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"Saved: {FIG_DIR / 'cluster_zscore_heatmap.png'}")

In [None]:
# Radar chart for each cluster
from math import pi

def create_radar_chart(cluster_zscores, cluster_id, ax):
    """Create a radar chart for a single cluster."""
    categories = list(cluster_zscores.columns)
    n_cats = len(categories)
    
    # Compute angles for each feature
    angles = [n / float(n_cats) * 2 * pi for n in range(n_cats)]
    angles += angles[:1]  # Complete the loop
    
    # Get values for this cluster
    values = cluster_zscores.loc[cluster_id].values.tolist()
    values += values[:1]  # Complete the loop
    
    # Plot
    ax.plot(angles, values, 'o-', linewidth=2, label=f'Cluster {cluster_id}')
    ax.fill(angles, values, alpha=0.25)
    
    # Set labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(categories, size=8)
    ax.set_title(f'Cluster {cluster_id}', fontsize=12, fontweight='bold')
    
    # Set y-axis limits
    ax.set_ylim(-2.5, 2.5)

# Create radar charts for all clusters
n_clusters = len(cluster_zscores)
n_cols = 4
n_rows = (n_clusters + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 4*n_rows), 
                         subplot_kw=dict(polar=True))
axes = axes.flatten()

for i, cluster_id in enumerate(cluster_zscores.index):
    create_radar_chart(cluster_zscores, cluster_id, axes[i])

# Hide unused subplots
for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.suptitle('Cluster Behavioral Profiles (Z-Scores)', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig(FIG_DIR / 'cluster_radar_charts.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"Saved: {FIG_DIR / 'cluster_radar_charts.png'}")

## 5. Statistical Profile → Descriptive Labels

Map quantitative centroid values to human-readable behavioral descriptors.

In [None]:
def describe_feature(feature_name: str, value: float, zscore: float, pop_stats: pd.DataFrame) -> str:
    """
    Generate a descriptive label for a feature value based on its z-score.
    
    Args:
        feature_name: Name of the feature
        value: Raw feature value (cluster centroid)
        zscore: Z-score relative to population
        pop_stats: Population statistics DataFrame
    
    Returns:
        Human-readable description
    """
    pop_mean = pop_stats.loc['mean', feature_name]
    
    # Determine magnitude
    if abs(zscore) < 0.5:
        magnitude = "average"
    elif abs(zscore) < 1.0:
        magnitude = "slightly " + ("above" if zscore > 0 else "below") + " average"
    elif abs(zscore) < 1.5:
        magnitude = "moderately " + ("high" if zscore > 0 else "low")
    elif abs(zscore) < 2.0:
        magnitude = "high" if zscore > 0 else "low"
    else:
        magnitude = "very " + ("high" if zscore > 0 else "low")
    
    # Feature-specific descriptions
    descriptions = {
        'frequency': {
            'metric': f"{value:.1f} orders",
            'behavior': "repeat buyer" if value > 1.5 else "one-time buyer" if value < 1.1 else "occasional repeat buyer"
        },
        'monetary_total': {
            'metric': f"R${value:.0f} lifetime",
            'behavior': "high-value" if zscore > 1 else "budget-conscious" if zscore < -1 else "moderate spender"
        },
        'monetary_avg_item': {
            'metric': f"R${value:.0f}/item",
            'behavior': "premium buyer" if zscore > 1 else "bargain hunter" if zscore < -1 else "mid-range buyer"
        },
        'avg_items_per_order': {
            'metric': f"{value:.1f} items/order",
            'behavior': "bulk buyer" if zscore > 1 else "single-item buyer" if zscore < -0.5 else "typical basket"
        },
        'avg_installments': {
            'metric': f"{value:.1f} installments",
            'behavior': "heavy financing" if value > 5 else "cash/single payment" if value < 1.5 else "moderate financing"
        },
        'pct_credit_card': {
            'metric': f"{value*100:.0f}% credit card",
            'behavior': "credit card exclusive" if value > 0.95 else "cash/boleto preference" if value < 0.3 else "mixed payment"
        },
        'category_diversity': {
            'metric': f"{value:.1f} categories",
            'behavior': "category explorer" if value > 1.5 else "category focused" if value <= 1 else "slight explorer"
        },
        'is_positive_reviewer': {
            'metric': f"{value*100:.0f}% positive",
            'behavior': "satisfied customer" if value > 0.85 else "critical reviewer" if value < 0.5 else "mixed satisfaction"
        },
        'is_weekend_shopper': {
            'metric': f"{value*100:.0f}% weekend",
            'behavior': "weekend shopper" if value > 0.5 else "weekday shopper" if value < 0.15 else "any-day shopper"
        }
    }
    
    desc = descriptions.get(feature_name, {'metric': str(value), 'behavior': magnitude})
    return f"{desc['metric']} ({desc['behavior']})"


# Test the function
print("Example descriptions for Cluster 0:")
for feature in RAW_FEATURES:
    val = cluster_centroids.loc[0, feature]
    z = cluster_zscores.loc[0, feature]
    print(f"  {feature}: {describe_feature(feature, val, z, population_stats)}")

In [None]:
def identify_distinguishing_features(cluster_id: int, zscores: pd.DataFrame, threshold: float = 0.75) -> dict:
    """
    Identify the features that most distinguish a cluster from the population.
    
    Args:
        cluster_id: Cluster to analyze
        zscores: DataFrame of z-scores
        threshold: Minimum |z-score| to be considered distinguishing
    
    Returns:
        Dict with 'high' and 'low' feature lists
    """
    cluster_z = zscores.loc[cluster_id]
    
    high_features = cluster_z[cluster_z > threshold].sort_values(ascending=False)
    low_features = cluster_z[cluster_z < -threshold].sort_values(ascending=True)
    
    return {
        'high': list(high_features.index),
        'low': list(low_features.index),
        'high_zscores': high_features.to_dict(),
        'low_zscores': low_features.to_dict()
    }


# Identify distinguishing features for each cluster
distinguishing_features = {}
for cluster_id in cluster_zscores.index:
    distinguishing_features[cluster_id] = identify_distinguishing_features(cluster_id, cluster_zscores)
    print(f"\nCluster {cluster_id}:")
    print(f"  High: {distinguishing_features[cluster_id]['high']}")
    print(f"  Low: {distinguishing_features[cluster_id]['low']}")

## 6. Generate Cluster Summaries

In [None]:
def generate_cluster_summary(cluster_id: int, 
                             centroids: pd.DataFrame, 
                             zscores: pd.DataFrame, 
                             pop_stats: pd.DataFrame,
                             cluster_sizes: pd.Series) -> dict:
    """
    Generate a complete summary for a cluster.
    
    Returns:
        Dictionary with cluster statistics and descriptions
    """
    centroid = centroids.loc[cluster_id]
    zscore = zscores.loc[cluster_id]
    size = cluster_sizes[cluster_id]
    pct = size / cluster_sizes.sum() * 100
    
    # Feature descriptions
    feature_descriptions = {}
    for feature in RAW_FEATURES:
        feature_descriptions[feature] = {
            'value': centroid[feature],
            'zscore': zscore[feature],
            'description': describe_feature(feature, centroid[feature], zscore[feature], pop_stats)
        }
    
    # Distinguishing features
    distinguishing = identify_distinguishing_features(cluster_id, zscores)
    
    return {
        'cluster_id': cluster_id,
        'size': int(size),
        'percentage': pct,
        'features': feature_descriptions,
        'distinguishing_high': distinguishing['high'],
        'distinguishing_low': distinguishing['low']
    }


# Generate summaries for all clusters
cluster_summaries = {}
for cluster_id in cluster_zscores.index:
    cluster_summaries[cluster_id] = generate_cluster_summary(
        cluster_id, cluster_centroids, cluster_zscores, population_stats, cluster_counts
    )

# Display summary for cluster 0 as example
print("Example: Cluster 0 Summary")
print("=" * 50)
summary = cluster_summaries[0]
print(f"Size: {summary['size']:,} customers ({summary['percentage']:.1f}%)")
print(f"\nDistinguishing HIGH features: {summary['distinguishing_high']}")
print(f"Distinguishing LOW features: {summary['distinguishing_low']}")
print(f"\nFeature breakdown:")
for feature, desc in summary['features'].items():
    print(f"  {feature}: {desc['description']}")

In [None]:
# Create a summary table for all clusters
summary_table = pd.DataFrame({
    'Cluster': range(len(cluster_summaries)),
    'Size': [cluster_summaries[i]['size'] for i in range(len(cluster_summaries))],
    'Pct': [f"{cluster_summaries[i]['percentage']:.1f}%" for i in range(len(cluster_summaries))],
    'Distinguishing High': [', '.join(cluster_summaries[i]['distinguishing_high'][:3]) for i in range(len(cluster_summaries))],
    'Distinguishing Low': [', '.join(cluster_summaries[i]['distinguishing_low'][:3]) for i in range(len(cluster_summaries))]
})

print("Cluster Summary Table:")
summary_table

## 7. Descriptive Labels → NLP Personas

Transform statistical profiles into natural language persona descriptions suitable for LLM agent instantiation.

In [None]:
def infer_decision_heuristics(summary: dict) -> list:
    """
    Infer behavioral decision heuristics from cluster statistics.
    These are hypotheses about WHY customers behave the way they do.
    """
    heuristics = []
    features = summary['features']
    
    # Payment behavior heuristics
    installments = features['avg_installments']['value']
    cc_pct = features['pct_credit_card']['value']
    
    if installments > 5:
        heuristics.append("I evaluate purchases by monthly payment size, not total cost. Spreading payments makes expensive items accessible.")
    elif installments < 1.5 and cc_pct < 0.3:
        heuristics.append("I avoid debt and prefer to pay upfront. If I can't afford it now, I'll wait or find an alternative.")
    elif installments < 2 and cc_pct > 0.9:
        heuristics.append("I use credit cards for convenience and rewards, but pay off balances quickly.")
    
    # Spending behavior heuristics
    monetary_z = features['monetary_total']['zscore']
    avg_item_z = features['monetary_avg_item']['zscore']
    
    if monetary_z > 1 and avg_item_z > 1:
        heuristics.append("Quality matters more than price. I'm willing to pay premium for better products.")
    elif monetary_z < -0.5 and avg_item_z < -0.5:
        heuristics.append("I'm price-conscious and actively seek deals. I compare prices before purchasing.")
    elif avg_item_z > 1 and features['avg_items_per_order']['zscore'] < 0:
        heuristics.append("I make deliberate, considered purchases. Each buy is a decision, not an impulse.")
    
    # Review behavior heuristics
    positive_pct = features['is_positive_reviewer']['value']
    
    if positive_pct > 0.9:
        heuristics.append("I'm generally satisfied with my purchases and appreciate when things work as expected.")
    elif positive_pct < 0.5:
        heuristics.append("I have high standards and will voice concerns when products don't meet expectations.")
    elif 0.5 <= positive_pct <= 0.7:
        heuristics.append("I'm discerning but fair. I'll praise good experiences and critique poor ones.")
    
    # Shopping pattern heuristics
    weekend_pct = features['is_weekend_shopper']['value']
    frequency = features['frequency']['value']
    
    if weekend_pct > 0.5:
        heuristics.append("I shop during leisure time, often browsing before buying.")
    elif weekend_pct < 0.15:
        heuristics.append("I shop with purpose during the workweek, often for specific needs.")
    
    if frequency > 1.5:
        heuristics.append("I'm comfortable with online shopping and return when I have a good experience.")
    elif frequency < 1.05:
        heuristics.append("Online shopping is transactional for me—I buy what I need and move on.")
    
    # Category behavior
    cat_diversity = features['category_diversity']['value']
    if cat_diversity > 1.5:
        heuristics.append("I treat this marketplace as a one-stop shop for various needs.")
    elif cat_diversity <= 1:
        heuristics.append("I come here for specific product types—I know what I'm looking for.")
    
    return heuristics


# Test on cluster 0
print("Inferred heuristics for Cluster 0:")
for h in infer_decision_heuristics(cluster_summaries[0]):
    print(f"  • {h}")

In [None]:
def generate_persona_name(summary: dict) -> str:
    """
    Generate a descriptive persona name based on distinguishing features.
    """
    features = summary['features']
    high = summary['distinguishing_high']
    low = summary['distinguishing_low']
    
    # Name components based on strongest signals
    name_parts = []
    
    # Spending level
    monetary_z = features['monetary_total']['zscore']
    if monetary_z > 1.5:
        name_parts.append("Premium")
    elif monetary_z > 0.75:
        name_parts.append("High-Value")
    elif monetary_z < -0.75:
        name_parts.append("Budget")
    
    # Payment style
    if 'avg_installments' in high:
        name_parts.append("Financing")
    elif 'pct_credit_card' in low:
        name_parts.append("Cash")
    
    # Review tendency
    if 'is_positive_reviewer' in high:
        name_parts.append("Satisfied")
    elif 'is_positive_reviewer' in low:
        name_parts.append("Critical")
    
    # Shopping pattern
    if 'is_weekend_shopper' in high:
        name_parts.append("Weekend")
    elif 'frequency' in high:
        name_parts.append("Loyal")
    
    # Basket behavior
    if 'avg_items_per_order' in high:
        name_parts.append("Bulk")
    if 'category_diversity' in high:
        name_parts.append("Explorer")
    
    # Fallback
    if not name_parts:
        name_parts.append("Mainstream")
    
    # Add archetype suffix
    suffixes = ["Shopper", "Buyer", "Customer"]
    
    return " ".join(name_parts[:2]) + " " + suffixes[summary['cluster_id'] % len(suffixes)]


# Generate names for all clusters
print("Generated Persona Names:")
for cluster_id, summary in cluster_summaries.items():
    name = generate_persona_name(summary)
    print(f"  Cluster {cluster_id}: {name}")

In [None]:
def generate_persona_description(summary: dict) -> dict:
    """
    Generate a complete natural language persona description.
    
    Returns:
        Dictionary containing persona name, description, and structured attributes
    """
    cluster_id = summary['cluster_id']
    features = summary['features']
    
    # Generate components
    persona_name = generate_persona_name(summary)
    heuristics = infer_decision_heuristics(summary)
    
    # Build behavioral profile section
    behavioral_profile = []
    
    # Frequency & Spending
    freq = features['frequency']['value']
    monetary = features['monetary_total']['value']
    avg_item = features['monetary_avg_item']['value']
    
    if freq > 1.5:
        behavioral_profile.append(f"Repeat customer with {freq:.1f} orders on average")
    else:
        behavioral_profile.append("Typically makes a single purchase")
    
    behavioral_profile.append(f"Average lifetime spend of R${monetary:.0f}")
    behavioral_profile.append(f"Typical item price around R${avg_item:.0f}")
    
    # Basket
    basket = features['avg_items_per_order']['value']
    if basket > 1.5:
        behavioral_profile.append(f"Buys multiple items per order ({basket:.1f} items on average)")
    else:
        behavioral_profile.append("Usually buys one item per order")
    
    # Payment
    installments = features['avg_installments']['value']
    cc_pct = features['pct_credit_card']['value']
    
    if cc_pct > 0.9:
        payment_desc = "Almost exclusively uses credit card"
    elif cc_pct < 0.2:
        payment_desc = "Prefers boleto/debit over credit"
    else:
        payment_desc = f"Uses credit card for {cc_pct*100:.0f}% of purchases"
    
    if installments > 4:
        payment_desc += f", typically in {installments:.0f} installments"
    elif installments < 1.5:
        payment_desc += ", usually paying in full"
    else:
        payment_desc += f", averaging {installments:.1f} installments"
    
    behavioral_profile.append(payment_desc)
    
    # Categories
    cat_div = features['category_diversity']['value']
    if cat_div > 1.5:
        behavioral_profile.append(f"Explores multiple product categories ({cat_div:.1f} on average)")
    else:
        behavioral_profile.append("Focused on specific product categories")
    
    # Review behavior
    positive_pct = features['is_positive_reviewer']['value']
    if positive_pct > 0.9:
        behavioral_profile.append("Highly satisfied—reviews are consistently positive")
    elif positive_pct > 0.7:
        behavioral_profile.append("Generally satisfied, with occasional concerns")
    elif positive_pct > 0.5:
        behavioral_profile.append("Mixed satisfaction—reviews reflect both positive and negative experiences")
    else:
        behavioral_profile.append("Often critical in reviews—holds products to high standards")
    
    # Shopping timing
    weekend_pct = features['is_weekend_shopper']['value']
    if weekend_pct > 0.5:
        behavioral_profile.append("Shops primarily on weekends")
    elif weekend_pct < 0.15:
        behavioral_profile.append("Shops primarily on weekdays")
    else:
        behavioral_profile.append("No strong weekday/weekend preference")
    
    # Build the full description
    description = f"""
## {persona_name}

**Cluster {cluster_id}** | {summary['size']:,} customers ({summary['percentage']:.1f}% of population)

### Behavioral Profile
{"; ".join(behavioral_profile)}.

### Decision Heuristics
{chr(10).join('- ' + h for h in heuristics)}

### Key Statistics
| Metric | Value | vs. Population |
|--------|-------|----------------|
| Lifetime Spend | R${monetary:.0f} | {'+' if features['monetary_total']['zscore'] > 0 else ''}{features['monetary_total']['zscore']:.1f}σ |
| Avg Item Price | R${avg_item:.0f} | {'+' if features['monetary_avg_item']['zscore'] > 0 else ''}{features['monetary_avg_item']['zscore']:.1f}σ |
| Installments | {installments:.1f} | {'+' if features['avg_installments']['zscore'] > 0 else ''}{features['avg_installments']['zscore']:.1f}σ |
| Credit Card % | {cc_pct*100:.0f}% | {'+' if features['pct_credit_card']['zscore'] > 0 else ''}{features['pct_credit_card']['zscore']:.1f}σ |
| Positive Reviews | {positive_pct*100:.0f}% | {'+' if features['is_positive_reviewer']['zscore'] > 0 else ''}{features['is_positive_reviewer']['zscore']:.1f}σ |
""".strip()
    
    return {
        'cluster_id': cluster_id,
        'persona_name': persona_name,
        'description_markdown': description,
        'behavioral_profile': behavioral_profile,
        'decision_heuristics': heuristics,
        'raw_statistics': {
            feature: {
                'value': features[feature]['value'],
                'zscore': features[feature]['zscore']
            } for feature in RAW_FEATURES
        },
        'size': summary['size'],
        'percentage': summary['percentage']
    }


# Generate personas for all clusters
personas = {}
for cluster_id, summary in cluster_summaries.items():
    personas[cluster_id] = generate_persona_description(summary)

print(f"Generated {len(personas)} personas")

In [None]:
# Display all persona descriptions
from IPython.display import Markdown, display

for cluster_id, persona in personas.items():
    display(Markdown(persona['description_markdown']))
    display(Markdown("---"))

## 8. Generate Agent System Prompts

Create the system prompts that will be used to instantiate Claude agents in Phase 3.

In [None]:
def generate_agent_system_prompt(persona: dict) -> str:
    """
    Generate a system prompt for Claude agent instantiation.
    """
    stats = persona['raw_statistics']
    
    prompt = f'''You are simulating a customer from the "{persona['persona_name']}" behavioral segment.

## Context
- Brazilian e-commerce customer (Olist marketplace, 2016-2018)
- This persona represents {persona['percentage']:.1f}% of the customer base ({persona['size']:,} customers)

## Behavioral Profile
- Purchase frequency: {stats['frequency']['value']:.1f} orders (average)
- Average lifetime spend: R${stats['monetary_total']['value']:.0f}
- Typical item price: R${stats['monetary_avg_item']['value']:.0f}
- Basket size: {stats['avg_items_per_order']['value']:.1f} items per order
- Payment: {stats['pct_credit_card']['value']*100:.0f}% credit card, {stats['avg_installments']['value']:.1f} installments avg
- Category diversity: {stats['category_diversity']['value']:.1f} distinct categories
- Satisfaction: {stats['is_positive_reviewer']['value']*100:.0f}% positive reviews
- Shopping timing: {"weekend-oriented" if stats['is_weekend_shopper']['value'] > 0.5 else "weekday-oriented" if stats['is_weekend_shopper']['value'] < 0.2 else "no strong preference"}

## Decision Heuristics
{chr(10).join("- " + h for h in persona['decision_heuristics'])}

## Instructions
When presented with product scenarios, purchasing decisions, or marketplace situations:

1. Respond as this customer persona would, based on the behavioral profile above
2. Your preferences should reflect:
   - The economic constraints implied by your spending patterns
   - The risk tolerance implied by your payment preferences
   - The satisfaction threshold implied by your review behavior
3. Stay in character throughout the conversation
4. When making decisions, briefly explain your reasoning in a way consistent with your persona

Do not break character or acknowledge that you are an AI simulating a customer.
'''
    
    return prompt.strip()


# Generate system prompts for all personas
agent_prompts = {}
for cluster_id, persona in personas.items():
    agent_prompts[cluster_id] = generate_agent_system_prompt(persona)

# Display example
print("Example Agent System Prompt (Cluster 0):")
print("=" * 60)
print(agent_prompts[0])

## 9. Export Personas and Prompts

In [None]:
# Prepare export data
export_data = {
    'metadata': {
        'n_clusters': len(personas),
        'total_customers': int(cluster_counts.sum()),
        'features_used': RAW_FEATURES,
        'generated_from': 'notebooks/04_persona_profiling.ipynb'
    },
    'population_statistics': {
        'mean': population_stats.loc['mean'].to_dict(),
        'median': population_stats.loc['median'].to_dict(),
        'std': population_stats.loc['std'].to_dict()
    },
    'personas': {}
}

for cluster_id, persona in personas.items():
    export_data['personas'][int(cluster_id)] = {
        'persona_name': persona['persona_name'],
        'size': persona['size'],
        'percentage': persona['percentage'],
        'behavioral_profile': persona['behavioral_profile'],
        'decision_heuristics': persona['decision_heuristics'],
        'raw_statistics': {
            k: {'value': float(v['value']), 'zscore': float(v['zscore'])}
            for k, v in persona['raw_statistics'].items()
        },
        'agent_system_prompt': agent_prompts[cluster_id]
    }

# Save to JSON
with open(OUTPUT_DIR / 'personas.json', 'w') as f:
    json.dump(export_data, f, indent=2)

print(f"Saved: {OUTPUT_DIR / 'personas.json'}")

In [None]:
# Save cluster centroids (original scale)
cluster_centroids.to_csv(OUTPUT_DIR / 'cluster_centroids.csv')
print(f"Saved: {OUTPUT_DIR / 'cluster_centroids.csv'}")

# Save cluster z-scores
cluster_zscores.to_csv(OUTPUT_DIR / 'cluster_zscores.csv')
print(f"Saved: {OUTPUT_DIR / 'cluster_zscores.csv'}")

In [None]:
# Save persona descriptions as markdown for easy reading
with open(OUTPUT_DIR / 'persona_descriptions.md', 'w') as f:
    f.write("# Customer Personas\n\n")
    f.write(f"Generated from K-means clustering with {len(personas)} clusters.\n\n")
    f.write("---\n\n")
    
    for cluster_id, persona in personas.items():
        f.write(persona['description_markdown'])
        f.write("\n\n---\n\n")

print(f"Saved: {OUTPUT_DIR / 'persona_descriptions.md'}")

## 10. Summary

In [None]:
print("\n" + "="*60)
print("PERSONA PROFILING COMPLETE")
print("="*60)

print(f"\nGenerated {len(personas)} customer personas:")
for cluster_id, persona in personas.items():
    print(f"  Cluster {cluster_id}: {persona['persona_name']} ({persona['size']:,} customers, {persona['percentage']:.1f}%)")

print(f"\nOutput files:")
print(f"  1. personas.json - Complete persona data with agent prompts")
print(f"  2. cluster_centroids.csv - Centroids in original scale")
print(f"  3. cluster_zscores.csv - Cluster deviations from population")
print(f"  4. persona_descriptions.md - Human-readable persona descriptions")

print(f"\nFigures:")
print(f"  1. cluster_zscore_heatmap.png")
print(f"  2. cluster_radar_charts.png")

print(f"\nNext Steps (Phase 3):")
print(f"  1. Load personas.json")
print(f"  2. Instantiate Claude agents using agent_system_prompt")
print(f"  3. Design product scenarios for testing")
print(f"  4. Run simulations and compare responses across personas")