# Character Frequency Analysis

Comprehensive analysis of character appearances, frequencies, and interaction patterns.

## Analysis Components:
1. **Character Frequency**: How often each character appears and speaks
2. **Character Types**: Main, recurring, guest, and minor characters
3. **Co-occurrence Patterns**: Which characters appear together
4. **Network Analysis**: Character interaction networks
5. **Timeline Analysis**: Character appearance patterns over episodes

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import networkx as nx
from IPython.display import display, HTML, Markdown

# Add src to path
sys.path.append(str(Path('../src').resolve()))

# Import character frequency analyzer
from analysis.character_frequency_analyzer import CharacterFrequencyAnalyzer

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("Setup complete!")

## 1. Initialize Analyzer and Load Data

In [None]:
# Initialize the analyzer
analyzer = CharacterFrequencyAnalyzer(data_dir=Path('../data/original'))

# Get character type breakdown
char_types = analyzer.identify_character_types()

print("Character Database Overview:")
print("="*50)
print(f"Total unique characters: {len(analyzer.character_profiles)}")
print(f"Total episodes analyzed: {len(analyzer.episodes)}")
print(f"\nCharacter Type Breakdown:")
for char_type, chars in char_types.items():
    print(f"  {char_type.capitalize()}: {len(chars)} characters")
    if char_type == 'main' and chars:
        print(f"    Examples: {', '.join(chars[:5])}")

## 2. Character Frequency Distributions

In [None]:
# Plot comprehensive character distributions
fig = analyzer.plot_character_distributions(top_n=20)

# Additional statistics
print("\nKey Statistics:")
print(f"• Characters appearing in 10+ episodes: {sum(1 for p in analyzer.character_profiles.values() if p.episode_count >= 10)}")
print(f"• Characters appearing in only 1 episode: {sum(1 for p in analyzer.character_profiles.values() if p.episode_count == 1)}")
print(f"• Average sentences per character: {np.mean([p.total_sentences for p in analyzer.character_profiles.values()]):.1f}")
print(f"• Max sentences by one character: {max(p.total_sentences for p in analyzer.character_profiles.values())}")

## 3. Top Characters Analysis

In [None]:
# Get rankings by different metrics
by_sentences = analyzer.get_character_rankings('total_sentences')[:15]
by_episodes = analyzer.get_character_rankings('episode_count')[:15]
by_intensity = analyzer.get_character_rankings('avg_sentences')[:15]

# Create comparison DataFrame
rankings_df = pd.DataFrame({
    'By Total Sentences': [f"{name} ({count})" for name, count in by_sentences],
    'By Episode Count': [f"{name} ({count})" for name, count in by_episodes],
    'By Avg Sentences/Episode': [f"{name} ({count:.1f})" for name, count in by_intensity]
})

display(HTML("<h3>Character Rankings by Different Metrics</h3>"))
display(rankings_df.head(10).style.set_properties(**{'text-align': 'left'}))

# Identify CSI team members
csi_team_keywords = ['grissom', 'sara', 'nick', 'warrick', 'catherine', 'greg', 'brass']
csi_team = []
for name, profile in analyzer.character_profiles.items():
    if any(keyword in name.lower() for keyword in csi_team_keywords):
        if profile.episode_count >= 5:  # Filter to significant appearances
            csi_team.append((profile.name, profile.episode_count, profile.total_sentences))

if csi_team:
    print("\nIdentified CSI Team Members:")
    for name, eps, sents in sorted(csi_team, key=lambda x: x[2], reverse=True):
        print(f"  {name}: {eps} episodes, {sents} sentences")

## 4. Character Co-occurrence Network

In [None]:
# Analyze co-occurrence patterns
co_occur = analyzer.analyze_co_occurrences(min_weight=2)

if 'error' not in co_occur:
    print("Co-occurrence Network Statistics:")
    print("="*50)
    print(f"Network nodes: {co_occur['nodes']}")
    print(f"Network edges: {co_occur['edges']}")
    print(f"Network density: {co_occur['density']:.3f}")
    print(f"Average clustering: {co_occur['avg_clustering']:.3f}")
    print(f"Connected components: {co_occur['connected_components']}")
    
    if 'most_connected' in co_occur:
        print("\nMost Connected Characters (Hub Characters):")
        for char, centrality in co_occur['most_connected'][:8]:
            char_name = analyzer.character_profiles[char].name
            print(f"  {char_name}: {centrality:.3f}")
    
    if 'frequent_pairs' in co_occur:
        print("\nMost Frequent Character Pairs:")
        for char1, char2, weight in co_occur['frequent_pairs'][:10]:
            name1 = analyzer.character_profiles[char1].name
            name2 = analyzer.character_profiles[char2].name
            print(f"  {name1} & {name2}: {weight} episodes together")

In [None]:
# Visualize co-occurrence network
fig = analyzer.plot_co_occurrence_network(min_weight=3)

if fig:
    print("\nNetwork Visualization Notes:")
    print("• Node size = character speaking frequency")
    print("• Node color: Gold=Main, Blue=Recurring, Gray=Guest/Minor")
    print("• Edge thickness = frequency of co-appearance")
    print("• Closer nodes = more frequent co-appearances")

## 5. Character Timeline Analysis

In [None]:
# Analyze timeline for main characters
main_chars = char_types['main'][:5] if 'main' in char_types else []

if main_chars:
    fig, axes = plt.subplots(len(main_chars), 1, figsize=(14, 3*len(main_chars)))
    if len(main_chars) == 1:
        axes = [axes]
    
    for idx, char_name in enumerate(main_chars):
        # Get timeline
        timeline = analyzer.get_character_timeline(char_name)
        
        if 'error' not in timeline and 'episode_details' in timeline:
            ax = axes[idx]
            
            # Extract episode numbers and sentences
            episodes = []
            sentences = []
            
            for detail in timeline['episode_details']:
                ep = detail['episode']
                if 'e' in ep:
                    try:
                        ep_num = int(ep.split('e')[1])
                        episodes.append(ep_num)
                        sentences.append(detail['sentences'])
                    except:
                        pass
            
            if episodes:
                ax.bar(episodes, sentences, color='steelblue', alpha=0.7)
                ax.set_xlabel('Episode Number')
                ax.set_ylabel('Sentences')
                ax.set_title(f'{timeline["character"]} Speaking Pattern ({timeline["total_episodes"]} episodes, {timeline["total_sentences"]} total sentences)')
                ax.grid(True, alpha=0.3)
                
                # Add average line
                avg_sentences = np.mean(sentences)
                ax.axhline(y=avg_sentences, color='red', linestyle='--', alpha=0.5,
                          label=f'Average: {avg_sentences:.1f}')
                ax.legend()
    
    plt.suptitle('Main Character Speaking Patterns Across Episodes', fontsize=14, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.show()
else:
    print("No main characters identified for timeline analysis")

## 6. Character Similarity Analysis

In [None]:
# Calculate character similarity based on co-occurrence
from scipy.cluster.hierarchy import dendrogram, linkage

# Get main and recurring characters only
significant_chars = []
for name, profile in analyzer.character_profiles.items():
    if profile.episode_count >= 5:  # At least 5 episodes
        significant_chars.append(name)

if len(significant_chars) >= 10:
    # Create similarity matrix for significant characters
    char_indices = {char: i for i, char in enumerate(significant_chars)}
    n_chars = len(significant_chars)
    similarity_matrix = np.zeros((n_chars, n_chars))
    
    for i, char1 in enumerate(significant_chars):
        for j, char2 in enumerate(significant_chars):
            if i == j:
                similarity_matrix[i, j] = 1.0
            else:
                # Count co-appearances
                co_count = analyzer.character_profiles[char1].co_appearances.get(char2, 0)
                # Normalize by minimum episodes
                min_eps = min(analyzer.character_profiles[char1].episode_count,
                            analyzer.character_profiles[char2].episode_count)
                similarity_matrix[i, j] = co_count / max(1, min_eps)
    
    # Create dendrogram
    fig, ax = plt.subplots(figsize=(12, 8))
    
    # Convert similarity to distance
    distance_matrix = 1 - similarity_matrix
    
    # Perform hierarchical clustering
    linkage_matrix = linkage(distance_matrix[np.triu_indices(n_chars, k=1)], method='ward')
    
    # Create dendrogram
    char_labels = [analyzer.character_profiles[c].name for c in significant_chars]
    dendrogram(linkage_matrix, labels=char_labels, ax=ax, orientation='right')
    
    ax.set_xlabel('Distance (1 - Co-appearance Rate)')
    ax.set_title('Character Clustering Based on Co-appearances', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print("Dendrogram Interpretation:")
    print("• Characters that frequently appear together cluster together")
    print("• Shorter branches = more similar co-appearance patterns")
    print("• Distinct clusters may represent different storylines or teams")
else:
    print("Not enough significant characters for clustering analysis")

## 7. Episode Character Diversity

In [None]:
# Analyze character diversity per episode
episode_diversity = []
for ep_id, stats in analyzer.episode_stats.items():
    episode_diversity.append({
        'episode': ep_id,
        'character_count': stats.character_count,
        'total_sentences': stats.total_sentences,
        'avg_sentences_per_char': stats.total_sentences / max(1, stats.character_count)
    })

diversity_df = pd.DataFrame(episode_diversity)

# Visualize episode diversity
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Character count distribution
ax = axes[0]
ax.hist(diversity_df['character_count'], bins=15, edgecolor='black', alpha=0.7, color='green')
ax.set_xlabel('Number of Characters')
ax.set_ylabel('Number of Episodes')
ax.set_title('Characters per Episode Distribution')
ax.axvline(x=diversity_df['character_count'].mean(), color='red', linestyle='--',
           label=f'Mean: {diversity_df["character_count"].mean():.1f}')
ax.legend()

# Sentences per episode
ax = axes[1]
ax.hist(diversity_df['total_sentences'], bins=15, edgecolor='black', alpha=0.7, color='blue')
ax.set_xlabel('Total Sentences')
ax.set_ylabel('Number of Episodes')
ax.set_title('Dialogue Density Distribution')
ax.axvline(x=diversity_df['total_sentences'].mean(), color='red', linestyle='--',
           label=f'Mean: {diversity_df["total_sentences"].mean():.1f}')
ax.legend()

# Relationship between characters and sentences
ax = axes[2]
ax.scatter(diversity_df['character_count'], diversity_df['total_sentences'],
          alpha=0.6, s=50)
ax.set_xlabel('Number of Characters')
ax.set_ylabel('Total Sentences')
ax.set_title('Characters vs Dialogue Volume')

# Add trend line
z = np.polyfit(diversity_df['character_count'], diversity_df['total_sentences'], 1)
p = np.poly1d(z)
x_trend = np.linspace(diversity_df['character_count'].min(), 
                     diversity_df['character_count'].max(), 100)
ax.plot(x_trend, p(x_trend), 'r--', alpha=0.5, label=f'Trend: {z[0]:.1f}x + {z[1]:.1f}')
ax.legend()
ax.grid(True, alpha=0.3)

plt.suptitle('Episode Character Diversity Analysis', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

print("\nEpisode Diversity Statistics:")
print(f"• Average characters per episode: {diversity_df['character_count'].mean():.1f}")
print(f"• Average sentences per episode: {diversity_df['total_sentences'].mean():.1f}")
print(f"• Average sentences per character: {diversity_df['avg_sentences_per_char'].mean():.1f}")
print(f"• Most crowded episode: {diversity_df.loc[diversity_df['character_count'].idxmax(), 'episode']} ({diversity_df['character_count'].max()} characters)")
print(f"• Most dialogue: {diversity_df.loc[diversity_df['total_sentences'].idxmax(), 'episode']} ({diversity_df['total_sentences'].max()} sentences)")

## 8. Character Communities Detection

In [None]:
# Detect character communities
if 'communities' in co_occur and co_occur['communities']:
    print("Detected Character Communities:")
    print("="*50)
    
    for i, community in enumerate(co_occur['communities'][:5], 1):
        print(f"\nCommunity {i} ({community['size']} members):")
        # Show first 10 members
        members_to_show = community['members'][:10]
        for member in members_to_show:
            print(f"  • {member}")
        if len(community['members']) > 10:
            print(f"  ... and {len(community['members']) - 10} more")
    
    print("\nCommunity Interpretation:")
    print("• Communities represent groups of characters that frequently appear together")
    print("• May correspond to different storylines, teams, or social groups")
    print("• Larger communities indicate core character groups")
else:
    print("No significant character communities detected")

## 9. Summary Report

In [None]:
# Generate and save comprehensive report
output_dir = Path('../experiments/character_analysis')
output_dir.mkdir(parents=True, exist_ok=True)

report = analyzer.generate_report(save_path=output_dir / 'character_frequency.json')

# Create summary markdown
summary_md = f"""
# Character Frequency Analysis Summary

## Dataset Overview
- **Total Unique Characters**: {report['overview']['total_characters']}
- **Total Episodes**: {report['overview']['total_episodes']}
- **Main Characters**: {report['overview']['main_characters']}
- **Recurring Characters**: {report['overview']['recurring_characters']}

## Episode Statistics
- **Avg Characters per Episode**: {report['episode_statistics']['avg_characters_per_episode']:.1f}
- **Character Range**: {report['episode_statistics']['min_characters']}-{report['episode_statistics']['max_characters']}
- **Avg Sentences per Episode**: {report['episode_statistics']['avg_sentences_per_episode']:.1f}

## Top 5 Characters by Total Sentences
"""

for name, count in report['top_characters']['by_sentences'][:5]:
    summary_md += f"1. **{name}**: {count} sentences\n"

summary_md += f"""

## Network Analysis
- **Network Nodes**: {co_occur.get('nodes', 'N/A')}
- **Network Edges**: {co_occur.get('edges', 'N/A')}
- **Network Density**: {co_occur.get('density', 0):.3f}
- **Connected Components**: {co_occur.get('connected_components', 'N/A')}

## Implications for Character Embeddings
1. **Main characters** provide most training data ({report['overview']['main_characters']} characters)
2. **Character communities** suggest natural embedding clusters
3. **Co-occurrence patterns** will influence embedding similarity
4. **Episode diversity** ({report['episode_statistics']['avg_characters_per_episode']:.1f} chars/episode) ensures varied contexts
"""

display(Markdown(summary_md))

# Save summary
with open(output_dir / 'character_summary.md', 'w') as f:
    f.write(summary_md)

print(f"\nResults saved to {output_dir}")
print("Files created:")
for file in output_dir.glob('*'):
    print(f"  • {file.name}")