# Exploratory Data Analysis
## 18th-Century Novel Corpus for Burney Attribution

This notebook provides an overview of the corpus assembled for training the Burney authorship attribution model.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter
import re

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load metadata
metadata = pd.read_csv('../data/metadata.csv')
print(f"Loaded metadata for {len(metadata)} texts")
metadata.head()

## Corpus Overview

In [None]:
# Basic statistics
print("=" * 60)
print("CORPUS STATISTICS")
print("=" * 60)
print(f"\nTotal texts: {len(metadata)}")
print(f"Total words: {metadata['word_count'].sum():,}")
print(f"Unique authors: {metadata['author'].nunique()}")
print(f"Unique works: {metadata.groupby(['author', 'title']).ngroups}")
print(f"\nDate range: {metadata['year'].min()} - {metadata['year'].max()}")
print(f"\nAverage words per text: {metadata['word_count'].mean():,.0f}")
print(f"Median words per text: {metadata['word_count'].median():,.0f}")

## Distribution by Author

In [None]:
# Words per author
author_words = metadata.groupby('author')['word_count'].agg(['sum', 'count'])
author_words.columns = ['total_words', 'num_texts']
author_words = author_words.sort_values('total_words', ascending=False)

print("\nWords by Author:")
print(author_words.to_string())

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Total words by author
author_words['total_words'].plot(kind='barh', ax=ax1, color='steelblue')
ax1.set_xlabel('Total Words')
ax1.set_title('Total Words by Author')
ax1.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{int(x/1000)}K'))

# Number of texts by author
author_words['num_texts'].plot(kind='barh', ax=ax2, color='coral')
ax2.set_xlabel('Number of Texts')
ax2.set_title('Number of Texts by Author')

plt.tight_layout()
plt.savefig('../outputs/figures/author_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

# Class balance analysis
print(f"\n{'='*60}")
print("CLASS BALANCE ANALYSIS")
print(f"{'='*60}")
total_words = author_words['total_words'].sum()
author_words['percentage'] = (author_words['total_words'] / total_words * 100).round(1)
print("\nPercentage of corpus by author:")
print(author_words[['percentage']].to_string())
print(f"\nBurney represents {author_words.loc['burney', 'percentage']:.1f}% of the corpus")

## Temporal Distribution

In [None]:
# Aggregate multi-volume works for this analysis
works = metadata.groupby(['author', 'title', 'year']).agg({
    'word_count': 'sum'
}).reset_index()

# Timeline plot
fig, ax = plt.subplots(figsize=(14, 6))

# Color by author
authors = works['author'].unique()
colors = sns.color_palette('tab10', len(authors))
author_colors = dict(zip(authors, colors))

for author in authors:
    author_data = works[works['author'] == author]
    ax.scatter(author_data['year'], 
              author_data['word_count'], 
              s=author_data['word_count']/500,  # Size by word count
              alpha=0.6,
              label=author.title(),
              color=author_colors[author])

ax.set_xlabel('Publication Year')
ax.set_ylabel('Word Count')
ax.set_title('Corpus Timeline (bubble size = word count)')
ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{int(x/1000)}K'))
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/temporal_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

# Burney's career trajectory
burney_works = works[works['author'] == 'burney'].sort_values('year')
print("\nBurney's Career:")
print(burney_works[['title', 'year', 'word_count']].to_string(index=False))

## Text Length Distribution

In [None]:
# Distribution of text lengths
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
metadata['word_count'].hist(bins=20, ax=ax1, color='steelblue', edgecolor='black')
ax1.set_xlabel('Word Count')
ax1.set_ylabel('Frequency')
ax1.set_title('Distribution of Text Lengths')
ax1.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{int(x/1000)}K'))

# Box plot by author
metadata.boxplot(column='word_count', by='author', ax=ax2)
ax2.set_xlabel('Author')
ax2.set_ylabel('Word Count')
ax2.set_title('Text Length by Author')
ax2.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'{int(x/1000)}K'))
plt.suptitle('')  # Remove automatic title

plt.tight_layout()
plt.savefig('../outputs/figures/length_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## Sample Text Inspection

In [None]:
# Load and display samples from each author
def show_sample(author, n_words=200):
    """Display a sample from an author's text."""
    file_info = metadata[metadata['author'] == author].iloc[0]
    file_path = Path('../data/processed') / file_info['file_path']
    
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    
    # Get first n_words
    words = text.split()[:n_words]
    sample = ' '.join(words)
    
    print(f"\n{'='*60}")
    print(f"{author.upper()}: {file_info['title']} ({file_info['year']})")
    print(f"{'='*60}")
    print(sample + "...")

# Show samples from each author
for author in sorted(metadata['author'].unique()):
    show_sample(author, n_words=150)

## Data Quality Check

In [None]:
def check_text_quality(file_path):
    """Check for potential quality issues in a text."""
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    
    issues = []
    
    # Check for remaining Gutenberg markers
    if 'gutenberg' in text.lower():
        issues.append('Contains Gutenberg text')
    
    # Check for excessive special characters (OCR errors)
    special_char_ratio = len(re.findall(r'[^a-zA-Z0-9\s.,;:!?\'"\-]', text)) / len(text)
    if special_char_ratio > 0.01:
        issues.append(f'High special char ratio: {special_char_ratio:.2%}')
    
    # Check for very short texts (possible incomplete)
    word_count = len(text.split())
    if word_count < 20000:
        issues.append(f'Short text: {word_count:,} words')
    
    return issues

# Check all texts
print("Data Quality Report:")
print("=" * 60)

issue_count = 0
for _, row in metadata.iterrows():
    file_path = Path('../data/processed') / row['file_path']
    issues = check_text_quality(file_path)
    
    if issues:
        issue_count += 1
        print(f"\n{row['author']}/{row['title']}:")
        for issue in issues:
            print(f"  ⚠ {issue}")

if issue_count == 0:
    print("\n✓ No significant quality issues detected")
else:
    print(f"\n{issue_count} texts with potential issues (review recommended)")

## Summary Statistics Table

In [None]:
# Create comprehensive summary table
summary = metadata.groupby('author').agg({
    'title': 'count',
    'word_count': ['sum', 'mean', 'min', 'max'],
    'year': ['min', 'max']
})

summary.columns = ['Texts', 'Total Words', 'Avg Words', 'Min Words', 'Max Words', 'First Year', 'Last Year']
summary = summary.sort_values('Total Words', ascending=False)

# Format numbers
for col in ['Total Words', 'Avg Words', 'Min Words', 'Max Words']:
    summary[col] = summary[col].apply(lambda x: f"{int(x):,}")

print("\nComprehensive Summary by Author:")
print(summary.to_string())

# Save to CSV
summary.to_csv('../outputs/corpus_summary.csv')
print("\n✓ Summary saved to outputs/corpus_summary.csv")

## Conclusions

### Corpus Strengths
- Substantial Burney corpus (1.1M words across 4 major works)
- Good temporal coverage (1740-1814)
- Mix of male and female authors
- Contemporary comparison authors in similar genres

### Considerations for Model Training
1. **Class imbalance**: Burney represents ~44% of corpus. May need to:
   - Use stratified sampling
   - Apply class weights during training
   - Consider data augmentation for minority classes

2. **Text length variation**: Some texts much shorter (Edgeworth, Burney volumes). Should:
   - Split longer texts into comparable chunks
   - Ensure train/val/test split by work (not by chunk)

3. **Multi-volume works**: Burney's Cecilia and The Wanderer span multiple files
   - Treat as single works for splitting
   - Consider whether to train on individual volumes or concatenated

4. **Temporal span**: 74-year range
   - Language evolution may affect results
   - Consider temporal stratification in experiments