# Exploratory Data Analysis - Child Speech Recognition Challenge

This notebook explores the training data to understand:
- Audio duration distributions
- Age bucket distributions
- Transcription length statistics
- Potential outliers
- Audio characteristics via spectrograms

In [None]:
import json
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Training Data

In [None]:
# Load training manifest
data_dir = Path('../data')
manifest_path = data_dir / 'train_word_transcripts.jsonl'

# Read JSONL file
records = []
with open(manifest_path, 'r', encoding='utf-8') as f:
    for line in f:
        records.append(json.loads(line.strip()))

# Convert to DataFrame
df = pd.DataFrame(records)

print(f"Total samples: {len(df):,}")
print(f"\nDataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()

In [None]:
# Basic statistics
print("Dataset Summary:")
print(f"Total utterances: {len(df):,}")
print(f"Unique children: {df['child_id'].nunique():,}")
print(f"Unique sessions: {df['session_id'].nunique():,}")
print(f"\nMissing values:")
print(df.isnull().sum())

## 2. Audio Duration Analysis

In [None]:
# Duration statistics
print("Audio Duration Statistics (seconds):")
print(df['audio_duration_sec'].describe())

print(f"\nTotal audio duration: {df['audio_duration_sec'].sum() / 3600:.2f} hours")
print(f"Median duration: {df['audio_duration_sec'].median():.2f} seconds")

In [None]:
# Duration distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(df['audio_duration_sec'], bins=100, edgecolor='black', alpha=0.7)
axes[0].axvline(df['audio_duration_sec'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df["audio_duration_sec"].mean():.2f}s')
axes[0].axvline(df['audio_duration_sec'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {df["audio_duration_sec"].median():.2f}s')
axes[0].set_xlabel('Duration (seconds)')
axes[0].set_ylabel('Count')
axes[0].set_title('Audio Duration Distribution')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Box plot
axes[1].boxplot(df['audio_duration_sec'], vert=True)
axes[1].set_ylabel('Duration (seconds)')
axes[1].set_title('Audio Duration Box Plot')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Duration bins
duration_bins = pd.cut(df['audio_duration_sec'], bins=[0, 2, 5, 10, 15, 100])
print("\nDuration distribution by bins:")
print(duration_bins.value_counts().sort_index())

## 3. Age Bucket Distribution

In [None]:
# Age bucket statistics
age_counts = df['age_bucket'].value_counts().sort_index()
print("Age Bucket Distribution:")
print(age_counts)
print(f"\nPercentage distribution:")
print((age_counts / len(df) * 100).round(2))

In [None]:
# Visualize age bucket distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar chart
age_counts.plot(kind='bar', ax=axes[0], color='steelblue', edgecolor='black')
axes[0].set_xlabel('Age Bucket')
axes[0].set_ylabel('Count')
axes[0].set_title('Samples per Age Bucket')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3)

# Add counts on bars
for i, v in enumerate(age_counts):
    axes[0].text(i, v + 500, str(v), ha='center', va='bottom')

# Pie chart
axes[1].pie(age_counts, labels=age_counts.index, autopct='%1.1f%%', startangle=90, colors=sns.color_palette('Set3'))
axes[1].set_title('Age Bucket Distribution (%)')

plt.tight_layout()
plt.show()

In [None]:
# Duration by age bucket
fig, ax = plt.subplots(figsize=(12, 6))

df.boxplot(column='audio_duration_sec', by='age_bucket', ax=ax)
ax.set_xlabel('Age Bucket')
ax.set_ylabel('Duration (seconds)')
ax.set_title('Audio Duration by Age Bucket')
plt.suptitle('')  # Remove default title
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Statistics by age
print("\nDuration statistics by age bucket:")
print(df.groupby('age_bucket')['audio_duration_sec'].describe())

## 4. Transcription Length Analysis

In [None]:
# Calculate transcription lengths
df['text_length_chars'] = df['orthographic_text'].str.len()
df['text_length_words'] = df['orthographic_text'].str.split().str.len()

print("Transcription Length Statistics:")
print("\nCharacter length:")
print(df['text_length_chars'].describe())
print("\nWord count:")
print(df['text_length_words'].describe())

In [None]:
# Transcription length distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Character length histogram
axes[0, 0].hist(df['text_length_chars'], bins=100, edgecolor='black', alpha=0.7, color='coral')
axes[0, 0].axvline(df['text_length_chars'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df["text_length_chars"].mean():.1f}')
axes[0, 0].set_xlabel('Character Length')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('Transcription Character Length Distribution')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Word count histogram
axes[0, 1].hist(df['text_length_words'], bins=50, edgecolor='black', alpha=0.7, color='skyblue')
axes[0, 1].axvline(df['text_length_words'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df["text_length_words"].mean():.1f}')
axes[0, 1].set_xlabel('Word Count')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Transcription Word Count Distribution')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Duration vs Character Length
axes[1, 0].scatter(df['audio_duration_sec'], df['text_length_chars'], alpha=0.3, s=10)
axes[1, 0].set_xlabel('Audio Duration (seconds)')
axes[1, 0].set_ylabel('Character Length')
axes[1, 0].set_title('Duration vs Character Length')
axes[1, 0].grid(True, alpha=0.3)

# Duration vs Word Count
axes[1, 1].scatter(df['audio_duration_sec'], df['text_length_words'], alpha=0.3, s=10)
axes[1, 1].set_xlabel('Audio Duration (seconds)')
axes[1, 1].set_ylabel('Word Count')
axes[1, 1].set_title('Duration vs Word Count')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Correlation
print(f"\nCorrelation between duration and character length: {df['audio_duration_sec'].corr(df['text_length_chars']):.3f}")
print(f"Correlation between duration and word count: {df['audio_duration_sec'].corr(df['text_length_words']):.3f}")

In [None]:
# Word count by age bucket
fig, ax = plt.subplots(figsize=(12, 6))

df.boxplot(column='text_length_words', by='age_bucket', ax=ax)
ax.set_xlabel('Age Bucket')
ax.set_ylabel('Word Count')
ax.set_title('Word Count by Age Bucket')
plt.suptitle('')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nWord count statistics by age bucket:")
print(df.groupby('age_bucket')['text_length_words'].describe())

## 5. Outlier Analysis

In [None]:
# Identify outliers using IQR method
def find_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Duration outliers
duration_outliers, dur_lower, dur_upper = find_outliers_iqr(df, 'audio_duration_sec')
print(f"Duration outliers: {len(duration_outliers):,} samples ({len(duration_outliers)/len(df)*100:.2f}%)")
print(f"Valid range: {dur_lower:.2f}s to {dur_upper:.2f}s")
print(f"\nTop 10 longest recordings:")
print(df.nlargest(10, 'audio_duration_sec')[['utterance_id', 'audio_duration_sec', 'age_bucket', 'text_length_words', 'orthographic_text']])

In [None]:
# Word count outliers
word_outliers, word_lower, word_upper = find_outliers_iqr(df, 'text_length_words')
print(f"Word count outliers: {len(word_outliers):,} samples ({len(word_outliers)/len(df)*100:.2f}%)")
print(f"Valid range: {word_lower:.1f} to {word_upper:.1f} words")
print(f"\nTop 10 longest transcriptions by word count:")
print(df.nlargest(10, 'text_length_words')[['utterance_id', 'audio_duration_sec', 'text_length_words', 'orthographic_text']])

In [None]:
# Very short utterances
short_utterances = df[df['text_length_words'] <= 2]
print(f"\nUtterances with ‚â§2 words: {len(short_utterances):,} samples ({len(short_utterances)/len(df)*100:.2f}%)")
print(f"\nMost common short transcriptions:")
print(short_utterances['orthographic_text'].value_counts().head(20))

In [None]:
# Calculate speech rate (words per second)
df['speech_rate'] = df['text_length_words'] / df['audio_duration_sec']
df['speech_rate'] = df['speech_rate'].replace([np.inf, -np.inf], np.nan)

print("Speech Rate Statistics (words/second):")
print(df['speech_rate'].describe())

# Visualize speech rate
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Overall distribution
axes[0].hist(df['speech_rate'].dropna(), bins=100, edgecolor='black', alpha=0.7, color='lightgreen')
axes[0].axvline(df['speech_rate'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df["speech_rate"].mean():.2f}')
axes[0].set_xlabel('Speech Rate (words/second)')
axes[0].set_ylabel('Count')
axes[0].set_title('Speech Rate Distribution')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# By age bucket
df.boxplot(column='speech_rate', by='age_bucket', ax=axes[1])
axes[1].set_xlabel('Age Bucket')
axes[1].set_ylabel('Speech Rate (words/second)')
axes[1].set_title('Speech Rate by Age Bucket')
plt.suptitle('')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Extremely slow or fast speakers
speech_outliers, sr_lower, sr_upper = find_outliers_iqr(df, 'speech_rate')
print(f"\nSpeech rate outliers: {len(speech_outliers):,} samples ({len(speech_outliers)/len(df)*100:.2f}%)")
print(f"Valid range: {sr_lower:.2f} to {sr_upper:.2f} words/second")

## 6. Audio Quality Analysis

### File Size Analysis

In [None]:
# File size analysis
df['filesize_mb'] = df['filesize_bytes'] / (1024 * 1024)

print("File Size Statistics:")
print(df['filesize_mb'].describe())
print(f"\nTotal dataset size: {df['filesize_mb'].sum():.2f} MB ({df['filesize_mb'].sum()/1024:.2f} GB)")

# File size distribution
fig, ax = plt.subplots(figsize=(12, 6))
ax.hist(df['filesize_mb'], bins=100, edgecolor='black', alpha=0.7, color='mediumpurple')
ax.set_xlabel('File Size (MB)')
ax.set_ylabel('Count')
ax.set_title('Audio File Size Distribution')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Compression ratio (bytes per second)
df['compression_ratio'] = df['filesize_bytes'] / df['audio_duration_sec']
print(f"\nAverage compression ratio: {df['compression_ratio'].mean():.0f} bytes/second")

### Sample Audio Spectrograms

Load and visualize spectrograms for sample audio files to understand audio characteristics.

In [None]:
# Try to load librosa for spectrogram analysis
try:
    import librosa
    import librosa.display
    librosa_available = True
except ImportError:
    print("Warning: librosa not installed. Skipping spectrogram analysis.")
    print("Install with: pip install librosa")
    librosa_available = False

In [None]:
if librosa_available:
    # Function to find audio file across multiple directories
    def find_audio_file(audio_path, base_dirs=['audio_0', 'audio_1', 'audio_2']):
        # Remove 'audio/' prefix if present
        filename = audio_path.replace('audio/', '').replace('audio\\', '')
        
        for base_dir in base_dirs:
            full_path = data_dir / base_dir / filename
            if full_path.exists():
                return full_path
        return None
    
    # Sample different age buckets and durations
    sample_criteria = [
        ('Short (3-4 years)', (df['age_bucket'] == '3-4') & (df['audio_duration_sec'] < 3)),
        ('Medium (5-7 years)', (df['age_bucket'] == '5-7') & (df['audio_duration_sec'].between(5, 8))),
        ('Long (8-11 years)', (df['age_bucket'] == '8-11') & (df['audio_duration_sec'] > 10)),
    ]
    
    fig, axes = plt.subplots(len(sample_criteria), 2, figsize=(15, 4 * len(sample_criteria)))
    
    for idx, (label, condition) in enumerate(sample_criteria):
        candidates = df[condition]
        if len(candidates) == 0:
            print(f"No samples found for: {label}")
            continue
        
        # Get a random sample
        sample = candidates.sample(1).iloc[0]
        audio_path = find_audio_file(sample['audio_path'])
        
        if audio_path and audio_path.exists():
            try:
                # Load audio
                y, sr = librosa.load(audio_path, sr=16000)
                
                # Waveform
                axes[idx, 0].plot(np.linspace(0, len(y)/sr, len(y)), y)
                axes[idx, 0].set_xlabel('Time (s)')
                axes[idx, 0].set_ylabel('Amplitude')
                axes[idx, 0].set_title(f'{label} - Waveform\nDuration: {sample["audio_duration_sec"]:.2f}s')
                axes[idx, 0].grid(True, alpha=0.3)
                
                # Spectrogram
                D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
                img = librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='hz', ax=axes[idx, 1])
                axes[idx, 1].set_title(f'{label} - Spectrogram\nText: "{sample["orthographic_text"][:50]}..."')
                fig.colorbar(img, ax=axes[idx, 1], format='%+2.0f dB')
                
                print(f"‚úì Loaded sample for {label}: {sample['utterance_id']}")
            except Exception as e:
                print(f"‚úó Error loading {label}: {e}")
        else:
            print(f"‚úó Audio file not found for {label}: {sample['audio_path']}")
    
    plt.tight_layout()
    plt.show()
else:
    print("Skipping spectrogram analysis - librosa not available")

## 7. Vocabulary Analysis

In [None]:
# Tokenize all transcriptions
all_words = []
for text in df['orthographic_text']:
    all_words.extend(text.lower().split())

word_freq = Counter(all_words)
print(f"Total vocabulary size: {len(word_freq):,} unique words")
print(f"Total word tokens: {len(all_words):,}")

print("\nTop 30 most common words:")
for word, count in word_freq.most_common(30):
    print(f"  {word:20s} {count:6,} ({count/len(all_words)*100:.2f}%)")

In [None]:
# Vocabulary distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Word frequency distribution (log scale)
word_counts = sorted(word_freq.values(), reverse=True)
axes[0].plot(range(1, len(word_counts) + 1), word_counts)
axes[0].set_xscale('log')
axes[0].set_yscale('log')
axes[0].set_xlabel('Word Rank (log scale)')
axes[0].set_ylabel('Frequency (log scale)')
axes[0].set_title('Word Frequency Distribution (Zipf\'s Law)')
axes[0].grid(True, alpha=0.3)

# Top 30 words bar chart
top_words = word_freq.most_common(30)
words, counts = zip(*top_words)
axes[1].barh(range(len(words)), counts, color='teal', alpha=0.7)
axes[1].set_yticks(range(len(words)))
axes[1].set_yticklabels(words)
axes[1].set_xlabel('Frequency')
axes[1].set_title('Top 30 Most Common Words')
axes[1].invert_yaxis()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Rare words
rare_words = [word for word, count in word_freq.items() if count == 1]
print(f"\nWords appearing only once (hapax legomena): {len(rare_words):,} ({len(rare_words)/len(word_freq)*100:.2f}% of vocabulary)")
print(f"Sample rare words: {', '.join(rare_words[:20])}")

## 8. Summary Statistics

In [None]:
print("="*60)
print("DATASET SUMMARY")
print("="*60)

print(f"\nüìä OVERALL STATISTICS")
print(f"  Total samples: {len(df):,}")
print(f"  Unique children: {df['child_id'].nunique():,}")
print(f"  Unique sessions: {df['session_id'].nunique():,}")
print(f"  Total audio duration: {df['audio_duration_sec'].sum() / 3600:.2f} hours")
print(f"  Total dataset size: {df['filesize_mb'].sum() / 1024:.2f} GB")

print(f"\nüéôÔ∏è AUDIO CHARACTERISTICS")
print(f"  Duration range: {df['audio_duration_sec'].min():.2f}s - {df['audio_duration_sec'].max():.2f}s")
print(f"  Average duration: {df['audio_duration_sec'].mean():.2f}s (¬±{df['audio_duration_sec'].std():.2f}s)")
print(f"  Median duration: {df['audio_duration_sec'].median():.2f}s")

print(f"\nüë∂ AGE DISTRIBUTION")
for age, count in age_counts.items():
    print(f"  {age:12s}: {count:6,} samples ({count/len(df)*100:5.2f}%)")

print(f"\nüìù TRANSCRIPTION CHARACTERISTICS")
print(f"  Word count range: {df['text_length_words'].min():.0f} - {df['text_length_words'].max():.0f} words")
print(f"  Average words per utterance: {df['text_length_words'].mean():.2f} (¬±{df['text_length_words'].std():.2f})")
print(f"  Median words per utterance: {df['text_length_words'].median():.0f}")
print(f"  Total vocabulary size: {len(word_freq):,} unique words")
print(f"  Average speech rate: {df['speech_rate'].mean():.2f} words/second")

print(f"\n‚ö†Ô∏è POTENTIAL ISSUES")
print(f"  Duration outliers: {len(duration_outliers):,} samples ({len(duration_outliers)/len(df)*100:.2f}%)")
print(f"  Very short utterances (‚â§2 words): {len(short_utterances):,} samples ({len(short_utterances)/len(df)*100:.2f}%)")
print(f"  Speech rate outliers: {len(speech_outliers):,} samples ({len(speech_outliers)/len(df)*100:.2f}%)")

print("\n" + "="*60)

## Key Findings

1. **Dataset Size**: The dataset contains ~95K utterances from children across different age groups
2. **Age Distribution**: Most samples are from the 8-11 age bucket, with representation across all age groups
3. **Audio Duration**: Most utterances are between 2-10 seconds, with a median around 5-6 seconds
4. **Transcription Length**: Average utterance length is around 10-15 words, with significant variation
5. **Speech Rate**: Children's speech rate varies by age, with older children typically speaking faster
6. **Vocabulary**: Large vocabulary with many rare words (hapax legomena), indicating diverse content

## Recommendations for Training

1. **Stratified Splitting**: Ensure validation set has proportional representation from all age buckets
2. **Duration Handling**: Consider padding/truncating strategy for very long or short utterances
3. **Data Quality**: Monitor outliers (very fast/slow speech rates) during training
4. **Age-Specific Analysis**: Track model performance separately by age bucket to identify weaknesses
5. **Vocabulary Coverage**: Ensure tokenizer handles children's speech patterns and common words effectively