# Topic Modeling (LDA) of NTSB Aviation Accident Narratives

**Objective**: Discover latent topics in 67,126 aviation accident narratives using Latent Dirichlet Allocation (LDA) topic modeling.

**Dataset**: NTSB Aviation Accident Database (1977-2025, 48 years)

**Methods**:
- Latent Dirichlet Allocation (LDA) with Gensim
- Coherence score optimization (5-20 topics)
- Topic word distributions and probabilities
- Topic prevalence over time
- Topic correlation with fatal outcomes

**Author**: Claude Code (Anthropic)

**Date**: 2025-11-08

## 1. Setup and Data Loading

In [None]:
# Standard library imports
import re
import warnings
from typing import List, Dict, Tuple
import pickle

# Third-party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# NLP imports
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Gensim imports
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('Set2')
np.random.seed(42)

print('‚úÖ Libraries imported successfully')

In [None]:
# Load narrative dataset
df = pd.read_parquet('../../data/narratives_dataset.parquet')

print(f'Dataset: {len(df):,} narrative records')
print(f'Date range: {df["ev_year"].min()} - {df["ev_year"].max()}')

df.head()

## 2. Text Preprocessing for LDA

In [None]:
def preprocess_for_lda(text: str) -> List[str]:
    """
    Preprocess text for LDA topic modeling.
    
    Args:
        text: Raw narrative text
        
    Returns:
        List of cleaned tokens
    """
    if pd.isna(text):
        return []
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove URLs and emails
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove special characters (keep only letters)
    text = re.sub(r'[^a-z\s]', ' ', text)
    
    # Tokenize
    tokens = text.split()
    
    # Remove stopwords and short tokens
    stop_words = set(STOPWORDS)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 3]
    
    return tokens

# Combine narratives and preprocess
df['full_narrative'] = (df['narr_accp'].fillna('') + ' ' + df['narr_cause'].fillna('')).str.strip()
df['tokens'] = df['full_narrative'].apply(preprocess_for_lda)

# Remove empty documents
df = df[df['tokens'].str.len() > 0].reset_index(drop=True)

# Add metadata
df['fatal_outcome'] = df['inj_tot_f'] > 0
df['decade'] = (df['ev_year'] // 10) * 10

print(f'‚úÖ Preprocessed {len(df):,} narratives')
print(f'Average tokens per narrative: {df["tokens"].str.len().mean():.0f}')
print(f'Median tokens per narrative: {df["tokens"].str.len().median():.0f}')

# Display example
print('\nExample tokenized narrative:')
print(f'Original: {df["full_narrative"].iloc[0][:200]}...')
print(f'Tokens (first 30): {df["tokens"].iloc[0][:30]}')

## 3. Create Dictionary and Corpus

In [None]:
# Extract token lists
texts = df['tokens'].tolist()

# Create dictionary
print('üîÑ Creating dictionary...')
dictionary = corpora.Dictionary(texts)

print(f'Dictionary before filtering: {len(dictionary)} unique tokens')

# Filter extremes
dictionary.filter_extremes(
    no_below=10,   # Minimum 10 documents
    no_above=0.6,  # Maximum 60% of documents
    keep_n=10000   # Keep top 10,000 tokens
)

print(f'Dictionary after filtering: {len(dictionary)} unique tokens')

# Create corpus (bag-of-words)
print('üîÑ Creating corpus...')
corpus = [dictionary.doc2bow(text) for text in texts]

print(f'‚úÖ Corpus created: {len(corpus):,} documents')
print(f'Average tokens per document: {np.mean([len(doc) for doc in corpus]):.1f}')

## 4. Determine Optimal Number of Topics

In [None]:
# Test different numbers of topics (full evaluation)
topic_range = [5, 10, 15, 20]  # Full range for comprehensive coherence testing
coherence_scores = []

print('üîÑ Testing different numbers of topics for coherence...\n')
print('Note: Testing 4 topic counts (comprehensive evaluation)\n')

for num_topics in topic_range:
    print(f'Training LDA with {num_topics} topics...')
    
    lda_temp = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        random_state=42,
        passes=10,  # Full passes for comprehensive coherence testing
        alpha='auto',
        eta='auto',
        per_word_topics=True
    )
    
    # Compute coherence score
    coherence_model = CoherenceModel(
        model=lda_temp,
        texts=texts,
        dictionary=dictionary,
        coherence='c_v'
    )
    coherence = coherence_model.get_coherence()
    coherence_scores.append(coherence)
    
    print(f'  Coherence score: {coherence:.4f}\n')

# Find optimal number of topics
optimal_idx = np.argmax(coherence_scores)
optimal_topics = topic_range[optimal_idx]
optimal_coherence = coherence_scores[optimal_idx]

print(f'\n‚úÖ Optimal number of topics: {optimal_topics} (coherence: {optimal_coherence:.4f})')


In [None]:
# Plot coherence scores
plt.figure(figsize=(10, 6))
plt.plot(topic_range, coherence_scores, marker='o', linewidth=2, markersize=8)
plt.axvline(x=optimal_topics, color='red', linestyle='--', label=f'Optimal: {optimal_topics} topics')
plt.xlabel('Number of Topics', fontsize=12, fontweight='bold')
plt.ylabel('Coherence Score (C_v)', fontsize=12, fontweight='bold')
plt.title('Topic Model Coherence Optimization', fontsize=14, fontweight='bold', pad=20)
plt.legend(fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('figures/lda_coherence_optimization.png', dpi=150, bbox_inches='tight')
plt.show()

print('‚úÖ Coherence plot saved: figures/lda_coherence_optimization.png')

## 5. Train Final LDA Model

In [None]:
# Train final LDA model with optimal number of topics
print(f'üîÑ Training final LDA model with {optimal_topics} topics...\n')

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=optimal_topics,
    random_state=42,
    passes=15,  # More passes for final model
    iterations=400,
    alpha='auto',
    eta='auto',
    per_word_topics=True
)

print('‚úÖ LDA model training complete!')

# Save model
lda_model.save('../../models/lda_aviation_narratives.model')
dictionary.save('../../models/lda_dictionary.dict')
with open('../../models/lda_corpus.pkl', 'wb') as f:
    pickle.dump(corpus, f)

print('‚úÖ Model saved to: models/lda_aviation_narratives.model')

## 6. Extract and Display Topics

In [None]:
# Display all topics with top 20 words
print('='*80)
print(f'LDA TOPICS ({optimal_topics} topics, top 20 words per topic)')
print('='*80)

topic_words = {}

for topic_id in range(optimal_topics):
    topic_terms = lda_model.show_topic(topic_id, topn=20)
    topic_words[topic_id] = topic_terms
    
    print(f'\nTopic {topic_id}:')
    for word, prob in topic_terms:
        print(f'  {word:20s} {prob:.4f}')

## 7. Assign Dominant Topics to Narratives

In [None]:
# Get dominant topic for each document
def get_dominant_topic(doc_topics):
    """Extract dominant topic from LDA output."""
    if not doc_topics:
        return -1, 0.0
    sorted_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    return sorted_topics[0]

dominant_topics = []
dominant_probs = []

for doc in corpus:
    doc_topics = lda_model.get_document_topics(doc)
    topic_id, prob = get_dominant_topic(doc_topics)
    dominant_topics.append(topic_id)
    dominant_probs.append(prob)

df['dominant_topic'] = dominant_topics
df['topic_probability'] = dominant_probs

print('‚úÖ Dominant topics assigned to all narratives')
print(f'\nTopic distribution:')
print(df['dominant_topic'].value_counts().sort_index())

## 8. Visualizations

### 8.1 Topic Distribution Bar Chart

In [None]:
# Topic distribution
topic_counts = df['dominant_topic'].value_counts().sort_index()

plt.figure(figsize=(12, 6))
plt.bar(topic_counts.index, topic_counts.values, color='steelblue', edgecolor='black')
plt.xlabel('Topic ID', fontsize=12, fontweight='bold')
plt.ylabel('Number of Narratives', fontsize=12, fontweight='bold')
plt.title(f'Topic Distribution Across {len(df):,} Aviation Accident Narratives', 
          fontsize=14, fontweight='bold', pad=20)
plt.xticks(topic_counts.index)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('figures/lda_topic_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print('‚úÖ Topic distribution chart saved: figures/lda_topic_distribution.png')

### 8.2 Topic Prevalence Over Time

In [None]:
# Topic prevalence by decade
decade_topic = df.groupby(['decade', 'dominant_topic']).size().unstack(fill_value=0)

# Normalize by decade (percentage)
decade_topic_pct = decade_topic.div(decade_topic.sum(axis=1), axis=0) * 100

# Plot heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(
    decade_topic_pct.T,
    cmap='YlGnBu',
    cbar_kws={'label': 'Percentage of Narratives (%)'},
    linewidths=0.5,
    linecolor='gray',
    fmt='.1f',
    annot=True
)
plt.xlabel('Decade', fontsize=12, fontweight='bold')
plt.ylabel('Topic ID', fontsize=12, fontweight='bold')
plt.title('Topic Prevalence Across Decades', fontsize=14, fontweight='bold', pad=20)
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('figures/lda_topic_prevalence_decades.png', dpi=150, bbox_inches='tight')
plt.show()

print('‚úÖ Topic prevalence heatmap saved: figures/lda_topic_prevalence_decades.png')

### 8.3 Topic Correlation with Fatal Outcomes

In [None]:
# Fatal rate by topic
topic_fatal_rate = df.groupby('dominant_topic')['fatal_outcome'].agg(['sum', 'count', 'mean'])
topic_fatal_rate['fatal_rate_pct'] = topic_fatal_rate['mean'] * 100
topic_fatal_rate = topic_fatal_rate.sort_values('fatal_rate_pct', ascending=False)

print('Fatal Rate by Topic:\n')
print(topic_fatal_rate[['sum', 'count', 'fatal_rate_pct']].to_string())

# Plot fatal rates
plt.figure(figsize=(12, 6))
colors = ['#e74c3c' if rate > df['fatal_outcome'].mean() * 100 else '#3498db' 
          for rate in topic_fatal_rate['fatal_rate_pct']]

plt.bar(topic_fatal_rate.index, topic_fatal_rate['fatal_rate_pct'], color=colors, edgecolor='black')
plt.axhline(y=df['fatal_outcome'].mean() * 100, color='orange', linestyle='--', 
            linewidth=2, label=f'Overall Fatal Rate: {df["fatal_outcome"].mean()*100:.1f}%')
plt.xlabel('Topic ID', fontsize=12, fontweight='bold')
plt.ylabel('Fatal Accident Rate (%)', fontsize=12, fontweight='bold')
plt.title('Fatal Accident Rate by Topic', fontsize=14, fontweight='bold', pad=20)
plt.xticks(topic_fatal_rate.index)
plt.legend(fontsize=10)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('figures/lda_topic_fatal_rates.png', dpi=150, bbox_inches='tight')
plt.show()

print('‚úÖ Topic fatal rates chart saved: figures/lda_topic_fatal_rates.png')

### 8.4 Topic Word Clouds

In [None]:
from wordcloud import WordCloud

# Create word clouds for top 6 topics by size
top_topics = topic_counts.nlargest(6).index

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, topic_id in enumerate(top_topics):
    # Get topic words and probabilities
    topic_terms = dict(lda_model.show_topic(topic_id, topn=50))
    
    # Create word cloud
    wordcloud = WordCloud(
        width=600,
        height=400,
        background_color='white',
        colormap='viridis',
        relative_scaling=0.5
    ).generate_from_frequencies(topic_terms)
    
    axes[idx].imshow(wordcloud, interpolation='bilinear')
    axes[idx].axis('off')
    axes[idx].set_title(f'Topic {topic_id} (n={topic_counts[topic_id]:,})', 
                        fontsize=12, fontweight='bold')

fig.suptitle('Word Clouds for Top 6 Topics', fontsize=16, fontweight='bold', y=0.98)
plt.tight_layout()
plt.savefig('figures/lda_topic_wordclouds.png', dpi=150, bbox_inches='tight')
plt.show()

print('‚úÖ Topic word clouds saved: figures/lda_topic_wordclouds.png')

## 9. Summary Statistics

In [None]:
print('='*80)
print('LDA TOPIC MODELING SUMMARY')
print('='*80)

print(f'\nüìä Model Configuration:')
print(f'   Number of topics: {optimal_topics}')
print(f'   Coherence score (C_v): {optimal_coherence:.4f}')
print(f'   Dictionary size: {len(dictionary):,} unique tokens')
print(f'   Corpus size: {len(corpus):,} documents')

print(f'\nüìà Topic Statistics:')
print(f'   Most prevalent topic: Topic {topic_counts.idxmax()} ({topic_counts.max():,} narratives)')
print(f'   Least prevalent topic: Topic {topic_counts.idxmin()} ({topic_counts.min():,} narratives)')
print(f'   Average narratives per topic: {topic_counts.mean():.0f}')
print(f'   Average topic probability: {df["topic_probability"].mean():.3f}')

print(f'\n‚ö†Ô∏è Fatal Outcome Analysis:')
print(f'   Overall fatal rate: {df["fatal_outcome"].mean()*100:.1f}%')
print(f'   Highest fatal rate: Topic {topic_fatal_rate.index[0]} ({topic_fatal_rate["fatal_rate_pct"].iloc[0]:.1f}%)')
print(f'   Lowest fatal rate: Topic {topic_fatal_rate.index[-1]} ({topic_fatal_rate["fatal_rate_pct"].iloc[-1]:.1f}%)')

print(f'\nüíæ Artifacts Created:')
print(f'   Model: models/lda_aviation_narratives.model')
print(f'   Dictionary: models/lda_dictionary.dict')
print(f'   Corpus: models/lda_corpus.pkl')

print(f'\nüìä Visualizations Created:')
print(f'   1. Coherence optimization plot')
print(f'   2. Topic distribution bar chart')
print(f'   3. Topic prevalence heatmap (decades)')
print(f'   4. Topic fatal rates comparison')
print(f'   5. Topic word clouds (top 6 topics)')

print('\n‚úÖ LDA Topic Modeling Complete!')
print('='*80)

## 10. Export Results

In [None]:
# Export topic assignments
topic_assignments = df[['ev_id', 'ev_year', 'decade', 'dominant_topic', 'topic_probability', 'fatal_outcome']]
topic_assignments.to_csv('../../data/lda_topic_assignments.csv', index=False)
print('‚úÖ Exported topic assignments to: data/lda_topic_assignments.csv')

# Export topic words
topic_words_export = []
for topic_id in range(optimal_topics):
    for word, prob in topic_words[topic_id]:
        topic_words_export.append({
            'topic_id': topic_id,
            'word': word,
            'probability': prob
        })

topic_words_df = pd.DataFrame(topic_words_export)
topic_words_df.to_csv('../../data/lda_topic_words.csv', index=False)
print('‚úÖ Exported topic words to: data/lda_topic_words.csv')

# Export topic statistics
topic_stats = pd.DataFrame({
    'topic_id': topic_counts.index,
    'narrative_count': topic_counts.values,
    'fatal_count': topic_fatal_rate.loc[topic_counts.index, 'sum'].values,
    'fatal_rate_pct': topic_fatal_rate.loc[topic_counts.index, 'fatal_rate_pct'].values
})
topic_stats.to_csv('../../data/lda_topic_statistics.csv', index=False)
print('‚úÖ Exported topic statistics to: data/lda_topic_statistics.csv')

print('\nüéâ All LDA analysis results saved successfully!')