# Word2Vec Embeddings for Aviation Accident Narratives

**Objective**: Train Word2Vec embeddings on aviation accident narratives to capture semantic relationships between aviation-related terms.

**Methods**: Word2Vec Skip-gram, t-SNE visualization, similarity queries

**Date**: 2025-11-08

In [None]:
import re
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models import Word2Vec
from sklearn.manifold import TSNE

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
np.random.seed(42)
print('âœ… Libraries imported')

In [None]:
df = pd.read_parquet('../../data/narratives_dataset.parquet')
df['full_narrative'] = (df['narr_accp'].fillna('') + ' ' + df['narr_cause'].fillna('')).str.strip()

def tokenize(text):
    if pd.isna(text):
        return []
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    return [w for w in text.split() if len(w) > 2]

texts = df['full_narrative'].apply(tokenize).tolist()
texts = [t for t in texts if len(t) > 0]
print(f'âœ… Preprocessed {len(texts):,} narratives')

In [None]:
print('ðŸ”„ Training Word2Vec model...')
model = Word2Vec(
    sentences=texts,
    vector_size=200,
    window=5,
    min_count=10,
    workers=4,
    sg=1,
    epochs=15,
    seed=42
)
model.save('../../models/word2vec_narratives.model')
print(f'âœ… Model trained: {len(model.wv)} words in vocabulary')

In [None]:
# Test semantic similarity
test_words = ['engine', 'pilot', 'fuel', 'landing', 'weather', 'control']
print('Semantic Similarities:\n')
for word in test_words:
    if word in model.wv:
        similar = model.wv.most_similar(word, topn=5)
        print(f'{word:12s} -> {[", ".join([f"{w} ({s:.3f})" for w, s in similar])]}')

In [None]:
# t-SNE visualization
print('ðŸ”„ Computing t-SNE projection...')
# Select most frequent 300 words
word_freq = {word: model.wv.get_vecattr(word, "count") for word in model.wv.index_to_key[:300]}
top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:300]
words_to_plot = [w for w, _ in top_words]

vectors = np.array([model.wv[word] for word in words_to_plot])
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
coords = tsne.fit_transform(vectors)

plt.figure(figsize=(16, 12))
plt.scatter(coords[:, 0], coords[:, 1], alpha=0.6, s=50, c='steelblue')
for i, word in enumerate(words_to_plot[::3]):
    plt.annotate(word, coords[i*3], fontsize=8, alpha=0.7)
plt.title('Word2Vec Embeddings (t-SNE 2D Projection)', fontsize=14, fontweight='bold')
plt.xlabel('t-SNE Dimension 1', fontsize=12)
plt.ylabel('t-SNE Dimension 2', fontsize=12)
plt.tight_layout()
plt.savefig('figures/word2vec_tsne_projection.png', dpi=150, bbox_inches='tight')
plt.show()
print('âœ… t-SNE plot saved')

In [None]:
print('='*70)
print('WORD2VEC EMBEDDING SUMMARY')
print('='*70)
print(f'Vocabulary size: {len(model.wv):,} words')
print(f'Vector dimensions: {model.wv.vector_size}')
print(f'Training corpus: {len(texts):,} narratives')
print('\nâœ… Word2Vec analysis complete!')
print('='*70)