In [8]:
import json

with open("../data/speeches_lemmatized.json") as f:
    speeches = json.load(f)


# Preprocessing the corpus

In [10]:
from collections import defaultdict
from datetime import datetime

def group_speeches_by_quarter_century(speeches_list):
    """
    Group speeches into quarter centuries and create text files.

    Args:
        speeches_list: List of dictionaries containing speech data

    Returns:
        Dictionary with quarter century ranges as keys and filenames as values
    """
    # Group speeches by quarter century
    quarter_groups = defaultdict(list)

    for speech in speeches_list:
        year = int(speech['date'][:4])  # Get year from date
        # Calculate quarter century (1800-1824, 1825-1849, etc.)
        quarter_start = year - (year % 25)
        quarter_end = quarter_start + 24
        quarter_key = f"{quarter_start}-{quarter_end}"

        quarter_groups[quarter_key].append(speech)

    # Create text files for each quarter century
    file_paths = {}
    for quarter, speeches in quarter_groups.items():
        # Sort speeches by date
        speeches.sort(key=lambda x: x['date'])

        # Create lemmatized version
        lemma_text = "\n".join(s['lemmatized'] for s in speeches)
        lemma_filename = f"../data/speeches_{quarter}_lemmatized.txt"
        with open(lemma_filename, 'w', encoding='utf-8') as f:
            f.write(lemma_text)

        # Create transcript version
        transcript_text = "\n".join(s['transcript'] for s in speeches)
        transcript_filename = f"../data/speeches_{quarter}_transcript.txt"
        with open(transcript_filename, 'w', encoding='utf-8') as f:
            f.write(transcript_text)

        file_paths[quarter] = {
            'lemmatized': lemma_filename,
            'transcript': transcript_filename,
            'count': len(speeches)
        }

    # Print summary
    print("Quarter Century Statistics:")
    for quarter, info in file_paths.items():
        print(f"{quarter}: {info['count']} speeches")

    return file_paths


file_paths = group_speeches_by_quarter_century(speeches)

Quarter Century Statistics:
1800-1824: 59 speeches
1900-1924: 91 speeches
1975-1999: 150 speeches
1825-1849: 90 speeches
2000-2024: 164 speeches
1950-1974: 165 speeches
1925-1949: 103 speeches
1875-1899: 99 speeches
1775-1799: 28 speeches
1850-1874: 108 speeches
2025-2049: 1 speeches


# Building language models

In [11]:
from chronowords.algebra.svd import SVDAlgebra
from pathlib import Path

def create_embeddings_for_periods(file_paths, use_lemmatized=True):
    """
    Create word embeddings for each quarter-century period.

    Args:
        file_paths: Dictionary from group_speeches_by_quarter_century
        use_lemmatized: Whether to use lemmatized or transcript texts

    Returns:
        Dictionary of {period: SVDAlgebra model}
    """
    models = {}

    for period, info in file_paths.items():
        print(f"\nProcessing period {period}")

        # Choose which text version to use
        filename = info['lemmatized' if use_lemmatized else 'transcript']

        # Create generator for the corpus
        def read_corpus():
            with open(filename, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():  # Skip empty lines
                        yield line.strip().lower()  # Lowercase everything

        # Initialize and train model
        model = SVDAlgebra(
            n_components=100,     # Smaller dimension for historical texts
            window_size=5,
            min_word_length=3,
            cms_width=1_000_000,  # 1M width should be enough for this corpus
            cms_depth=5
        )

        try:
            # Train model
            print(f"Training model for {period}...")
            model.train(read_corpus())

            # Print some statistics
            print(f"Vocabulary size: {len(model.vocabulary)}")
            print(f"Sample words: {model.vocabulary[:10]}")

            models[period] = model

        except Exception as e:
            print(f"Error processing period {period}: {str(e)}")

    return models

# Create models for both lemmatized and transcript versions
print("Creating models for lemmatized texts...")
lemma_models = create_embeddings_for_periods(file_paths, use_lemmatized=True)

print("\nCreating models for transcript texts...")
transcript_models = create_embeddings_for_periods(file_paths, use_lemmatized=False)

# Save models for later use
for period, model in lemma_models.items():
    save_path = Path(f"../models/lemmatized/{period}")
    save_path.mkdir(parents=True, exist_ok=True)
    model.save_model(save_path)

for period, model in transcript_models.items():
    save_path = Path(f"../models/transcript/{period}")
    save_path.mkdir(parents=True, exist_ok=True)
    model.save_model(save_path)

Creating models for lemmatized texts...

Processing period 1800-1824
Training model for 1800-1824...
Counting words and skipgrams...
Total words: 97700
Total skipgrams: 584554
Building vocabulary...
Vocabulary size: 1331
Computing PPMI matrix...
PPMI matrix shape: (1331, 1331)
PPMI matrix non-zeros: 0
Computing SVD...
Singular values: [7.27026383e-09 7.23292930e-09 7.18736310e-09 7.16579320e-09
 7.14742474e-09 7.13339457e-09 7.10757300e-09 7.10055834e-09
 7.07637587e-09 7.06131469e-09]
Final embeddings shape: (1331, 100)
Embeddings non-zeros: 133100
Min norm: 1.7573036827643673e-05
Max norm: 2.8281866194714123e-05
Mean norm: 2.221218214258625e-05
Vocabulary size: 1331
Sample words: ['the', 'and', 'have', 'that', 'which', 'our', 'with', 'for', 'they', 'their']

Processing period 1900-1924
Training model for 1900-1924...
Counting words and skipgrams...
Total words: 360878
Total skipgrams: 2120636
Building vocabulary...
Vocabulary size: 1383
Computing PPMI matrix...
PPMI matrix shape: (13

# Semantic shift