In [1]:
import json

with open("../data/speeches_lemmatized.json") as f:
    speeches = json.load(f)


# Preprocessing the corpus

In [2]:
from collections import defaultdict


def group_speeches_by_quarter_century(speeches_list):
    """
    Group speeches into quarter centuries and create text files.

    Args:
        speeches_list: List of dictionaries containing speech data

    Returns:
        Dictionary with quarter century ranges as keys and filenames as values
    """
    # Group speeches by quarter century
    quarter_groups = defaultdict(list)

    for speech in speeches_list:
        year = int(speech['date'][:4])  # Get year from date
        # Calculate quarter century (1800-1824, 1825-1849, etc.)
        quarter_start = year - (year % 25)
        quarter_end = quarter_start + 24
        quarter_key = f"{quarter_start}-{quarter_end}"

        quarter_groups[quarter_key].append(speech)

    # Create text files for each quarter century
    file_paths = {}
    for quarter, speeches in quarter_groups.items():
        # Sort speeches by date
        speeches.sort(key=lambda x: x['date'])

        # Create lemmatized version
        lemma_text = "\n".join(s['lemmatized'] for s in speeches)
        lemma_filename = f"../data/speeches_{quarter}_lemmatized.txt"
        with open(lemma_filename, 'w', encoding='utf-8') as f:
            f.write(lemma_text)

        # Create transcript version
        transcript_text = "\n".join(s['transcript'] for s in speeches)
        transcript_filename = f"../data/speeches_{quarter}_transcript.txt"
        with open(transcript_filename, 'w', encoding='utf-8') as f:
            f.write(transcript_text)

        file_paths[quarter] = {
            'lemmatized': lemma_filename,
            'transcript': transcript_filename,
            'count': len(speeches)
        }

    # Print summary
    print("Quarter Century Statistics:")
    for quarter, info in file_paths.items():
        print(f"{quarter}: {info['count']} speeches")

    return file_paths


file_paths = group_speeches_by_quarter_century(speeches)

Quarter Century Statistics:
1800-1824: 59 speeches
1900-1924: 91 speeches
1975-1999: 150 speeches
1825-1849: 90 speeches
2000-2024: 164 speeches
1950-1974: 165 speeches
1925-1949: 103 speeches
1875-1899: 99 speeches
1775-1799: 28 speeches
1850-1874: 108 speeches
2025-2049: 1 speeches


# Building language models

In [11]:
from chronowords.algebra.svd import SVDAlgebra
from pathlib import Path

def create_embeddings_for_periods(file_paths, use_lemmatized=True):
    """
    Create word embeddings for each quarter-century period.

    Args:
        file_paths: Dictionary from group_speeches_by_quarter_century
        use_lemmatized: Whether to use lemmatized or transcript texts

    Returns:
        Dictionary of {period: SVDAlgebra model}
    """
    models = {}

    for period, info in file_paths.items():
        print(f"\nProcessing period {period}")

        # Choose which text version to use
        filename = info['lemmatized' if use_lemmatized else 'transcript']

        # Create generator for the corpus
        def read_corpus():
            with open(filename, 'r', encoding='utf-8') as f:
                for line in f:
                    if line.strip():  # Skip empty lines
                        yield line.strip().lower()  # Lowercase everything

        # Initialize and train model
        model = SVDAlgebra(
            n_components=100,     # Smaller dimension for historical texts
            window_size=5,
            min_word_length=3,
            cms_width=1_000_000,  # 1M width should be enough for this corpus
            cms_depth=5
        )

        try:
            # Train model
            print(f"Training model for {period}...")
            model.train(read_corpus())

            # Print some statistics
            print(f"Vocabulary size: {len(model.vocabulary)}")
            print(f"Sample words: {model.vocabulary[:10]}")

            models[period] = model

        except Exception as e:
            print(f"Error processing period {period}: {str(e)}")

    return models

# Create models for both lemmatized and transcript versions
print("Creating models for lemmatized texts...")
lemma_models = create_embeddings_for_periods(file_paths, use_lemmatized=True)

print("\nCreating models for transcript texts...")
transcript_models = create_embeddings_for_periods(file_paths, use_lemmatized=False)

# Save models for later use
for period, model in lemma_models.items():
    save_path = Path(f"../models/lemmatized/{period}")
    save_path.mkdir(parents=True, exist_ok=True)
    model.save_model(save_path)

for period, model in transcript_models.items():
    save_path = Path(f"../models/transcript/{period}")
    save_path.mkdir(parents=True, exist_ok=True)
    model.save_model(save_path)

Creating models for lemmatized texts...

Processing period 1800-1824
Training model for 1800-1824...
Counting words and skipgrams...
Total words: 97700
Total skipgrams: 584554
Building vocabulary...
Vocabulary size: 1331
Computing PPMI matrix...
PPMI matrix shape: (1331, 1331)
PPMI matrix non-zeros: 0
Computing SVD...
Singular values: [7.27026383e-09 7.23292930e-09 7.18736310e-09 7.16579320e-09
 7.14742474e-09 7.13339457e-09 7.10757300e-09 7.10055834e-09
 7.07637587e-09 7.06131469e-09]
Final embeddings shape: (1331, 100)
Embeddings non-zeros: 133100
Min norm: 1.7573036827643673e-05
Max norm: 2.8281866194714123e-05
Mean norm: 2.221218214258625e-05
Vocabulary size: 1331
Sample words: ['the', 'and', 'have', 'that', 'which', 'our', 'with', 'for', 'they', 'their']

Processing period 1900-1924
Training model for 1900-1924...
Counting words and skipgrams...
Total words: 360878
Total skipgrams: 2120636
Building vocabulary...
Vocabulary size: 1383
Computing PPMI matrix...
PPMI matrix shape: (13

# Semantic shift

In [22]:
import numpy as np
import pickle
from pathlib import Path

def load_and_sort_models(base_path="../models/lemmatized"):
   models = {}
   for period_path in Path(base_path).iterdir():
       if period_path.is_dir():
           period = period_path.name
           embeddings = np.load(f"{period_path}/embeddings.npy")
           with open(f"{period_path}/vocabulary.pkl", 'rb') as f:
               vocabulary = pickle.load(f)
           models[period] = {'embeddings': embeddings, 'vocabulary': vocabulary}

   # Sort by start year
   sorted_periods = sorted(models.keys(), key=lambda x: int(x.split('-')[0]))
   return {period: models[period] for period in sorted_periods}

models = load_and_sort_models()
print("Periods in chronological order:", list(models.keys()))

target_words = [
   "freedom", "democracy", "constitution", "justice",
   "government", "power", "law", "authority",
   "america", "union", "state", "nation"
]


Periods in chronological order: ['1775-1799', '1800-1824', '1825-1849', '1850-1874', '1875-1899', '1900-1924', '1925-1949', '1950-1974', '1975-1999', '2000-2024', '2025-2049']


['the', 'and', 'have', 'that', 'which', 'with', 'for', 'our', 'will', 'they']

In [25]:
from chronowords.alignment.procrustes import ProcustesAligner


def analyze_shifts(models, target_words=None):
    """Analyze semantic shifts between consecutive periods."""
    periods = sorted(models.keys())
    results = {}

    for i in range(len(periods) - 1):
        period1, period2 = periods[i], periods[i + 1]
        model1, model2 = models[period1], models[period2]

        # Align embeddings
        aligner = ProcustesAligner()
        metrics = aligner.fit(
            model1["embeddings"],
            model2["embeddings"],
            model1["vocabulary"],
            model2["vocabulary"]
        )

        print(f"\nAligned {period1} -> {period2}")
        print(f"Aligned words: {metrics.num_aligned_words}")
        print(f"Average similarity: {metrics.average_cosine_similarity:.3f}")

        # Analyze specific words
        if target_words:
            shifts = []
            for word in target_words:
                sim = aligner.get_word_similarity(
                    word,
                    model1["embeddings"],
                    model2["embeddings"]
                )
                if sim is not None:
                    shifts.append((word, 1 - sim))  # Convert to distance

            # Sort by shift magnitude
            shifts.sort(key=lambda x: x[1], reverse=True)
            results[f"{period1}->{period2}"] = shifts

            print("\nTop shifted words:")
            for word, shift in shifts[:5]:
                print(f"{word}: {shift:.3f}")

    return results


# Analyze key political concepts
target_words = [
    "freedom", "democracy", "government", "power",
    "war", "peace", "america", "union", "state",
    "constitution", "rights", "justice", "law"
]

shifts = analyze_shifts(models, target_words)


Aligned 1775-1799 -> 1800-1824
Aligned words: 727
Average similarity: 0.318

Top shifted words:
freedom: 0.998
state: 0.724
america: 0.721
constitution: 0.707
law: 0.658

Aligned 1800-1824 -> 1825-1849
Aligned words: 798
Average similarity: 0.255

Top shifted words:
power: 0.885
law: 0.800
america: 0.797
peace: 0.762
union: 0.751

Aligned 1825-1849 -> 1850-1874
Aligned words: 823
Average similarity: 0.240

Top shifted words:
state: 1.006
union: 0.988
government: 0.859
freedom: 0.846
america: 0.838

Aligned 1850-1874 -> 1875-1899
Aligned words: 779
Average similarity: 0.233

Top shifted words:
america: 1.027
freedom: 1.024
war: 0.808
peace: 0.764
union: 0.715

Aligned 1875-1899 -> 1900-1924
Aligned words: 783
Average similarity: 0.219

Top shifted words:
government: 0.957
freedom: 0.948
war: 0.841
law: 0.806
union: 0.792

Aligned 1900-1924 -> 1925-1949
Aligned words: 782
Average similarity: 0.238

Top shifted words:
america: 0.947
war: 0.908
freedom: 0.877
democracy: 0.838
state: 0.816

In [34]:
# Load the data you already processed
periods = sorted(models.keys(), key=lambda x: int(x.split('-')[0]))

shift_data = []
# Extract start year from period for better x-axis
df['year'] = df['period'].apply(lambda x: int(x.split('→')[0].strip().split('-')[0]))

timeline = alt.Chart(df).mark_line(point=True).encode(
   x=alt.X('year:Q', title='Year'),
   y=alt.Y('shift:Q', title='Semantic Change'),
   color='word:N',
   tooltip=['word', 'year', 'shift']
).properties(
   width=800,
   height=400,
   title='Semantic Shifts Over Time'
).interactive()

timeline


In [41]:
df

Unnamed: 0,period,word,shift,year
0,1775-1799 → 1800-1824,freedom,0.998267,1775
1,1775-1799 → 1800-1824,government,0.521551,1775
2,1775-1799 → 1800-1824,power,0.588788,1775
3,1775-1799 → 1800-1824,war,0.619324,1775
4,1775-1799 → 1800-1824,peace,0.578794,1775
...,...,...,...,...
107,2000-2024 → 2025-2049,america,0.823258,2000
108,2000-2024 → 2025-2049,state,0.760452,2000
109,2000-2024 → 2025-2049,constitution,0.826764,2000
110,2000-2024 → 2025-2049,justice,0.768187,2000


In [42]:
shift_data = []
# Use first period as reference
base_period = periods[0]
base_model = models[base_period]

for current_period in periods[1:]:
    current_model = models[current_period]
    aligner = ProcustesAligner()
    metrics = aligner.fit(
        base_model['embeddings'],
        current_model['embeddings'],
        base_model['vocabulary'],
        current_model['vocabulary']
    )

    for word in target_words:
        sim = aligner.get_word_similarity(word, base_model['embeddings'], current_model['embeddings'])
        if sim is not None:
            shift_data.append({
                'period': current_period,
                'word': word,
                'cumulative_shift': 1 - sim
            })

df_cumulative = pd.DataFrame(shift_data)
df_cumulative['year'] = df_cumulative['period'].apply(lambda x: int(x.split('-')[0]))

cumulative_timeline = alt.Chart(df_cumulative).mark_line(point=True).encode(
    x=alt.X('year:Q', title='Year'),
    y=alt.Y('cumulative_shift:Q', title='Cumulative Semantic Change from 1775'),
    color='word:N',
    tooltip=['word', 'year', 'cumulative_shift']
).properties(
    width=800,
    height=400,
    title='Cumulative Semantic Shifts from First Period'
).interactive()

cumulative_timeline

In [44]:
df_cumulative

Unnamed: 0,period,word,cumulative_shift,year
0,1800-1824,freedom,0.998267,1800
1,1800-1824,government,0.521551,1800
2,1800-1824,power,0.588788,1800
3,1800-1824,war,0.619324,1800
4,1800-1824,peace,0.578794,1800
...,...,...,...,...
102,2025-2049,america,0.469035,2025
103,2025-2049,state,0.659055,2025
104,2025-2049,constitution,0.562302,2025
105,2025-2049,justice,0.471516,2025
