In [1]:
import json


with open("../data/speeches_lemmatized.json") as f:
    speeches = json.load(f)

# Preprocessing the corpus

In [2]:
from collections import defaultdict


def group_speeches_by_quarter_century(speeches_list):
    """Group speeches into quarter centuries and create text files.

    Args:
        speeches_list: List of dictionaries containing speech data

    Returns:
        Dictionary with quarter century ranges as keys and filenames as values

    """
    # Group speeches by quarter century
    quarter_groups = defaultdict(list)

    for speech in speeches_list:
        year = int(speech["date"][:4])  # Get year from date
        # Calculate quarter century (1800-1824, 1825-1849, etc.)
        quarter_start = year - (year % 25)
        quarter_end = quarter_start + 24
        quarter_key = f"{quarter_start}-{quarter_end}"

        quarter_groups[quarter_key].append(speech)

    # Create text files for each quarter century
    file_paths = {}
    for quarter, speeches in quarter_groups.items():
        # Sort speeches by date
        speeches.sort(key=lambda x: x["date"])

        # Create lemmatized version
        lemma_text = "\n".join(s["lemmatized"] for s in speeches)
        lemma_filename = f"../data/speeches_{quarter}_lemmatized.txt"
        with open(lemma_filename, "w", encoding="utf-8") as f:
            f.write(lemma_text)

        # Create transcript version
        transcript_text = "\n".join(s["transcript"] for s in speeches)
        transcript_filename = f"../data/speeches_{quarter}_transcript.txt"
        with open(transcript_filename, "w", encoding="utf-8") as f:
            f.write(transcript_text)

        file_paths[quarter] = {
            "lemmatized": lemma_filename,
            "transcript": transcript_filename,
            "count": len(speeches),
        }

    # Print summary
    print("Quarter Century Statistics:")
    for quarter, info in file_paths.items():
        print(f"{quarter}: {info['count']} speeches")

    return file_paths


file_paths = group_speeches_by_quarter_century(speeches)

Quarter Century Statistics:
1800-1824: 59 speeches
1900-1924: 91 speeches
1975-1999: 150 speeches
1825-1849: 90 speeches
2000-2024: 164 speeches
1950-1974: 165 speeches
1925-1949: 103 speeches
1875-1899: 99 speeches
1775-1799: 28 speeches
1850-1874: 108 speeches
2025-2049: 2 speeches


# Building language models

In [3]:
from pathlib import Path

from chronowords.algebra.svd import SVDAlgebra


def create_embeddings_for_periods(file_paths, use_lemmatized=True):
    """Create word embeddings for each quarter-century period.

    Args:
        file_paths: Dictionary from group_speeches_by_quarter_century
        use_lemmatized: Whether to use lemmatized or transcript texts

    Returns:
        Dictionary of {period: SVDAlgebra model}

    """
    models = {}

    for period, info in file_paths.items():
        print(f"\nProcessing period {period}")

        # Choose which text version to use
        filename = info["lemmatized" if use_lemmatized else "transcript"]

        # Create generator for the corpus
        def read_corpus():
            with open(filename, encoding="utf-8") as f:
                for line in f:
                    if line.strip():  # Skip empty lines
                        yield line.strip().lower()  # Lowercase everything

        # Initialize and train model
        model = SVDAlgebra(
            n_components=100,  # Smaller dimension for historical texts
            window_size=5,
            min_word_length=3,
            cms_width=1_000_000,  # 1M width should be enough for this corpus
            cms_depth=5,
        )

        try:
            # Train model
            # print(f"Training model for {period}...")
            model.train(read_corpus())

            # Print some statistics
            # print(f"Vocabulary size: {len(model.vocabulary)}")
            # print(f"Sample words: {model.vocabulary[:10]}")

            models[period] = model
        except Exception as e:
            print(f"Error processing period {period}: {e!s}")

    return models


# Create models for both lemmatized and transcript versions
print("Creating models for lemmatized texts...")
lemma_models = create_embeddings_for_periods(file_paths, use_lemmatized=True)

print("\nCreating models for transcript texts...")
transcript_models = create_embeddings_for_periods(file_paths, use_lemmatized=False)

# Save models for later use
for period, model in lemma_models.items():
    save_path = Path(f"../models/lemmatized/{period}")
    save_path.mkdir(parents=True, exist_ok=True)
    model.save_model(save_path)

for period, model in transcript_models.items():
    save_path = Path(f"../models/transcript/{period}")
    save_path.mkdir(parents=True, exist_ok=True)
    model.save_model(save_path)

Creating models for lemmatized texts...

Processing period 1800-1824
Counting words and skipgrams...
Building vocabulary...
Final embeddings shape: (1331, 100)
Embeddings non-zeros: 133100
Min norm: 1.7154927366809808e-05

Processing period 1900-1924
Counting words and skipgrams...
Building vocabulary...
Final embeddings shape: (1383, 100)
Embeddings non-zeros: 138300
Min norm: 1.7672577773045495e-05

Processing period 1975-1999
Counting words and skipgrams...
Building vocabulary...
Final embeddings shape: (1190, 100)
Embeddings non-zeros: 119000
Min norm: 1.8160011020070682e-05

Processing period 1825-1849
Counting words and skipgrams...
Building vocabulary...
Final embeddings shape: (1298, 100)
Embeddings non-zeros: 129800
Min norm: 1.7878387962087844e-05

Processing period 2000-2024
Counting words and skipgrams...
Building vocabulary...
Final embeddings shape: (1188, 100)
Embeddings non-zeros: 118800
Min norm: 1.802807329202471e-05

Processing period 1950-1974
Counting words and ski

# Semantic shift

In [4]:
from pathlib import Path


def load_and_sort_models(base_path="../models/lemmatized"):
    models = {}
    for period_path in Path(base_path).iterdir():
        if period_path.is_dir():
            period = period_path.name
            loaded_model = SVDAlgebra()
            loaded_model.load_model(period_path)
            models[period] = loaded_model

    # Sort by start year
    sorted_periods = sorted(models.keys(), key=lambda x: int(x.split("-")[0]))
    return {period: models[period] for period in sorted_periods}


models = load_and_sort_models()
print("Periods in chronological order:", list(models.keys()))

target_words = [
    "freedom",
    "democracy",
    "constitution",
    "justice",
    "government",
    "power",
    "law",
    "authority",
    "america",
    "union",
    "state",
    "nation",
]

Periods in chronological order: ['1775-1799', '1800-1824', '1825-1849', '1850-1874', '1875-1899', '1900-1924', '1925-1949', '1950-1974', '1975-1999', '2000-2024', '2025-2049']


In [5]:
from chronowords.alignment.procrustes import ProcustesAligner


def analyze_shifts(models, target_words=None):
    """Analyze semantic shifts between consecutive periods."""
    periods = sorted(models.keys())
    results = {}

    for i in range(len(periods) - 1):
        period1, period2 = periods[i], periods[i + 1]
        model1, model2 = models[period1], models[period2]

        # Align embeddings
        aligner = ProcustesAligner()
        metrics = aligner.fit(
            model1.embeddings,
            model2.embeddings,
            model1.vocabulary,
            model2.vocabulary,
        )

        print(f"\nAligned {period1} -> {period2}")
        print(f"Aligned words: {metrics.num_aligned_words}")
        print(f"Average similarity: {metrics.average_cosine_similarity:.3f}")

        # Analyze specific words
        if target_words:
            shifts = []
            for word in target_words:
                sim = aligner.get_word_similarity(
                    word, model1.embeddings, model2.embeddings
                )
                if sim is not None:
                    shifts.append((word, 1 - sim))  # Convert to distance

            # Sort by shift magnitude
            shifts.sort(key=lambda x: x[1], reverse=True)
            results[f"{period1}->{period2}"] = shifts

            print("\nTop shifted words:")
            for word, shift in shifts[:5]:
                print(f"{word}: {shift:.3f}")

    return results


# Analyze key political concepts
target_words = [
    "freedom",
    "democracy",
    "government",
    "power",
    "war",
    "peace",
    "america",
    "union",
    "state",
    "constitution",
    "rights",
    "justice",
    "law",
]

shifts = analyze_shifts(models, target_words)


Aligned 1775-1799 -> 1800-1824
Aligned words: 727
Average similarity: 0.318

Top shifted words:
freedom: 1.026
union: 0.772
power: 0.765
constitution: 0.725
law: 0.708

Aligned 1800-1824 -> 1825-1849
Aligned words: 791
Average similarity: 0.301

Top shifted words:
state: 0.809
union: 0.797
war: 0.785
freedom: 0.720
law: 0.720

Aligned 1825-1849 -> 1850-1874
Aligned words: 825
Average similarity: 0.299

Top shifted words:
union: 0.774
constitution: 0.765
war: 0.751
power: 0.749
government: 0.742

Aligned 1850-1874 -> 1875-1899
Aligned words: 780
Average similarity: 0.304

Top shifted words:
freedom: 1.143
state: 0.866
law: 0.826
peace: 0.799
government: 0.757

Aligned 1875-1899 -> 1900-1924
Aligned words: 781
Average similarity: 0.305

Top shifted words:
freedom: 0.954
war: 0.849
state: 0.782
constitution: 0.728
peace: 0.711

Aligned 1900-1924 -> 1925-1949
Aligned words: 781
Average similarity: 0.302

Top shifted words:
democracy: 1.081
law: 0.911
state: 0.796
power: 0.771
union: 0.756

In [6]:
from pathlib import Path

import altair as alt
import pandas as pd


# Load preprocessed models
models = load_and_sort_models()
periods = sorted(models.keys(), key=lambda x: int(x.split("-")[0]))

shift_data = []
# For each period transition and its shifts
for i in range(len(periods) - 1):
    period1, period2 = periods[i], periods[i + 1]
    period_pair = f"{period1} → {period2}"

    # Get the shift data for this period transition
    model1, model2 = models[period1], models[period2]
    aligner = ProcustesAligner()
    metrics = aligner.fit(
        model1.embeddings,
        model2.embeddings,
        model1.vocabulary,
        model2.vocabulary,
    )

    # Calculate shifts for target words
    for word in target_words:
        sim = aligner.get_word_similarity(word, model1.embeddings, model2.embeddings)
        if sim is not None:
            shift_data.append(
                {
                    "period": period_pair,
                    "word": word,
                    "shift": 1 - sim,  # Convert similarity to distance
                    "year": int(period1.split("-")[0]),
                }
            )

# Create DataFrame
df = pd.DataFrame(shift_data)

timeline = (
    alt.Chart(df)
    .mark_line(point=True)
    .encode(
        x=alt.X("year:Q", title="Year"),
        y=alt.Y("shift:Q", title="Semantic Change"),
        color="word:N",
        tooltip=["word", "year", "shift"],
    )
    .properties(width=800, height=400, title="Semantic Shifts Over Time")
    .interactive()
)

# Display the visualization
timeline

In [7]:
df

Unnamed: 0,period,word,shift,year
0,1775-1799 → 1800-1824,freedom,1.026043,1775
1,1775-1799 → 1800-1824,government,0.699798,1775
2,1775-1799 → 1800-1824,power,0.765479,1775
3,1775-1799 → 1800-1824,war,0.555635,1775
4,1775-1799 → 1800-1824,peace,0.663436,1775
...,...,...,...,...
109,2000-2024 → 2025-2049,america,0.799971,2000
110,2000-2024 → 2025-2049,state,0.660920,2000
111,2000-2024 → 2025-2049,constitution,0.704149,2000
112,2000-2024 → 2025-2049,justice,0.655690,2000


In [8]:
shift_data = []
# Use first period as reference
base_period = periods[0]
base_model = models[base_period]

for current_period in periods[1:]:
    current_model = models[current_period]
    aligner = ProcustesAligner()
    metrics = aligner.fit(
        base_model.embeddings,
        current_model.embeddings,
        base_model.vocabulary,
        current_model.vocabulary,
    )

    for word in target_words:
        sim = aligner.get_word_similarity(
            word, base_model.embeddings, current_model.embeddings
        )
        if sim is not None:
            shift_data.append(
                {"period": current_period, "word": word, "cumulative_shift": 1 - sim}
            )

df_cumulative = pd.DataFrame(shift_data)
df_cumulative["year"] = df_cumulative["period"].apply(lambda x: int(x.split("-")[0]))

cumulative_timeline = (
    alt.Chart(df_cumulative)
    .mark_line(point=True)
    .encode(
        x=alt.X("year:Q", title="Year"),
        y=alt.Y("cumulative_shift:Q", title="Cumulative Semantic Change from 1775"),
        color="word:N",
        tooltip=["word", "year", "cumulative_shift"],
    )
    .properties(
        width=800, height=400, title="Cumulative Semantic Shifts from First Period"
    )
    .interactive()
)

cumulative_timeline

In [9]:
df_cumulative

Unnamed: 0,period,word,cumulative_shift,year
0,1800-1824,freedom,1.026043,1800
1,1800-1824,government,0.699798,1800
2,1800-1824,power,0.765479,1800
3,1800-1824,war,0.555635,1800
4,1800-1824,peace,0.663436,1800
...,...,...,...,...
104,2025-2049,america,0.722062,2025
105,2025-2049,state,0.577215,2025
106,2025-2049,constitution,0.432402,2025
107,2025-2049,justice,0.613866,2025


# Topic modeling with NMF

In [10]:
from chronowords.topics.nmf import TopicModel


def analyze_topics_by_period(models, n_topics=10):
    """Create topic models for each time period using existing PPMI matrices.

    Args:
        models: Dictionary of period -> model data from load_and_sort_models()
        n_topics: Number of topics to extract

    """
    topic_models = {}

    for period, model_data in models.items():
        print(f"\nAnalyzing topics for {period}")

        # Get embeddings and vocabulary
        embeddings = model_data.M_dense
        embeddings[embeddings < 0.0] = 0.0  # quick fix for negative values
        vocabulary = model_data.vocabulary

        # Create topic model on existing embeddings
        topic_model = TopicModel(n_topics=n_topics)
        topic_model.fit(embeddings, vocabulary)

        # Print topics
        print("\nTop topics:")
        topic_model.print_topics()

        topic_models[period] = topic_model

    return topic_models


# Create topic models using existing data
topic_models = analyze_topics_by_period(models)


Analyzing topics for 1775-1799





Top topics:

Topic 0:
  inconvenience: 0.0026
  over: 0.0026
  take: 0.0025
  amount: 0.0025
  obey: 0.0025
  judiciary: 0.0025
  progressive: 0.0024
  embarrassment: 0.0024
  remove: 0.0024
  particular: 0.0023

Topic 1:
  history: 0.0034
  punish: 0.0029
  lay: 0.0028
  make: 0.0024
  inform: 0.0023
  accommodation: 0.0023
  subscribe: 0.0023
  compel: 0.0022
  grant: 0.0022
  weaken: 0.0022

Topic 2:
  executive: 0.0029
  election: 0.0029
  domestic: 0.0029
  month: 0.0028
  seem: 0.0026
  humble: 0.0026
  commission: 0.0026
  conviction: 0.0025
  impose: 0.0025
  public: 0.0024

Topic 3:
  maxim: 0.0031
  population: 0.0030
  secure: 0.0027
  retirement: 0.0027
  adams: 0.0027
  john: 0.0025
  fund: 0.0025
  capture: 0.0024
  disclose: 0.0023
  consistent: 0.0023

Topic 4:
  competent: 0.0034
  wait: 0.0030
  way: 0.0029
  all: 0.0028
  moderation: 0.0026
  shew: 0.0026
  more: 0.0025
  authorize: 0.0024
  habit: 0.0023
  states: 0.0023

Topic 5:
  friendly: 0.0031
  amicable: 0.0




Top topics:

Topic 0:
  private: 0.0029
  rise: 0.0029
  restriction: 0.0029
  peculiarly: 0.0028
  destiny: 0.0026
  present: 0.0026
  late: 0.0026
  proclamation: 0.0026
  piracy: 0.0026
  water: 0.0025

Topic 1:
  brother: 0.0029
  precede: 0.0029
  involve: 0.0028
  rule: 0.0027
  similar: 0.0027
  nevertheless: 0.0027
  rely: 0.0026
  conviction: 0.0025
  sudden: 0.0025
  ocean: 0.0025

Topic 2:
  supply: 0.0029
  give: 0.0028
  defect: 0.0028
  erect: 0.0027
  prayer: 0.0026
  proof: 0.0026
  essentially: 0.0026
  defense: 0.0025
  execute: 0.0025
  intention: 0.0025

Topic 3:
  neither: 0.0030
  hope: 0.0030
  taxis: 0.0030
  desire: 0.0030
  public: 0.0028
  proposition: 0.0027
  home: 0.0026
  capital: 0.0025
  behold: 0.0025
  resolution: 0.0024

Topic 4:
  foundation: 0.0031
  importance: 0.0029
  regulate: 0.0027
  witness: 0.0026
  than: 0.0025
  official: 0.0025
  ohio: 0.0024
  loan: 0.0024
  seaport: 0.0023
  prohibit: 0.0023

Topic 5:
  open: 0.0044
  russia: 0.0029
 




Top topics:

Topic 0:
  matter: 0.0033
  statement: 0.0032
  necessity: 0.0032
  war: 0.0032
  people: 0.0032
  president: 0.0030
  assent: 0.0030
  price: 0.0028
  with: 0.0026
  large: 0.0026

Topic 1:
  practical: 0.0030
  also: 0.0029
  add: 0.0028
  unite: 0.0027
  assert: 0.0026
  house: 0.0025
  confide: 0.0025
  sincere: 0.0024
  out: 0.0024
  constantly: 0.0024

Topic 2:
  disregard: 0.0029
  apparent: 0.0028
  side: 0.0027
  anxious: 0.0027
  humanity: 0.0026
  discrimination: 0.0026
  prosecute: 0.0025
  position: 0.0025
  beyond: 0.0025
  calculate: 0.0025

Topic 3:
  copy: 0.0031
  five: 0.0029
  adequate: 0.0029
  since: 0.0028
  society: 0.0026
  million: 0.0026
  apply: 0.0025
  ordinance: 0.0025
  exercise: 0.0025
  what: 0.0024

Topic 4:
  note: 0.0029
  directly: 0.0029
  exception: 0.0029
  same: 0.0028
  title: 0.0028
  troop: 0.0026
  convention: 0.0026
  meeting: 0.0026
  reflection: 0.0026
  safety: 0.0026

Topic 5:
  speculation: 0.0035
  private: 0.0031
  des




Top topics:

Topic 0:
  ten: 0.0028
  burden: 0.0027
  repeal: 0.0026
  reference: 0.0026
  origin: 0.0025
  clear: 0.0025
  diminish: 0.0025
  afford: 0.0025
  suppose: 0.0025
  consider: 0.0025

Topic 1:
  truth: 0.0038
  operation: 0.0029
  new: 0.0029
  settlement: 0.0028
  still: 0.0028
  human: 0.0027
  try: 0.0027
  put: 0.0027
  census: 0.0026
  destruction: 0.0026

Topic 2:
  passage: 0.0031
  exclude: 0.0030
  whereas: 0.0030
  command: 0.0029
  object: 0.0029
  postal: 0.0027
  possession: 0.0026
  largely: 0.0026
  taxation: 0.0026
  enterprise: 0.0025

Topic 3:
  hereafter: 0.0029
  hostile: 0.0028
  pension: 0.0027
  senator: 0.0026
  agitation: 0.0026
  favorable: 0.0025
  regulate: 0.0025
  republican: 0.0025
  bring: 0.0025
  empire: 0.0025

Topic 4:
  safety: 0.0032
  proclamation: 0.0027
  slavery: 0.0026
  seize: 0.0026
  immediate: 0.0025
  see: 0.0025
  louisiana: 0.0024
  insist: 0.0024
  popular: 0.0024
  1859: 0.0024

Topic 5:
  civilization: 0.0030
  washingt




Top topics:

Topic 0:
  draw: 0.0029
  field: 0.0029
  provide: 0.0029
  payment: 0.0028
  more: 0.0027
  propose: 0.0027
  correct: 0.0025
  restriction: 0.0025
  troop: 0.0024
  rise: 0.0024

Topic 1:
  position: 0.0031
  difference: 0.0027
  occasion: 0.0026
  properly: 0.0025
  building: 0.0025
  delegate: 0.0025
  state: 0.0023
  continue: 0.0023
  spain: 0.0023
  concern: 0.0023

Topic 2:
  dependent: 0.0027
  dangerous: 0.0027
  engage: 0.0026
  struggle: 0.0026
  doubt: 0.0025
  existence: 0.0025
  certainly: 0.0025
  raise: 0.0025
  indeed: 0.0023
  therefore: 0.0023

Topic 3:
  deficiency: 0.0028
  frontier: 0.0028
  entitle: 0.0027
  own: 0.0026
  police: 0.0025
  city: 0.0025
  committee: 0.0024
  register: 0.0024
  ought: 0.0024
  1st: 0.0024

Topic 4:
  original: 0.0029
  progress: 0.0027
  spirit: 0.0027
  realize: 0.0027
  express: 0.0027
  practical: 0.0026
  possible: 0.0025
  degree: 0.0025
  work: 0.0024
  every: 0.0024

Topic 5:
  this: 0.0028
  such: 0.0026
  mak




Top topics:

Topic 0:
  executive: 0.0034
  ask: 0.0033
  considerable: 0.0026
  definite: 0.0026
  mankind: 0.0025
  russia: 0.0025
  direction: 0.0025
  make: 0.0025
  total: 0.0024
  position: 0.0024

Topic 1:
  senate: 0.0030
  actual: 0.0027
  joint: 0.0027
  secretary: 0.0026
  vast: 0.0025
  perform: 0.0025
  region: 0.0025
  end: 0.0024
  adequate: 0.0024
  amendment: 0.0024

Topic 2:
  meet: 0.0031
  respective: 0.0030
  sufficient: 0.0029
  just: 0.0026
  furnish: 0.0025
  british: 0.0025
  communication: 0.0025
  thirty: 0.0024
  ideal: 0.0024
  fashion: 0.0023

Topic 3:
  division: 0.0030
  fiscal: 0.0029
  corps: 0.0027
  add: 0.0026
  porto: 0.0026
  than: 0.0026
  cent: 0.0024
  expenditure: 0.0024
  least: 0.0024
  fact: 0.0024

Topic 4:
  thus: 0.0028
  house: 0.0027
  precede: 0.0027
  burden: 0.0026
  progress: 0.0025
  europe: 0.0025
  forward: 0.0025
  coast: 0.0025
  accept: 0.0024
  necessity: 0.0024

Topic 5:
  judgment: 0.0030
  advantage: 0.0027
  hour: 0.002




Top topics:

Topic 0:
  matter: 0.0029
  sufficient: 0.0028
  assume: 0.0028
  billion: 0.0027
  eight: 0.0027
  mine: 0.0026
  bureau: 0.0026
  promise: 0.0026
  around: 0.0025
  her: 0.0024

Topic 1:
  home: 0.0034
  alone: 0.0033
  modern: 0.0029
  enterprise: 0.0029
  careful: 0.0029
  weapon: 0.0027
  organize: 0.0027
  love: 0.0026
  report: 0.0025
  represent: 0.0025

Topic 2:
  recently: 0.0030
  good: 0.0029
  commission: 0.0028
  locality: 0.0026
  fight: 0.0026
  name: 0.0026
  middle: 0.0026
  fact: 0.0025
  hemisphere: 0.0025
  pass: 0.0025

Topic 3:
  rome: 0.0040
  ship: 0.0038
  limitation: 0.0030
  territory: 0.0029
  administration: 0.0029
  moral: 0.0027
  movement: 0.0027
  civilization: 0.0026
  doubt: 0.0024
  forth: 0.0024

Topic 4:
  attain: 0.0031
  not: 0.0029
  threaten: 0.0029
  purchase: 0.0027
  well: 0.0027
  make: 0.0027
  land: 0.0026
  chance: 0.0026
  machine: 0.0026
  economic: 0.0026

Topic 5:
  nor: 0.0035
  island: 0.0034
  end: 0.0033
  represen




Top topics:

Topic 0:
  conversation: 0.0034
  division: 0.0034
  collective: 0.0030
  society: 0.0030
  period: 0.0030
  move: 0.0030
  dangerous: 0.0029
  against: 0.0029
  express: 0.0029
  deserve: 0.0028

Topic 1:
  even: 0.0038
  profit: 0.0032
  whatever: 0.0031
  their: 0.0031
  flight: 0.0030
  response: 0.0030
  mr.: 0.0029
  formosa: 0.0029
  out: 0.0029
  around: 0.0028

Topic 2:
  something: 0.0031
  charge: 0.0030
  leave: 0.0029
  protection: 0.0028
  territory: 0.0028
  die: 0.0028
  terror: 0.0027
  high: 0.0027
  heart: 0.0027
  career: 0.0027

Topic 3:
  protection: 0.0032
  too: 0.0031
  entire: 0.0031
  safe: 0.0030
  examine: 0.0030
  divide: 0.0030
  khrushchev: 0.0030
  color: 0.0029
  incentive: 0.0028
  easy: 0.0028

Topic 4:
  republic: 0.0030
  substantial: 0.0027
  revenue: 0.0027
  worker: 0.0027
  word: 0.0027
  now: 0.0027
  rather: 0.0026
  production: 0.0025
  something: 0.0025
  hemisphere: 0.0025

Topic 5:
  continue: 0.0031
  occasion: 0.0030
  kin




Top topics:

Topic 0:
  natural: 0.0034
  housing: 0.0032
  very: 0.0031
  rule: 0.0031
  four: 0.0030
  national: 0.0030
  enough: 0.0029
  never: 0.0028
  receive: 0.0028
  abuse: 0.0027

Topic 1:
  foreign: 0.0033
  belong: 0.0032
  objective: 0.0032
  liberal: 0.0031
  wage: 0.0031
  join: 0.0031
  restore: 0.0030
  training: 0.0028
  will: 0.0028
  lot: 0.0028

Topic 2:
  aim: 0.0037
  serve: 0.0033
  remove: 0.0033
  nicaragua: 0.0033
  quarter: 0.0032
  demonstrate: 0.0032
  write: 0.0031
  100: 0.0030
  transportation: 0.0030
  which: 0.0029

Topic 3:
  assistance: 0.0039
  bush: 0.0036
  current: 0.0032
  exchange: 0.0031
  capability: 0.0030
  afghanistan: 0.0030
  key: 0.0030
  you: 0.0029
  when: 0.0028
  law: 0.0028

Topic 4:
  learn: 0.0038
  fast: 0.0036
  review: 0.0035
  reduction: 0.0033
  especially: 0.0032
  dignity: 0.0032
  place: 0.0031
  teacher: 0.0029
  somebody: 0.0028
  would: 0.0028

Topic 5:
  why: 0.0033
  important: 0.0033
  address: 0.0032
  rest: 0.00




Top topics:

Topic 0:
  fire: 0.0037
  organization: 0.0034
  source: 0.0033
  force: 0.0032
  accountable: 0.0030
  lift: 0.0030
  plan: 0.0030
  each: 0.0030
  urge: 0.0029
  stability: 0.0029

Topic 1:
  development: 0.0040
  shape: 0.0035
  actually: 0.0034
  immediately: 0.0033
  which: 0.0031
  eliminate: 0.0031
  stem: 0.0031
  consequence: 0.0030
  fuel: 0.0029
  national: 0.0029

Topic 2:
  research: 0.0036
  official: 0.0036
  open: 0.0035
  america: 0.0033
  class="s1">i: 0.0032
  civil: 0.0032
  religious: 0.0031
  allow: 0.0030
  additional: 0.0029
  spending: 0.0029

Topic 3:
  republicans: 0.0041
  person: 0.0033
  use: 0.0031
  mayor: 0.0031
  follow: 0.0030
  event: 0.0030
  sanction: 0.0030
  commit: 0.0029
  set: 0.0028
  bear: 0.0028

Topic 4:
  refuse: 0.0037
  resource: 0.0035
  choose: 0.0035
  possibility: 0.0033
  difficult: 0.0033
  transition: 0.0031
  stake: 0.0030
  address: 0.0029
  perhaps: 0.0029
  within: 0.0029

Topic 5:
  remove: 0.0037
  industry: 0



In [11]:
def process_topic_evolution(topic_models):
    """Process topic models into evolution chains for visualization.

    Args:
        topic_models: Dictionary mapping periods to TopicModel instances

    Returns:
        List of dicts containing evolution data for visualization

    """
    all_data = []
    # Sort periods chronologically
    periods = sorted(topic_models.keys(), key=lambda x: int(x.split("-")[0]))

    # Initialize chains from first period
    first_period = periods[0]
    first_model = topic_models[first_period]

    # Create initial topic chains
    chains = {}
    for idx, topic in enumerate(first_model.topics):
        chain_id = f"Chain_{idx}"
        top_words = [word for word, _ in topic.words[:3]]
        topic_label = ", ".join(top_words)

        chains[chain_id] = {
            "id": chain_id,
            "current_topic": topic,
            "label": topic_label,
        }

        # Add first period data point
        all_data.append(
            {
                "year": int(first_period.split("-")[0]),
                "period": first_period,
                "chain_id": chain_id,
                "topic_label": topic_label,
                "weight": topic.words[0][1],  # Use weight of top word
                "words": topic_label,
            }
        )

    # Follow alignments through subsequent periods
    for i in range(len(periods) - 1):
        period1, period2 = periods[i], periods[i + 1]
        model1 = topic_models[period1]
        model2 = topic_models[period2]

        # Get aligned topics
        aligned_topics = model1.align_with(model2)

        # Update chains based on alignments
        new_chains = {}
        used_target_topics = set()

        for aligned in aligned_topics:
            # Find chain containing source topic
            source_chain = None
            for chain_id, chain in chains.items():
                if chain["current_topic"].id == aligned.source_topic.id:
                    source_chain = chain
                    break

            if source_chain and aligned.target_topic.id not in used_target_topics:
                chain_id = source_chain["id"]
                top_words = [word for word, _ in aligned.target_topic.words[:3]]
                topic_label = ", ".join(top_words)

                # Add data point for this period
                all_data.append(
                    {
                        "year": int(period2.split("-")[0]),
                        "period": period2,
                        "chain_id": chain_id,
                        "topic_label": source_chain[
                            "label"
                        ],  # Keep original label for continuity
                        "weight": aligned.target_topic.words[0][1],
                        "words": topic_label,  # Current words for tooltip
                    }
                )

                # Update chain
                new_chains[chain_id] = {
                    "id": chain_id,
                    "current_topic": aligned.target_topic,
                    "label": source_chain["label"],
                }
                used_target_topics.add(aligned.target_topic.id)

        chains = new_chains

    return all_data


def plot_topic_trends(topic_models):
    """Create interactive visualization of topic evolution over time.

    Args:
        topic_models: Dictionary mapping periods to TopicModel instances

    Returns:
        altair.Chart: Interactive visualization

    """
    import altair as alt
    import pandas as pd

    # Process data
    all_data = process_topic_evolution(topic_models)
    df = pd.DataFrame(all_data)

    # Create base chart
    base = alt.Chart(df).encode(
        x=alt.X(
            "year:Q",
            title="Year",
            scale=alt.Scale(domain=[df.year.min(), df.year.max()]),
        ),
        color=alt.Color(
            "topic_label:N",
            legend=alt.Legend(title="Topics", orient="bottom", columns=2),
        ),
        tooltip=[
            alt.Tooltip("year:Q", title="Year"),
            alt.Tooltip("words:N", title="Top Words"),
            alt.Tooltip("weight:Q", title="Weight", format=".3f"),
        ],
    )

    # Create line chart with points
    lines = base.mark_line(size=2).encode(
        y=alt.Y("weight:Q", title="Topic Weight"), detail="chain_id:N"
    )

    points = base.mark_circle(size=60).encode(y=alt.Y("weight:Q", title="Topic Weight"))

    # Combine charts
    chart = (
        (lines + points)
        .properties(width=800, height=400, title="Topic Evolution Over Time")
        .interactive()
    )

    # Add selection
    topic_selection = alt.selection_point(fields=["topic_label"], bind="legend")

    chart = chart.add_params(topic_selection).encode(
        opacity=alt.condition(topic_selection, alt.value(1), alt.value(0.2))
    )

    return chart


# Example usage:
topic_models = analyze_topics_by_period(models)
chart = plot_topic_trends(topic_models)
chart
# chart.save('topic_evolution.html')  # Save interactive chart


Analyzing topics for 1775-1799





Top topics:

Topic 0:
  punishment: 0.0029
  reason: 0.0027
  mint: 0.0027
  health: 0.0026
  people: 0.0026
  know: 0.0025
  expect: 0.0025
  liberal: 0.0025
  convenience: 0.0025
  competent: 0.0024

Topic 1:
  surrender: 0.0030
  accelerate: 0.0029
  further: 0.0027
  encouragement: 0.0026
  assist: 0.0026
  humane: 0.0026
  half: 0.0025
  gratitude: 0.0025
  diffuse: 0.0025
  day: 0.0024

Topic 2:
  executive: 0.0029
  service: 0.0027
  benevolence: 0.0027
  month: 0.0027
  offender: 0.0026
  check: 0.0026
  commensurate: 0.0026
  george: 0.0026
  partial: 0.0025
  election: 0.0025

Topic 3:
  report: 0.0032
  limit: 0.0028
  appropriation: 0.0027
  continuance: 0.0027
  universal: 0.0026
  answer: 0.0026
  other: 0.0026
  minister: 0.0026
  river: 0.0025
  reside: 0.0024

Topic 4:
  ambition: 0.0030
  renew: 0.0027
  island: 0.0026
  forbid: 0.0026
  mark: 0.0025
  board: 0.0025
  some: 0.0025
  character: 0.0025
  facilitate: 0.0025
  fishery: 0.0024

Topic 5:
  implore: 0.0029





Top topics:

Topic 0:
  rather: 0.0027
  late: 0.0027
  derive: 0.0026
  previous: 0.0026
  line: 0.0026
  current: 0.0026
  station: 0.0025
  order: 0.0024
  continue: 0.0024
  themselves: 0.0024

Topic 1:
  impose: 0.0038
  man: 0.0031
  island: 0.0027
  april: 0.0027
  south: 0.0026
  individual: 0.0026
  loan: 0.0025
  national: 0.0024
  majesty: 0.0024
  purchase: 0.0023

Topic 2:
  resource: 0.0033
  public: 0.0033
  loss: 0.0032
  scene: 0.0028
  involve: 0.0028
  question: 0.0027
  desire: 0.0027
  neither: 0.0027
  let: 0.0027
  proposition: 0.0027

Topic 3:
  open: 0.0043
  accrue: 0.0028
  regulate: 0.0027
  trust: 0.0026
  sustain: 0.0026
  suppression: 0.0025
  same: 0.0025
  dollar: 0.0025
  convention: 0.0025
  well: 0.0024

Topic 4:
  pass: 0.0040
  pay: 0.0031
  similar: 0.0030
  earth: 0.0029
  promote: 0.0028
  coast: 0.0028
  participation: 0.0028
  attention: 0.0027
  revolution: 0.0026
  forbid: 0.0025

Topic 5:
  delay: 0.0028
  prayer: 0.0028
  these: 0.0028
  




Top topics:

Topic 0:
  title: 0.0036
  territory: 0.0035
  energy: 0.0030
  export: 0.0027
  enterprise: 0.0026
  troop: 0.0025
  magistrate: 0.0025
  present: 0.0025
  constituent: 0.0025
  add: 0.0025

Topic 1:
  position: 0.0031
  distribution: 0.0029
  aggregate: 0.0027
  judicial: 0.0026
  check: 0.0026
  beyond: 0.0025
  for: 0.0025
  discretion: 0.0025
  blessing: 0.0023
  effectually: 0.0023

Topic 2:
  1832: 0.0030
  speculation: 0.0029
  active: 0.0029
  capacity: 0.0029
  indulge: 0.0027
  prosperous: 0.0027
  and: 0.0027
  particularly: 0.0027
  sign: 0.0026
  character: 0.0026

Topic 3:
  copy: 0.0030
  province: 0.0029
  when: 0.0029
  exceed: 0.0028
  merchant: 0.0028
  community: 0.0028
  performance: 0.0026
  map: 0.0026
  reason: 0.0026
  happily: 0.0026

Topic 4:
  desirable: 0.0029
  force: 0.0028
  reserve: 0.0026
  suggestion: 0.0026
  injury: 0.0026
  consequence: 0.0026
  essential: 0.0026
  representative: 0.0025
  december: 0.0025
  state: 0.0025

Topic 5:
 




Top topics:

Topic 0:
  command: 0.0031
  passage: 0.0031
  nearly: 0.0028
  several: 0.0028
  distant: 0.0027
  tariff: 0.0027
  control: 0.0025
  commencement: 0.0025
  you: 0.0025
  disturb: 0.0025

Topic 1:
  measure: 0.0032
  official: 0.0031
  fair: 0.0029
  reject: 0.0027
  urge: 0.0027
  exception: 0.0026
  tend: 0.0025
  raise: 0.0025
  many: 0.0025
  within: 0.0025

Topic 2:
  rapidly: 0.0029
  increase: 0.0029
  manufacture: 0.0027
  manner: 0.0027
  such: 0.0026
  therefore: 0.0026
  have: 0.0026
  civilized: 0.0025
  divide: 0.0024
  ask: 0.0024

Topic 3:
  conduct: 0.0034
  embrace: 0.0030
  institution: 0.0030
  want: 0.0028
  responsible: 0.0027
  forth: 0.0026
  actual: 0.0026
  house: 0.0026
  rebellion: 0.0026
  complete: 0.0025

Topic 4:
  island: 0.0032
  immediate: 0.0031
  harbor: 0.0031
  obvious: 0.0028
  consist: 0.0028
  neither: 0.0027
  enable: 0.0027
  reference: 0.0027
  pende: 0.0027
  furnish: 0.0026

Topic 5:
  four: 0.0029
  vacancy: 0.0029
  pacific




Top topics:

Topic 0:
  bond: 0.0032
  gratifying: 0.0027
  represent: 0.0027
  willing: 0.0026
  serious: 0.0025
  reciprocal: 0.0025
  paper: 0.0025
  often: 0.0025
  safety: 0.0024
  1880: 0.0024

Topic 1:
  manner: 0.0029
  supply: 0.0028
  voter: 0.0027
  entire: 0.0026
  route: 0.0026
  certainly: 0.0026
  consent: 0.0025
  executive: 0.0025
  raise: 0.0025
  merchandise: 0.0024

Topic 2:
  united: 0.0028
  directly: 0.0027
  report: 0.0027
  and: 0.0027
  gain: 0.0027
  section: 0.0026
  law: 0.0025
  recent: 0.0025
  language: 0.0024
  export: 0.0024

Topic 3:
  sentiment: 0.0032
  management: 0.0030
  our: 0.0029
  delegate: 0.0028
  completion: 0.0027
  kind: 0.0027
  engagement: 0.0027
  from: 0.0026
  january: 0.0026
  knowledge: 0.0025

Topic 4:
  progress: 0.0029
  surrender: 0.0029
  keep: 0.0027
  deem: 0.0027
  enforce: 0.0027
  belong: 0.0024
  violence: 0.0024
  expenditure: 0.0024
  1890: 0.0023
  impartial: 0.0023

Topic 5:
  deficiency: 0.0031
  register: 0.0030





Top topics:

Topic 0:
  settle: 0.0033
  attend: 0.0032
  alien: 0.0029
  your: 0.0025
  west: 0.0025
  against: 0.0025
  provide: 0.0024
  simple: 0.0024
  mountain: 0.0023
  recently: 0.0023

Topic 1:
  fiscal: 0.0036
  division: 0.0027
  community: 0.0027
  raise: 0.0026
  acquire: 0.0025
  reference: 0.0025
  she: 0.0024
  intelligent: 0.0024
  add: 0.0024
  advance: 0.0024

Topic 2:
  show: 0.0035
  when: 0.0033
  center: 0.0028
  east: 0.0028
  domain: 0.0028
  say: 0.0028
  offer: 0.0026
  age: 0.0026
  hay: 0.0026
  americans: 0.0025

Topic 3:
  secretary: 0.0029
  fundamental: 0.0028
  payment: 0.0028
  investigation: 0.0027
  number: 0.0026
  house: 0.0026
  rich: 0.0026
  representatives: 0.0026
  really: 0.0026
  occur: 0.0025

Topic 4:
  competent: 0.0028
  address: 0.0027
  appropriate: 0.0027
  dominion: 0.0027
  easy: 0.0027
  union: 0.0025
  word: 0.0025
  aim: 0.0025
  against: 0.0024
  support: 0.0024

Topic 5:
  and: 0.0031
  manner: 0.0027
  constitute: 0.0026
  p




Top topics:

Topic 0:
  vast: 0.0035
  june: 0.0032
  rich: 0.0029
  particularly: 0.0027
  arise: 0.0025
  remedy: 0.0025
  grant: 0.0025
  board: 0.0024
  standard: 0.0024
  right: 0.0024

Topic 1:
  rome: 0.0044
  atlantic: 0.0030
  month: 0.0028
  now: 0.0028
  ship: 0.0028
  conception: 0.0026
  fully: 0.0026
  none: 0.0025
  forth: 0.0025
  civilization: 0.0024

Topic 2:
  attain: 0.0028
  not: 0.0028
  meet: 0.0028
  threaten: 0.0028
  union: 0.0027
  again: 0.0027
  information: 0.0027
  practice: 0.0026
  machine: 0.0025
  influence: 0.0025

Topic 3:
  north: 0.0031
  employer: 0.0029
  fact: 0.0028
  middle: 0.0028
  join: 0.0028
  credit: 0.0027
  today: 0.0026
  completely: 0.0026
  pension: 0.0025
  skill: 0.0025

Topic 4:
  financial: 0.0031
  whole: 0.0030
  careful: 0.0029
  1928: 0.0028
  come: 0.0027
  solve: 0.0026
  effort: 0.0026
  destructive: 0.0025
  wage: 0.0024
  broad: 0.0024

Topic 5:
  burden: 0.0031
  island: 0.0031
  respect: 0.0030
  defend: 0.0029
  re




Top topics:

Topic 0:
  move: 0.0049
  priority: 0.0032
  division: 0.0032
  accept: 0.0031
  dangerous: 0.0027
  problem: 0.0026
  viet: 0.0026
  research: 0.0025
  morning: 0.0025
  school: 0.0025

Topic 1:
  department: 0.0039
  freedom: 0.0032
  carry: 0.0032
  flight: 0.0032
  progress: 0.0030
  hunger: 0.0028
  formosa: 0.0028
  join: 0.0026
  special: 0.0026
  profit: 0.0026

Topic 2:
  protection: 0.0032
  day: 0.0029
  territory: 0.0029
  heart: 0.0028
  something: 0.0027
  give: 0.0027
  yet: 0.0027
  leave: 0.0026
  die: 0.0025
  charge: 0.0025

Topic 3:
  khrushchev: 0.0032
  free: 0.0031
  along: 0.0029
  crime: 0.0028
  disarmament: 0.0027
  value: 0.0027
  adversary: 0.0027
  lady: 0.0027
  connection: 0.0026
  taxpayer: 0.0026

Topic 4:
  population: 0.0036
  prosperous: 0.0032
  value: 0.0031
  resolve: 0.0029
  determination: 0.0029
  belief: 0.0029
  complex: 0.0028
  emerge: 0.0028
  support: 0.0027
  affair: 0.0027

Topic 5:
  demand: 0.0034
  foundation: 0.0033
 




Top topics:

Topic 0:
  listen: 0.0033
  end: 0.0032
  presidency: 0.0032
  most: 0.0031
  low: 0.0030
  coal: 0.0029
  ten: 0.0029
  solution: 0.0029
  affect: 0.0028
  size: 0.0028

Topic 1:
  and: 0.0034
  air: 0.0032
  wonderful: 0.0031
  friendship: 0.0030
  hope: 0.0030
  inflation: 0.0029
  income: 0.0028
  same: 0.0028
  appropriate: 0.0028
  independence: 0.0027

Topic 2:
  enact: 0.0039
  president: 0.0035
  bridge: 0.0032
  response: 0.0031
  discuss: 0.0030
  demonstrate: 0.0030
  court: 0.0030
  table: 0.0030
  which: 0.0030
  once: 0.0029

Topic 3:
  belong: 0.0041
  rebuttal: 0.0039
  entire: 0.0036
  stay: 0.0033
  ability: 0.0032
  announce: 0.0032
  police: 0.0032
  only: 0.0029
  enforce: 0.0029
  die: 0.0027

Topic 4:
  create: 0.0031
  company: 0.0031
  favor: 0.0030
  being: 0.0029
  rather: 0.0029
  themselves: 0.0029
  already: 0.0028
  hand: 0.0028
  chemical: 0.0028
  lesson: 0.0028

Topic 5:
  kuwait: 0.0034
  mistake: 0.0030
  eight: 0.0030
  mean: 0.0029
 




Top topics:

Topic 0:
  n’t: 0.0032
  refuse: 0.0030
  principle: 0.0030
  whether: 0.0029
  lot: 0.0029
  week: 0.0028
  measure: 0.0028
  above: 0.0027
  important: 0.0027
  represent: 0.0027

Topic 1:
  class="s1">i: 0.0035
  official: 0.0035
  additional: 0.0032
  research: 0.0032
  open: 0.0031
  america: 0.0031
  civil: 0.0030
  white: 0.0029
  religious: 0.0029
  allow: 0.0029

Topic 2:
  republicans: 0.0035
  use: 0.0033
  commit: 0.0033
  mayor: 0.0032
  sanction: 0.0032
  bear: 0.0029
  mind: 0.0028
  save: 0.0028
  university: 0.0028
  threat: 0.0028

Topic 3:
  fake: 0.0037
  effective: 0.0035
  speech: 0.0034
  choose: 0.0033
  stem: 0.0033
  fully: 0.0032
  liberty: 0.0031
  stake: 0.0031
  science: 0.0029
  rebuild: 0.0029

Topic 4:
  fire: 0.0044
  urge: 0.0036
  african: 0.0033
  law: 0.0033
  organization: 0.0030
  brave: 0.0030
  win: 0.0030
  deep: 0.0030
  anyone: 0.0029
  private: 0.0028

Topic 5:
  remove: 0.0040
  industry: 0.0034
  organization: 0.0033
  testi




Top topics:

Topic 0:
  ’ll: 0.0037
  california: 0.0033
  good: 0.0032
  need: 0.0032
  bold: 0.0031
  team: 0.0031
  tonight: 0.0030
  watch: 0.0030
  pillar: 0.0030
  each: 0.0029

Topic 1:
  our: 0.0037
  toxic: 0.0035
  economy: 0.0033
  try: 0.0032
  inform: 0.0031
  exceptional: 0.0031
  badly: 0.0029
  technology: 0.0029
  voice: 0.0028
  trump: 0.0026

Topic 2:
  people: 0.0040
  2025: 0.0040
  never: 0.0033
  oil: 0.0032
  plant: 0.0030
  purpose: 0.0029
  policy: 0.0029
  more: 0.0029
  hurricane: 0.0028
  delaware: 0.0028

Topic 3:
  mental: 0.0037
  flourish: 0.0034
  wealth: 0.0033
  presidential: 0.0032
  stars: 0.0032
  engineer: 0.0031
  presence: 0.0030
  work: 0.0030
  happen: 0.0030
  liquid: 0.0029

Topic 4:
  possibility: 0.0037
  beginning: 0.0035
  abuse: 0.0033
  police: 0.0031
  address: 0.0031
  peril: 0.0030
  start: 0.0030
  influence: 0.0030
  bullet: 0.0029
  panama: 0.0029

Topic 5:
  expectation: 0.0040
  cost: 0.0038
  some: 0.0038
  los: 0.0032
  mod