# Semantic Change Analysis Interface

Use this notebook to interactively analyze semantic changes of words over time.

In [None]:
import os
import numpy as np
import ipywidgets as widgets
from IPython.display import display, Markdown, clear_output, HTML

# Import our library modules
from semantic_change.corpus import CorpusManager
from semantic_change.embedding import BertEmbedder
from semantic_change.wsi import WordSenseInductor
from semantic_change.visualization import Visualizer

# Helper to display documentation
def show_docs():
    if os.path.exists("user_guide.md"):
        with open("user_guide.md", "r") as f:
            display(Markdown(f.read()))
    else:
        print("User guide not found.")

show_docs()

## 1. Configuration & Setup
Configure your data paths below. By default, it uses the `data/` directory created by the demo.

In [None]:
# UI Widgets
path_input = widgets.Text(value='data', description='Data Root:', placeholder='Path to data folder')
word_input = widgets.Text(value='bank', description='Focus Word:', placeholder='e.g., apple')
sample_slider = widgets.IntSlider(value=50, min=10, max=500, step=10, description='Samples:')
cluster_slider = widgets.IntSlider(value=2, min=2, max=10, step=1, description='Clusters:')
run_btn = widgets.Button(description="Run Analysis", button_style='success', icon='play')
output_area = widgets.Output()

display(widgets.VBox([path_input, word_input, sample_slider, cluster_slider, run_btn]))

# Global model variable to load only once if possible (though class re-init is safer for memory in notebooks)
embedder = None

def run_analysis(b):
    global embedder
    with output_area:
        clear_output()
        root_path = path_input.value
        target_word = word_input.value
        n_samples = sample_slider.value
        n_clusters = cluster_slider.value
        
        print(f"Processing '{target_word}' from '{root_path}'...")
        
        if not os.path.exists(root_path):
            print(f"Error: Path '{root_path}' does not exist.")
            return

        # 1. Load Corpora
        manager = CorpusManager()
        subdirs = [d for d in os.listdir(root_path) if os.path.isdir(os.path.join(root_path, d))]
        subdirs.sort()
        
        if not subdirs:
            print("No subdirectories found in data path. Expected format: data/corpus_1990, data/corpus_2000 etc.")
            return
            
        for d in subdirs:
            manager.add_corpus(d, os.path.join(root_path, d))
            print(f"Loaded corpus: {d}")

        # 2. Extract Embeddings
        if embedder is None:
            print("Loading BERT model (this may take a moment)...\n")
            embedder = BertEmbedder()
        
        all_embeddings = []
        all_sentences = []
        time_labels = []
        
        for name in manager.corpora:
            print(f"Querying '{name}'...")
            samples = manager.get_corpus(name).query_samples(target_word, n=n_samples)
            if not samples:
                print(f"  - No samples found for '{target_word}' in {name}")
                continue
                
            embs, sents = embedder.get_embeddings(samples, target_word)
            if len(embs) > 0:
                all_embeddings.append(embs)
                all_sentences.extend(sents)
                time_labels.extend([name] * len(embs))
                print(f"  - Retrieved {len(embs)} embeddings.")

        if not all_embeddings:
            print("No data found for analysis. Check your word spelling or data path.")
            return
            
        X = np.vstack(all_embeddings)
        
        # 3. WSI Clustering
        print(f"\nRunning WSI (k={n_clusters})...")
        wsi = WordSenseInductor(n_clusters=n_clusters)
        sense_labels = wsi.fit_predict(X)
        
        # 4. Visualization
        viz = Visualizer(method='pca')
        
        print("\n--- Visualizing by Time Period ---")
        viz.plot_clustering(X, time_labels, all_sentences, title=f"'{target_word}' by Time Period")
        
        print("\n--- Visualizing by Sense Cluster ---")
        viz.plot_clustering(X, sense_labels, all_sentences, title=f"'{target_word}' by Sense Cluster")
        
        print("\n--- Semantic Neighbors (MLM Projection) ---")
        unique_clusters = sorted(list(set(sense_labels)))
        for cluster_id in unique_clusters:
            mask = (sense_labels == cluster_id)
            centroid = np.mean(X[mask], axis=0)
            neighbors = embedder.get_nearest_neighbors(centroid, k=8)
            
            print(f"Cluster {cluster_id} top keywords: {list(neighbors.keys())}")
            viz.plot_neighbors(centroid, neighbors, title=f"Cluster {cluster_id} Context Projection")

run_btn.on_click(run_analysis)
display(output_area)