<a href="https://colab.research.google.com/github/cs-iuu/word-sense-2025-fall-ai/blob/main/notebooks/13.word_sense_induction_en.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Word Sense Induction

WSI (Word Sense Induction) using BERT

## Setup

In [None]:
import torch
# from transformers import BertTokenizer, BertModel
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
import pandas as pd
import time
# --- New Imports ---
import nltk
from nltk.corpus import stopwords
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True) # Explicitly download the English tagger
nltk.download('stopwords', quiet=True)
# --------------------


## Load text

In [None]:
# --- 1. Setup and Corpus Definition (Same as before) ---
# large_corpus = [
#     "I went to the bank to deposit a large sum of money.",
#     "The fisherman cast his line from the grassy river bank.",
#     "The central bank announced a new interest rate policy.",
#     "We use a construction crane to lift heavy steel beams.",
#     "A small red star twinkled brightly in the night sky.",
#     "The famous movie star walked the red carpet.",
#     "The huge crane bird waded through the shallow marsh.",
#     "The harbor crane loaded the containers onto the ship."
# ]
# clean_corpus = [s.replace('**', '') for s in large_corpus]

file_path = "/content/Bible_NT.en-kjv.txt"

clean_corpus = []
with open(file_path, 'r') as f:
    for line in f:
        clean_corpus.append(line.strip())

N_SENTENCES = len(clean_corpus)


## Define functions

### for corpus processing function using BERT

In [None]:
# Load BERT Model (Same as before)
# MODEL_NAME = 'bert-base-uncased'
# tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
# model = BertModel.from_pretrained(MODEL_NAME)
# Use a smaller and faster model
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
model = DistilBertModel.from_pretrained(MODEL_NAME)
model.eval()

embeddings_store = []
index_data = []

print(f"--- Starting Stage 1: Indexing All Content Words for {N_SENTENCES} Sentences ---")

# Define which POS tags are considered content words (Nouns, Verbs, Adjectives, Adverbs)
# POS tags: NN (Noun), VB (Verb), JJ (Adjective), RB (Adverb)
CONTENT_TAGS_PREFIX = ('NN', 'VB', 'JJ', 'RB')
STOP_WORDS = set(stopwords.words('english'))

# --- 2. The Modified Pre-calculation Function ---
def process_corpus_general(corpus, model, tokenizer):
    """Processes the corpus and indexes all content words found."""
    for sent_idx, text in enumerate(corpus):
        # A. Get BERT Hidden States (Same as before)
        encoded_input = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True)
        input_ids = encoded_input['input_ids']

        with torch.no_grad():
            outputs = model(input_ids)
            full_hidden_state = outputs[0].squeeze(0).numpy()

        embeddings_store.append(full_hidden_state)

        # B. Identify Content Words using NLTK (The New Step)

        # NLTK tokenization and POS tagging
        nltk_tokens = nltk.word_tokenize(text)
        tagged_tokens = nltk.pos_tag(nltk_tokens)

        # Filter for content words
        content_words = [(word.lower(), tag) for word, tag in tagged_tokens
                         if word.isalpha() and word.lower() not in STOP_WORDS and tag.startswith(CONTENT_TAGS_PREFIX)]

        # C. Build the Index for Each Content Word
        bert_tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0))

        for word, _ in content_words:
            # 1. Find all BERT token indices corresponding to this content word
            # Note: We search the BERT tokens, not the NLTK tokens
            target_indices = [i for i, token in enumerate(bert_tokens)
                              if word in token or word.capitalize() in token]

            if target_indices:
                index_data.append({
                    'target_word': word,
                    'sentence_id': sent_idx,
                    'token_indices': target_indices,
                    'sentence': text
                })

### Batch version of corpus processing

In [None]:
device = torch.device("cpu")

In [None]:

# Check if a CUDA-enabled GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"‚úÖ Using GPU: {torch.cuda.get_device_name(0)}")
# else:
#     device = torch.device("cpu")
#     print("‚ö†Ô∏è Using CPU. Processing will be slower.")

# BATCH_SIZE = 32

In [None]:
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
model = DistilBertModel.from_pretrained(MODEL_NAME)
# model.eval()

embeddings_store = []
index_data = []

print(f"--- Starting Stage 1: Indexing All Content Words for {N_SENTENCES} Sentences ---")

# Define which POS tags are considered content words (Nouns, Verbs, Adjectives, Adverbs)
# POS tags: NN (Noun), VB (Verb), JJ (Adjective), RB (Adverb)
CONTENT_TAGS_PREFIX = ('NN', 'VB', 'JJ', 'RB')
STOP_WORDS = set(stopwords.words('english'))
model.to(device)
model.eval()

In [None]:
import h5py
import pandas as pd
import numpy as np
import os

# --- Define File Paths ---
HDF5_EMBEDDINGS_FILE = 'corpus_embeddings.h5'
INDEX_FILE = 'corpus_index.pkl' # Use a simple file for the index metadata
BATCH_SIZE = 16 # Use a low batch size to be safe

# Initialize (or create) the HDF5 file and the index list
# Delete files if they exist to start fresh
try:
    os.remove(HDF5_EMBEDDINGS_FILE)
    os.remove(INDEX_FILE)
except OSError:
    pass

# Initialize an empty list to collect index data (this stays small)
index_data = []

# Open the HDF5 file for writing
h5f = h5py.File(HDF5_EMBEDDINGS_FILE, 'w')

In [None]:
# --- 2. The Modified Pre-calculation Function (Corrected and Robust) ---
def process_corpus_general_batched(corpus, model, tokenizer, batch_size, device, h5f):
    """
    Processes the corpus in batches for fast BERT inference and indexes
    all content words, writing embeddings directly to the h5f disk file.
    """

    # We rely on index_data being defined globally outside the function
    global index_data

    # 1. Chunk the entire corpus into batches
    batched_corpus = [corpus[i:i + batch_size] for i in range(0, len(corpus), batch_size)]

    # Track the global sentence index across all batches
    global_sent_idx = 0

    print(f"Processing {len(corpus)} sentences in {len(batched_corpus)} batches of size {batch_size}...")

    # 2. Process each batch
    for batch_texts in batched_corpus:

        # A. Get BERT Hidden States for the entire batch
        encoded_input = tokenizer(
            batch_texts,
            return_tensors='pt',
            padding='max_length',
            truncation=True
        ).to(device)

        with torch.no_grad():
            outputs = model(**encoded_input)
            full_hidden_states = outputs[0].cpu().numpy() # Shape: (batch_size, seq_len, hidden_size)

        # 3. Process each sentence's result from the batch for indexing
        for sent_in_batch, text in enumerate(batch_texts):

            # Extract the ID tensor and ensure it's a CPU-based list/array for token conversion
            input_ids_tensor = encoded_input['input_ids'][sent_in_batch].cpu()

            # --- CRITICAL CHANGE: DISK WRITE ---
            embedding_array = full_hidden_states[sent_in_batch]
            # Write the array to HDF5 with a unique dataset name
            h5f.create_dataset(f'sent_{global_sent_idx}', data=embedding_array, compression="gzip")


            # 4. Identify Content Words using NLTK (CPU-bound)
            nltk_tokens = nltk.word_tokenize(text)
            tagged_tokens = nltk.pos_tag(nltk_tokens)

            # NOTE: Assuming STOP_WORDS and CONTENT_TAGS_PREFIX are globally defined
            content_words = [(word.lower(), tag) for word, tag in tagged_tokens
                             if word.isalpha() and word.lower() not in STOP_WORDS and tag.startswith(CONTENT_TAGS_PREFIX)]

            # Convert IDs to BERT tokens using the list of IDs
            bert_tokens = tokenizer.convert_ids_to_tokens(input_ids_tensor.tolist())

            # 5. Build the Index for Each Content Word
            for word, _ in content_words:
                target_indices = [i for i, token in enumerate(bert_tokens)
                                  if word in token or word.capitalize() in token]

                if target_indices:
                    index_data.append({
                        'target_word': word,
                        'sentence_id': global_sent_idx,
                        'token_indices': target_indices,
                        'sentence': text
                    })

            global_sent_idx += 1

## for analysis

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import h5py
import numpy as np
import pandas as pd

# --- File Paths (Must match where you saved them) ---
HDF5_EMBEDDINGS_FILE = 'corpus_embeddings.h5'
INDEX_FILE = 'corpus_index.pkl'

# --- 0. Load the Index (Do this once, before calling the function) ---
# index_df = pd.read_pickle(INDEX_FILE)

def get_target_vectors_from_store(target_word, index_df):
    """
    Retrieves the contextualized BERT vectors for all occurrences of a target word,
    reading the embedding data directly from the HDF5 file.
    """

    # 1. Query the Index
    # Find all rows in the index where the 'target_word' occurs (case-insensitive)
    matches = index_df[index_df['target_word'] == target_word.lower()]

    if matches.empty:
        return []

    target_vectors = []

    # 2. Access HDF5 File
    # Use 'with' to ensure the file is closed automatically
    with h5py.File(HDF5_EMBEDDINGS_FILE, 'r') as hf:

        # 3. Iterate through matches and extract the specific vector
        for _, row in matches.iterrows():
            sent_id = row['sentence_id']
            token_indices = row['token_indices']

            # Retrieve the full embedding array for the sentence from HDF5
            # The dataset name is f'sent_{sent_id}'
            try:
                # Use dataset name indexing (e.g., hf['sent_0']) and load the data [()]
                full_sent_embedding = hf[f'sent_{sent_id}'][()]
            except KeyError:
                print(f"Warning: Dataset 'sent_{sent_id}' not found in HDF5 file.")
                continue

            # The embedding for the word is the average of its sub-token embeddings
            word_vector = np.mean(full_sent_embedding[token_indices], axis=0)
            target_vectors.append(word_vector)

    return target_vectors

# Example Call:
# all_embeddings_for_apple = get_target_vectors_from_store("apple", index_df)
# --- 2. Function to find optimal K and perform Clustering (from previous answer) ---
def find_optimal_k_and_cluster(X, max_k=5):
    """
    Finds the optimal K using Silhouette Score and performs K-means,
    while safeguarding against having too few samples.
    """
    n_instances = X.shape[0]

    # --- Initial Checks ---
    if n_instances < 2:
        # If there's 0 or 1 instance, clustering is meaningless
        print(f"   --> Warning: Only {n_instances} instance(s) found. Cannot cluster.")
        return 1, np.zeros(n_instances, dtype=int)

    # K must be less than the number of instances for Silhouette Score
    # The range should be from 2 up to n_instances - 1
    k_range = range(2, min(max_k, n_instances - 1) + 1)

    # If the range is empty (e.g., n_instances=2, range is just 2, min is 1),
    # we can only assign K=1 (no distinct senses found).
    if len(k_range) == 0:
        print(f"   --> Warning: Only {n_instances} instances. Defaulting to K=1.")
        return 1, np.zeros(n_instances, dtype=int)

    best_k = k_range[0] # Start with the smallest possible K (usually 2)
    best_score = -1.0

    print(f"   --> Testing K in range {list(k_range)}...")

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
        labels = kmeans.fit_predict(X)

        # This calculation is now safe because k is guaranteed to be < n_instances
        score = silhouette_score(X, labels)

        if score > best_score:
            best_score = score
            best_k = k

    # Final clustering with the optimal K
    kmeans = KMeans(n_clusters=best_k, random_state=42, n_init='auto')
    final_labels = kmeans.fit_predict(X)

    print(f"   --> Optimal K determined: {best_k} (Silhouette: {best_score:.4f})")
    return best_k, final_labels

# The word 'deposit' from your corpus likely had n_instances = 3.
# The old code tried K=3, which failed.
# The new code will cap K at min(max_k, 3-1) = min(5, 2) = 2. It will only test K=2.

## Process & Save the index & embeddings

In [None]:
# # --- Run the Pre-calculation ---
start_time = time.time()

# IMPORTANT: h5f must be opened before the function call
# h5f = h5py.File('corpus_embeddings.h5', 'w')
process_corpus_general_batched(clean_corpus, model, tokenizer, BATCH_SIZE, device, h5f)
h5f.close() # Close after the loop finishes

# --- CRITICAL: SAVE THE INDEX HERE ---
import pandas as pd
index_df = pd.DataFrame(index_data)
INDEX_FILE = 'corpus_index.pkl'
index_df.to_pickle(INDEX_FILE)

print(f"Processing complete. Index saved to {INDEX_FILE}")
print(f"Time taken for Stage 1 (BERT Inference): {time.time() - start_time:.2f} seconds.")
print(f"Index created for {len(index_df)} instances of ALL content words.")

## Test & Analysis

In [None]:
import time
import pandas as pd
import numpy as np
# Assuming find_optimal_k_and_cluster is available

# --- File Paths (Must match where you saved them) ---
HDF5_EMBEDDINGS_FILE = 'corpus_embeddings.h5'
INDEX_FILE = 'corpus_index.pkl'

# --- 0. Load the Index and Ensure Files Exist ---
try:
    # Load the index DataFrame from the saved pickle file
    index_df = pd.read_pickle(INDEX_FILE)
    print(f"‚úÖ Loaded index with {len(index_df)} word occurrences.")
except FileNotFoundError:
    print(f"üõë Error: Index file '{INDEX_FILE}' not found. Did Stage 1 complete successfully?")
    exit()

# Filter out words with too few instances to cluster (e.g., less than 2)
word_counts = index_df.groupby('target_word').size()
plausible_words = word_counts[word_counts >= 2].index.tolist()
print(f"Found {len(plausible_words)} words with 2 or more instances for clustering.")


# --- 1. Define Words to Analyze ---
# Use a specific list, or sample from the plausible_words list
WORDS_TO_ANALYZE = ["life", "light", "spirit"]

print("\n--- Starting Stage 2: Efficient Sense Induction from Disk ---")

for word in WORDS_TO_ANALYZE:
    run_start = time.time()

    # A. Retrieve vectors quickly
    # The function handles reading the correct embedding arrays from the HDF5 file
    X_list = get_target_vectors_from_store(word, index_df)

    if not X_list:
        print(f"  Skipping '{word}': No instances found or retrieval failed.")
        continue

    # Convert the list of vectors to a numpy array for K-means
    X = np.array(X_list)

    # Extract the original sentences for interpretation
    sentences = index_df[index_df['target_word'] == word.lower()]['sentence'].tolist()

    # B. Find optimal K and cluster
    # Note: max_k=5 is generally a good starting point for WSI
    optimal_k, labels = find_optimal_k_and_cluster(X, max_k=5)

    # C. Display Results
    sense_clusters = {i: [] for i in range(optimal_k)}
    for sentence, label in zip(sentences, labels):
        sense_clusters[label].append(sentence)

    run_end = time.time()
    print(f"\n## üéØ Induced Senses for '{word}' (Run Time: {run_end - run_start:.4f}s) ##")

    for i, sentences_in_sense in sense_clusters.items():
        print(f"--- Sense Cluster {i+1} ({len(sentences_in_sense)} instances) ---")

        # Display up to 3 example sentences for brevity
        for j, sentence in enumerate(sentences_in_sense[:3]):
            print(f"  - {sentence}")
        if len(sentences_in_sense) > 3:
             print("  - ... (more instances)")

    print("-" * 20)