In [1]:
# Set project root directory and add `src` to path
import sys
from pathlib import Path

PROJECT_ROOT = '/scratch/edk202/word2gm-fast'
project_root = Path(PROJECT_ROOT)
src_path = project_root / 'src'

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Import the notebook setup utilities
from word2gm_fast.utils.notebook_setup import setup_training_notebook, enable_autoreload, run_silent_subprocess

# Enable autoreload for development
enable_autoreload()

# Set up environment (deterministic=True for reproducibility)
env = setup_training_notebook(project_root=PROJECT_ROOT)

# Extract commonly used modules for convenience
tf = env['tensorflow']
np = env['numpy']
pd = env['pandas']
print_resource_summary = env['print_resource_summary']

<pre>Autoreload enabled</pre>

<pre>Project root: /scratch/edk202/word2gm-fast
TensorFlow version: 2.19.0
Device mode: GPU-enabled</pre>

<pre>Training environment ready!</pre>

In [2]:
# --- Robust diagnostic: print all model and optimizer variable names and shapes ---
import importlib
import word2gm_fast.training.train_loop as train_loop_mod
import word2gm_fast.training.notebook_training as notebook_training_mod
import tensorflow as tf
import numpy as np

importlib.reload(train_loop_mod)
importlib.reload(notebook_training_mod)

def diagnostic_train_one_epoch(model, optimizer, dataset, summary_writer=None, epoch=0):
    print("\n=== DIAGNOSTIC: train_one_epoch called ===")
    print(f"Model id: {id(model)}  Optimizer id: {id(optimizer)}")
    print("Model trainable variables:")
    for v in model.trainable_variables:
        print(f"  {v.name} shape={v.shape} id={id(v)}")
    print("Optimizer variables (including slots):")
    for v in optimizer.variables:
        print(f"  {v.name} shape={v.shape} id={id(v)}")
    for batch in dataset.take(1):
        w, p, n = batch
        print("Batch indices (first 10):", w.numpy()[:10], p.numpy()[:10], n.numpy()[:10])
        print("Batch max index:", max(w.numpy().max(), p.numpy().max(), n.numpy().max()))
        try:
            loss = train_loop_mod.train_step(model, optimizer, w, p, n)
            print("train_step loss:", loss.numpy())
        except Exception as e:
            import traceback
            print("Exception during train_step:")
            traceback.print_exc()
            raise
        break
    print("=== END DIAGNOSTIC ===\n")
    return 0.0

train_loop_mod.train_one_epoch = diagnostic_train_one_epoch
notebook_training_mod.train_one_epoch = diagnostic_train_one_epoch

In [3]:
print_resource_summary()

<pre>SYSTEM RESOURCE SUMMARY
============================================================
Hostname: cm048.hpc.nyu.edu

Job Allocation:
   CPUs: 14
   Memory: 125.0 GB
   Requested partitions: short,cs,cm,cpu_a100_2,cpu_a100_1,cpu_gpu
   Running on: SSH failed: Host key verification failed.
   Job ID: 63043526
   Node list: cm048

GPU Information:
   Error: NVML Shared Library Not Found

TensorFlow GPU Detection:
   TensorFlow detects 0 GPU(s)
   Built with CUDA: True
============================================================</pre>

In [4]:
import os
from pathlib import Path
from word2gm_fast.training.notebook_training import run_notebook_training
from word2gm_fast.utils.tfrecord_io import load_pipeline_artifacts

# Define paths for your small corpus aartifacts and output
artifacts_dir = '/vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1750_artifacts'
output_dir = '/scratch/edk202/word2gm-fast/output/test_small_corpus'
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [5]:
# Build the dataset pipeline: cache → shuffle → batch → prefetch

# Set the path to your artifacts directory (already defined as artifacts_dir)
artifacts = load_pipeline_artifacts(artifacts_dir)

# Unpack the loaded artifacts
vocab_table = artifacts['vocab_table']
triplets_ds = artifacts['triplets_ds']
vocab_size = artifacts['vocab_size']

# Build the dataset pipeline: cache -> shuffle -> batch -> prefetch
triplets_ds = triplets_ds.cache()

BATCH_SIZE = 256
SHUFFLE_BUFFER_SIZE = BATCH_SIZE * 10
triplets_ds = triplets_ds.shuffle(SHUFFLE_BUFFER_SIZE)
triplets_ds = triplets_ds.batch(BATCH_SIZE)
triplets_ds = triplets_ds.prefetch(tf.data.AUTOTUNE)

<pre>Loading pipeline artifacts from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1750_artifacts</pre>

<pre>Loading vocabulary TFRecord from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1750_artifacts/vocab.tfrecord</pre>

<pre>Loading triplet TFRecord from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1750_artifacts/triplets.tfrecord</pre>

<pre>Triplet TFRecord loaded and parsed</pre>

<pre>All artifacts loaded successfully!</pre>

In [6]:
print("vocab_size:", vocab_size)
max_idx = -1
for batch in triplets_ds.unbatch():
    w, p, n = batch
    max_idx = max(max_idx, w.numpy(), p.numpy(), n.numpy())
print("max index in triplets:", max_idx)

vocab_size: 5143
max index in triplets: 5142


In [None]:
run_notebook_training(
    training_dataset=triplets_ds,
    save_path=output_dir,
    vocab_size=vocab_size,
    embedding_size=30,
    num_mixtures=2,
    spherical=True,
    learning_rate=0.01,
    epochs=5,
    adagrad=True,
    normclip=True,
    norm_cap=1.0,
    lower_sig=0.01,
    upper_sig=2.0,
    var_scale=10.0,
    loss_epsilon=1e-3,
    wout=False,
    tensorboard_log_path=os.path.join(output_dir, 'tensorboard'),
    monitor_interval=2,
    profile=False,
)

In [7]:
# Now call run_notebook_training for 1 epoch to trigger the error and print diagnostics
notebook_training_mod.run_notebook_training(
    training_dataset=triplets_ds,
    save_path=output_dir,
    vocab_size=vocab_size,
    embedding_size=30,
    num_mixtures=2,
    spherical=True,
    learning_rate=0.01,
    epochs=1,
    adagrad=True,
    normclip=True,
    norm_cap=1.0,
    lower_sig=0.01,
    upper_sig=2.0,
    var_scale=10.0,
    loss_epsilon=1e-3,
    wout=False,
    tensorboard_log_path=os.path.join(output_dir, 'tensorboard'),
    monitor_interval=1,
    profile=False,
)


Dataset pipeline structure (oldest to newest):
  [0] TFRecordDatasetV2
  [1] _ParallelMapDataset
  [2] _ParallelMapDataset
  [3] CacheDataset
  [4] _ShuffleDataset
  [5] _BatchDataset
  [6] _PrefetchDataset


<pre>
Starting Word2GM training</pre>

<pre>Writing TensorBoard logs to /scratch/edk202/word2gm-fast/output/test_small_corpus/tensorboard</pre>

Optimizer slot variables initialized.
All optimizer variables and shapes after init:
  iteration: shape ()
  learning_rate: shape ()
  word2gm_model_mus_accumulator: shape (5143, 2, 30)
  word2gm_model_logsigmas_accumulator: shape (5143, 2, 1)
  word2gm_model_mixture_accumulator: shape (5143, 2)

[Diagnostics] Model trainable variables:
  mus: shape (5143, 2, 30)
  logsigmas: shape (5143, 2, 1)
  mixture: shape (5143, 2)

[Diagnostics] Optimizer variables:
  iteration: shape ()
  learning_rate: shape ()
  word2gm_model_mus_accumulator: shape (5143, 2, 30)
  word2gm_model_logsigmas_accumulator: shape (5143, 2, 1)
  word2gm_model_mixture_accumulator: shape (5143, 2)

Training model vocab_size: 5143
model.mus.shape: (5143, 2, 30)
model.logsigmas.shape: (5143, 2, 1)
model.mixture.shape: (5143, 2)
id(model): 22755893395680
id(optimizer): 22755893305824
Batch min/max indices: 29 5108 38 5135 7 5140
All index and shape checks passed at training start.

=== DIAGNOSTIC: train_one_epoch called =

Traceback (most recent call last):
  File "/state/partition1/job-63043526/ipykernel_2194746/3796053952.py", line 25, in diagnostic_train_one_epoch
    loss = train_loop_mod.train_step(model, optimizer, w, p, n)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/edk202/word2gm-fast/src/word2gm_fast/training/training_utils.py", line 88, in train_step
    optimizer.apply_gradients(grads_and_vars)
  File "/ext3/miniforge3/envs/word2gm-fast2/lib/python3.12/site-packages/keras/src/optimizers/base_optimizer.py", line 463, in apply_gradients
    self.apply(grads, trainable_variables)
  File "/ext3/miniforge3/envs/word2gm-fast2/lib/python3.12/site-packages/keras/src/optimizers/base_optimizer.py", line 527, in apply
    self._backend_apply_gradients(grads, trainable_variables)
  File "/ext3/miniforge3/envs/word2gm-fast2/lib/python3.12/site-packages/keras/src/optimizers/base_optimizer.py", line 593, in _backend_apply_gradients
    self._backend_update_step(
  File "/

InvalidArgumentError: {{function_node __wrapped__GatherV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} indices[6] = 1421 is not in [0, 769) [Op:GatherV2] name: 

In [None]:
# Diagnostic: Inspect loss, gradients, and parameter stats after one batch
from word2gm_fast.models.word2gm_model import Word2GMModel
import tensorflow as tf
import numpy as np

# Get a single batch for diagnostics
for batch in triplets_ds.take(1):
    w, p, n = batch
    break

# Build a fresh model for diagnostics (same config as training)
class Config:
    vocab_size = vocab_size
    embedding_size = 30
    num_mixtures = 2
    spherical = True
    lower_sig = 0.01
    upper_sig = 2.0
    var_scale = 1.0
    loss_epsilon = 1e-3
    wout = False
    max_pe = False

diagnostic_model = Word2GMModel(Config())
diagnostic_model.build([(None,), (None,), (None,)])

# Forward pass and loss computation (margin=0.1 for diagnostics)
margin = 0.1
with tf.GradientTape() as tape:
    # Patch the model's call to use a custom margin for this diagnostic
    orig_call = diagnostic_model.call
    def call_with_margin(inputs, training=None):
        word_ids, pos_ids, neg_ids = inputs
        word_mus, word_vars, word_weights = diagnostic_model.get_word_distributions(word_ids)
        pos_mus, pos_vars, pos_weights = diagnostic_model.get_word_distributions(pos_ids, use_output=diagnostic_model.config.wout)
        neg_mus, neg_vars, neg_weights = diagnostic_model.get_word_distributions(neg_ids, use_output=diagnostic_model.config.wout)
        pos_energy = diagnostic_model.expected_likelihood_kernel(word_mus, word_vars, word_weights, pos_mus, pos_vars, pos_weights)
        neg_energy = diagnostic_model.expected_likelihood_kernel(word_mus, word_vars, word_weights, neg_mus, neg_vars, neg_weights)
        pos_energy = tf.clip_by_value(pos_energy, diagnostic_model.config.loss_epsilon, 1e6)
        neg_energy = tf.clip_by_value(neg_energy, diagnostic_model.config.loss_epsilon, 1e6)
        pos_log_energy = -tf.math.log(pos_energy)
        neg_log_energy = -tf.math.log(neg_energy)
        zero = tf.constant(0.0, dtype=pos_log_energy.dtype)
        loss_per_sample = tf.maximum(zero, margin + pos_log_energy - neg_log_energy)
        return tf.reduce_mean(loss_per_sample), {
            'per_sample_loss': loss_per_sample,
            'log_pos': pos_log_energy,
            'log_neg': neg_log_energy,
        }
    diagnostic_model.call = call_with_margin
    loss, details = diagnostic_model((w, p, n), training=True)
    diagnostic_model.call = orig_call

# Print loss and per-sample loss
print(f"Batch loss: {loss.numpy():.6f}")
print("Per-sample loss (first 10):", details['per_sample_loss'][:10].numpy())

# Print log energies (first 10)
print("Log energies (pos, first 10):", details['log_pos'][:10].numpy())
print("Log energies (neg, first 10):", details['log_neg'][:10].numpy())

# Check for NaNs/Infs in loss and log energies
print("Any NaN in per-sample loss?", tf.math.reduce_any(tf.math.is_nan(details['per_sample_loss'])).numpy())
print("Any Inf in per-sample loss?", tf.math.reduce_any(tf.math.is_inf(details['per_sample_loss'])).numpy())

# Compute gradients
grads = tape.gradient(loss, diagnostic_model.trainable_variables)
for i, (g, v) in enumerate(zip(grads, diagnostic_model.trainable_variables)):
    if g is not None:
        print(f"Grad {i} ({v.name}): min={tf.reduce_min(g).numpy():.4e}, max={tf.reduce_max(g).numpy():.4e}, mean={tf.reduce_mean(g).numpy():.4e}")
    else:
        print(f"Grad {i} ({v.name}): None")

# Print parameter stats (first mixture mean, log-sigma, and mixture logits)
mus = diagnostic_model.mus.numpy()
logsigmas = diagnostic_model.logsigmas.numpy()
mixture = diagnostic_model.mixture.numpy()
print("mus[0,0,:5]:", mus[0,0,:5])
print("logsigmas[0,0,:5]:", logsigmas[0,0,:5])
print("mixture[0,:]:", mixture[0,:])

In [None]:
# Diagnostic: Print raw expected_likelihood_kernel outputs for a few word pairs
num_samples = 5
w_np, p_np, n_np = w.numpy(), p.numpy(), n.numpy()
for i in range(num_samples):
    idx_w, idx_p, idx_n = w_np[i], p_np[i], n_np[i]
    # Get single-word distributions
    mus_w, vars_w, weights_w = diagnostic_model.get_word_distributions(tf.constant([idx_w]))
    mus_p, vars_p, weights_p = diagnostic_model.get_word_distributions(tf.constant([idx_p]))
    mus_n, vars_n, weights_n = diagnostic_model.get_word_distributions(tf.constant([idx_n]))
    # Compute kernel values
    k_pos = diagnostic_model.expected_likelihood_kernel(mus_w, vars_w, weights_w, mus_p, vars_p, weights_p).numpy()[0]
    k_neg = diagnostic_model.expected_likelihood_kernel(mus_w, vars_w, weights_w, mus_n, vars_n, weights_n).numpy()[0]
    print(f'Sample {i}:')
    print(f'  w={idx_w}, p={idx_p}, n={idx_n}')
    print(f'  kernel(w, p) = {k_pos}')
    print(f'  kernel(w, n) = {k_neg}')

    print(f'  -log(kernel(w, p)) = {-np.log(k_pos)}')
    print(f'  -log(kernel(w, n)) = {-np.log(k_neg)}')

In [None]:
# --- Diagnostic: Check model/loss behavior on a single batch and optimizer step (margin=0.1) ---

from word2gm_fast.models.word2gm_model import Word2GMModel
import tensorflow as tf
import numpy as np

# Get a single batch from the dataset
for batch in triplets_ds.take(1):
    word_ids, pos_ids, neg_ids = batch
    break
else:
    raise RuntimeError('No batch found in triplets_ds!')

# Build a fresh model with the same config as training
model = Word2GMModel(type('Config', (), dict(
    vocab_size=vocab_size,
    embedding_size=30,  # match your training config
    num_mixtures=2,
    spherical=True,
    var_scale=1.0,  # match training config
    loss_epsilon=1e-3,
    wout=False,
    max_pe=False
))())
model.build([(None,), (None,), (None,)])

# Save initial weights for comparison
init_weights = [w.numpy().copy() for w in model.trainable_weights]

# Forward pass: compute loss for the batch with a smaller margin
margin = 0.1
with tf.GradientTape() as tape:
    # Patch the model's call to use a custom margin for this diagnostic
    orig_call = model.call
    def call_with_margin(inputs, training=None):
        word_ids, pos_ids, neg_ids = inputs
        word_mus, word_vars, word_weights = model.get_word_distributions(word_ids)
        pos_mus, pos_vars, pos_weights = model.get_word_distributions(pos_ids, use_output=model.config.wout)
        neg_mus, neg_vars, neg_weights = model.get_word_distributions(neg_ids, use_output=model.config.wout)
        pos_energy = model.expected_likelihood_kernel(word_mus, word_vars, word_weights, pos_mus, pos_vars, pos_weights)
        neg_energy = model.expected_likelihood_kernel(word_mus, word_vars, word_weights, neg_mus, neg_vars, neg_weights)
        pos_energy = tf.clip_by_value(pos_energy, model.config.loss_epsilon, 1e6)
        neg_energy = tf.clip_by_value(neg_energy, model.config.loss_epsilon, 1e6)
        pos_log_energy = -tf.math.log(pos_energy)
        neg_log_energy = -tf.math.log(neg_energy)
        zero = tf.constant(0.0, dtype=pos_log_energy.dtype)
        loss_per_sample = tf.maximum(zero, margin + pos_log_energy - neg_log_energy)
        return tf.reduce_mean(loss_per_sample)
    model.call = call_with_margin
    loss = model((word_ids, pos_ids, neg_ids), training=True)
    model.call = orig_call
print(f'Initial loss for batch (margin={margin}): {loss.numpy()}')

# Compute gradients and apply one optimizer step
optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.01)
grads = tape.gradient(loss, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))

# Check if any weights changed
changed = [not np.allclose(w0, w1.numpy()) for w0, w1 in zip(init_weights, model.trainable_weights)]
print('Any weights changed after one optimizer step?', any(changed))
for i, (w0, w1) in enumerate(zip(init_weights, model.trainable_weights)):
    if not np.allclose(w0, w1.numpy()):
        print(f'Weight {i} changed: max diff = {np.abs(w0 - w1.numpy()).max()}')
    else:
        print(f'Weight {i} did NOT change.')

# Forward pass again to see if loss changes
loss2 = model((word_ids, pos_ids, neg_ids), training=True)
print(f'Loss after one optimizer step: {loss2.numpy()}')

# Check for NaNs/Infs in loss and weights
print('Loss has NaN:', np.isnan(loss2.numpy()).any(), 'Inf:', np.isinf(loss2.numpy()).any())
for i, w in enumerate(model.trainable_weights):
    print(f'Weight {i}: min={w.numpy().min()}, max={w.numpy().max()}, has NaN={np.isnan(w.numpy()).any()}, has Inf={np.isinf(w.numpy()).any()}')
