In [1]:
# Set project root directory and add `src` to path
import sys
from pathlib import Path

PROJECT_ROOT = '/scratch/edk202/word2gm-fast'
project_root = Path(PROJECT_ROOT)
src_path = project_root / 'src'

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Import the notebook setup utilities
from word2gm_fast.utils.notebook_setup import setup_training_notebook, enable_autoreload, run_silent_subprocess

# Enable autoreload for development
enable_autoreload()

# Set up environment (deterministic=True for reproducibility)
env = setup_training_notebook(project_root=PROJECT_ROOT)

# Extract commonly used modules for convenience
tf = env['tensorflow']
np = env['numpy']
pd = env['pandas']
print_resource_summary = env['print_resource_summary']

<pre>Autoreload enabled</pre>

<pre>Project root: /scratch/edk202/word2gm-fast
TensorFlow version: 2.19.0
Device mode: GPU-enabled</pre>

<pre>Training environment ready!</pre>

In [2]:
print_resource_summary()

<pre>SYSTEM RESOURCE SUMMARY
============================================================
Hostname: cm040.hpc.nyu.edu

Job Allocation:
   CPUs: 14
   Memory: 125.0 GB
   Requested partitions: short,cs,cm,cpu_a100_2,cpu_a100_1,cpu_gpu
   Running on: SSH failed: Host key verification failed.
   Job ID: 63030133
   Node list: cm040

GPU Information:
   Error: NVML Shared Library Not Found

TensorFlow GPU Detection:
   TensorFlow detects 0 GPU(s)
   Built with CUDA: True
============================================================</pre>

In [3]:
import os
from pathlib import Path
from word2gm_fast.training.notebook_training import run_notebook_training
from word2gm_fast.utils.tfrecord_io import load_pipeline_artifacts

# Define paths for your small corpus aartifacts and output
artifacts_dir = '/vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1750_artifacts'
output_dir = '/scratch/edk202/word2gm-fast/output/test_small_corpus'
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [4]:
# Build the dataset pipeline: cache → shuffle → batch → prefetch

# Set the path to your artifacts directory (already defined as artifacts_dir)
artifacts = load_pipeline_artifacts(artifacts_dir)

# Unpack the loaded artifacts
vocab_table = artifacts['vocab_table']
triplets_ds = artifacts['triplets_ds']
vocab_size = artifacts['vocab_size']

# Build the dataset pipeline: cache -> shuffle -> batch -> prefetch
triplets_ds = triplets_ds.cache()

BATCH_SIZE = 256
SHUFFLE_BUFFER_SIZE = BATCH_SIZE * 10
triplets_ds = triplets_ds.shuffle(SHUFFLE_BUFFER_SIZE)
triplets_ds = triplets_ds.batch(BATCH_SIZE)
triplets_ds = triplets_ds.prefetch(tf.data.AUTOTUNE)

<pre>Loading pipeline artifacts from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1750_artifacts</pre>

<pre>Loading vocabulary TFRecord from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1750_artifacts/vocab.tfrecord</pre>

<pre>Loading triplet TFRecord from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1750_artifacts/triplets.tfrecord</pre>

<pre>Triplet TFRecord loaded and parsed</pre>

<pre>All artifacts loaded successfully!</pre>

In [5]:
print("vocab_size:", vocab_size)
max_idx = -1
for batch in triplets_ds.unbatch():
    w, p, n = batch
    max_idx = max(max_idx, w.numpy(), p.numpy(), n.numpy())
print("max index in triplets:", max_idx)

vocab_size: 5143
max index in triplets: 5142
max index in triplets: 5142


In [6]:
run_notebook_training(
    training_dataset=triplets_ds,
    save_path=output_dir,
    vocab_size=vocab_size,
    embedding_size=30,
    num_mixtures=2,
    spherical=True,
    learning_rate=0.01,
    epochs=5,
    adagrad=True,
    normclip=True,
    norm_cap=1.0,
    lower_sig=0.01,  # widened lower bound
    upper_sig=2.0,   # widened upper bound
    var_scale=1.0,   # larger initial variance
    loss_epsilon=1e-3,
    wout=False,
    tensorboard_log_path=os.path.join(output_dir, 'tensorboard'),
    monitor_interval=2,
    profile=False,
)

Dataset pipeline structure (oldest to newest):
  [0] TFRecordDatasetV2
  [1] _ParallelMapDataset
  [2] _ParallelMapDataset
  [3] CacheDataset
  [4] _ShuffleDataset
  [5] _BatchDataset
  [6] _PrefetchDataset


<pre>
Starting Word2GM training</pre>

<pre>Writing TensorBoard logs to /scratch/edk202/word2gm-fast/output/test_small_corpus/tensorboard</pre>

<pre>Epoch 1/5 | Loss: 1.00000 | Time: 2.92 sec</pre>

<pre>Epoch 2/5 | Loss: 1.00000 | Time: 2.61 sec</pre>

<pre>Epoch 3/5 | Loss: 1.00000 | Time: 2.59 sec</pre>

<pre>Epoch 4/5 | Loss: 1.00000 | Time: 2.79 sec</pre>

<pre>Epoch 5/5 | Loss: 1.00000 | Time: 2.60 sec</pre>

<pre>Total training time: 13.67 seconds</pre>

In [11]:
# Diagnostic: Inspect loss, gradients, and parameter stats after one batch
from word2gm_fast.models.word2gm_model import Word2GMModel
import tensorflow as tf
import numpy as np

# Get a single batch for diagnostics
for batch in triplets_ds.take(1):
    w, p, n = batch
    break

# Build a fresh model for diagnostics (same config as training)
class Config:
    vocab_size = vocab_size
    embedding_size = 30
    num_mixtures = 2
    spherical = True
    lower_sig = 0.01
    upper_sig = 2.0
    var_scale = 1.0
    loss_epsilon = 1e-3
    wout = False
    max_pe = False

diagnostic_model = Word2GMModel(Config())
diagnostic_model.build([(None,), (None,), (None,)])

# Forward pass and loss computation (margin=0.1 for diagnostics)
margin = 0.1
with tf.GradientTape() as tape:
    # Patch the model's call to use a custom margin for this diagnostic
    orig_call = diagnostic_model.call
    def call_with_margin(inputs, training=None):
        word_ids, pos_ids, neg_ids = inputs
        word_mus, word_vars, word_weights = diagnostic_model.get_word_distributions(word_ids)
        pos_mus, pos_vars, pos_weights = diagnostic_model.get_word_distributions(pos_ids, use_output=diagnostic_model.config.wout)
        neg_mus, neg_vars, neg_weights = diagnostic_model.get_word_distributions(neg_ids, use_output=diagnostic_model.config.wout)
        pos_energy = diagnostic_model.expected_likelihood_kernel(word_mus, word_vars, word_weights, pos_mus, pos_vars, pos_weights)
        neg_energy = diagnostic_model.expected_likelihood_kernel(word_mus, word_vars, word_weights, neg_mus, neg_vars, neg_weights)
        pos_energy = tf.clip_by_value(pos_energy, diagnostic_model.config.loss_epsilon, 1e6)
        neg_energy = tf.clip_by_value(neg_energy, diagnostic_model.config.loss_epsilon, 1e6)
        pos_log_energy = -tf.math.log(pos_energy)
        neg_log_energy = -tf.math.log(neg_energy)
        zero = tf.constant(0.0, dtype=pos_log_energy.dtype)
        loss_per_sample = tf.maximum(zero, margin + pos_log_energy - neg_log_energy)
        return tf.reduce_mean(loss_per_sample), {
            'per_sample_loss': loss_per_sample,
            'log_pos': pos_log_energy,
            'log_neg': neg_log_energy,
        }
    diagnostic_model.call = call_with_margin
    loss, details = diagnostic_model((w, p, n), training=True)
    diagnostic_model.call = orig_call

# Print loss and per-sample loss
print(f"Batch loss: {loss.numpy():.6f}")
print("Per-sample loss (first 10):", details['per_sample_loss'][:10].numpy())

# Print log energies (first 10)
print("Log energies (pos, first 10):", details['log_pos'][:10].numpy())
print("Log energies (neg, first 10):", details['log_neg'][:10].numpy())

# Check for NaNs/Infs in loss and log energies
print("Any NaN in per-sample loss?", tf.math.reduce_any(tf.math.is_nan(details['per_sample_loss'])).numpy())
print("Any Inf in per-sample loss?", tf.math.reduce_any(tf.math.is_inf(details['per_sample_loss'])).numpy())

# Compute gradients
grads = tape.gradient(loss, diagnostic_model.trainable_variables)
for i, (g, v) in enumerate(zip(grads, diagnostic_model.trainable_variables)):
    if g is not None:
        print(f"Grad {i} ({v.name}): min={tf.reduce_min(g).numpy():.4e}, max={tf.reduce_max(g).numpy():.4e}, mean={tf.reduce_mean(g).numpy():.4e}")
    else:
        print(f"Grad {i} ({v.name}): None")

# Print parameter stats (first mixture mean, log-sigma, and mixture logits)
mus = diagnostic_model.mus.numpy()
logsigmas = diagnostic_model.logsigmas.numpy()
mixture = diagnostic_model.mixture.numpy()
print("mus[0,0,:5]:", mus[0,0,:5])
print("logsigmas[0,0,:5]:", logsigmas[0,0,:5])
print("mixture[0,:]:", mixture[0,:])

Batch loss: 0.100000
Per-sample loss (first 10): [0.0999999 0.0999999 0.0999999 0.0999999 0.0999999 0.0999999 0.0999999
 0.0999999 0.0999999 0.0999999]
Log energies (pos, first 10): [6.9077554 6.9077554 6.9077554 6.9077554 6.9077554 6.9077554 6.9077554
 6.9077554 6.9077554 6.9077554]
Log energies (neg, first 10): [6.9077554 6.9077554 6.9077554 6.9077554 6.9077554 6.9077554 6.9077554
 6.9077554 6.9077554 6.9077554]
Any NaN in per-sample loss? False
Any Inf in per-sample loss? False
Grad 0 (mus): min=0.0000e+00, max=0.0000e+00, mean=0.0000e+00
Grad 1 (logsigmas): min=0.0000e+00, max=0.0000e+00, mean=0.0000e+00
Grad 2 (mixture): min=0.0000e+00, max=0.0000e+00, mean=0.0000e+00
mus[0,0,:5]: [ 1.4378221 -0.7192576  1.562633  -1.2091137 -0.609277 ]
logsigmas[0,0,:5]: [0.00225813]
mixture[0,:]: [-0.00554704 -0.00301867]


In [13]:
# Diagnostic: Print raw expected_likelihood_kernel outputs for a few word pairs
num_samples = 5
w_np, p_np, n_np = w.numpy(), p.numpy(), n.numpy()
for i in range(num_samples):
    idx_w, idx_p, idx_n = w_np[i], p_np[i], n_np[i]
    # Get single-word distributions
    mus_w, vars_w, weights_w = diagnostic_model.get_word_distributions(tf.constant([idx_w]))
    mus_p, vars_p, weights_p = diagnostic_model.get_word_distributions(tf.constant([idx_p]))
    mus_n, vars_n, weights_n = diagnostic_model.get_word_distributions(tf.constant([idx_n]))
    # Compute kernel values
    k_pos = diagnostic_model.expected_likelihood_kernel(mus_w, vars_w, weights_w, mus_p, vars_p, weights_p).numpy()[0]
    k_neg = diagnostic_model.expected_likelihood_kernel(mus_w, vars_w, weights_w, mus_n, vars_n, weights_n).numpy()[0]
    print(f'Sample {i}:')
    print(f'  w={idx_w}, p={idx_p}, n={idx_n}')
    print(f'  kernel(w, p) = {k_pos}')
    print(f'  kernel(w, n) = {k_neg}')

    print(f'  -log(kernel(w, p)) = {-np.log(k_pos)}')
    print(f'  -log(kernel(w, n)) = {-np.log(k_neg)}')

Sample 0:
  w=3620, p=2028, n=2346
  kernel(w, p) = 1.3182880622634313e-13
  kernel(w, n) = 1.212661047966801e-12
  -log(kernel(w, p)) = 29.657272338867188
  -log(kernel(w, n)) = 27.438203811645508
Sample 1:
  w=1919, p=2028, n=3109
  kernel(w, p) = 1.6754596299760338e-13
  kernel(w, n) = 1.7411729881251087e-13
  -log(kernel(w, p)) = 29.417518615722656
  -log(kernel(w, n)) = 29.379047393798828
Sample 2:
  w=3164, p=2836, n=3654
  kernel(w, p) = 1.1842838341935558e-11
  kernel(w, n) = 1.9210861200436025e-12
  -log(kernel(w, p)) = 25.159297943115234
  -log(kernel(w, n)) = 26.978130340576172
Sample 3:
  w=3058, p=372, n=2497
  kernel(w, p) = 2.1205396650864766e-13
  kernel(w, n) = 1.7321638914880033e-12
  -log(kernel(w, p)) = 29.181936264038086
  -log(kernel(w, n)) = 27.081649780273438
Sample 4:
  w=3743, p=2028, n=2427
  kernel(w, p) = 3.81823817897399e-14
  kernel(w, n) = 8.794113576455001e-14
  -log(kernel(w, p)) = 30.89640235900879
  -log(kernel(w, n)) = 30.062108993530273


In [8]:
# --- Diagnostic: Check model/loss behavior on a single batch and optimizer step (margin=0.1) ---

from word2gm_fast.models.word2gm_model import Word2GMModel
import tensorflow as tf
import numpy as np

# Get a single batch from the dataset
for batch in triplets_ds.take(1):
    word_ids, pos_ids, neg_ids = batch
    break
else:
    raise RuntimeError('No batch found in triplets_ds!')

# Build a fresh model with the same config as training
model = Word2GMModel(type('Config', (), dict(
    vocab_size=vocab_size,
    embedding_size=30,  # match your training config
    num_mixtures=2,
    spherical=True,
    var_scale=1.0,  # match training config
    loss_epsilon=1e-3,
    wout=False,
    max_pe=False
))())
model.build([(None,), (None,), (None,)])

# Save initial weights for comparison
init_weights = [w.numpy().copy() for w in model.trainable_weights]

# Forward pass: compute loss for the batch with a smaller margin
margin = 0.1
with tf.GradientTape() as tape:
    # Patch the model's call to use a custom margin for this diagnostic
    orig_call = model.call
    def call_with_margin(inputs, training=None):
        word_ids, pos_ids, neg_ids = inputs
        word_mus, word_vars, word_weights = model.get_word_distributions(word_ids)
        pos_mus, pos_vars, pos_weights = model.get_word_distributions(pos_ids, use_output=model.config.wout)
        neg_mus, neg_vars, neg_weights = model.get_word_distributions(neg_ids, use_output=model.config.wout)
        pos_energy = model.expected_likelihood_kernel(word_mus, word_vars, word_weights, pos_mus, pos_vars, pos_weights)
        neg_energy = model.expected_likelihood_kernel(word_mus, word_vars, word_weights, neg_mus, neg_vars, neg_weights)
        pos_energy = tf.clip_by_value(pos_energy, model.config.loss_epsilon, 1e6)
        neg_energy = tf.clip_by_value(neg_energy, model.config.loss_epsilon, 1e6)
        pos_log_energy = -tf.math.log(pos_energy)
        neg_log_energy = -tf.math.log(neg_energy)
        zero = tf.constant(0.0, dtype=pos_log_energy.dtype)
        loss_per_sample = tf.maximum(zero, margin + pos_log_energy - neg_log_energy)
        return tf.reduce_mean(loss_per_sample)
    model.call = call_with_margin
    loss = model((word_ids, pos_ids, neg_ids), training=True)
    model.call = orig_call
print(f'Initial loss for batch (margin={margin}): {loss.numpy()}')

# Compute gradients and apply one optimizer step
optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.01)
grads = tape.gradient(loss, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))

# Check if any weights changed
changed = [not np.allclose(w0, w1.numpy()) for w0, w1 in zip(init_weights, model.trainable_weights)]
print('Any weights changed after one optimizer step?', any(changed))
for i, (w0, w1) in enumerate(zip(init_weights, model.trainable_weights)):
    if not np.allclose(w0, w1.numpy()):
        print(f'Weight {i} changed: max diff = {np.abs(w0 - w1.numpy()).max()}')
    else:
        print(f'Weight {i} did NOT change.')

# Forward pass again to see if loss changes
loss2 = model((word_ids, pos_ids, neg_ids), training=True)
print(f'Loss after one optimizer step: {loss2.numpy()}')

# Check for NaNs/Infs in loss and weights
print('Loss has NaN:', np.isnan(loss2.numpy()).any(), 'Inf:', np.isinf(loss2.numpy()).any())
for i, w in enumerate(model.trainable_weights):
    print(f'Weight {i}: min={w.numpy().min()}, max={w.numpy().max()}, has NaN={np.isnan(w.numpy()).any()}, has Inf={np.isinf(w.numpy()).any()}')


Initial loss for batch (margin=0.1): 0.09999990463256836
Any weights changed after one optimizer step? False
Weight 0 did NOT change.
Weight 1 did NOT change.
Weight 2 did NOT change.
Loss after one optimizer step: 1.0
Loss has NaN: False Inf: False
Weight 0: min=-4.589941501617432, max=4.4528326988220215, has NaN=False, has Inf=False
Weight 1: min=-0.19939996302127838, max=0.19155453145503998, has NaN=False, has Inf=False
Weight 2: min=-0.03728317469358444, max=0.03973766416311264, has NaN=False, has Inf=False
