In [2]:
# Set project root directory and add `src` to path
import sys
from pathlib import Path

PROJECT_ROOT = '/scratch/edk202/word2gm-fast'
project_root = Path(PROJECT_ROOT)
src_path = project_root / 'src'

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Import the notebook setup utilities
from word2gm_fast.utils.notebook_setup import setup_training_notebook, enable_autoreload, run_silent_subprocess

# Enable autoreload for development
enable_autoreload()

# Set up environment (deterministic=True for reproducibility)
env = setup_training_notebook(project_root=PROJECT_ROOT)

# Extract commonly used modules for convenience
tf = env['tensorflow']
np = env['numpy']
pd = env['pandas']
print_resource_summary = env['print_resource_summary']

<pre>Autoreload enabled</pre>

<pre>Project root: /scratch/edk202/word2gm-fast
TensorFlow version: 2.19.0
Device mode: GPU-enabled</pre>

<pre>Training environment ready!</pre>

In [3]:
print_resource_summary()

<pre>SYSTEM RESOURCE SUMMARY
============================================================
Hostname: gr038.hpc.nyu.edu

Job Allocation:
   CPUs: 14
   Memory: 125.0 GB
   Requested partitions: rtx8000,v100,a100_2,a100_1,h100_1
   Running on: SSH failed: Host key verification failed.
   Job ID: 63013420
   Node list: gr038

GPU Information:
   CUDA GPUs detected: 1
   GPU 0: Quadro RTX 8000
      Memory: 1.2/45.0 GB (43.8 GB free)
      Temperature: 37°C
      Utilization: GPU 0%, Memory 0%

TensorFlow GPU Detection:
   TensorFlow detects 1 GPU(s)
      /physical_device:GPU:0, Memory growth: True
   Built with CUDA: True
============================================================</pre>

In [10]:
import os
from word2gm_fast.training.notebook_training import run_notebook_training
from word2gm_fast.utils.tfrecord_io import load_pipeline_artifacts

# Define paths for your small corpus aartifacts and output
artifacts_dir = '/vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1800_artifacts'
output_dir = '/scratch/edk202/word2gm-fast/output/test_small_corpus'
from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [None]:
# Build the dataset pipeline: cache → shuffle → batch → prefetch

# Set the path to your artifacts directory (already defined as artifacts_dir)
artifacts = load_pipeline_artifacts(artifacts_dir)

# Unpack the loaded artifacts
vocab_table = artifacts['vocab_table']
triplets_ds = artifacts['triplets_ds']
vocab_size = artifacts['vocab_size']

# Build the dataset pipeline: cache -> shuffle -> batch -> prefetch
triplets_ds = triplets_ds.cache()

BATCH_SIZE = 1024 * 8
SHUFFLE_BUFFER_SIZE = BATCH_SIZE * 10
triplets_ds = triplets_ds.shuffle(SHUFFLE_BUFFER_SIZE)
triplets_ds = triplets_ds.batch(BATCH_SIZE)
triplets_ds = triplets_ds.prefetch(tf.data.AUTOTUNE)

<pre>Loading pipeline artifacts from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1800_artifacts</pre>

<pre>Loading vocabulary TFRecord from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1800_artifacts/vocab.tfrecord</pre>

<pre>Loading triplet TFRecord from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1800_artifacts/triplets.tfrecord</pre>

<pre>Triplet TFRecord loaded and parsed</pre>

<pre>All artifacts loaded successfully!</pre>

In [27]:
# Check for NaNs/Infs in a batch from the dataset
for batch in triplets_ds.take(1):
    word_ids, pos_ids, neg_ids = batch
    print("word_ids NaN:", np.isnan(word_ids.numpy()).any(), "Inf:", np.isinf(word_ids.numpy()).any())
    print("pos_ids NaN:", np.isnan(pos_ids.numpy()).any(), "Inf:", np.isinf(pos_ids.numpy()).any())
    print("neg_ids NaN:", np.isnan(neg_ids.numpy()).any(), "Inf:", np.isinf(neg_ids.numpy()).any())
    print("word_ids min/max:", word_ids.numpy().min(), word_ids.numpy().max())
    print("pos_ids min/max:", pos_ids.numpy().min(), pos_ids.numpy().max())
    print("neg_ids min/max:", neg_ids.numpy().min(), neg_ids.numpy().max())

word_ids NaN: False Inf: False
pos_ids NaN: False Inf: False
neg_ids NaN: False Inf: False
word_ids min/max: 1 20673
pos_ids min/max: 1 20678
neg_ids min/max: 1 20683


In [26]:
run_notebook_training(
    training_dataset=triplets_ds,
    save_path=output_dir,
    vocab_size=vocab_size,
    embedding_size=15,
    num_mixtures=1,
    spherical=True,
    learning_rate=0.001,
    epochs=10,
    adagrad=True,
    normclip=True,
    norm_cap=1.0,
    lower_sig=0.1,
    upper_sig=0.5,
    var_scale=0.2,
    loss_epsilon=1e-3,
    wout=False,
    tensorboard_log_path=os.path.join(output_dir, 'tensorboard'),
    monitor_interval=2,
    profile=False,
)

Dataset pipeline structure (oldest to newest):
  [0] TFRecordDatasetV2
  [1] _ParallelMapDataset
  [2] _ParallelMapDataset
  [3] CacheDataset
  [4] _ShuffleDataset
  [5] _BatchDataset
  [6] _PrefetchDataset


<pre>
Starting Word2GM training</pre>

<pre>Writing TensorBoard logs to /scratch/edk202/word2gm-fast/output/test_small_corpus/tensorboard</pre>

<pre>Epoch 1/10 | Loss: nan | Time: 20.07 sec</pre>

<pre>Epoch 2/10 | Loss: nan | Time: 15.13 sec</pre>

KeyboardInterrupt: 

In [24]:
# Diagnostic: Inspect model parameter and kernel/loss statistics after training
from word2gm_fast.models.word2gm_model import Word2GMModel
import numpy as np
import tensorflow as tf
import os
model_weights_path = os.path.join(output_dir, 'model_weights_epoch10.weights.h5')
if os.path.exists(model_weights_path):
    # Rebuild model with same config as training
    model = Word2GMModel(type('Config', (), dict(
        vocab_size=vocab_size,
        embedding_size=15,
        num_mixtures=1,
        spherical=True,
        var_scale=0.2,
        loss_epsilon=1e-3,
        wout=False,
        max_pe=False  # Add this line to match model config
    ))())
    # Build the model before loading weights
    model.build([(None,), (None,), (None,)])
    model.load_weights(model_weights_path)
    print('Loaded model weights.')
    # Inspect parameter stats
    mus = model.mus.numpy()
    logsigmas = model.logsigmas.numpy()
    print(f"mus: min={mus.min()}, max={mus.max()}, mean={mus.mean()}, std={mus.std()}")
    print(f"logsigmas: min={logsigmas.min()}, max={logsigmas.max()}, mean={logsigmas.mean()}, std={logsigmas.std()}")
    # Check for NaNs/Infs
    print(f"mus has NaN: {np.isnan(mus).any()}, Inf: {np.isinf(mus).any()}")
    print(f"logsigmas has NaN: {np.isnan(logsigmas).any()}, Inf: {np.isinf(logsigmas).any()}")
    # Try a batch through the kernel and loss
    for batch in triplets_ds.take(1):
        word_ids, pos_ids, neg_ids = batch
        out = model((word_ids, pos_ids, neg_ids), training=False)
        print(f"Loss for batch: {out.numpy()}")
else:
    print('Model weights not found for diagnostics.')

Loaded model weights.
mus: min=nan, max=nan, mean=nan, std=nan
logsigmas: min=nan, max=nan, mean=nan, std=nan
mus has NaN: True, Inf: False
logsigmas has NaN: True, Inf: False
Loss for batch: nan
Loss for batch: nan
