In [1]:
# Set project root directory and add `src` to path
import sys
from pathlib import Path

PROJECT_ROOT = '/scratch/edk202/word2gm-fast'
project_root = Path(PROJECT_ROOT)
src_path = project_root / 'src'
 
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Import the notebook setup utilities
from word2gm_fast.utils.notebook_setup import setup_training_notebook, enable_autoreload, run_silent_subprocess

# Enable autoreload for development
enable_autoreload()

# Set up environment
env = setup_training_notebook(project_root=PROJECT_ROOT)

# Extract commonly used modules for convenience
tf = env['tensorflow']
np = env['numpy']
pd = env['pandas']
print_resource_summary = env['print_resource_summary']

<pre>Autoreload enabled</pre>

<pre>Project root: /scratch/edk202/word2gm-fast
TensorFlow version: 2.19.0
Device mode: GPU-enabled</pre>

<pre>Training environment ready!</pre>

In [2]:
import os
from word2gm_fast.training.notebook_training import run_notebook_training
from word2gm_fast.utils.tfrecord_io import load_pipeline_artifacts

# Define paths for your small corpus artifacts and output
artifacts_dir = '/vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1937_artifacts'
output_dir = '/scratch/edk202/word2gm-fast/output/test_small_corpus'
from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [None]:
# Set the path to your artifacts directory (already defined as artifacts_dir)
artifacts = load_pipeline_artifacts(artifacts_dir)

# Unpack the loaded artifacts
vocab_table = artifacts['vocab_table']
triplets_ds = artifacts['triplets_ds']
vocab_size = artifacts['vocab_size']

triplets_ds = triplets_ds.cache()

BATCH_SIZE = 1024 * 32
triplets_ds = triplets_ds.batch(BATCH_SIZE)

<pre>Loading pipeline artifacts from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1937_artifacts</pre>

<pre>Loading vocabulary TFRecord from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1937_artifacts/vocab.tfrecord</pre>

I0000 00:00:1751132431.006495 1766813 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30681 MB memory:  -> device: 0, name: Tesla V100-PCIE-32GB, pci bus id: 0000:d8:00.0, compute capability: 7.0


<pre>Loading triplet TFRecord from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1937_artifacts/triplets.tfrecord</pre>

<pre>Triplet TFRecord loaded and parsed</pre>

<pre>All artifacts loaded successfully!</pre>

In [4]:
run_notebook_training(
    training_dataset=triplets_ds,
    save_path=output_dir,
    vocab_size=vocab_size,
    embedding_size=16,
    num_mixtures=2,
    spherical=True,
    learning_rate=0.1,
    epochs=2,
    adagrad=True,
    normclip=True,
    norm_cap=5.0,
    lower_sig=0.05,
    upper_sig=1.0,
    wout=False,
    tensorboard_log_path=os.path.join(output_dir, 'tensorboard'),
    monitor_interval=2,
    profile=False
)

Dataset pipeline structure (oldest to newest):
  [0] TFRecordDatasetV2
  [1] _ParallelMapDataset
  [2] _ParallelMapDataset
  [3] CacheDataset
  [4] _BatchDataset


<pre>
Starting Word2GM training</pre>

<pre>Writing TensorBoard logs to /scratch/edk202/word2gm-fast/output/test_small_corpus/tensorboard</pre>

I0000 00:00:1751132440.667431 1767149 service.cc:152] XLA service 0x14e60c003b00 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1751132440.667455 1767149 service.cc:160]   StreamExecutor device (0): Tesla V100-PCIE-32GB, Compute Capability 7.0
I0000 00:00:1751132440.694365 1767149 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1751132440.813662 1767149 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


<pre>Epoch 1/2 | Loss: 1.00000 | Time: 177.54 sec</pre>

<pre>Epoch 2/2 | Loss: 1.00000 | Time: 14.70 sec</pre>

<pre>Total training time: 194.61 seconds</pre>

In [None]:
# Inspect output files or model weights (optional)
print('Output directory contents:', os.listdir(output_dir))

In [None]:
# Launch TensorBoard inside the notebook for live monitoring
from tensorboard import notebook as tb_notebook
import os
tb_logdir = os.path.join(output_dir, 'tensorboard')
print(f"TensorBoard logdir: {tb_logdir}")
tb_notebook.start('--logdir', tb_logdir)

In [None]:
# Print max resource usage after training (if ResourceMonitor was used)
from word2gm_fast.utils.resource_monitor import ResourceMonitor
if hasattr(ResourceMonitor, 'get_last_instance') and ResourceMonitor.get_last_instance() is not None:
    monitor = ResourceMonitor.get_last_instance()
    max_stats = monitor.get_max_stats()
    print('Max resource usage during training:')
    for k, v in max_stats.items():
        print(f'  {k}: {v}')
else:
    print('ResourceMonitor instance not found or not used.')