In [1]:
# Set project root directory and add `src` to path
import sys
from pathlib import Path

PROJECT_ROOT = '/scratch/edk202/word2gm-fast'
project_root = Path(PROJECT_ROOT)
src_path = project_root / 'src'
 
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Import the notebook setup utilities
from word2gm_fast.utils.notebook_setup import setup_training_notebook, enable_autoreload, run_silent_subprocess

# Enable autoreload for development
enable_autoreload()

# Set up environment
env = setup_training_notebook(project_root=PROJECT_ROOT)

# Extract commonly used modules for convenience
tf = env['tensorflow']
np = env['numpy']
pd = env['pandas']
print_resource_summary = env['print_resource_summary']

<pre>Autoreload enabled</pre>

<pre>Project root: /scratch/edk202/word2gm-fast
TensorFlow version: 2.19.0
Device mode: GPU-enabled</pre>

<pre>Training environment ready!</pre>

In [6]:
import os
from word2gm_fast.training.notebook_training import run_notebook_training
from word2gm_fast.utils.tfrecord_io import load_pipeline_artifacts

# Define paths for your small corpus artifacts and output
artifacts_dir = '/vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/2000_artifacts'
output_dir = '/scratch/edk202/word2gm-fast/output/test_small_corpus'
from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [7]:
# Set the path to your artifacts directory (already defined as artifacts_dir)
artifacts = load_pipeline_artifacts(artifacts_dir)

# Unpack the loaded artifacts
vocab_table = artifacts['vocab_table']
triplets_ds = artifacts['triplets_ds']
vocab_size = artifacts['vocab_size']

triplets_ds = triplets_ds.cache()

BATCH_SIZE = 1024 * 32
triplets_ds = triplets_ds.batch(BATCH_SIZE)

<pre>Loading pipeline artifacts from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/2000_artifacts</pre>

<pre>Loading vocabulary TFRecord from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/2000_artifacts/vocab.tfrecord</pre>

<pre>Loading triplet TFRecord from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/2000_artifacts/triplets.tfrecord</pre>

<pre>Triplet TFRecord loaded and parsed</pre>

<pre>All artifacts loaded successfully!</pre>

In [None]:
run_notebook_training(
    training_dataset=triplets_ds,
    save_path=output_dir,
    vocab_size=vocab_size,
    embedding_size=16,
    num_mixtures=2,
    spherical=True,
    learning_rate=0.1,
    epochs=2,
    adagrad=True,
    normclip=True,
    norm_cap=5.0,
    lower_sig=0.05,
    upper_sig=1.0,
    wout=False,
    tensorboard_log_path=os.path.join(output_dir, 'tensorboard'),
    monitor_interval=2,
    profile=False
)

Dataset pipeline structure (oldest to newest):
  [0] TFRecordDatasetV2
  [1] _ParallelMapDataset
  [2] _ParallelMapDataset
  [3] CacheDataset
  [4] _BatchDataset


<pre>
Starting Word2GM training</pre>

<pre>Writing TensorBoard logs to /scratch/edk202/word2gm-fast/output/test_small_corpus/tensorboard</pre>

In [5]:
# Print max resource usage after training (if ResourceMonitor was used)
from word2gm_fast.utils.resource_monitor import ResourceMonitor
if hasattr(ResourceMonitor, 'get_last_instance') and ResourceMonitor.get_last_instance() is not None:
    monitor = ResourceMonitor.get_last_instance()
    max_stats = monitor.get_max_stats()
    print('Max resource usage during training:')
    percent_keys = {'cpu_percent', 'mem_percent', 'gpu_util', 'gpu_mem_percent'}
    for k, v in max_stats.items():
        if k in percent_keys:
            print(f'  {k}: {v}%')
        else:
            print(f'  {k}: {v}')
else:
    print('ResourceMonitor instance not found or not used.')

Max resource usage during training:
  cpu_percent: 43.0%
  mem_percent: 28.8%
  gpu_util: 18%
  gpu_mem_percent: 3.8234710693359375%


In [14]:
# Inspect output files or model weights (optional)
print('Output directory contents:', os.listdir(output_dir))

Output directory contents: ['model_weights_epoch1.weights.h5', 'tensorboard', 'model_weights_epoch2.weights.h5']


In [None]:
# Launch TensorBoard inside the notebook for live monitoring
from tensorboard import notebook as tb_notebook
import os
tb_logdir = os.path.join(output_dir, 'tensorboard')
print(f"TensorBoard logdir: {tb_logdir}")
tb_notebook.start('--logdir', tb_logdir)