In [16]:
# Set project root directory and add `src` to path
import sys
from pathlib import Path

PROJECT_ROOT = '/scratch/edk202/word2gm-fast'
project_root = Path(PROJECT_ROOT)
src_path = project_root / 'src'
 
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Import the notebook setup utilities
from word2gm_fast.utils.notebook_setup import setup_training_notebook, enable_autoreload, run_silent_subprocess

# Enable mixed precision for GPU training
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

# Enable autoreload for development
enable_autoreload()

# Set up environment
env = setup_training_notebook(project_root=PROJECT_ROOT)

# Extract commonly used modules for convenience
tf = env['tensorflow']
np = env['numpy']
pd = env['pandas']
print_resource_summary = env['print_resource_summary']

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<pre>Autoreload enabled</pre>

<pre>Project root: /scratch/edk202/word2gm-fast
TensorFlow version: 2.19.0
Device mode: GPU-enabled</pre>

<pre>Training environment ready!</pre>

In [24]:
import os
from word2gm_fast.training.notebook_training import run_notebook_training
from word2gm_fast.utils.tfrecord_io import load_pipeline_artifacts

# Define paths for your small corpus artifacts and output
artifacts_dir = '/vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1937_artifacts'
output_dir = '/scratch/edk202/word2gm-fast/output/test_small_corpus'
from pathlib import Path
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [31]:
# Set the path to your artifacts directory (already defined as artifacts_dir)
artifacts = load_pipeline_artifacts(artifacts_dir)

# Unpack the loaded artifacts
vocab_table = artifacts['vocab_table']
triplets_ds = artifacts['triplets_ds']
vocab_size = artifacts['vocab_size']

triplets_ds = triplets_ds.cache()

BATCH_SIZE = 5096
triplets_ds = triplets_ds.batch(BATCH_SIZE)

<pre>Loading pipeline artifacts from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1937_artifacts</pre>

<pre>Loading vocabulary TFRecord from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1937_artifacts/vocab.tfrecord</pre>

<pre>Loading triplet TFRecord from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1937_artifacts/triplets.tfrecord</pre>

<pre>Triplet TFRecord loaded and parsed</pre>

<pre>All artifacts loaded successfully!</pre>

In [None]:
# Run the training pipeline
run_notebook_training(
    training_dataset=triplets_ds,
    save_path=output_dir,
    vocab_size=vocab_size,  # Set to your actual vocab size
    embedding_size=16,
    num_mixtures=2,
    spherical=True,
    learning_rate=0.1,
    epochs=2,
    adagrad=True,
    normclip=True,
    norm_cap=5.0,
    lower_sig=0.05,
    upper_sig=1.0,
    wout=False,
    tensorboard_log_path=os.path.join(output_dir, 'tensorboard'),
    monitor_interval=10,
    profile=False
)

🔍 Dataset pipeline structure:
🔹 _BatchDataset
  🔹 CacheDataset
    🔹 _ParallelMapDataset
      🔹 _ParallelMapDataset
        🔹 TFRecordDatasetV2

🚀 Starting Word2GM training
📝 Writing TensorBoard logs to /scratch/edk202/word2gm-fast/output/test_small_corpus/tensorboard

📘 Epoch 1/2
[Resource] Step 0: CPU 0.0% Mem 8.0% GPU 0.0% GPU Mem 1.8% 


2025-06-28 01:02:18.348590: W tensorflow/core/kernels/data/cache_dataset_ops.cc:916] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2025-06-28 01:02:19.930784: E tensorflow/core/framework/node_def_util.cc:680] NodeDef mentions attribute use_unbounded_threadpool which is not in the op definition: Op<name=MapDataset; signature=input_dataset:variant, other_arguments: -> handle:variant; attr=f:func; attr=Targuments:list(type),min=0; attr=output_types:list(type),min=1; attr=output_shapes:list(shape),min=1; attr=use_inter_op_parallelism:bool,default=true; attr=preserve_cardinality:bool,default=false; attr=force_synchronous:bool,default=false; attr=metadata:string,default=""> This may be expected if your g

[Resource] Step 1: CPU 10.9% Mem 8.0% GPU 3.0% GPU Mem 1.8% 
[Resource] Step 2: CPU 12.3% Mem 8.1% GPU 1.0% GPU Mem 1.8% 
[Resource] Step 2: CPU 12.3% Mem 8.1% GPU 1.0% GPU Mem 1.8% 
[Resource] Step 3: CPU 12.1% Mem 8.1% GPU 1.0% GPU Mem 1.8% 
[Resource] Step 3: CPU 12.1% Mem 8.1% GPU 1.0% GPU Mem 1.8% 
[Resource] Step 4: CPU 12.2% Mem 7.9% GPU 3.0% GPU Mem 1.8% 
[Resource] Step 4: CPU 12.2% Mem 7.9% GPU 3.0% GPU Mem 1.8% 
[Resource] Step 5: CPU 12.1% Mem 8.2% GPU 3.0% GPU Mem 1.8% 
[Resource] Step 5: CPU 12.1% Mem 8.2% GPU 3.0% GPU Mem 1.8% 
[Resource] Step 6: CPU 12.3% Mem 8.2% GPU 3.0% GPU Mem 1.8% 
[Resource] Step 6: CPU 12.3% Mem 8.2% GPU 3.0% GPU Mem 1.8% 
[Resource] Step 7: CPU 11.9% Mem 8.3% GPU 1.0% GPU Mem 1.8% 
[Resource] Step 7: CPU 11.9% Mem 8.3% GPU 1.0% GPU Mem 1.8% 
[Resource] Step 8: CPU 12.2% Mem 8.3% GPU 1.0% GPU Mem 1.8% 
[Resource] Step 8: CPU 12.2% Mem 8.3% GPU 1.0% GPU Mem 1.8% 
[Resource] Step 9: CPU 12.1% Mem 8.3% GPU 1.0% GPU Mem 1.8% 
[Resource] Step 9: CPU 1

In [None]:
# Inspect output files or model weights (optional)
print('Output directory contents:', os.listdir(output_dir))

In [None]:
# Launch TensorBoard inside the notebook for live monitoring
from tensorboard import notebook as tb_notebook
import os
tb_logdir = os.path.join(output_dir, 'tensorboard')
print(f"TensorBoard logdir: {tb_logdir}")
tb_notebook.start('--logdir', tb_logdir)