In [16]:
%load_ext autoreload
%autoreload 2

# Standard imports for TensorFlow and numpy
import tensorflow as tf
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
# Set project root directory and add `src` to path
import sys
from pathlib import Path

PROJECT_ROOT = '/scratch/edk202/word2gm-fast'
project_root = Path(PROJECT_ROOT)
src_path = project_root / 'src'

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Optionally, print sys.path for debugging
print('sys.path:', sys.path[:3], '...')

sys.path: ['/scratch/edk202/word2gm-fast/src', '/ext3/miniforge3/bin', '/ext3/miniforge3/condabin'] ...


In [18]:
# Data loading and pipeline setup
from word2gm_fast.utils.tfrecord_io import load_pipeline_artifacts
import os
from pathlib import Path

# Define paths for your corpus artifacts and output
dataset_artifacts_dir = '/vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1850_artifacts'
output_dir = '/scratch/edk202/word2gm-fast/output/test_corpus'
Path(output_dir).mkdir(parents=True, exist_ok=True)

# Load pipeline artifacts (vocab, triplets, etc.)
artifacts = load_pipeline_artifacts(dataset_artifacts_dir)
vocab_table = artifacts['vocab_table']
triplets_ds = artifacts['triplets_ds']
vocab_size = artifacts['vocab_size']

# Build the dataset pipeline: cache -> shuffle -> batch -> prefetch
triplets_ds = triplets_ds.cache()
BATCH_SIZE = 256
SHUFFLE_BUFFER_SIZE = BATCH_SIZE * 10
triplets_ds = triplets_ds.shuffle(SHUFFLE_BUFFER_SIZE)
triplets_ds = triplets_ds.batch(BATCH_SIZE)
triplets_ds = triplets_ds.prefetch(tf.data.AUTOTUNE)

print(f'Loaded vocab_size: {vocab_size}')

<pre>Loading pipeline artifacts from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1850_artifacts</pre>

<pre>Loading vocabulary TFRecord from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1850_artifacts/vocab.tfrecord</pre>

<pre>Loading triplet TFRecord from: /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1850_artifacts/triplets.tfrecord</pre>

<pre>Triplet TFRecord loaded and parsed</pre>

<pre>All artifacts loaded successfully!</pre>

Loaded vocab_size: 33668


In [None]:
# Launch Word2GM training using notebook_training.py with standard hyperparameters
from word2gm_fast.training.notebook_training import run_notebook_training
from word2gm_fast.models.config import Word2GMConfig

# Construct config if not already defined
config = Word2GMConfig

run_notebook_training(
    training_dataset=triplets_ds,
    save_path=output_dir,
    vocab_size=vocab_size,
    embedding_size=config.embedding_size,
    num_mixtures=config.num_mixtures,
    spherical=config.spherical,
    learning_rate=config.learning_rate,
    epochs=10,
    adagrad=config.adagrad,
    normclip=config.normclip,
    norm_cap=config.norm_cap,
    lower_sig=config.lower_sig,
    upper_sig=config.upper_sig,
    wout=config.wout,
    tensorboard_log_path=None,
    monitor_interval=10,
    profile=False,
    var_scale=config.var_scale,
    loss_epsilon=config.loss_epsilon
)


**Word2GM Training Hyperparameters:**

| Parameter         | Value                |
|-------------------|----------------------|
| Vocab size        | `33668`  |
| Embedding size    | `200` |
| Mixtures          | `2` |
| Spherical         | `True`   |
| Learning rate     | `0.05` |
| Epochs            | `10` |
| Adagrad           | `True`     |
| Normclip          | `True`    |
| Norm cap          | `5.0`    |
| Lower sigma       | `0.05`   |
| Upper sigma       | `1.0`   |
| Wout              | `False`        |
| Var scale         | `0.05`   |
| Loss epsilon      | `1e-07`|


**Epoch 1 finished. Loss:** `0.247869`  | **Duration:** `62.19` seconds.

**Epoch 2 finished. Loss:** `0.168961`  | **Duration:** `102.76` seconds.

**Starting epoch 3/10...**