# Test Modules

In [1]:
# Set project root directory and add `src` to path
import sys
from pathlib import Path

PROJECT_ROOT = '/scratch/edk202/word2gm-fast'
project_root = Path(PROJECT_ROOT)
src_path = project_root / 'src'
 
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Import the notebook setup utilities
from word2gm_fast.utils.notebook_setup import setup_testing_notebook, enable_autoreload, run_silent_subprocess

# Enable mixed precision for GPU training
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

# Enable autoreload for development
enable_autoreload()

# Set up environment
env = setup_testing_notebook(project_root=PROJECT_ROOT)

# Extract commonly used modules for convenience
tf = env['tensorflow']
np = env['numpy']
pd = env['pandas']
print_resource_summary = env['print_resource_summary']

2025-07-05 09:01:31.210611: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-05 09:01:31.227736: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751720491.245777 4126438 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751720491.251079 4126438 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1751720491.264863 4126438 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

<pre>Autoreload enabled</pre>

<pre>Project root: /scratch/edk202/word2gm-fast
TensorFlow version: 2.19.0
Device mode: GPU-enabled</pre>

<pre>Testing environment ready!</pre>

In [2]:
print_resource_summary()

<pre>SYSTEM RESOURCE SUMMARY
============================================================
Hostname: cm032.hpc.nyu.edu

Job Allocation:
   CPUs: 4
   Memory: 15.6 GB
   Requested partitions: short
   Running on: SSH failed: Host key verification failed.
   Job ID: 63398538
   Node list: cm032

GPU Information:
   Error: NVML Shared Library Not Found

TensorFlow GPU Detection:
   TensorFlow detects 0 GPU(s)
   Built with CUDA: True
============================================================</pre>

In [3]:
import subprocess

# Run the updated test suite with verbose output
result = subprocess.run([
    'python', '-m', 'pytest', 
    'tests/',
    '-v',  # Verbose output
    '--tb=short',  # Short traceback format
    '-x'  # Stop on first failure
], capture_output=True, text=True, cwd=PROJECT_ROOT)

print("STDOUT:")
print(result.stdout)
if result.stderr:
    print("\nSTDERR:")
    print(result.stderr)
    
print(f"\nReturn code: {result.returncode}")

STDOUT:
platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 -- /ext3/miniforge3/envs/word2gm-fast2/bin/python
cachedir: .pytest_cache
rootdir: /scratch/edk202/word2gm-fast
plugins: anyio-4.9.0, timeout-2.4.0
[1mcollecting ... [0mcollected 69 items

tests/test_corpus_to_dataset.py::test_corpus_to_dataset [32mPASSED[0m[32m           [  1%][0m
tests/test_dataset_to_triplets.py::test_basic_triplet_generation [32mPASSED[0m[32m  [  2%][0m
tests/test_dataset_to_triplets.py::test_center_word_extraction [32mPASSED[0m[32m    [  4%][0m
tests/test_dataset_to_triplets.py::test_context_word_extraction [32mPASSED[0m[32m   [  5%][0m
tests/test_dataset_to_triplets.py::test_multiple_triplets_per_line [32mPASSED[0m[32m [  7%][0m
tests/test_dataset_to_triplets.py::test_negative_sampling_range [32mPASSED[0m[32m   [  8%][0m
tests/test_dataset_to_triplets.py::test_no_triplets_with_unk_context [32mPASSED[0m[32m [ 10%][0m
tests/test_dataset_to_triplets.py::test_frequency_

In [4]:
# Test the new IO modules specifically
print("Testing updated IO modules...")

# Run tests for the new modular IO structure
io_tests = subprocess.run([
    'python', '-m', 'pytest', 
    'tests/test_tfrecord_io.py',
    'tests/test_io_integration.py', 
    'tests/test_pipeline.py',
    '-v',
    '--tb=short'
], capture_output=True, text=True, cwd=PROJECT_ROOT)

print("IO Module Tests:")
print(io_tests.stdout)
if io_tests.stderr:
    print("\nErrors:")
    print(io_tests.stderr)
    
print(f"IO Tests Return code: {io_tests.returncode}")
print("=" * 60)

Testing updated IO modules...
IO Module Tests:
platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 -- /ext3/miniforge3/envs/word2gm-fast2/bin/python
cachedir: .pytest_cache
rootdir: /scratch/edk202/word2gm-fast
plugins: anyio-4.9.0, timeout-2.4.0
[1mcollecting ... [0mcollected 35 items

tests/test_tfrecord_io.py::test_write_and_load_triplets_uncompressed [32mPASSED[0m[32m [  2%][0m
tests/test_tfrecord_io.py::test_write_and_load_triplets_compressed [32mPASSED[0m[32m [  5%][0m
tests/test_tfrecord_io.py::test_parse_triplet_example [32mPASSED[0m[32m             [  8%][0m
tests/test_tfrecord_io.py::test_write_and_load_vocab_uncompressed [32mPASSED[0m[32m [ 11%][0m
tests/test_tfrecord_io.py::test_write_and_load_vocab_with_frequencies [32mPASSED[0m[32m [ 14%][0m
tests/test_tfrecord_io.py::test_write_vocab_without_frequencies [32mPASSED[0m[32m   [ 17%][0m
tests/test_tfrecord_io.py::test_write_and_load_vocab_compressed [32mPASSED[0m[32m   [ 20%][0m
tests/t

In [5]:
# Manual verification of key IO functionality
print("Manual verification of new IO modules...")

try:
    # Test importing the new modular structure
    from word2gm_fast.io.vocab import write_vocab_to_tfrecord, parse_vocab_example
    from word2gm_fast.io.triplets import write_triplets_to_tfrecord, load_triplets_from_tfrecord
    from word2gm_fast.io.tables import create_token_to_index_table, create_index_to_token_table
    from word2gm_fast.io.artifacts import save_pipeline_artifacts, load_pipeline_artifacts
    print("✓ All new IO modules imported successfully")
    
    # Test that the old module is deprecated
    try:
        from word2gm_fast.utils.tfrecord_io import write_vocab_to_tfrecord as old_write_vocab
        print("⚠ Old tfrecord_io module still accessible (shows deprecation warning)")
    except ImportError:
        print("✓ Old tfrecord_io module properly removed")
    
    # Quick functionality test
    vocab_words = ["UNK", "test", "words"]
    vocab_table = tf.lookup.StaticHashTable(
        tf.lookup.KeyValueTensorInitializer(
            keys=tf.constant(vocab_words),
            values=tf.constant([0, 1, 2], dtype=tf.int64)
        ),
        default_value=0
    )
    
    # Test vocab with frequencies
    import tempfile
    with tempfile.NamedTemporaryFile(suffix='.tfrecord', delete=False) as tmp:
        frequencies = {"UNK": 100.0, "test": 50.0, "words": 25.0}
        write_vocab_to_tfrecord(vocab_table, tmp.name, frequencies=frequencies)
        
        # Test loading with table creation
        token_to_idx = create_token_to_index_table(tmp.name)
        test_idx = token_to_idx.lookup(tf.constant("test")).numpy()
        print(f"✓ Frequency-enabled vocab I/O working: 'test' -> {test_idx}")
        
        # Clean up
        import os
        os.unlink(tmp.name)
    
    print("✓ Manual verification passed!")
    
except Exception as e:
    print(f"✗ Manual verification failed: {e}")
    import traceback
    traceback.print_exc()

Manual verification of new IO modules...
✓ All new IO modules imported successfully
✓ Old tfrecord_io module properly removed


  from word2gm_fast.utils.tfrecord_io import write_vocab_to_tfrecord as old_write_vocab


Writing vocabulary TFRecord to: /state/partition1/job-63398538/tmpw8ngy12q.tfrecord

<pre>Vocabulary write complete. Words written: 3</pre>

<pre>Loading token-to-index vocabulary TFRecord from: /state/partition1/job-63398538/tmpw8ngy12q.tfrecord</pre>

✓ Frequency-enabled vocab I/O working: 'test' -> 1
✓ Manual verification passed!


2025-07-05 09:02:44.503294: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:387] The default buffer size is 262144, which is overridden by the user specified `buffer_size` of 134217728
2025-07-05 09:02:44.507590: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [None]:
# Comprehensive test of all new IO modules
print("=" * 60)
print("COMPREHENSIVE IO MODULE TEST")
print("=" * 60)

try:
    # Test 1: Import all new modules
    from word2gm_fast.io.vocab import (
        write_vocab_to_tfrecord, 
        parse_vocab_example
    )
    from word2gm_fast.io.triplets import (
        write_triplets_to_tfrecord, 
        load_triplets_from_tfrecord
    )
    from word2gm_fast.io.tables import (
        create_token_to_index_table, 
        create_index_to_token_table
    )
    from word2gm_fast.io.artifacts import (
        save_pipeline_artifacts, 
        load_pipeline_artifacts
    )
    print("✓ All new IO modules imported successfully")
    
    # Test 2: Create sample data
    vocab_words = ["UNK", "the", "quick", "brown", "fox"]
    vocab_indices = list(range(len(vocab_words)))
    vocab_frequencies = {word: 100.0 - i*10 for i, word in enumerate(vocab_words)}
    
    # Create vocab table
    vocab_table = tf.lookup.StaticHashTable(
        tf.lookup.KeyValueTensorInitializer(
            keys=tf.constant(vocab_words),
            values=tf.constant(vocab_indices, dtype=tf.int64)
        ),
        default_value=0
    )
    
    # Test 3: Vocab I/O with frequencies
    import tempfile
    import os
    
    with tempfile.NamedTemporaryFile(suffix='.tfrecord', delete=False) as vocab_file:
        vocab_path = vocab_file.name
        
    # Write vocab with frequencies
    write_vocab_to_tfrecord(vocab_table, vocab_path, frequencies=vocab_frequencies)
    
    # Read back and create tables
    token_to_idx = create_token_to_index_table(vocab_path)
    idx_to_token = create_index_to_token_table(vocab_path)
    
    # Test lookups
    test_token = "quick"
    test_idx = token_to_idx.lookup(tf.constant(test_token)).numpy()
    back_token = idx_to_token.lookup(tf.constant([test_idx], dtype=tf.int64)).numpy()[0].decode('utf-8')
    
    print(f"✓ Vocab I/O test: '{test_token}' -> {test_idx} -> '{back_token}'")
    
    # Test 4: Triplet I/O
    with tempfile.NamedTemporaryFile(suffix='.tfrecord', delete=False) as triplet_file:
        triplet_path = triplet_file.name
        
    # Create sample triplets
    sample_triplets = [
        (1, 2, 3),  # (target, context, negative)
        (2, 3, 4),
        (3, 4, 1),
    ]
    
    # Write triplets
    write_triplets_to_tfrecord(sample_triplets, triplet_path)
    
    # Load triplets back
    loaded_triplets = list(load_triplets_from_tfrecord(triplet_path))
    
    print(f"✓ Triplet I/O test: wrote {len(sample_triplets)}, loaded {len(loaded_triplets)}")
    
    # Test 5: Artifact I/O
    with tempfile.NamedTemporaryFile(suffix='.gz', delete=False) as artifact_file:
        artifact_path = artifact_file.name
        
    # Create sample artifacts
    artifacts = {
        'vocab_size': len(vocab_words),
        'total_tokens': sum(vocab_frequencies.values()),
        'model_config': {'embedding_dim': 128, 'epochs': 10},
        'metadata': {'version': '1.0', 'timestamp': '2025-01-01'}
    }
    
    # Save and load artifacts
    save_pipeline_artifacts(artifacts, artifact_path)
    loaded_artifacts = load_pipeline_artifacts(artifact_path)
    
    print(f"✓ Artifact I/O test: saved {len(artifacts)} items, loaded {len(loaded_artifacts)} items")
    
    # Test 6: Verify artifact contents
    assert loaded_artifacts['vocab_size'] == len(vocab_words)
    assert loaded_artifacts['model_config']['embedding_dim'] == 128
    print("✓ Artifact contents verified")
    
    # Test 7: Verify the old module shows deprecation warning
    try:
        from word2gm_fast.utils.tfrecord_io import write_vocab_to_tfrecord as old_write_vocab
        print("✓ Old module still accessible (with deprecation warning)")
    except ImportError:
        print("✓ Old module completely removed")
    
    # Clean up
    os.unlink(vocab_path)
    os.unlink(triplet_path) 
    os.unlink(artifact_path)
    
    print("=" * 60)
    print("ALL IO MODULE TESTS PASSED!")
    print("=" * 60)
    
except Exception as e:
    print(f"✗ IO module test failed: {e}")
    import traceback
    traceback.print_exc()

COMPREHENSIVE IO MODULE TEST
✓ All new IO modules imported successfully


Writing vocabulary TFRecord to: /state/partition1/job-63398538/tmp1q02mglm.tfrecord

<pre>Vocabulary write complete. Words written: 5</pre>

<pre>Loading token-to-index vocabulary TFRecord from: /state/partition1/job-63398538/tmp1q02mglm.tfrecord</pre>

2025-07-05 09:05:48.670182: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


<pre>Loading index-to-token vocab TFRecord from: /state/partition1/job-63398538/tmp1q02mglm.tfrecord</pre>

✗ IO module test failed: Dtype of argument `keys` must be <dtype: 'int64'>, received: <dtype: 'int32'>


Traceback (most recent call last):
  File "/state/partition1/job-63398538/ipykernel_4126438/1575288815.py", line 57, in <module>
    back_token = idx_to_token.lookup(tf.constant([test_idx])).numpy()[0].decode('utf-8')
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/ext3/miniforge3/envs/word2gm-fast2/lib/python3.12/site-packages/tensorflow/python/ops/lookup_ops.py", line 253, in lookup
    raise TypeError(f"Dtype of argument `keys` must be {self._key_dtype}, "
TypeError: Dtype of argument `keys` must be <dtype: 'int64'>, received: <dtype: 'int32'>
