In [1]:
# Set project root directory and add `src` to path
import sys
from pathlib import Path

PROJECT_ROOT = '/scratch/edk202/word2gm-fast'
project_root = Path(PROJECT_ROOT)
src_path = project_root / 'src'
 
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Import the notebook setup utilities
from word2gm_fast.utils.notebook_setup import setup_testing_notebook, enable_autoreload, run_silent_subprocess

# Enable mixed precision for GPU training
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('mixed_float16')

# Enable autoreload for development
enable_autoreload()

# Set up environment
env = setup_testing_notebook(project_root=PROJECT_ROOT)

# Extract commonly used modules for convenience
tf = env['tensorflow']
np = env['numpy']
pd = env['pandas']
print_resource_summary = env['print_resource_summary']

2025-07-07 08:08:20.735873: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-07 08:08:20.751553: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751890100.768412  725669 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751890100.773528  725669 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1751890100.786236  725669 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

<pre>Autoreload enabled</pre>

<pre>Project root: /scratch/edk202/word2gm-fast
TensorFlow version: 2.19.0
Device mode: GPU-enabled</pre>

<pre>Testing environment ready!</pre>

In [2]:
print_resource_summary()

<pre>SYSTEM RESOURCE SUMMARY
============================================================
Hostname: cm045.hpc.nyu.edu

Job Allocation:
   CPUs: 14
   Memory: 125.0 GB
   Requested partitions: short,cs,cm,cpu_a100_2,cpu_a100_1,cpu_gpu
   Running on: SSH failed: Host key verification failed.
   Job ID: 63450075
   Node list: cm045

GPU Information:
   Error: NVML Shared Library Not Found

TensorFlow GPU Detection:
   TensorFlow detects 0 GPU(s)
   Built with CUDA: True
============================================================</pre>

In [3]:
import subprocess
import os

# Verify test directory exists and discover test files
tests_dir = os.path.join(PROJECT_ROOT, 'tests')
print(f"Project root: {PROJECT_ROOT}")
print(f"Tests directory: {tests_dir}")
print(f"Tests directory exists: {os.path.exists(tests_dir)}")

if os.path.exists(tests_dir):
    test_files = [f for f in os.listdir(tests_dir) 
                  if f.startswith('test_') and f.endswith('.py')]
    print(f"Found {len(test_files)} test files:")
    
    # Organize by category
    # I/O modules (in the io/ folder)
    io_modules = [f for f in test_files if 
                  any(module in f for module in ['vocab', 'triplets', 'tables', 'artifacts'])]
    
    # Corpus processing modules 
    corpus_modules = [f for f in test_files if 
                      any(module in f for module in ['corpus_to_dataset', 'dataset_to_triplets', 'index_vocab'])]
    
    # Integration tests
    integration_tests = [f for f in test_files if 'integration' in f or 'pipeline' in f]
    
    # Model training modules (training, model, utilities)
    training_modules = [f for f in test_files if f not in io_modules and 
                        f not in corpus_modules and f not in integration_tests]
    
    print(f"  I/O Modules: {io_modules}")
    print(f"  Corpus Processing Modules: {corpus_modules}")
    print(f"  Integration Tests: {integration_tests}")
    print(f"  Model Training Modules: {training_modules}")
else:
    print("WARNING: Tests directory not found!")
    exit(1)

# Import verification
print(f"\nImport verification...")
try:
    from word2gm_fast.io.vocab import write_vocab_to_tfrecord, parse_vocab_example
    from word2gm_fast.io.triplets import write_triplets_to_tfrecord, load_triplets_from_tfrecord
    from word2gm_fast.io.tables import create_token_to_index_table, create_index_to_token_table
    from word2gm_fast.io.artifacts import (save_pipeline_artifacts, load_pipeline_artifacts, 
                                         save_metadata, load_metadata)
    print("SUCCESS: All modules imported successfully")
except Exception as e:
    print(f"ERROR: Import verification failed: {e}")
    import traceback
    traceback.print_exc()
    exit(1)

# Run all tests in one comprehensive execution
print("\n" + "=" * 80)
print("RUNNING ALL TESTS")
print("=" * 80)

result = subprocess.run([
    'python', '-m', 'pytest', 
    'tests/',
    '-v',
    '--tb=short'
], capture_output=True, text=True, cwd=PROJECT_ROOT)

print("STDOUT:")
print(result.stdout)
if result.stderr:
    print("\nSTDERR:")
    print(result.stderr)

print(f"\nReturn code: {result.returncode}")

if result.returncode == 0:
    print("\n" + "=" * 80)
    print("SUCCESS: ALL TESTS PASSED!")
    print("The IO module refactoring is working correctly.")
    print("=" * 80)
else:
    print("\n" + "=" * 80)
    print("WARNING: Some tests failed.")
    print("Review the output above for details.")
    print("=" * 80)


Project root: /scratch/edk202/word2gm-fast
Tests directory: /scratch/edk202/word2gm-fast/tests
Tests directory exists: True
Found 16 test files:
  I/O Modules: ['test_index_vocab.py', 'test_artifacts.py', 'test_triplets.py', 'test_tables.py', 'test_filtered_tables.py', 'test_vocab.py', 'test_dataset_to_triplets.py']
  Corpus Processing Modules: ['test_index_vocab.py', 'test_corpus_to_dataset.py', 'test_dataset_to_triplets.py']
  Integration Tests: ['test_pipeline.py', 'test_io_integration.py']
  Model Training Modules: ['test_notebook_training.py', 'test_word2gm_model.py', 'test_training_utils.py', 'test_token_validation.py', 'test_train_loop.py', 'test_resource_monitor.py']

Import verification...
SUCCESS: All modules imported successfully

RUNNING ALL TESTS


STDOUT:
platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 -- /ext3/miniforge3/envs/word2gm-fast2/bin/python
cachedir: .pytest_cache
rootdir: /scratch/edk202/word2gm-fast
plugins: anyio-4.9.0, timeout-2.4.0
[1mcollecting ... [0mcollected 98 items

tests/test_artifacts.py::TestArtifactsModule::test_save_metadata_uncompressed [32mPASSED[0m[32m [  1%][0m
tests/test_artifacts.py::TestArtifactsModule::test_save_metadata_compressed [32mPASSED[0m[32m [  2%][0m
tests/test_artifacts.py::TestArtifactsModule::test_load_metadata_uncompressed [32mPASSED[0m[32m [  3%][0m
tests/test_artifacts.py::TestArtifactsModule::test_load_metadata_compressed [32mPASSED[0m[32m [  4%][0m
tests/test_artifacts.py::TestArtifactsModule::test_metadata_roundtrip [32mPASSED[0m[32m [  5%][0m
tests/test_artifacts.py::TestArtifactsModule::test_save_pipeline_artifacts [32mPASSED[0m[32m [  6%][0m
tests/test_artifacts.py::TestArtifactsModule::test_load_pipeline_artifacts [32mPASSED[0m[32m