# Test Modules

In [26]:
# Enable autoreload for development
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
# Set project root and add src to path
import sys
from pathlib import Path
import os

PROJECT_ROOT = '/scratch/edk202/word2gm-fast'
project_root = Path(PROJECT_ROOT)
src_path = project_root / 'src'

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

In [28]:
# Print resource summary
from word2gm_fast.utils.resource_summary import print_resource_summary

print_resource_summary()

<pre>SYSTEM RESOURCE SUMMARY
=============================================
Hostname: cm045.hpc.nyu.edu

Job Allocation:
   CPUs: 14
   Memory: 125.0 GB
   Partition: short
   Job ID: 63785051
   Node list: cm045

Physical GPU Hardware:
   No physical GPUs allocated to this job

TensorFlow GPU Recognition:
   TensorFlow can access 0 GPU(s)
   Built with CUDA support: True
=============================================</pre>

## Test `corpus_to_dataset.py`

In [29]:
import subprocess

# Run the corpus_to_dataset test
result = subprocess.run([
    'python', '-m', 'pytest', 
    'tests/test_corpus_to_dataset.py',
    '-v'
], capture_output=True, text=True, cwd=PROJECT_ROOT)

print(result.stdout)
if result.stderr:
    print(result.stderr)

platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 -- /ext3/miniforge3/envs/word2gm-fast2/bin/python
cachedir: .pytest_cache
rootdir: /scratch/edk202/word2gm-fast
plugins: anyio-4.9.0, timeout-2.4.0
[1mcollecting ... [0mcollected 12 items

tests/test_corpus_to_dataset.py::TestValidate5gramLine::test_valid_cases [32mPASSED[0m[32m [  8%][0m
tests/test_corpus_to_dataset.py::TestValidate5gramLine::test_invalid_cases [32mPASSED[0m[32m [ 16%][0m
tests/test_corpus_to_dataset.py::TestValidate5gramLine::test_validation_rules [32mPASSED[0m[32m [ 25%][0m
tests/test_corpus_to_dataset.py::TestMakeDataset::test_filtering_accuracy [32mPASSED[0m[32m [ 33%][0m
tests/test_corpus_to_dataset.py::TestMakeDataset::test_summary_statistics [32mPASSED[0m[32m [ 41%][0m
tests/test_corpus_to_dataset.py::TestMakeDataset::test_caching_functionality [32mPASSED[0m[32m [ 50%][0m
tests/test_corpus_to_dataset.py::TestMakeDataset::test_preview_integration [32mPASSED[0m[32m [ 58%][0

## Test `dataset_to_frequency.py`

In [30]:
import subprocess

# Run the corpus_to_dataset test
result = subprocess.run([
    'python', '-m', 'pytest', 
    'tests/test_dataset_to_frequency.py',
    '-v'
], capture_output=True, text=True, cwd=PROJECT_ROOT)

print(result.stdout)
if result.stderr:
    print(result.stderr)

platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 -- /ext3/miniforge3/envs/word2gm-fast2/bin/python
cachedir: .pytest_cache
rootdir: /scratch/edk202/word2gm-fast
plugins: anyio-4.9.0, timeout-2.4.0
[1mcollecting ... [0mcollected 11 items

tests/test_dataset_to_frequency.py::TestDatasetToFrequency::test_basic_frequency_counting [32mPASSED[0m[32m [  9%][0m
tests/test_dataset_to_frequency.py::TestDatasetToFrequency::test_frequency_counts [32mPASSED[0m[32m [ 18%][0m
tests/test_dataset_to_frequency.py::TestDatasetToFrequency::test_sorted_by_frequency [32mPASSED[0m[32m [ 27%][0m
tests/test_dataset_to_frequency.py::TestDatasetToFrequency::test_empty_dataset [32mPASSED[0m[32m [ 36%][0m
tests/test_dataset_to_frequency.py::TestDatasetToFrequency::test_single_line_dataset [32mPASSED[0m[32m [ 45%][0m
tests/test_dataset_to_frequency.py::TestDatasetToFrequency::test_repeated_tokens_same_line [32mPASSED[0m[32m [ 54%][0m
tests/test_dataset_to_frequency.py::TestData

## Test `dataset_to_triplets.py`

In [31]:
import subprocess

# Run the corpus_to_dataset test
result = subprocess.run([
    'python', '-m', 'pytest', 
    'tests/test_dataset_to_triplets.py',
    '-v'
], capture_output=True, text=True, cwd=PROJECT_ROOT)

print(result.stdout)
if result.stderr:
    print(result.stderr)

platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 -- /ext3/miniforge3/envs/word2gm-fast2/bin/python
cachedir: .pytest_cache
rootdir: /scratch/edk202/word2gm-fast
plugins: anyio-4.9.0, timeout-2.4.0
[1mcollecting ... [0mcollected 10 items

tests/test_dataset_to_triplets.py::TestDatasetToTriplets::test_basic_triplet_generation [32mPASSED[0m[32m [ 10%][0m
tests/test_dataset_to_triplets.py::TestDatasetToTriplets::test_triplet_structure [32mPASSED[0m[32m [ 20%][0m
tests/test_dataset_to_triplets.py::TestDatasetToTriplets::test_unk_exclusion [32mPASSED[0m[32m [ 30%][0m
tests/test_dataset_to_triplets.py::TestDatasetToTriplets::test_empty_dataset [32mPASSED[0m[32m [ 40%][0m
tests/test_dataset_to_triplets.py::TestDatasetToTriplets::test_caching_functionality [32mPASSED[0m[32m [ 50%][0m
tests/test_dataset_to_triplets.py::TestDatasetToTriplets::test_summary_statistics [32mPASSED[0m[32m [ 60%][0m
tests/test_dataset_to_triplets.py::TestEdgeCases::test_single_line

## Test `index_vocab.py`

In [32]:
import subprocess

# Run the index_vocab test
result = subprocess.run([
    'python', '-m', 'pytest', 
    'tests/test_index_vocab.py',
    '-v'
], capture_output=True, text=True, cwd=PROJECT_ROOT)

print(result.stdout)
if result.stderr:
    print(result.stderr)

platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 -- /ext3/miniforge3/envs/word2gm-fast2/bin/python
cachedir: .pytest_cache
rootdir: /scratch/edk202/word2gm-fast
plugins: anyio-4.9.0, timeout-2.4.0
[1mcollecting ... [0mcollected 18 items

tests/test_index_vocab.py::TestBuildVocabTable::test_basic_vocab_table_creation [32mPASSED[0m[32m [  5%][0m
tests/test_index_vocab.py::TestBuildVocabTable::test_unk_token_validation [32mPASSED[0m[32m [ 11%][0m
tests/test_index_vocab.py::TestBuildVocabTable::test_lookup_unknown_tokens [32mPASSED[0m[32m [ 16%][0m
tests/test_index_vocab.py::TestMakeVocab::test_basic_vocab_creation [32mPASSED[0m[32m [ 22%][0m
tests/test_index_vocab.py::TestMakeVocab::test_vocab_ordering [32mPASSED[0m[32m     [ 27%][0m
tests/test_index_vocab.py::TestMakeVocab::test_frequency_counting [32mPASSED[0m[32m [ 33%][0m
tests/test_index_vocab.py::TestMakeVocab::test_empty_dataset [32mPASSED[0m[32m      [ 38%][0m
tests/test_index_vocab.py::Te

## Test `io.triplets.py`

In [33]:
import subprocess

# Run the index_vocab test
result = subprocess.run([
    'python', '-m', 'pytest', 
    'tests/test_io_triplets.py',
    '-v'
], capture_output=True, text=True, cwd=PROJECT_ROOT)

print(result.stdout)
if result.stderr:
    print(result.stderr)

platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 -- /ext3/miniforge3/envs/word2gm-fast2/bin/python
cachedir: .pytest_cache
rootdir: /scratch/edk202/word2gm-fast
plugins: anyio-4.9.0, timeout-2.4.0
[1mcollecting ... [0mcollected 13 items

tests/test_io_triplets.py::TestWriteTripletsToTFRecord::test_basic_write_uncompressed [32mPASSED[0m[32m [  7%][0m
tests/test_io_triplets.py::TestWriteTripletsToTFRecord::test_basic_write_compressed [32mPASSED[0m[32m [ 15%][0m
tests/test_io_triplets.py::TestWriteTripletsToTFRecord::test_empty_dataset [32mPASSED[0m[32m [ 23%][0m
tests/test_io_triplets.py::TestWriteTripletsToTFRecord::test_large_integers [32mPASSED[0m[32m [ 30%][0m
tests/test_io_triplets.py::TestLoadTripletsFromTFRecord::test_basic_load_uncompressed [32mPASSED[0m[32m [ 38%][0m
tests/test_io_triplets.py::TestLoadTripletsFromTFRecord::test_basic_load_compressed [32mPASSED[0m[32m [ 46%][0m
tests/test_io_triplets.py::TestLoadTripletsFromTFRecord::test_loa

## Test `io.vocab.py`

In [34]:
import subprocess

# Run the io.vocab test
result = subprocess.run([
    'python', '-m', 'pytest', 
    'tests/test_io_vocab.py',
    '-v'
], capture_output=True, text=True, cwd=PROJECT_ROOT)

print(result.stdout)
if result.stderr:
    print(result.stderr)

platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 -- /ext3/miniforge3/envs/word2gm-fast2/bin/python
cachedir: .pytest_cache
rootdir: /scratch/edk202/word2gm-fast
plugins: anyio-4.9.0, timeout-2.4.0
[1mcollecting ... [0mcollected 16 items

tests/test_io_vocab.py::TestWriteVocabToTFRecord::test_basic_write_uncompressed [32mPASSED[0m[32m [  6%][0m
tests/test_io_vocab.py::TestWriteVocabToTFRecord::test_basic_write_compressed [32mPASSED[0m[32m [ 12%][0m
tests/test_io_vocab.py::TestWriteVocabToTFRecord::test_write_without_frequencies [32mPASSED[0m[32m [ 18%][0m
tests/test_io_vocab.py::TestWriteVocabToTFRecord::test_empty_vocab [32mPASSED[0m[32m [ 25%][0m
tests/test_io_vocab.py::TestLoadVocabFromTFRecord::test_basic_load_uncompressed [32mPASSED[0m[32m [ 31%][0m
tests/test_io_vocab.py::TestLoadVocabFromTFRecord::test_basic_load_compressed [32mPASSED[0m[32m [ 37%][0m
tests/test_io_vocab.py::TestLoadVocabFromTFRecord::test_load_without_frequencies [32mPASSED

## Test `io.artifacts.py`

In [35]:
import subprocess

# Run the io.artifacts test
result = subprocess.run([
    'python', '-m', 'pytest', 
    'tests/test_io_artifacts.py',
    '-v'
], capture_output=True, text=True, cwd=PROJECT_ROOT)

print(result.stdout)
if result.stderr:
    print(result.stderr)

platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 -- /ext3/miniforge3/envs/word2gm-fast2/bin/python
cachedir: .pytest_cache
rootdir: /scratch/edk202/word2gm-fast
plugins: anyio-4.9.0, timeout-2.4.0
[1mcollecting ... [0mcollected 13 items

tests/test_io_artifacts.py::TestSavePipelineArtifacts::test_basic_save_uncompressed [32mPASSED[0m[32m [  7%][0m
tests/test_io_artifacts.py::TestSavePipelineArtifacts::test_basic_save_compressed [32mPASSED[0m[32m [ 15%][0m
tests/test_io_artifacts.py::TestSavePipelineArtifacts::test_directory_creation [32mPASSED[0m[32m [ 23%][0m
tests/test_io_artifacts.py::TestSavePipelineArtifacts::test_empty_triplets [32mPASSED[0m[32m [ 30%][0m
tests/test_io_artifacts.py::TestLoadPipelineArtifacts::test_basic_load_uncompressed [32mPASSED[0m[32m [ 38%][0m
tests/test_io_artifacts.py::TestLoadPipelineArtifacts::test_basic_load_compressed [32mPASSED[0m[32m [ 46%][0m
tests/test_io_artifacts.py::TestLoadPipelineArtifacts::test_auto_detec

## Test `io.tables.py`

In [36]:
import subprocess

# Run the io.tables test
result = subprocess.run([
    'python', '-m', 'pytest', 
    'tests/test_io_tables.py',
    '-v'
], capture_output=True, text=True, cwd=PROJECT_ROOT)

print(result.stdout)
if result.stderr:
    print(result.stderr)

platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 -- /ext3/miniforge3/envs/word2gm-fast2/bin/python
cachedir: .pytest_cache
rootdir: /scratch/edk202/word2gm-fast
plugins: anyio-4.9.0, timeout-2.4.0
[1mcollecting ... [0mcollected 17 items

tests/test_io_tables.py::TestCreateTokenToIndexTable::test_basic_creation_uncompressed [32mPASSED[0m[32m [  5%][0m
tests/test_io_tables.py::TestCreateTokenToIndexTable::test_basic_creation_compressed [32mPASSED[0m[32m [ 11%][0m
tests/test_io_tables.py::TestCreateTokenToIndexTable::test_with_triplet_filtering [32mPASSED[0m[32m [ 17%][0m
tests/test_io_tables.py::TestCreateTokenToIndexTable::test_table_size [32mPASSED[0m[32m [ 23%][0m
tests/test_io_tables.py::TestCreateTokenToIndexTable::test_nonexistent_file [32mPASSED[0m[32m [ 29%][0m
tests/test_io_tables.py::TestCreateIndexToTokenTable::test_basic_creation_uncompressed [32mPASSED[0m[32m [ 35%][0m
tests/test_io_tables.py::TestCreateIndexToTokenTable::test_basic_crea

## Test `pipeline.py`

In [43]:
import subprocess

# Run the io.tables test
result = subprocess.run([
    'python', '-m', 'pytest', 
    'tests/test_pipeline.py',
    '-v'
], capture_output=True, text=True, cwd=PROJECT_ROOT)

print(result.stdout)
if result.stderr:
    print(result.stderr)

platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 -- /ext3/miniforge3/envs/word2gm-fast2/bin/python
cachedir: .pytest_cache
rootdir: /scratch/edk202/word2gm-fast
plugins: anyio-4.9.0, timeout-2.4.0
[1mcollecting ... [0mcollected 26 items

tests/test_pipeline.py::TestProcessSingleYear::test_successful_processing [32mPASSED[0m[32m [  3%][0m
tests/test_pipeline.py::TestProcessSingleYear::test_compressed_output [32mPASSED[0m[32m [  7%][0m
tests/test_pipeline.py::TestProcessSingleYear::test_nonexistent_file [32mPASSED[0m[32m [ 11%][0m
tests/test_pipeline.py::TestProcessSingleYear::test_invalid_corpus_directory [32mPASSED[0m[32m [ 15%][0m
tests/test_pipeline.py::TestProcessSingleYear::test_small_corpus_file [32mPASSED[0m[32m [ 19%][0m
tests/test_pipeline.py::TestProcessYearRange::test_single_year_range [32mPASSED[0m[32m [ 23%][0m
tests/test_pipeline.py::TestProcessYearRange::test_multiple_years_sequential [32mPASSED[0m[32m [ 26%][0m
tests/test_pipelin