In [1]:
import os
import sys
import subprocess
import time

# Change to project directory
os.chdir('/scratch/edk202/word2gm-fast/notebooks')
os.chdir("..")

# Clean TensorFlow import with complete silencing (non-deterministic by default)
from src.word2gm_fast.utils import (
    import_tensorflow_silently, 
    log_tf_to_file, 
    run_silent_subprocess
)

tf = import_tensorflow_silently(deterministic=False)
print(f"✅ TensorFlow {tf.__version__} imported silently (non-deterministic mode)")

# Import data pipeline modules
from src.word2gm_fast.dataprep.corpus_to_dataset import make_dataset
from src.word2gm_fast.dataprep.index_vocab import make_vocab
from src.word2gm_fast.dataprep.dataset_to_triplets import build_skipgram_triplets
from src.word2gm_fast.dataprep.tfrecord_io import (
    save_pipeline_artifacts,
    load_pipeline_artifacts,
    write_triplets_to_tfrecord,
    load_triplets_from_tfrecord,
    write_vocab_to_tfrecord,
    load_vocab_from_tfrecord
)

✅ TensorFlow 2.19.0 imported silently (non-deterministic mode)


In [2]:
# Run tests with TensorFlow silencing
result = run_silent_subprocess(
    ['python', '-m', 'unittest', '-b', 'tests.test_corpus_to_dataset'], 
    capture_output=True, text=True
)
if result.returncode == 0:
    print("✅ test_corpus_to_dataset: PASSED")
else:
    print("❌ test_corpus_to_dataset: FAILED")
    print("STDOUT:", result.stdout)
    if result.stderr:
        print("STDERR:", result.stderr)

✅ test_corpus_to_dataset: PASSED


In [3]:
# Run tests with TensorFlow silencing
result = run_silent_subprocess(
    ['python', '-m', 'unittest', '-b', 'tests.test_index_vocab'], 
    capture_output=True, text=True
)
if result.returncode == 0:
    print("✅ test_index_vocab: PASSED")
else:
    print("❌ test_index_vocab: FAILED")
    print("STDOUT:", result.stdout)
    if result.stderr:
        print("STDERR:", result.stderr)

✅ test_index_vocab: PASSED


In [4]:
# Run tests with TensorFlow silencing (non-deterministic mode)
result = run_silent_subprocess(
    ['python', '-m', 'unittest', '-b', 'tests.test_dataset_to_triplets'], 
    deterministic=False,
    capture_output=True, text=True
)
if result.returncode == 0:
    print("✅ test_dataset_to_triplets: PASSED")
else:
    print("❌ test_dataset_to_triplets: FAILED")
    print("STDOUT:", result.stdout)
    if result.stderr:
        print("STDERR:", result.stderr)

✅ test_dataset_to_triplets: PASSED


In [5]:
# Corpus file information
corpus_file = "1800.txt"
corpus_dir = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data"
corpus_path = os.path.join(corpus_dir, corpus_file)
file_size = os.path.getsize(corpus_path) / 1024 / 1024
print("CORPUS FILE: ", corpus_path, "\n")

# Load and filter the corpus
start = time.time()
dataset, _ = make_dataset(corpus_path)
duration_load = time.time() - start
rate_load = file_size / duration_load

# Cache the dataset
dataset = dataset.cache()

# Build the vocabulary from the dataset
start = time.time()
vocab_table = make_vocab(dataset)
duration_vocab = time.time() - start
rate_vocab = file_size / duration_vocab

# Make triplets from the dataset and vocabulary table
start = time.time()
triplets_ds = build_skipgram_triplets(dataset, vocab_table)
duration_triplets = time.time() - start
rate_triplets = file_size / duration_triplets

# Benchmarking output
print("[--    Benchmarks    --]\n")
print(f"{'Step':<35}{'Duration':>10}{'Quantity':>21}{'Rate':>21}")
print("-" * 87)
print(f"{'Corpus loading and filtering (Lazy)':<35}{duration_load:8,.2f}{'s':>2}{file_size:18,.2f}{'MB':>3}{rate_load:16,.2f}{'MB/s':>5}")
print(f"{'Vocabulary creation':<35}{duration_vocab:8,.2f}{'s':>2}{file_size:18,.2f}{'MB':>3}{rate_vocab:16,.2f}{'MB/s':>5}")
print(f"{'Triplet generation (Lazy)':<35}{duration_triplets:8,.2f}{'s':>2}{file_size:18,.2f}{'MB':>3}{rate_triplets:16,.2f}{'MB/s':>5}")

# Create reverse lookup from vocab table once at the beginning
vocab_export = vocab_table.export()
vocab_keys = vocab_export[0].numpy()
vocab_values = vocab_export[1].numpy()
index_to_word = {idx: word.decode('utf-8') for word, idx in zip(vocab_keys, vocab_values)}

# Show sample lines
print("\n[--   Sample Lines   --]\n")
sample_lines = list(dataset.shuffle(1000, seed=42).take(5).as_numpy_iterator())
for line_bytes in sample_lines:
    print(line_bytes.decode("utf-8"))

# Test the vocab table with example words
print("\n[--    Test Words    --]\n")
test_words = ["UNK", "man", "king", "nonexistentword"]
ids = vocab_table.lookup(tf.constant(test_words)).numpy()
print(f"{'Word':<18} {'ID':>6}")
print("-" * 25)
for word, idx in zip(test_words, ids):
    print(f"{word:<18} {idx:>6}")

# Show sample triplets
print("\n[--  Sample Triplets  --]\n")
print(f"{'Center':<8} {'Center Word':<12} {'Positive':<8} {'Pos Word':<12} {'Negative':<8} {'Neg Word':<12}")
print("-" * 75)

sample_triplets = list(triplets_ds.shuffle(1000, seed=123).take(5).as_numpy_iterator())
for triplet in sample_triplets:
    center, positive, negative = triplet
    center_word = index_to_word.get(center, f"ID_{center}")
    pos_word = index_to_word.get(positive, f"ID_{positive}")
    neg_word = index_to_word.get(negative, f"ID_{negative}")
    print(f"{center:<8} {center_word:<12} {positive:<8} {pos_word:<12} {negative:<8} {neg_word:<12}")


CORPUS FILE:  /vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data/1800.txt 

[--    Benchmarks    --]

Step                                 Duration             Quantity                 Rate
---------------------------------------------------------------------------------------
Corpus loading and filtering (Lazy)    0.13 s             31.49 MB          240.73 MB/s
Vocabulary creation                   18.71 s             31.49 MB            1.68 MB/s
Triplet generation (Lazy)              0.16 s             31.49 MB          199.70 MB/s

[--   Sample Lines   --]

UNK one chance UNK UNK
UNK one pair front UNK
UNK one brave action UNK
UNK one another UNK since
UNK one could imagine UNK

[--    Test Words    --]

Word                   ID
-------------------------
UNK                     0
man                 11276
king                10355
nonexistentword         0

[--  Sample Triplets  --]

Center   Center Word  Positive Pos Word     Negativ

In [None]:
# Demonstrate TFRecord functionality
print("\n[-- TFRecord Demo (Fast Pipeline Serialization) --]\n")

# Example usage:
output_dir = "./pipeline_tfrecords"

# Save all pipeline artifacts to TFRecords (one-time cost)
print("Saving pipeline artifacts...")
artifacts = save_pipeline_artifacts(
    dataset, vocab_table, triplets_ds, 
    output_dir=output_dir, 
    compress=True
)
print(f"Saved to: {artifacts['output_dir']}")




[-- TFRecord Demo (Fast Pipeline Serialization) --]

Saving pipeline artifacts...
Saving pipeline artifacts to: ./pipeline_tfrecords
Writing vocabulary TFRecord to: ./pipeline_tfrecords/vocab.tfrecord.gz
Vocabulary write complete. Words written: 20,685
Write time: 0.21 sec
Writing TFRecord to: ./pipeline_tfrecords/triplets.tfrecord.gz
Vocabulary write complete. Words written: 20,685
Write time: 0.21 sec
Writing TFRecord to: ./pipeline_tfrecords/triplets.tfrecord.gz
Write complete. Triplets written: 794,296
Write time: 202.15 sec
All artifacts saved successfully!
Saved to: ./pipeline_tfrecords
Write complete. Triplets written: 794,296
Write time: 202.15 sec
All artifacts saved successfully!
Saved to: ./pipeline_tfrecords


In [None]:
# Demonstrate TFRecord loading speed
print("\n[-- TFRecord Loading Speed Demo --]\n")

output_dir = "./pipeline_tfrecords"

# Check if TFRecords exist
import os
vocab_file = os.path.join(output_dir, "vocab.tfrecord.gz")
triplets_file = os.path.join(output_dir, "triplets.tfrecord.gz")

if os.path.exists(vocab_file) and os.path.exists(triplets_file):
    print("TFRecord files found! Benchmarking load times...\n")
    
    # Time vocabulary loading
    print("Loading vocabulary from TFRecord...")
    start = time.time()
    loaded_vocab_table = load_vocab_from_tfrecord(vocab_file)
    vocab_load_time = time.time() - start
    
    # Time triplet dataset loading
    print("Loading triplets dataset from TFRecord...")
    start = time.time()
    loaded_triplets_ds = load_triplets_from_tfrecord(triplets_file)
    triplets_load_time = time.time() - start
    
    # Quick verification by taking a sample
    print("Verifying loaded data...")
    start = time.time()
    sample_triplets = list(loaded_triplets_ds.take(5).as_numpy_iterator())
    verification_time = time.time() - start
    
    # Display results
    print("\n[--  Loading Benchmarks  --]\n")
    print(f"{'Operation':<25}{'Duration':>10}{'Rate/Notes':>25}")
    print("-" * 60)
    print(f"{'Vocabulary loading':<25}{vocab_load_time:8,.2f}{'s':>2}{'~instant':>23}")
    print(f"{'Triplets dataset loading':<25}{triplets_load_time:8,.2f}{'s':>2}{'lazy - very fast':>23}")
    print(f"{'Sample verification':<25}{verification_time:8,.2f}{'s':>2}{'5 triplets':>23}")
    
    # Check if original timing variables are available for comparison
    try:
        print(f"\n💡 Compare with original pipeline times:")
        print(f"   • Original vocab creation: {duration_vocab:.2f}s")
        print(f"   • Original triplet generation: {duration_triplets:.2f}s")
        print(f"   • TFRecord vocab loading: {vocab_load_time:.2f}s")
        print(f"   • TFRecord triplet loading: {triplets_load_time:.2f}s")
        
        # Calculate speedup
        vocab_speedup = duration_vocab / vocab_load_time if vocab_load_time > 0 else float('inf')
        triplet_speedup = duration_triplets / triplets_load_time if triplets_load_time > 0 else float('inf')
        
        print(f"\n🚀 Speedup factors:")
        print(f"   • Vocabulary: {vocab_speedup:.1f}x faster")
        print(f"   • Triplets: {triplet_speedup:.1f}x faster")
        
    except NameError:
        print(f"\n💡 Original pipeline timing not available.")
        print(f"   Run the main pipeline cell first for speed comparison.")
        print(f"   Current TFRecord loading times:")
        print(f"   • Vocab loading: {vocab_load_time:.2f}s")
        print(f"   • Triplet loading: {triplets_load_time:.2f}s")
    
    # Verify data integrity
    print(f"\n✅ Data verification:")
    print(f"   • Loaded vocab size: {loaded_vocab_table.size().numpy():,}")
    try:
        print(f"   • Original vocab size: {vocab_table.size().numpy():,}")
    except NameError:
        print(f"   • Original vocab size: (not available - run main pipeline first)")
    print(f"   • Sample triplets loaded: {len(sample_triplets)}")
    
    # Show sample loaded triplet
    if sample_triplets:
        triplet = sample_triplets[0]
        center, positive, negative = triplet
        print(f"   • First loaded triplet: ({center}, {positive}, {negative})")
        
else:
    print("❌ TFRecord files not found. Run the save demo first!")
    print(f"   Looking for: {vocab_file}")
    print(f"   Looking for: {triplets_file}")


[-- TFRecord Loading Speed Demo --]

TFRecord files found! Benchmarking load times...

Loading vocabulary from TFRecord...
Loading vocabulary TFRecord from: ./pipeline_tfrecords/vocab.tfrecord.gz
Vocabulary loaded. Size: 20,685 words
Load time: 2.70 sec
Loading triplets dataset from TFRecord...
Loading TFRecord from: ./pipeline_tfrecords/triplets.tfrecord.gz
TFRecord loaded and parsed
Load time (lazy initialization): 0.066 sec
Verifying loaded data...
Vocabulary loaded. Size: 20,685 words
Load time: 2.70 sec
Loading triplets dataset from TFRecord...
Loading TFRecord from: ./pipeline_tfrecords/triplets.tfrecord.gz
TFRecord loaded and parsed
Load time (lazy initialization): 0.066 sec
Verifying loaded data...

[--  Loading Benchmarks  --]

Operation                  Duration               Rate/Notes
------------------------------------------------------------
Vocabulary loading           2.70 s               ~instant
Triplets dataset loading     0.07 s       lazy - very fast
Sample verif

NameError: name 'duration_vocab' is not defined