# Word2GM Training Data Pipeline

**Pipeline: Corpus file → TFRecord training artifacts (triplets and vocabulary)**

Use this notebook to prepare a Google 5gram corpora for Word2GM skip-gram training.

## Pipeline Workflow

1. **Input**: Preprocessed corpus file (e.g., `2019.txt`) in `/vast` NVMe storage
2. **Processing**: TensorFlow-native filtering, vocabulary building, and triplet generation
3. **Output**: TFRecord artifacts in organized subdirectories (e.g., `2019_artifacts/`)

### **Artifact Storage**
The pipeline creates year-specific subdirectories alongside the original text corpora:
<pre>
/vast/edk202/NLP_corpora/.../data/
├── 2018.txt
├── 2019.txt
├── 2020.txt
├── 2018_artifacts/
│   ├── triplets.tfrecord.gz
│   └── vocab.tfrecord.gz
├── 2019_artifacts/
│   ├── triplets.tfrecord.gz
│   └── vocab.tfrecord.gz
└── 2020_artifacts/
    ├── triplets.tfrecord.gz
    └── vocab.tfrecord.gz
</pre>

## Setup

In [9]:
%load_ext autoreload
%autoreload 2

import os
import sys
import warnings
from pathlib import Path

# Setup project path
project_root = Path('/scratch/edk202/word2gm-fast')
os.chdir(project_root)
src_path = project_root / 'src'
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Import TensorFlow silently
from word2gm_fast.utils.tf_silence import import_tensorflow_silently
tf = import_tensorflow_silently()

print(f"TensorFlow version: {tf.__version__}")

# Import core dependencies
import numpy as np
import pandas as pd
import time
import psutil

# Import Word2GM modules
from word2gm_fast.dataprep.pipeline import batch_prepare_training_data

print("Setup complete; all modules loaded successfully")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
TensorFlow version: 2.19.0
Setup complete; all modules loaded successfully


In [8]:
# === CLUSTER RESOURCE MONITORING ===
import socket
import subprocess
import glob
import psutil
import os
import re

print("CLUSTER RESOURCE ALLOCATION SUMMARY")
print("=" * 50)

# Hostname
hostname = socket.gethostname()
print(f"Hostname: {hostname}")

# CPU Information
logical_cores = psutil.cpu_count(logical=True)
try:
    # Detect physical cores via thread siblings
    physical_cores = 0
    seen_cores = set()
    
    cpu_paths = glob.glob('/sys/devices/system/cpu/cpu[0-9]*')
    cpu_numbers = sorted([int(path.split('cpu')[-1]) for path in cpu_paths])
    
    for cpu_num in cpu_numbers:
        if cpu_num in seen_cores:
            continue
        siblings_path = f'/sys/devices/system/cpu/cpu{cpu_num}/topology/thread_siblings_list'
        try:
            with open(siblings_path, 'r') as f:
                siblings = f.read().strip()
            if ',' in siblings:
                sibling_list = [int(x) for x in siblings.split(',')]
            elif '-' in siblings:
                start, end = siblings.split('-')
                sibling_list = list(range(int(start), int(end) + 1))
            else:
                sibling_list = [int(siblings)]
            seen_cores.update(sibling_list)
            physical_cores += 1
        except:
            physical_cores += 1
            seen_cores.add(cpu_num)
    
    if physical_cores > 0 and physical_cores != logical_cores:
        print(f"CPU cores: {physical_cores} physical, {logical_cores} logical (hyperthreading)")
    else:
        print(f"CPU cores: {logical_cores} logical")
except:
    print(f"CPU cores: {logical_cores} logical")

# Memory Information
memory = psutil.virtual_memory()
total_memory_gb = memory.total / (1024**3)
available_memory_gb = memory.available / (1024**3)

# Check for SLURM memory limits
slurm_memory = None
try:
    slurm_mem_per_node = os.environ.get('SLURM_MEM_PER_NODE')
    if slurm_mem_per_node:
        slurm_memory = int(slurm_mem_per_node) / 1024  # MB to GB
except:
    pass

if slurm_memory and slurm_memory < total_memory_gb:
    print(f"Job-allocated memory: {slurm_memory:.1f} GB")
else:
    print(f"Memory: {total_memory_gb:.1f} GB total")

# Verify GPU
accessible_gpus = []
for gpu_id in range(4):
    device_path = f"/dev/nvidia{gpu_id}"
    if os.path.exists(device_path):
        try:
            with open(device_path, 'rb') as f:
                pass
            accessible_gpus.append(gpu_id)
        except (PermissionError, OSError):
            continue

if accessible_gpus:
    print(f"GPU: {len(accessible_gpus)} accessible")
    for gpu_id in accessible_gpus:
        print(f"  GPU {gpu_id}: /dev/nvidia{gpu_id}")
else:
    print("GPU: Not accessible")

print("\nSTORAGE QUOTAS AND USAGE")
print("=" * 50)

# Get quota information
try:
    cmd = ['ssh', '-o', 'ConnectTimeout=3', '-o', 'BatchMode=yes', 
           '-o', 'StrictHostKeyChecking=no', 'log-1.hpc.nyu.edu', 'myquota']
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=8)
    
    if result.returncode == 0 and result.stdout.strip():
        lines = result.stdout.strip().split('\n')
        
        # Clean ANSI color codes from output
        ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
        
        # Parse quota output
        quota_data = []
        
        for line in lines:
            line = ansi_escape.sub('', line).strip()  # Remove color codes
            if line.startswith('/'):
                # Split by whitespace, but handle the complex format
                parts = line.split()
                if len(parts) >= 5:
                    filesystem = parts[0]
                    allocation = parts[3]
                    usage_info = parts[4]
                    
                    # Extract usage and percentage from usage_info
                    usage_parts = usage_info.split('/')
                    if len(usage_parts) >= 1:
                        space_part = usage_parts[0]
                        # Extract usage and percentage using regex
                        match = re.match(r'([0-9.]+[KMGT]?B)\(([0-9.]+)%\)', space_part)
                        if match:
                            used = match.group(1)
                            percent = match.group(2) + '%'
                        else:
                            # Try simpler pattern
                            if '(' in space_part and ')' in space_part:
                                used = space_part.split('(')[0]
                                percent_match = re.search(r'\(([0-9.]+)%\)', space_part)
                                percent = percent_match.group(1) + '%' if percent_match else "N/A"
                            else:
                                used = space_part
                                percent = "N/A"
                    else:
                        used = "N/A"
                        percent = "N/A"
                    
                    quota_data.append({
                        'filesystem': filesystem,
                        'allocation': allocation,
                        'used': used,
                        'percent': percent
                    })
        
        if quota_data:
            # Print formatted table
            print(f"{'Filesystem':<12} {'Allocation':<15} {'Used':<12} {'Percent':<8}")
            print("-" * 50)
            for item in quota_data:
                print(f"{item['filesystem']:<12} {item['allocation']:<15} {item['used']:<12} {item['percent']:<8}")
        else:
            print("No quota information found")
            
except Exception as e:
    print(f"Quota check failed: {str(e)}")

print("=" * 50)
print("\nResource summary complete")

CLUSTER RESOURCE ALLOCATION SUMMARY
Hostname: gr004.hpc.nyu.edu
CPU cores: 2 physical, 48 logical (hyperthreading)
Job-allocated CPUs: 14 (SLURM)
Job-allocated memory: 125.0 GB
GPU: 1 accessible
  GPU 0: /dev/nvidia0

STORAGE QUOTAS AND USAGE
Filesystem   Allocation      Used         Percent 
--------------------------------------------------
/home        50.0GB/30.7K    15.57GB      31.13%  
/scratch     5.0TB/1.0M      323.29GB     6.31%   
/archive     2.0TB/20.5K     1262.12GB    61.63%  
/vast        2TB/5.0M        1.37TB       69.0%   

Resource summary complete


CLUSTER RESOURCE ALLOCATION SUMMARY
Hostname: gr004.hpc.nyu.edu
CPU cores: 2 physical, 48 logical (hyperthreading)
Job-allocated CPUs: 14 (SLURM)
Job-allocated memory: 125.0 GB
GPU: 1 accessible
  GPU 0: /dev/nvidia0

STORAGE QUOTAS AND USAGE
Filesystem   Allocation      Used         Percent 
--------------------------------------------------
/home        50.0GB/30.7K    15.57GB      31.13%  
/scratch     5.0TB/1.0M      323.29GB     6.31%   
/archive     2.0TB/20.5K     1262.12GB    61.63%  
/vast        2TB/5.0M        1.37TB       69.0%   

Resource summary complete


NameError: name 'socket' is not defined

## Prepare one or more corpora in parallel 

In [None]:
# Configuration
corpus_dir = "/vast/edk202/NLP_corpora/Google_Books/20200217/eng-fiction/5gram_files/6corpus/yearly_files/data"

# Process years with optional parallel processing
results = batch_prepare_training_data(
    corpus_dir=corpus_dir,
    year_range="1951-1952",
    compress=False,
    show_progress=True,
    show_summary=True,
    use_multiprocessing=True
)


PARALLEL BATCH PROCESSING
Processing 2 years
Using 2 parallel workers
Estimated speedup: 2.0x


KeyboardInterrupt: 

In [17]:
! /usr/bin/nvidia-smi

/bin/bash: line 1: /usr/bin/nvidia-smi: No such file or directory


In [22]:
ls -l /usr/bin/nvidia-smi

ls: cannot access '/usr/bin/nvidia-smi': No such file or directory


In [10]:
# GPU Diagnostics - Check TensorFlow GPU configuration
print("TensorFlow GPU Diagnostics")
print("=" * 40)

# Check environment variables
print(f"CUDA_VISIBLE_DEVICES: {os.environ.get('CUDA_VISIBLE_DEVICES', 'Not set')}")

# Check TensorFlow GPU detection
print(f"\nTensorFlow GPU devices:")
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for i, gpu in enumerate(gpus):
        print(f"  GPU {i}: {gpu}")
        # Check memory info
        try:
            details = tf.config.experimental.get_device_details(gpu)
            print(f"    Details: {details}")
        except:
            print(f"    Details: Not available")
else:
    print("  No GPU devices detected by TensorFlow")

# Enhanced GPU computation test - shows actual device placement
print(f"\nGPU computation test with device placement verification:")

def test_device_placement(device_name):
    """Test computation on specified device and show where it actually ran."""
    try:
        print(f"\n  Testing device: {device_name}")
        
        # Enable logging to see actual device placement
        tf.debugging.set_log_device_placement(True)
        
        with tf.device(device_name):
            a = tf.constant([[1.0, 2.0], [3.0, 4.0]])
            b = tf.constant([[1.0, 1.0], [0.0, 1.0]])
            c = tf.matmul(a, b)
            result = c.numpy()
            
            # Check where the operation actually ran
            actual_device = c.device
            
        tf.debugging.set_log_device_placement(False)
        
        print(f"    Requested: {device_name}")
        print(f"    Actually used: {actual_device}")
        print(f"    Result: {result.flatten()}")
        
        # Check if requested device matches actual device
        if device_name.lower() in actual_device.lower():
            print(f"    ✅ Computation ran on requested device")
        else:
            print(f"    ⚠️  Device fallback occurred! Requested {device_name} but used {actual_device}")
            
        return True, actual_device
        
    except Exception as e:
        print(f"    ❌ Computation failed: {e}")
        return False, str(e)

# Test various device configurations
test_devices = ['/GPU:0', '/GPU:1', '/GPU:5', '/CPU:0']

for device in test_devices:
    test_device_placement(device)

# Show TensorFlow's soft placement behavior
print(f"\n📋 TensorFlow Soft Device Placement:")
print(f"  - TensorFlow uses 'soft placement' by default")
print(f"  - If requested device doesn't exist, it falls back to available device")
print(f"  - This is why /GPU:5 'succeeds' but actually runs on CPU or /GPU:0")
print(f"  - Use tf.debugging.set_log_device_placement(True) to see actual placement")

# Check if pipeline functions are using GPU
print(f"\nChecking Word2GM pipeline GPU usage...")
try:
    # Check if the pipeline functions have device placement
    import inspect
    from word2gm_fast.dataprep import pipeline
    
    # Look for GPU-related code in the pipeline
    source = inspect.getsource(pipeline.batch_prepare_training_data)
    if 'GPU' in source or 'device' in source:
        print("  ✅ Pipeline appears to have GPU support")
    else:
        print("  ⚠️  Pipeline may not be using GPU - check implementation")
        
except Exception as e:
    print(f"  ❌ Could not inspect pipeline: {e}")

print("=" * 40)

TensorFlow GPU Diagnostics
CUDA_VISIBLE_DEVICES: 0

TensorFlow GPU devices:
  No GPU devices detected by TensorFlow

GPU computation test with device placement verification:

  Testing device: /GPU:0
    Requested: /GPU:0
    Actually used: /job:localhost/replica:0/task:0/device:CPU:0
    Result: [1. 3. 3. 7.]
    ⚠️  Device fallback occurred! Requested /GPU:0 but used /job:localhost/replica:0/task:0/device:CPU:0

  Testing device: /GPU:1
    Requested: /GPU:1
    Actually used: /job:localhost/replica:0/task:0/device:CPU:0
    Result: [1. 3. 3. 7.]
    ⚠️  Device fallback occurred! Requested /GPU:1 but used /job:localhost/replica:0/task:0/device:CPU:0

  Testing device: /GPU:5
    Requested: /GPU:5
    Actually used: /job:localhost/replica:0/task:0/device:CPU:0
    Result: [1. 3. 3. 7.]
    ⚠️  Device fallback occurred! Requested /GPU:5 but used /job:localhost/replica:0/task:0/device:CPU:0

  Testing device: /CPU:0
    Requested: /CPU:0
    Actually used: /job:localhost/replica:0/task:

In [11]:
# Demonstration: Why GPU tests "succeed" even with invalid GPU IDs
print("🔍 TensorFlow Soft Device Placement Demonstration")
print("=" * 55)

def quick_gpu_test(gpu_id):
    """Quick test to show where computation actually runs."""
    device_name = f'/GPU:{gpu_id}'
    
    try:
        with tf.device(device_name):
            # Simple computation
            x = tf.constant([1.0, 2.0])
            y = tf.square(x)
            result = y.numpy()
            
            # Check actual device
            actual_device = y.device
            
        print(f"GPU {gpu_id}: Requested={device_name}, Actually used={actual_device}")
        
        # Show if fallback occurred
        if f"GPU:{gpu_id}" in actual_device:
            return "✅ Used requested GPU"
        else:
            return "⚠️  FALLBACK occurred"
            
    except Exception as e:
        return f"❌ Failed: {e}"

print("Testing various GPU IDs:")
print("-" * 30)

for gpu_id in [0, 1, 2, 5, 99]:
    status = quick_gpu_test(gpu_id)
    print(f"  {status}")

print("\n💡 Key Points:")
print("• TensorFlow's default 'soft placement' allows fallback to available devices")
print("• Operations 'succeed' even if the requested GPU doesn't exist")
print("• Always check the actual device used (tensor.device) to verify placement")
print("• Use tf.debugging.set_log_device_placement(True) for detailed logging")

print("\n🛠️  To force strict device placement (fail if device unavailable):")
print("   tf.config.set_soft_device_placement(False)")
print("   # This will make /GPU:5 actually fail if GPU 5 doesn't exist")

🔍 TensorFlow Soft Device Placement Demonstration
Testing various GPU IDs:
------------------------------
  ❌ Failed: Could not satisfy device specification '/job:localhost/replica:0/task:0/device:GPU:0'. enable_soft_placement=0. Supported device types [CPU]. All available devices [/job:localhost/replica:0/task:0/device:CPU:0].
  ❌ Failed: Could not satisfy device specification '/job:localhost/replica:0/task:0/device:GPU:1'. enable_soft_placement=0. Supported device types [CPU]. All available devices [/job:localhost/replica:0/task:0/device:CPU:0].
  ❌ Failed: Could not satisfy device specification '/job:localhost/replica:0/task:0/device:GPU:2'. enable_soft_placement=0. Supported device types [CPU]. All available devices [/job:localhost/replica:0/task:0/device:CPU:0].
  ❌ Failed: Could not satisfy device specification '/job:localhost/replica:0/task:0/device:GPU:5'. enable_soft_placement=0. Supported device types [CPU]. All available devices [/job:localhost/replica:0/task:0/device:CPU:0].