In [3]:
import sys
import os
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
root_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

import torch
import numpy as np
from PyISV.utils.training_utils import Dataset, PreloadedDataset
from PyISV.neural_network import NeuralNetwork
from PyISV.utils.validation_utils import Validator
from PyISV.utils.set_architecture import import_config
import torch.nn as nn


In [None]:
# ---- Config ----
input_file = root_dir+"/datasets/RDFs/nonMin_nCu_38.pt"  # e.g. "datasets/RDFs/min_nCu_38.pt"
target_file = root_dir+"/datasets/RDFs/min_nCu_38.pt"  # can be same as input for identity test
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 16

# ---- Load Data ----
inputs = torch.load(input_file)
targets = torch.load(target_file)
print(f"Loaded inputs: {inputs.shape}, targets: {targets.shape}")

# ---- Dataset and DataLoader ----
dataset = Dataset(inputs, targets, norm_inputs=True, norm_targets=True, norm_mode="minmax")
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

Loaded inputs: torch.Size([72000, 1, 340]), targets: torch.Size([72000, 1, 340])


In [None]:
#Configure training parameters
import math
import datetime
import torch.distributed as dist

# Clean up any existing process groups
if dist.is_initialized():
    dist.destroy_process_group()

# Get input and target data paths
input_data = root_dir+"datasets/RDFs/min_nCu_38.pt"
target_data = root_dir+"datasets/RDFs/min_nCu_38.pt"

# Enable DDP
use_ddp = True

# Model architecture parameters
embed_dim = 3
in_channels = 1
input_length = 340
n_features = 340 * in_channels

# Encoder architecture
channels = [8, 16, 32, 64, 64, 128]  # out_channels for each Conv1d
kernel_sizes = [3, 3, 3, 3, 3, 3]
paddings = [2, 2, 2, 2, 2, 2]
strides = [1, 1, 1, 1, 1, 1]
pool_kernel = 2
pool_stride = 2

length = input_length
for i in range(len(channels)):
    length = math.floor((length + 2*paddings[i] - (kernel_sizes[i]-1) - 1)/strides[i] + 1)
    length = math.floor((length - pool_kernel)/pool_stride + 1)

last_layer_length = channels[-1]
flat_dim = last_layer_length * length
feature_map_length = length
print(f"Final feature map length: {length}")
print(f"Calculated flattened dimension: {flat_dim}")

params = {
  "GENERAL": {
    "device": "cuda",
    "seed": 42,
    "apply_jit_tracing": False,
    "use_ddp": use_ddp,
    "use_lr_finder": False,
    "use_tensorboard": False,
    "input_length": 340,
    "input_channels": in_channels,
    "input_features": n_features,
    "flattened_features": n_features
  },
  "MODEL": {
    "type": "autoencoder",
    "input_shape": [in_channels, n_features],
    "embedding_dim": embed_dim,
    "flattened_dim": flat_dim,
    "feature_map_length": feature_map_length,
    "encoder_layers": [
      [
        {"type": "Conv1d", "in_channels": in_channels, "out_channels": 8, "kernel_size": 3, "padding": 2},
        {"type": "MaxPool1d", "kernel_size": 2, "stride": 2},
        {"type": "ReLU"},
        {"type": "BatchNorm1d", "num_features": 8}
      ],
      [
        {"type": "Conv1d", "in_channels": 8, "out_channels": 16, "kernel_size": 3, "padding": 2},
        {"type": "MaxPool1d", "kernel_size": 2, "stride": 2},
        {"type": "ReLU"},
        {"type": "BatchNorm1d", "num_features": 16}
      ],
      [
        {"type": "Conv1d", "in_channels": 16, "out_channels": 32, "kernel_size": 3, "padding": 2},
        {"type": "MaxPool1d", "kernel_size": 2, "stride": 2},
        {"type": "ReLU"},
        {"type": "BatchNorm1d", "num_features": 32}
      ],
      [
        {"type": "Conv1d", "in_channels": 32, "out_channels": 64, "kernel_size": 3, "padding": 2},
        {"type": "MaxPool1d", "kernel_size": 2, "stride": 2},
        {"type": "ReLU"},
        {"type": "BatchNorm1d", "num_features": 64}  
      ],
      [
        {"type": "Conv1d", "in_channels": 64, "out_channels": 64, "kernel_size": 3, "padding": 2},
        {"type": "MaxPool1d", "kernel_size": 2, "stride": 2},
        {"type": "ReLU"},
        {"type": "BatchNorm1d", "num_features": 64}
      ],
      [
        {"type": "Conv1d", "in_channels": 64, "out_channels": 128, "kernel_size": 3, "padding": 2},
        {"type": "MaxPool1d", "kernel_size": 2, "stride": 2},
        {"type": "ReLU"},
        {"type": "BatchNorm1d", "num_features": 128}
      ]
    ],
    "bottleneck_layers": [
      [
        {"type": "Flatten"},
        {"type": "Linear", "in_features": flat_dim, "out_features": embed_dim},
      ],
      [
        {"type": "Linear", "in_features": embed_dim, "out_features": flat_dim},
        {"type": "Sigmoid"}
      ]
    ],
    "decoder_layers": [
      [
        {"type": "Upsample", "scale_factor": 2},
        {"type": "ConvTranspose1d", "in_channels": last_layer_length, "out_channels": 64, "kernel_size": 3, "padding": 2},
        {"type": "ReLU"},
        {"type": "BatchNorm1d", "num_features": 64}
      ],
      [
        {"type": "Upsample", "scale_factor": 2},
        {"type": "ConvTranspose1d", "in_channels": 64, "out_channels": 64, "kernel_size": 3, "padding": 2},
        {"type": "ReLU"},
        {"type": "BatchNorm1d", "num_features": 64}
      ],
      [
        {"type": "Upsample", "scale_factor": 2},
        {"type": "ConvTranspose1d", "in_channels": 64, "out_channels": 32, "kernel_size": 3, "padding": 2},
        {"type": "ReLU"},
        {"type": "BatchNorm1d", "num_features": 32}
      ],
      [
        {"type": "Upsample", "scale_factor": 2},
        {"type": "ConvTranspose1d", "in_channels": 32, "out_channels": 16, "kernel_size": 3, "padding": 2},
        {"type": "ReLU"},
        {"type": "BatchNorm1d", "num_features": 16}
      ],
      [
        {"type": "Upsample", "scale_factor": 2},
        {"type": "ConvTranspose1d", "in_channels": 16, "out_channels": 8, "kernel_size": 3, "padding": 2},
        {"type": "ReLU"},
        {"type": "BatchNorm1d", "num_features": 8}
      ],
      [
        {"type": "Upsample", "scale_factor": 2},
        {"type": "Conv1d", "in_channels": 8, "out_channels": in_channels, "kernel_size": 3, "padding": 2},
      ]
    ]
  },
  "TRAINING": {
    # Training parameters
    "batch_size": 128,
    "train_size": 0.8,
    "min_epochs": 100,
    "max_epochs": 200,
    "loss_function": "MSELoss",
    "learning_rate": 0.001,
    "normalization": "minmax",
    # Optimizer parameters
    "num_workers": 16,
    "pin_memory": True,
    "gradient_clipping": False,
    "accumulation_steps": 1,
    "scheduled_lr": False,
    "scheduler_params": {
      "lr_warmup_epochs": 50,
      "milestones": [],
        "gamma": 0.5
    },
    "early_stopping": False,
    "early_stopping_params": {
      "patience": 30,
      "min_delta": 0.0001
    },
  },
  "INPUTS": {
    "dataset": input_data,
    "target": target_data
  }
}

# Save json configuration
import json
from datetime import datetime

run_id = "test_run"
model_id_dir = root_dir+f"/models/{run_id}"
os.makedirs(model_id_dir, exist_ok=True)

with open(f"{model_id_dir}/config.json", 'w') as f:
    json.dump(params, f, indent=4)

# Set up environment variables
import random
master_port = random.randint(29500, 30000)  # Random port for DDP
os.environ.update({
    #"NCCL_DEBUG": "INFO",
    "NCCL_SOCKET_IFNAME": "^lo,docker",  # Skip loopback and docker interfaces
    "NCCL_IB_DISABLE": "0",              # Enable InfiniBand if available
    "NCCL_P2P_DISABLE": "0",             # Ensure P2P is enabled
    "TORCH_NCCL_BLOCKING_WAIT": "1",     # Use blocking wait for better performance
    "NCCL_LL_THRESHOLD": "0",            # Disable low latency threshold 
    "MASTER_PORT": str(master_port),
    "MASTER_ADDR": "localhost",
    "WORLD_SIZE": str(torch.cuda.device_count()),
    "OMP_NUM_THREADS": "16",  # Set OpenMP threads to 1
    "MKL_THREADING_LAYER": "INTEL",  # Set MKL threading layer to Intel
    "KMP_BLOCKTIME": "0",  # Set KMP block time to 0
    "KMP_AFFINITY": "granularity=fine,compact,1,0",  # Set KMP affinity
    "KMP_HW_SUBSET": "1t",  # Use only physical cores, no hyperthreading
    "I_MPI_PIN_DOMAIN": "auto",  # Automatically pin MPI processes to cores
    "I_MPI_PIN": "ON",  # Enable process pinning
    "I_MPI_PIN_CELL": "core",  # Pin MPI processes to cores
    "CUDA_VISIBLE_DEVICES": ",".join(str(i) for i in range(torch.cuda.device_count())),
    "PYTHONPATH": f"{root_dir}:{os.environ.get('PYTHONPATH', '')}"
})

# Print configuration summary
print("Configuration Summary:")
print(f"Run ID: {run_id}")
print(f"DDP Enabled: {use_ddp}")
print(f"Batch Size: {params['TRAINING']['batch_size']}")
print(f"Number of Workers: {params['TRAINING']['num_workers']}")
print(f"Learning Rate: {params['TRAINING']['learning_rate']}")

# Check for generic variables that might be available
torch.set_num_threads(int(os.environ.get('OMP_NUM_THREADS', 1)))
print("\n=== Available CPU Resources ===")
print(f"CPUs available to PyTorch: {torch.get_num_interop_threads()}")
print(f"Num of OpenMP threads: {torch.get_num_threads()}")

print("\n=== Available GPU Resources ===")
print(f"GPUs available to PyTorch: {torch.cuda.device_count()}")
print(f"Name of GPU devices: {[torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]}")
print(f"Visible CUDA devices: {os.environ.get('CUDA_VISIBLE_DEVICES', 'None')}")

Final feature map length: 7
Calculated flattened dimension: 896
Configuration Summary:
Run ID: test_run
DDP Enabled: True
Batch Size: 128
Number of Workers: 16
Learning Rate: 0.001

=== Available CPU Resources ===
CPUs available to PyTorch: 48
Num of OpenMP threads: 16

=== Available GPU Resources ===
GPUs available to PyTorch: 3
Name of GPU devices: ['NVIDIA A30', 'NVIDIA A30', 'NVIDIA A30']
Visible CUDA devices: 0,1,2


In [15]:
import subprocess

cmd = [
    "torchrun",
    f"--nproc_per_node={torch.cuda.device_count()}",
    "--nnodes=1",
    f"{root_dir}/tests/run_test.py",
]

print(f"Running command: {' '.join(cmd)}")

try:
    result = subprocess.run(cmd, check=True, text=True, capture_output=False)
except subprocess.CalledProcessError as e:
    print("Training failed with error code:", e.returncode)
    print("Output:\n", e.output)

Running command: torchrun --nproc_per_node=3 --nnodes=1 /scratch/rasera/PyISV/tests/run_test.py
Loaded inputs: torch.Size([72000, 1, 340]), targets: torch.Size([72000, 1, 340])
Loaded inputs: torch.Size([72000, 1, 340]), targets: torch.Size([72000, 1, 340])
Loaded inputs: torch.Size([72000, 1, 340]), targets: torch.Size([72000, 1, 340])
Model created.
Model created.
Model created.
Testing with train_epoch_step + validation:
Testing with train_epoch_step + validation:
Testing with train_epoch_step + validation:
Epoch 0: train_loss=0.008665, val_loss=0.006012
Epoch 0: train_loss=0.007802, val_loss=0.005782
  Val output stats: mean=0.3291, std=0.2863, min=-0.0590, max=1.0468
  Val output stats: mean=0.3304, std=0.2774, min=-0.0329, max=1.0329
Epoch 0: train_loss=0.007452, val_loss=0.005805
  Val output stats: mean=0.3266, std=0.2920, min=-0.0455, max=1.0233
Epoch 1: train_loss=0.006077, val_loss=0.006041
  Val output stats: mean=0.3308, std=0.2868, min=-0.0344, max=1.0414
Epoch 1: train_l

In [13]:
from PyISV.utils.IO_utils import load_tensor
from PyISV.utils.set_architecture import import_config
from PyISV.utils.training_utils import Dataset, get_data_loader, PreloadedDataset, get_device

# Use set_architecture to normalize the config
config = import_config(json_file=root_dir+"/models/test_run/config.json")
model_config = config["MODEL"]
model = NeuralNetwork(model_config).to(config["GENERAL"]["device"])
print("Model created.")

# ---- Loss and Optimizer ----
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

from PyISV.utils.training_utils import train_epoch_step
from PyISV.utils.validation_utils import Validator

# ---- Create validator ----
validator_config = model_config  # Use the same config
validator = Validator(validator_config)

# ---- Prepare data ----

input_file = config['INPUTS']['dataset']
target_file = config['INPUTS']['target']
input_data = load_tensor(input_file)
target_data = load_tensor(target_file) if target_file else input_data.clone()
        
dataset = Dataset(
    input_data, target_data,
    norm_inputs=True, norm_targets=True,
    norm_mode=config["TRAINING"]["normalization"], device=config["GENERAL"]["device"]
)

 # Split data
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(
    dataset.inputs, dataset.targets,
    train_size=config["TRAINING"]["train_size"],
    random_state=config["GENERAL"]["seed"],
    shuffle=True, stratify=None
)

# Loads the data using pre-computed normalization
train_dataset = PreloadedDataset(X_train, y_train)
valid_dataset = PreloadedDataset(X_valid, y_valid)

# Save the validation dataset
#torch.save(self.valid_dataset.inputs.detach().cpu(), 
#           f"{self.outputs_dir}/input_validation_data.pt")

train_loader = get_data_loader(
    train_dataset,
    batch_size=config["TRAINING"]["batch_size"],
    num_workers=config["TRAINING"]["num_workers"],
    pin_memory=config["TRAINING"]["pin_memory"],
    use_ddp=False,
    shuffle=True,
)
valid_loader = get_data_loader(
    valid_dataset,
    batch_size=config["TRAINING"]["batch_size"],
    num_workers=config["TRAINING"]["num_workers"],
    pin_memory=config["TRAINING"]["pin_memory"],
    use_ddp=False,
    shuffle=False,
)

# ---- Loss and Optimizer ----
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scaler = torch.amp.GradScaler()
device = get_device(config["GENERAL"]["device"])

# ---- Test with train_epoch_step + validation ----
print("Testing with train_epoch_step + validation:")
for epoch in range(10):
    # Training step
    avg_train_loss = train_epoch_step(
        model=model,
        data_loader=train_loader,
        optimizer=optimizer,
        scaler=scaler,
        loss_function=loss_fn,
        device=device,
        use_ddp=False,
        gradient_clipping=None,
        accumulation_steps=1,
        epoch=epoch
    )
    
    # Validation step
    avg_val_loss = validator.validate_epoch(
        model=model,
        data_loader=valid_loader,
        loss_function=loss_fn,
        device=device
    )
    
    print(f"Epoch {epoch}: train_loss={avg_train_loss:.6f}, val_loss={avg_val_loss:.6f}")
    
    # Check a sample batch to see output stats
    model.eval()
    with torch.no_grad():
        sample_batch = next(iter(valid_loader))
        inp, targ = sample_batch
        inp, targ = inp.to(device), targ.to(device)
        out = model(inp)
        if isinstance(out, tuple):
            out = out[0]
        print(f"  Val output stats: mean={out.mean().item():.4f}, std={out.std().item():.4f}, min={out.min().item():.4f}, max={out.max().item():.4f}")
    model.train()


Model created.
Testing with train_epoch_step + validation:
Epoch 0: train_loss=0.040390, val_loss=0.002859
  Val output stats: mean=0.3283, std=0.2990, min=-0.1592, max=1.0668
Epoch 1: train_loss=0.002548, val_loss=0.002226
  Val output stats: mean=0.3309, std=0.3004, min=-0.1369, max=1.1014
Epoch 2: train_loss=0.002133, val_loss=0.001970
  Val output stats: mean=0.3297, std=0.2973, min=-0.1126, max=1.0258
Epoch 3: train_loss=0.001936, val_loss=0.001794
  Val output stats: mean=0.3287, std=0.2975, min=-0.0863, max=1.0333
Epoch 4: train_loss=0.001805, val_loss=0.001852
  Val output stats: mean=0.3265, std=0.2979, min=-0.0794, max=1.0619
Epoch 5: train_loss=0.001726, val_loss=0.001656
  Val output stats: mean=0.3264, std=0.2950, min=-0.0741, max=1.0228
Epoch 6: train_loss=0.001662, val_loss=0.001609
  Val output stats: mean=0.3295, std=0.3024, min=-0.0798, max=1.0324
Epoch 7: train_loss=0.001601, val_loss=0.001571
  Val output stats: mean=0.3283, std=0.3015, min=-0.0665, max=1.0376
Epoch

In [None]:
# # ---- Single-Batch Overfit Test ----
# single_batch = next(iter(train_loader))
# inp, targ = single_batch
# inp, targ = inp.to(device), targ.to(device)

# print(f"Input min/max: {inp.min().item():.4f}, {inp.max().item():.4f}")
# print(f"Target min/max: {targ.min().item():.4f}, {targ.max().item():.4f}")
# print(f"Number of model parameters: {sum(p.numel() for p in model.parameters())}")

# # Check if parameters update
# first_param_before = next(model.parameters()).clone()

# for step in range(100):
#     model.train()
#     optimizer.zero_grad()
#     out = model(inp)
#     if isinstance(out, tuple):
#         out = out[0]
#     loss = loss_fn(out, targ)
#     loss.backward()
#     optimizer.step()
#     print(f"Step {step}: loss={loss.item():.6f}")
#     print(f"  Output stats: mean={out.mean().item():.4f}, std={out.std().item():.4f}, min={out.min().item():.4f}, max={out.max().item():.4f}")

# first_param_after = next(model.parameters()).clone()
# print(f"Parameter changed: {not torch.equal(first_param_before, first_param_after)}")
# print(f"Parameter difference norm: {(first_param_after - first_param_before).norm().item():.8f}")



In [None]:
# ---- Compare with simple training loop + validation ----
print("\nTesting with simple training loop + validation:")
model_simple = NeuralNetwork(model_config).to(device)  # Fresh model
optimizer_simple = torch.optim.Adam(model_simple.parameters(), lr=0.001)
validator_simple = Validator(validator_config)

for epoch in range(3):
    # Simple training
    total_loss = 0
    num_batches = 0
    for batch_idx, (inp, targ) in enumerate(train_loader):
        inp, targ = inp.to(device), targ.to(device)
        optimizer_simple.zero_grad()
        out = model_simple(inp)
        if isinstance(out, tuple):
            out = out[0]
        loss = loss_fn(out, targ)
        loss.backward()
        optimizer_simple.step()
        total_loss += loss.item()
        num_batches += 1
    
    avg_train_loss = total_loss / num_batches
    
    # Validation with validator
    avg_val_loss = validator_simple.validate_epoch(
        model=model_simple,
        data_loader=val_loader,
        loss_function=loss_fn,
        device=device
    )
    
    print(f"Epoch {epoch}: train_loss={avg_train_loss:.6f}, val_loss={avg_val_loss:.6f}")
    
    # Check a sample batch to see output stats
    model_simple.eval()
    with torch.no_grad():
        sample_batch = next(iter(val_loader))
        inp, targ = sample_batch
        inp, targ = inp.to(device), targ.to(device)
        out = model_simple(inp)
        if isinstance(out, tuple):
            out = out[0]
        print(f"  Val output stats: mean={out.mean().item():.4f}, std={out.std().item():.4f}, min={out.min().item():.4f}, max={out.max().item():.4f}")
    model_simple.train()