In [1]:
# Import necessary libraries and setup paths
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
import seaborn as sns

from PyISV.utils.IO_utils import find_project_root

# Set seaborn style
sns.set_theme(style="whitegrid")

# Paths to the data
root_dir = find_project_root()
data_dir = os.path.join(root_dir, "datasets")
models_dir = os.path.join(root_dir, "models/")

In [2]:
#Configure training parameters
import math
import random
import datetime

# Get input and target data paths
input_data = f"{data_dir}/RDFs/nonMin_nPt_85.pt"
target_data = f"{data_dir}/RDFs/min_nPt_85.pt"

# Enable DDP
use_ddp = True
use_data_parallel = False

# Model architecture parameters
embed_dim = 3
in_channels = 1
input_length = 340
n_features = 340 * in_channels

# Encoder architecture
channels = [16, 32, 64, 64]  # out_channels for each Conv1d
kernel_sizes = [20, 15, 10, 5]
paddings = ["same", "same", "same", "same"]  # Padding for each Conv1d
strides = [1, 1, 1, 1]
pool_kernel = 2
pool_stride = 2

length = input_length
for i in range(len(channels)):
    if paddings[i] == "same":
        conv_length = math.ceil(length / strides[i])
    else:
        pad = paddings[i] if isinstance(paddings[i], int) else 0
        conv_length = math.floor((length + 2*int(pad) - (kernel_sizes[i]-1) - 1)/strides[i] + 1)
    length = math.floor((conv_length - pool_kernel)/pool_stride + 1) # Apply pooling


last_layer_length = channels[-1]
flat_dim = last_layer_length * length
feature_map_length = length
print(f"Final feature map length: {length}")
print(f"Calculated flattened dimension: {flat_dim}")

params = {
  "GENERAL": {
    "device": "cuda",
    "seed": random.randint(0, 100000),
    "apply_jit_tracing": False,
    "use_data_parallel": use_data_parallel,
    "use_ddp": use_ddp,
    "use_lr_finder": False,
    "use_tensorboard": False,
    "input_length": 340,
    "input_channels": in_channels,
    "input_features": n_features,
    "flattened_features": n_features
  },
  "MODEL": {
    "type": "autoencoder",
    "input_shape": [in_channels, n_features],
    "embedding_dim": embed_dim,
    "flattened_dim": flat_dim,
    "feature_map_length": feature_map_length,
    "encoder_layers": [
      [
        {"type": "Conv1d", "in_channels": in_channels, "out_channels": 16, "kernel_size": 20, "stride":1, "padding": "same"},
        {"type": "BatchNorm1d", "num_features": 16},
        {"type": "Dropout", "p": 0.2},
        {"type": "ReLU"},
        {"type": "MaxPool1d", "kernel_size": 2, "stride": 2}
      ],
      [
        {"type": "Conv1d", "in_channels": 16, "out_channels": 32, "kernel_size": 15, "stride":1, "padding": "same"},
        {"type": "BatchNorm1d", "num_features": 32},
        {"type": "Dropout", "p": 0.15},
        {"type": "ReLU"},
        {"type": "MaxPool1d", "kernel_size": 2, "stride": 2}
      ],
      [
        {"type": "Conv1d", "in_channels": 32, "out_channels": 64, "kernel_size": 10, "stride":1, "padding": "same"},
        {"type": "BatchNorm1d", "num_features": 64},
        {"type": "ReLU"},
        {"type": "MaxPool1d", "kernel_size": 2, "stride": 2}
      ],
      [
        {"type": "Conv1d", "in_channels": 64, "out_channels": last_layer_length, "kernel_size": 5, "stride":1, "padding": "same"},
        {"type": "BatchNorm1d", "num_features": last_layer_length},
        {"type": "ReLU"},
        {"type": "MaxPool1d", "kernel_size": 2, "stride": 2}
      ]
    ],
    "bottleneck_layers": [
      [
        {"type": "Flatten"},
        {"type": "Linear", "in_features": flat_dim, "out_features": embed_dim},  # Fully connected layer
      ],
      [
        {"type": "Linear", "in_features": embed_dim, "out_features": flat_dim}, # Embedding layer
        {"type": "ReLU"},
      ]
    ],
    "decoder_layers": [
      [
        {"type": "Upsample", "scale_factor": 2},
        {"type": "ConvTranspose1d", "in_channels": last_layer_length, "out_channels": 64, "kernel_size": 5, "padding": 1},
        {"type": "ReLU"},
        {"type": "BatchNorm1d", "num_features": 64},
      ],
      [
        {"type": "Upsample", "scale_factor": 2},
        {"type": "ConvTranspose1d", "in_channels": 64, "out_channels": 32, "kernel_size": 10, "padding": 1},
        {"type": "ReLU"},
        {"type": "BatchNorm1d", "num_features": 32},
      ],
      [
        {"type": "Upsample", "scale_factor": 2},
        {"type": "ConvTranspose1d", "in_channels": 32, "out_channels": 16, "kernel_size": 15, "padding": 1},
        {"type": "ReLU"},
        {"type": "BatchNorm1d", "num_features": 16},
      ],
      [
        {"type": "Upsample", "scale_factor": 2},
        {"type": "Conv1d", "in_channels": 16, "out_channels": in_channels, "kernel_size": 20, "padding": 1},
      ]
    ]
  },
  "TRAINING": {
    # Training parameters
    "batch_size": 1024,
    "train_size": 0.8,
    "min_epochs": 150,
    "max_epochs": 500,
    "loss_function": "HuberLoss",
    "loss_params": {
      "reduction": "mean",
      "delta": 1.0
      },
    "learning_rate": 0.001,
    "normalization": "minmax",
    "weight_decay": 0.0005,
    
    # Optimizer parameters
    "num_workers": 0,
    "pin_memory": False,
    "scheduled_lr": True,
    "scheduler_params": {
      "lr_warmup_epochs": 10,
      "milestones": [100, 150, 200],
        "gamma": 0.5
    },
    "early_stopping": True,
    "early_stopping_params": {
      "patience": 30,
      "min_delta": 0.00005
    },
  },
  "INPUTS": {
    "dataset": input_data,
    "target": target_data
  }
}

Final feature map length: 21
Calculated flattened dimension: 1344


In [3]:
# Save json configuration
import json
import random
from datetime import datetime

#run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
run_id = "nonMin_to_min_nPt_85"
model_id_dir = f"{models_dir}/{run_id}"
os.makedirs(model_id_dir, exist_ok=True)

with open(f"{model_id_dir}/config.json", 'w') as f:
    json.dump(params, f, indent=4)

# Set up environment variables
master_port = random.randint(29500, 30000)  # Random port for DDP
os.environ.update({
    #"NCCL_DEBUG": "INFO",                          # Enable NCCL debugging
    "NCCL_SOCKET_IFNAME": "^lo,docker",             # Skip loopback and docker interfaces
    "NCCL_IB_DISABLE": "0",                         # Enable InfiniBand if available
    "NCCL_P2P_DISABLE": "0",                        # Ensure P2P is enabled
    "TORCH_NCCL_BLOCKING_WAIT": "1",                # Use blocking wait for better performance
    "NCCL_LL_THRESHOLD": "0",                       # Disable low latency threshold 
    "MASTER_PORT": str(master_port),                # Random port for DDP
    "MASTER_ADDR": "localhost",                     # Master address for DDP
    "WORLD_SIZE": str(torch.cuda.device_count()),   # Total number of possible processes
    "OMP_NUM_THREADS": "16",                        # Set OpenMP threads to 16
    "MKL_THREADING_LAYER": "INTEL",                 # Set MKL threading layer to Intel
    "KMP_BLOCKTIME": "0",                           # Set KMP block time to 0
    "KMP_AFFINITY": "granularity=fine,compact,1,0", # Set KMP affinity
    "KMP_HW_SUBSET": "1t",                          # Use only physical cores, no hyperthreading
    "I_MPI_PIN_DOMAIN": "auto",                     # Automatically pin MPI processes to cores
    "I_MPI_PIN": "ON",                              # Enable process pinning
    "I_MPI_PIN_CELL": "core",                       # Pin MPI processes to cores
    "CUDA_VISIBLE_DEVICES": ",".join(str(i) for i in range(torch.cuda.device_count())),
    "PYTHONPATH": f"{root_dir}:{os.environ.get('PYTHONPATH', '')}"
})

# Print configuration summary
print("\n=== Configuration Summary ===")
print(f"Run ID: {run_id}")
print(f"DDP Enabled: {use_ddp}")
print(f"Batch Size: {params['TRAINING']['batch_size']}")
print(f"Number of Workers: {params['TRAINING']['num_workers']}")
print(f"Learning Rate: {params['TRAINING']['learning_rate']}")

# Check for generic variables that might be available
torch.set_num_threads(int(os.environ.get('OMP_NUM_THREADS', 1)))
print("\n=== Available CPU Resources ===")
print(f"CPUs available to PyTorch: {torch.get_num_interop_threads()}")
print(f"Num of OpenMP threads: {torch.get_num_threads()}")

print("\n=== Available GPU Resources ===")
print(f"GPUs available to PyTorch: {torch.cuda.device_count()}")
print(f"Available GPU devices: {[torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]}")



=== Configuration Summary ===
Run ID: nonMin_to_min_nPt_85
DDP Enabled: True
Batch Size: 1024
Number of Workers: 0
Learning Rate: 0.001

=== Available CPU Resources ===
CPUs available to PyTorch: 64
Num of OpenMP threads: 16

=== Available GPU Resources ===
GPUs available to PyTorch: 2
Available GPU devices: ['NVIDIA GeForce RTX 2080 Ti', 'NVIDIA GeForce RTX 2080 Ti']


In [4]:
import torch.multiprocessing as mp
import subprocess
import torch.distributed as dist

# Clean up any existing process groups
torch.cuda.empty_cache()
if dist.is_initialized():
    dist.destroy_process_group()

cmd = [
    "torchrun",
    "--nproc_per_node=2",  # Adjust based on available GPUs
    "--nnodes=1",
    f"{root_dir}/PyISV/scripts/train_CNN.py",
    "--config", f"{model_id_dir}/config.json",
    "--models_dir", models_dir,
    "--run_id", run_id,
]

print(f"Running command: {' '.join(cmd)}")

try:
    result = subprocess.run(cmd, check=True, text=True, capture_output=False)
except subprocess.CalledProcessError as e:
    print("Training failed with error code:", e.returncode)
    print("Output:\n", e.output)

Running command: torchrun --nproc_per_node=2 --nnodes=1 /home/shared_folder/PyISV/PyISV/scripts/train_CNN.py --config /home/shared_folder/PyISV/models//nonMin_to_min_nPt_85/config.json --models_dir /home/shared_folder/PyISV/models/ --run_id nonMin_to_min_nPt_85


[DDP] Detected torchrun: LOCAL_RANK=1, RANK=1, WORLD_SIZE=2
[DDP] Detected torchrun: LOCAL_RANK=0, RANK=0, WORLD_SIZE=2
ℹ️ Using GPU backend: nccl


[rank1]:[W527 13:05:03.425448743 ProcessGroupNCCL.cpp:4715] [PG ID 0 PG GUID 0 Rank 1]  using GPU 1 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device.
[rank0]:[W527 13:05:03.461684163 ProcessGroupNCCL.cpp:4715] [PG ID 0 PG GUID 0 Rank 0]  using GPU 0 as device used by this process is currently unknown. This can potentially cause a hang if this rank to GPU mapping is incorrect. You can pecify device_id in init_process_group() to force use of a particular device.


✅ DDP initialized successfully with 2 processes using nccl backend

Training on cuda with run ID: nonMin_to_min_nPt_85

ℹ️ Model type: <class 'torch.nn.parallel.distributed.DistributedDataParallel'>
ℹ️ Device: cuda:0
ℹ️ Use DDP: True
ℹ️ Loss function: HuberLoss()

▶️ Starting training from epoch 0 to 500

 --- 💾 Saving best model at epoch: 10 --- 
⏳ [Epoch 10]  - train loss: 0.0022  - validation loss: 0.0034  - lr: 0.00091  - (1.39s/epoch)
⏳ [Epoch 10]  - train loss: 0.0023  - validation loss: 0.0034  - lr: 0.00091  - (1.40s/epoch)
⏳ [Epoch 20]  - train loss: 0.0020  - validation loss: 0.0045  - lr: 0.00100  - (1.30s/epoch)
⏳ [Epoch 20]  - train loss: 0.0020  - validation loss: 0.0045  - lr: 0.00100  - (1.30s/epoch)
⏳ [Epoch 30]  - train loss: 0.0019  - validation loss: 0.0042  - lr: 0.00100  - (1.28s/epoch)
⏳ [Epoch 30]  - train loss: 0.0020  - validation loss: 0.0042  - lr: 0.00100  - (1.28s/epoch)
 --- 💾 Saving best model at epoch: 40 --- 
⏳ [Epoch 40]  - train loss: 0.0020  - valid