In [1]:
import subprocess
from pathlib import Path
import os
import time

In [None]:
job_templet = """#!/bin/bash
#SBATCH --job-name={job_name}
#SBATCH --output=slurm_logs/%x_%A.out
#SBATCH --error=slurm_logs/%x_%A.err
#SBATCH --partition=C9654
#SBATCH --nodelist=c3
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=64
#SBATCH --mem-per-gpu=100G
#SBATCH --gres=gpu:1
export PATH=/opt/share/miniconda3/envs/mofmthnn/bin/:$PATH
export LD_LIBRARY_PATH=/opt/share/miniconda3/envs/mofmthnn/lib/:$LD_LIBRARY_PATH

srun python -u {py_executor} --progress_bar --task_cfg {task_config} --model_cfg {model_config}
""".strip()

def run_slurm_job(work_dir, executor="sbatch", script_name="run"):
    work_dir = Path(work_dir)
    # Create a script to run the job
    process = subprocess.Popen(
        f"{executor} {work_dir/script_name}",
        # [executor, str(work_dir/'run'), "&"],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        shell=True,                                                                                                                         
        env=os.environ.copy(),
        cwd=str(work_dir)
    )
    return process

## Training original CGCNN model

In [6]:
work_dir = Path("./CGCNN_MT").absolute()

task_configs = [
    # "tsd_ssd",
    "tsd_ssd_ws24",
    # "tsd_ssd_ws24_water",
    # "tsd_ssd_ws24_water_water4",
    # "tsd_ssd_ws24_water",
    # "ssd_ws24",
    # "ws24",
    "tsd",
    "ssd",
    "ws24_water",
    "ws24_water4",
    "ws24_acid",
    "ws24_base",
    "ws24_boiling"
                    ]
model_configs = [
    # "att_cgcnn",
    # "cgcnn",
    "cgcnn_raw",
    # "fcnn",
    # "att_fcnn",
    # "cgcnn_uni_atom"
]
script_name = "run_slurm.sh"
py_executor = "hyperopt.py"
# py_executor = "main.py"
model_conf = {
            'batch_size': 32,
            'max_epochs': 500, 
            'max_graph_len': 200,
            'atom_fea_len': 256,
            'extra_fea_len': 16,
            'h_fea_len': 128,
            'n_conv': 6,
            'n_h': 4,
            'dropout_prob': 0.5,
            'use_extra_fea': False,
            'use_cell_params': False,
            'atom_layer_norm': False,
            'loss_aggregation': "fixed_weight_sum",   # fixed_weight_sum, dwa, sum, sample_weight_sum, trainable_weight_sum
            'dl_sampler': 'random',
            'task_att_type': 'none',
            'augment': False,
            'lr': 0.001,
            'lr_mult': 10,
            'group_lr': False,
            'optim_config': "fine",  # fine or coarse
            'auto_lr_bs_find': False, 
            'patience': 50,
            'task_norm': False,
            'log_dir': "logs",
            'optuna_name': "optuna",
            }

for task_config in task_configs:
    for model_config in model_configs:
        job_name = f"{task_config.replace('_config', '')}_{model_config.replace('_config', '')}"
        if py_executor == "hyperopt.py":
            job_name = "opt_" + job_name
            # job_templet_ = job_templet + " --pruning"
            job_templet_ = job_templet
        else:
            job_templet_ = job_templet
        job_script = job_templet_.format(job_name=job_name, 
                                        task_config=task_config, 
                                        model_config=model_config,
                                        py_executor=py_executor
                                        )
        
        for key, value in model_conf.items():
            if isinstance(value, bool):
                if value:
                    job_script += f" --{key}"
                continue
            job_script += f" --{key} {value}"
        with open(work_dir/script_name, "w") as f:
            f.write(job_script)
        process = run_slurm_job(work_dir, executor="sbatch", script_name=script_name)
        ## get the output of the job
        while True:
            output = process.stdout.readline()
            if output == b'' and process.poll() is not None:
                break
            if output:
                print(output.decode().strip())
        print(f"Submitted job {job_name} with PID {process.pid}")
        time.sleep(1)

Submitted batch job 199555
Submitted job opt_tsd_ssd_ws24_cgcnn_raw with PID 1736896


## Training refined CGCNN model (MOFSNN)

In [2]:
import subprocess
from pathlib import Path
import os
import torch

def get_optimal_config():
    """Auto-detect GPU and set optimal batch size"""
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(0)
        gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9  # GB
        
        # Optimal batch sizes based on GPU memory
        if gpu_mem >= 40:  # A100
            batch_size = 128
        elif gpu_mem >= 24:  # L4, A10
            batch_size = 64
        elif gpu_mem >= 16:  # T4, V100
            batch_size = 32
        else:
            batch_size = 16
            
        print(f"Detected GPU: {gpu_name} ({gpu_mem:.1f} GB)")
        print(f"Using batch_size: {batch_size}")
        return batch_size, gpu_mem
    else:
        print("No GPU detected, using CPU")
        return 16, 0

def get_num_workers():
    """Get optimal number of data loading workers"""
    cpu_count = os.cpu_count() or 4
    # Use 4 workers per GPU, max half of CPUs
    return min(cpu_count // 2, 8)

# Auto-detect optimal settings
batch_size, gpu_mem = get_optimal_config()
num_workers = get_num_workers()

work_dir = Path("./CGCNN_MT").absolute()

# MOFSNN configuration
task_config = "tsd_ssd_ws24"
model_config = "att_cgcnn"

model_conf = {
    'batch_size': batch_size,  # Auto-scaled
    'max_epochs': 500, 
    'max_graph_len': 200,
    'atom_fea_len': 256,
    'extra_fea_len': 16,
    'h_fea_len': 128,
    'n_conv': 6,
    'n_h': 4,
    'dropout_prob': 0.5,
    'use_extra_fea': False,
    'use_cell_params': True,
    'atom_layer_norm': True,
    'loss_aggregation': "fixed_weight_sum",
    'dl_sampler': 'random',
    'task_att_type': 'self',
    'augment': False,
    'lr': 0.001,
    'lr_mult': 10,
    'group_lr': True,
    'optim_config': "fine",
    'auto_lr_bs_find': False,
    'patience': 50,
    'task_norm': True,
    'log_dir': "logs",
    'optuna_name': "optuna",
    'num_workers': num_workers,  # Add this for data loading
}

# Build command
cmd = f"python -u hyperopt.py --progress_bar --task_cfg {task_config} --model_cfg {model_config}"

for key, value in model_conf.items():
    if isinstance(value, bool):
        if value:
            cmd += f" --{key}"
    else:
        cmd += f" --{key} {value}"

print(f"\n{'='*60}")
print(f"Running MOFSNN Training")
print(f"{'='*60}")
print(f"Task: {task_config}")
print(f"Model: {model_config}")
print(f"Batch Size: {batch_size}")
print(f"Num Workers: {num_workers}")
print(f"Command:\n{cmd}\n")

# Run training
process = subprocess.run(
    cmd,
    shell=True,
    cwd=str(work_dir),
    env=os.environ.copy()
)

print(f"\nTraining completed with return code: {process.returncode}")

Detected GPU: NVIDIA L4 (23.7 GB)
Using batch_size: 32

Running MOFSNN Training
Task: tsd_ssd_ws24
Model: att_cgcnn
Batch Size: 32
Num Workers: 8
Command:
python -u hyperopt.py --progress_bar --task_cfg tsd_ssd_ws24 --model_cfg att_cgcnn --batch_size 32 --max_epochs 500 --max_graph_len 200 --atom_fea_len 256 --extra_fea_len 16 --h_fea_len 128 --n_conv 6 --n_h 4 --dropout_prob 0.5 --use_cell_params --atom_layer_norm --loss_aggregation fixed_weight_sum --dl_sampler random --task_att_type self --lr 0.001 --lr_mult 10 --group_lr --optim_config fine --patience 50 --task_norm --log_dir logs --optuna_name optuna --num_workers 8



  __import__("pkg_resources").declare_namespace(__name__)
Traceback (most recent call last):
  File "/home/dharunkraja/Desktop/D/MOFSNN_D/CGCNN_MT/hyperopt.py", line 13, in <module>
    from CGCNN_MT.module.module import MInterface
  File "/home/dharunkraja/Desktop/D/MOFSNN_D/CGCNN_MT/module/module.py", line 26, in <module>
    from transformers import (
ModuleNotFoundError: No module named 'transformers'



Training completed with return code: 1
