In [1]:
!git clone https://github.com/miniHuiHui/PINN_FP64.git 2>/dev/null || true

In [2]:
%cd PINN_FP64

/home/ubuntu/PINN_FP64


In [14]:
%pip install tqdm psutil gputil numpy

Defaulting to user installation because normal site-packages is not writeable
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 KB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install -q einops GPUtil psutil

In [6]:
# CELL 4: Create Metrics Logger Part 1 - Imports and Class Definition
# Creates the PINNMetricsLogger class
# ============================================================================

metrics_logger_code_part1 = '''import torch
import time
import psutil
import json
import os
from datetime import datetime
import numpy as np

try:
    import GPUtil
    GPU_AVAILABLE = True
except:
    GPU_AVAILABLE = False

class PINNMetricsLogger:
    """
    Comprehensive metrics logger for PINN training.
    Tracks: FLOPs, runtime, GPU/CPU utilization, memory, loss, gradients
    """
    
    def __init__(self, experiment_name, precision, equation, model_name, save_dir="./metrics_logs"):
        self.experiment_name = experiment_name
        self.precision = precision
        self.equation = equation
        self.model_name = model_name
        self.save_dir = save_dir
        os.makedirs(save_dir, exist_ok=True)
        
        self.metrics_per_epoch = []
        self.total_flops = 0
        self.start_time = None
        self.end_time = None
        self.gpu_available = torch.cuda.is_available() and GPU_AVAILABLE
        self.model_params = None
        self.model_size_mb = None
    
    def log_model_info(self, model):
        """Log model architecture information"""
        self.model_params = sum(p.numel() for p in model.parameters())
        param_size = sum(p.numel() * p.element_size() for p in model.parameters())
        buffer_size = sum(b.numel() * b.element_size() for b in model.buffers())
        self.model_size_mb = (param_size + buffer_size) / (1024 ** 2)
    
    def start_training(self):
        """Mark the start of training"""
        self.start_time = time.time()
    
    def end_training(self):
        """Mark the end of training"""
        self.end_time = time.time()
    
    def estimate_flops_per_iteration(self, model, input_shape):
        """
        Estimate FLOPs for one forward pass
        FLOPs = 2 * in_features * out_features * batch_size for each linear layer
        """
        flops = 0
        for module in model.modules():
            if isinstance(module, torch.nn.Linear):
                flops += 2 * module.in_features * module.out_features * input_shape[0]
        return flops
    
    def compute_backward_flops(self, forward_flops):
        """Estimate backward pass FLOPs (approximately 2x forward pass)"""
        return forward_flops * 2
    
    def compute_gradient_norm(self, model):
        """Compute total gradient norm across all parameters"""
        total_norm = 0.0
        for p in model.parameters():
            if p.grad is not None:
                param_norm = p.grad.data.norm(2)
                total_norm += param_norm.item() ** 2
        return total_norm ** 0.5
'''

with open('metrics_logger_temp.py', 'w') as f:
    f.write(metrics_logger_code_part1)
metrics_logger_code_part2 = '''
    def log_epoch_metrics(self, epoch, loss, grad_norm=None, learning_rate=None,
                         forward_flops=None, backward_flops=None):
        """Log metrics for a single epoch"""
        # System metrics
        cpu_percent = psutil.cpu_percent(interval=0.1)
        memory = psutil.virtual_memory()
        
        # GPU metrics
        gpu_metrics = {}
        if self.gpu_available:
            try:
                gpus = GPUtil.getGPUs()
                if gpus:
                    gpu = gpus[0]
                    gpu_metrics = {
                        'gpu_utilization': gpu.load * 100,
                        'gpu_memory_used_mb': gpu.memoryUsed,
                        'gpu_memory_percent': (gpu.memoryUsed / gpu.memoryTotal) * 100,
                        'gpu_temperature': gpu.temperature
                    }
            except:
                pass
        
        # FLOPs calculation
        iteration_flops = (forward_flops or 0) + (backward_flops or 0)
        self.total_flops += iteration_flops
        
        # Compile all metrics
        metrics = {
            'epoch': epoch,
            'loss': float(loss),
            'gradient_norm': float(grad_norm) if grad_norm else None,
            'learning_rate': learning_rate,
            'forward_flops': forward_flops,
            'backward_flops': backward_flops,
            'cumulative_flops': self.total_flops,
            'cpu_percent': cpu_percent,
            'memory_used_gb': memory.used / (1024**3),
            'memory_percent': memory.percent,
            **gpu_metrics,
            'elapsed_time': time.time() - self.start_time if self.start_time else 0
        }
        
        self.metrics_per_epoch.append(metrics)
        
        # Print progress every 100 epochs
        if epoch % 100 == 0:
            print(f"Epoch {epoch:5d} | Loss: {loss:.6e} | "
                  f"CPU: {cpu_percent:5.1f}% | RAM: {memory.percent:5.1f}%", end="")
            if 'gpu_utilization' in gpu_metrics:
                print(f" | GPU: {gpu_metrics['gpu_utilization']:5.1f}%")
            else:
                print()
    
    def save_metrics(self):
        """Save all metrics to JSON file"""
        if self.end_time is None:
            self.end_training()
        
        total_runtime = self.end_time - self.start_time
        
        # Compute summary statistics
        summary = {
            'experiment_info': {
                'name': self.experiment_name,
                'precision': self.precision,
                'equation': self.equation,
                'model': self.model_name,
                'model_parameters': self.model_params,
                'model_size_mb': self.model_size_mb,
            },
            'training_summary': {
                'total_epochs': len(self.metrics_per_epoch),
                'total_runtime_seconds': total_runtime,
                'total_runtime_hours': total_runtime / 3600,
                'total_flops': self.total_flops,
                'avg_flops_per_epoch': self.total_flops / len(self.metrics_per_epoch) if self.metrics_per_epoch else 0,
            },
            'final_metrics': self.metrics_per_epoch[-1] if self.metrics_per_epoch else {},
            'per_epoch_metrics': self.metrics_per_epoch
        }
        
        # Calculate average metrics
        if self.metrics_per_epoch:
            avg_metrics = {}
            numeric_keys = ['loss', 'gradient_norm', 'cpu_percent', 'memory_percent', 
                          'gpu_utilization', 'gpu_memory_percent']
            
            for key in numeric_keys:
                values = [m[key] for m in self.metrics_per_epoch 
                         if key in m and m[key] is not None]
                if values:
                    avg_metrics[f'avg_{key}'] = np.mean(values)
                    avg_metrics[f'std_{key}'] = np.std(values)
                    avg_metrics[f'min_{key}'] = np.min(values)
                    avg_metrics[f'max_{key}'] = np.max(values)
            
            summary['average_metrics'] = avg_metrics
        
        # Save to file
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f"{self.experiment_name}_{self.precision}_{self.equation}_{timestamp}.json"
        filepath = os.path.join(self.save_dir, filename)
        
        with open(filepath, 'w') as f:
            json.dump(summary, f, indent=2)
        
        print(f"\\nMetrics saved to: {filepath}")
        print(f"Total runtime: {total_runtime/3600:.2f} hours")
        print(f"Total FLOPs: {self.total_flops:.2e}")
        print(f"Final loss: {summary['final_metrics'].get('loss', 'N/A')}")
        
        return filepath
'''

# Combine both parts and write complete file
with open('metrics_logger_temp.py', 'r') as f:
    part1 = f.read()

with open('metrics_logger.py', 'w') as f:
    f.write(part1 + metrics_logger_code_part2)

# Clean up temporary file
import os
os.remove('metrics_logger_temp.py')

In [8]:
# CELL 6: Define Patch Function
# Function to automatically add metrics logging to equation scripts
# ============================================================================

import re
import os  # Added missing import

def patch_script(script_path, equation_name):
    """
    Automatically patch a PINN script to include metrics logging.
    
    Args:
        script_path: Path to the script (e.g., 'reaction_fp64.py')
        equation_name: Name of the equation (e.g., 'reaction')
    
    Returns:
        True if patched successfully, False otherwise
    """
    # Check if script exists
    if not os.path.exists(script_path):
        return False
    
    # Read original script
    with open(script_path, 'r') as f:
        code = f.read()
    
    # Check if already patched
    if 'METRICS_AUTO_PATCHED' in code:
        return False
    
    # Create backup of original script
    with open(script_path + '.original', 'w') as f:
        f.write(code)
    
    # Find the first import statement
    import_pattern = r'(import\s+torch.*?\n)'
    match = re.search(import_pattern, code)
    
    if match:
        insert_pos = match.end()
        
        # Code to inject at the beginning
        metrics_import_code = f'''
# ==================== METRICS_AUTO_PATCHED ====================
# This code was automatically added to enable metrics logging
# To disable, restore from .original backup file
from metrics_logger import PINNMetricsLogger
import os

# Get precision from environment variable (set before running)
# Options: 'fp64', 'fp32', 'fp16', 'bf16'
_PRECISION = os.environ.get('PINN_PRECISION', 'fp64')
_EQUATION = '{equation_name}'

# Set PyTorch default dtype based on precision
_dtype_map = {{
    'fp64': torch.float64,
    'fp32': torch.float32,
    'fp16': torch.float16,
    'bf16': torch.bfloat16
}}

if _PRECISION in _dtype_map:
    torch.set_default_dtype(_dtype_map[_PRECISION])

# Global variables for metrics logging
_METRICS_LOGGER = None
_FORWARD_FLOPS = None
_BACKWARD_FLOPS = None
# ==================== END AUTO_PATCH ====================
'''
        
        # Insert the code at the beginning (after first import)
        modified_code = code[:insert_pos] + metrics_import_code + code[insert_pos:]
        
        # Write the modified code
        with open(script_path, 'w') as f:
            f.write(modified_code)
        
        return True
    
    return False

In [11]:
# CELL 6: Define Patch Function
# Function to automatically add metrics logging to equation scripts
# ============================================================================

import re
import os

def patch_script(script_path, equation_name):
    """
    Automatically patch a PINN script to include metrics logging.
    
    Args:
        script_path: Path to the script (e.g., 'reaction_fp64.py')
        equation_name: Name of the equation (e.g., 'reaction')
    
    Returns:
        True if patched successfully, False otherwise
    """
    # Check if script exists
    if not os.path.exists(script_path):
        return False
    
    # Read original script
    with open(script_path, 'r') as f:
        code = f.read()
    
    # Check if already patched
    if 'METRICS_AUTO_PATCHED' in code:
        return False
    
    # Create backup of original script
    with open(script_path + '.original', 'w') as f:
        f.write(code)
    
    # Find the first import statement
    import_pattern = r'(import\s+torch.*?\n)'
    match = re.search(import_pattern, code)
    
    if match:
        insert_pos = match.end()
        
        # Code to inject at the beginning
        metrics_import_code = f'''
# ==================== METRICS_AUTO_PATCHED ====================
# This code was automatically added to enable metrics logging
# To disable, restore from .original backup file
from metrics_logger import PINNMetricsLogger
import os

# Get precision from environment variable (set before running)
# Options: 'fp64', 'fp32', 'fp16', 'bf16'
_PRECISION = os.environ.get('PINN_PRECISION', 'fp64')
_EQUATION = '{equation_name}'

# Set PyTorch default dtype based on precision
_dtype_map = {{
    'fp64': torch.float64,
    'fp32': torch.float32,
    'fp16': torch.float16,
    'bf16': torch.bfloat16
}}

if _PRECISION in _dtype_map:
    torch.set_default_dtype(_dtype_map[_PRECISION])

# Global variables for metrics logging
_METRICS_LOGGER = None
_FORWARD_FLOPS = None
_BACKWARD_FLOPS = None
# ==================== END AUTO_PATCH ====================
'''
        
        # Insert the metrics code after imports
        code = code[:insert_pos] + metrics_import_code + code[insert_pos:]
    
    # Find model creation and add logger initialization
    model_pattern = r'(model\s*=\s*Model\([^)]*\).*?\n)'
    match = re.search(model_pattern, code)
    
    if match:
        insert_pos = match.end()
        
        # Code to initialize the logger after model is created
        logger_init_code = '''
    # Initialize metrics logger
    global _METRICS_LOGGER, _FORWARD_FLOPS, _BACKWARD_FLOPS
    _METRICS_LOGGER = PINNMetricsLogger(
        experiment_name=f"baseline_{_EQUATION}_{_PRECISION}",
        precision=_PRECISION,
        equation=_EQUATION,
        model_name='PINN'
    )
    _METRICS_LOGGER.log_model_info(model)
    _METRICS_LOGGER.start_training()
    
    # Estimate FLOPs per iteration
    # Assumes batch size of 1024 and input dimension of 2
    # Adjust if your problem uses different values
    _FORWARD_FLOPS = _METRICS_LOGGER.estimate_flops_per_iteration(model, (1024, 2))
    _BACKWARD_FLOPS = _METRICS_LOGGER.compute_backward_flops(_FORWARD_FLOPS)
    
'''
        # Insert logger initialization after model creation
        code = code[:insert_pos] + logger_init_code + code[insert_pos:]
    
    # Add finalization code at the end of the file
    finalize_code = '''

# Finalize and save metrics at the end of training
if _METRICS_LOGGER is not None:
    _METRICS_LOGGER.end_training()
    _METRICS_LOGGER.save_metrics()
'''
    code += finalize_code
    
    # Write the patched script
    with open(script_path, 'w') as f:
        f.write(code)
    
    return True

In [12]:

import os

equations_to_patch = [
    ('reaction_fp64.py', 'reaction'),
    ('convection_fp64.py', 'convection'),
    ('wave_fp64.py', 'wave'),
    ('ac_fp64.py', 'ac'),
]

patched_count = 0
for script_name, equation_name in equations_to_patch:
    if patch_script(script_name, equation_name):
        patched_count += 1

print("="*70)
print("SETUP COMPLETE")
print("="*70)
print(f"Patched {patched_count} equation scripts")
print("\nTo run experiments with metrics:")
print("  1. Set precision: os.environ['PINN_PRECISION'] = 'fp32'")
print("  2. Run script: !python reaction_fp64.py --model PINN --device 'cuda:0'")
print("\nAvailable precisions: fp64, fp32, fp16, bf16")
print("Available equations: reaction, convection, wave, ac")
print("\nMetrics will be saved to: ./metrics_logs/")
print("="*70)



SETUP COMPLETE
Patched 4 equation scripts

To run experiments with metrics:
  1. Set precision: os.environ['PINN_PRECISION'] = 'fp32'
  2. Run script: !python reaction_fp64.py --model PINN --device 'cuda:0'

Available precisions: fp64, fp32, fp16, bf16
Available equations: reaction, convection, wave, ac

Metrics will be saved to: ./metrics_logs/


In [15]:
import os
os.environ['PINN_PRECISION'] = 'fp64'
!python wave_fp64.py --model PINN --device 'cuda:0'


  torch.nn.init.xavier_uniform(m.weight)
Model(
  (linear): Sequential(
    (0): Linear(in_features=2, out_features=512, bias=True)
    (1): Tanh()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): Tanh()
    (4): Linear(in_features=512, out_features=512, bias=True)
    (5): Tanh()
    (6): Linear(in_features=512, out_features=1, bias=True)
  )
)
527361
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
Loss Res: 0.000036, Loss_IC: 0.551949, Loss_BC: 0.008872
Loss Res: 0.000045, Loss_IC: 0.535447, Loss_BC: 0.012086
Loss Res: 0.000250, Loss_IC: 0.400598, Loss_BC: 0.064326
Loss Res: 0.000472, Loss_IC: 0.304585, Loss_BC: 0.111661
Loss Res: 0.000345, Loss_IC: 0.298666, Loss_BC: 0.101227
Loss Res: 0.000192, Loss_IC: 0.299022, Loss_BC: 0.096438
Loss Res: 0.000094, Loss_IC: 0.294738, Loss_BC: 0.098703
Loss Res: 0.000089, Loss_IC: 0.296155, Loss_BC: 0.097261
Loss Res: 0.000090, Loss_IC: 0.295609, Loss_BC: 0.097802
Loss R

In [16]:
import os
os.environ['PINN_PRECISION'] = 'fp64'
!python reaction_fp64.py --model PINN --device 'cuda:0'

  torch.nn.init.xavier_uniform(m.weight)
Model(
  (linear): Sequential(
    (0): Linear(in_features=2, out_features=1024, bias=True)
    (1): Tanh()
    (2): Linear(in_features=1024, out_features=1024, bias=True)
    (3): Tanh()
    (4): Linear(in_features=1024, out_features=1024, bias=True)
    (5): Tanh()
    (6): Linear(in_features=1024, out_features=1024, bias=True)
    (7): Tanh()
    (8): Linear(in_features=1024, out_features=1024, bias=True)
    (9): Tanh()
    (10): Linear(in_features=1024, out_features=1, bias=True)
  )
)
4202497
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
100%|███████████████████████████████████████| 2000/2000 [14:09<00:00,  2.36it/s]
Loss Res: 0.000003, Loss_BC: 0.000000, Loss_IC: 0.000002
Train Loss: 0.000006
relative L1 error: 0.022204
relative L2 error: 0.042169


In [18]:
import json
import os
from glob import glob

metrics_dir = './metrics_logs'

if os.path.exists(metrics_dir):
    files = sorted(glob(os.path.join(metrics_dir, '*.json')))
    
    if files:
        print(f"Found {len(files)} metrics files\n")
        print("="*70)
        
        for filepath in files:
            with open(filepath, 'r') as f:
                data = json.load(f)
            
            exp_info = data['experiment_info']
            train_summary = data['training_summary']
            final = data['final_metrics']
            
            print(f"Equation: {exp_info['equation']}")
            print(f"Precision: {exp_info['precision'].upper()}")
            print(f"Parameters: {exp_info['model_parameters']:,}")
            print(f"Runtime: {train_summary['total_runtime_hours']:.2f} hours")
            print(f"Total FLOPs: {train_summary['total_flops']:.2e}")
            print(f"Final Loss: {final['loss']:.6e}")
            if final.get('gradient_norm'):
                print(f"Final Grad Norm: {final['gradient_norm']:.6e}")
            print("-"*70)
    else:
        print("No metrics files found yet")
else:
    print("Metrics directory does not exist yet")
    print("Run experiments first to generate metrics")


Metrics directory does not exist yet
Run experiments first to generate metrics
