In [None]:
# Import necessary libraries
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import pickle
import lz4.frame
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn as nn

# Add the matal module to path
sys.path.append(str(Path.cwd() / 'matal'))

from matal.ann_model.config import ModelConfig
from matal.ann_model.builder import build_model
from matal.ann_model.settings import X_COLS, TARGET_Y_COLS, TARGET_Y_SCALES


## Load Model

In [None]:
# Load datasets and remove overlapping SampleIDs
grade1_df = pd.read_csv('data/grade 1 (2).csv')
ann_demo_df = pd.read_csv('data/ann_demo.csv')

print(f"Original: {grade1_df.shape}, ANN Demo: {ann_demo_df.shape}")

# Remove overlapping SampleIDs
overlapping = set(grade1_df['SampleID']).intersection(set(ann_demo_df['SampleID']))
grade1_df = grade1_df[~grade1_df['SampleID'].isin(ann_demo_df['SampleID'])]

print(f"Removed {len(overlapping)} rows. Final: {grade1_df.shape}")

Original: (3170, 31), ANN Demo: (342, 42)
Removed 342 rows. Final: (2828, 31)


In [None]:
# Split grade1_df into train and test based on SampleID prefix
test_mask = grade1_df['SampleID'].str.startswith('aTS-T')
test_df = grade1_df[test_mask].copy()
train_df = grade1_df[~test_mask].copy()

print(f"Train set: {train_df.shape[0]} samples")
print(f"Test set: {test_df.shape[0]} samples")
print(f"Total: {train_df.shape[0] + test_df.shape[0]} samples")


Train set: 2648 samples
Test set: 180 samples
Total: 2828 samples


In [None]:
# Set up paths
MODEL_STRUCTURE_DIR = Path('model/v3hp/')
RESULTS_DIR = Path('./evaluation_results')

# Create results directory if it doesn't exist
RESULTS_DIR.mkdir(exist_ok=True)

print(f"Model structure directory: {MODEL_STRUCTURE_DIR}")
print(f"Results directory: {RESULTS_DIR}")


Model structure directory: model/v3hp
Results directory: evaluation_results


In [None]:
# List available models in model_structure directory (search in subdirectories)
model_files = list(MODEL_STRUCTURE_DIR.glob('**/*.model.pt.lz4'))
print("Available model files:")
for model_file in model_files:
    print(f"  - {model_file}")

# Extract model tags
model_tags = []
for model_file in model_files:
    # Extract model tag from filename (e.g., pt_db.v3hp.2HNP.model.pt.lz4 -> pt_db.v3hp.2HNP)
    tag = model_file.name.replace('.model.pt.lz4', '')
    model_tags.append(tag)

print(f"\nModel tags: {model_tags}")


Available model files:
  - model/v3hp/pt_db.v3hp.RE5C/pt_db.v3hp.RE5C.model.pt.lz4
  - model/v3hp/pt_db.v3hp.2HNP/pt_db.v3hp.2HNP.model.pt.lz4
  - model/v3hp/pt_db.v3hp.CPUT/pt_db.v3hp.CPUT.model.pt.lz4
  - model/v3hp/pt_db.v3hp.VPLV/pt_db.v3hp.VPLV.model.pt.lz4
  - model/v3hp/pt_db.v3hp.PFFZ/pt_db.v3hp.PFFZ.model.pt.lz4

Model tags: ['pt_db.v3hp.RE5C', 'pt_db.v3hp.2HNP', 'pt_db.v3hp.CPUT', 'pt_db.v3hp.VPLV', 'pt_db.v3hp.PFFZ']


In [None]:

import re
from collections import defaultdict

def parse_model_txt_to_params(model_txt_path):
    """
    Parse a .model.txt file to extract model parameters.
    
    Args:
        model_txt_path: Path to the .model.txt file
    
    Returns:
        Dictionary containing model parameters in the expected format
    """
    with open(model_txt_path, 'r') as f:
        content = f.read()
    
    # Initialize parameters dictionary
    params = {
        'model_params': {},
        'data_config': {
            'X_COLS': X_COLS,
            'TARGET_Y_COLS': TARGET_Y_COLS,
            'TARGET_Y_SCALES': TARGET_Y_SCALES
        }
    }
    print("Here")
    # Parse encoder architecture
    encoder_match = re.search(r'\(encoder\): ANNModel\((.*?)\)', content, re.DOTALL)
    if encoder_match:
        encoder_content = encoder_match.group(1)
        encoder_layers = parse_ann_model_layers(encoder_content)
        
        # Extract encoder parameters
        params['model_params']['encoder__hidden_layers'] = len(encoder_layers) - 1  # -1 for input layer
        params['model_params']['encoder__n_output'] = encoder_layers[-1]['out_features']
        params['model_params']['encoder__hidden_base'] = encoder_layers[1]['out_features']  # First hidden layer size
        params['model_params']['encoder__hidden_scale'] = 'linear'  # Default assumption
        params['model_params']['encoder__act_hidden'] = 'elu'  # From the file
        params['model_params']['encoder__act_output'] = 'identity'  # From the file
        params['model_params']['encoder__weight_init'] = 'kaiming_uniform_'
        params['model_params']['encoder__input_dropout'] = 0.0
        params['model_params']['encoder__output_dropout'] = 0.0
        params['model_params']['encoder__hidden_dropout'] = 0.0
    
    # Parse task-specific models (grade, optical, tensile, fire)
    for task in ['grade', 'optical', 'tensile', 'fire']:
        task_match = re.search(rf'\({task}\): ANNModel\((.*?)\)', content, re.DOTALL)
        if task_match:
            task_content = task_match.group(1)
            task_layers = parse_ann_model_layers(task_content)
            
            # Extract task parameters
            params['model_params'][f'{task}__hidden_layers'] = len(task_layers) - 1
            params['model_params'][f'{task}__n_output'] = task_layers[-1]['out_features']
            params['model_params'][f'{task}__hidden_base'] = task_layers[1]['out_features'] if len(task_layers) > 1 else task_layers[0]['out_features']
            params['model_params'][f'{task}__hidden_scale'] = 'linear'
            params['model_params'][f'{task}__act_hidden'] = 'elu'
            
            # Determine output activation based on the last layer
            last_activation = task_layers[-1]['activation']
            if last_activation == 'Sigmoid':
                params['model_params'][f'{task}__act_output'] = 'sigmoid'
            elif last_activation == 'ReLU':
                params['model_params'][f'{task}__act_output'] = 'relu'
            else:
                params['model_params'][f'{task}__act_output'] = 'identity'
            
            params['model_params'][f'{task}__weight_init'] = 'kaiming_uniform_'
            params['model_params'][f'{task}__input_dropout'] = 0.0
            params['model_params'][f'{task}__output_dropout'] = 0.0
            params['model_params'][f'{task}__hidden_dropout'] = 0.0
    
    # Add general parameters
    params['model_params']['random_seed'] = 0
    
    return params

def parse_ann_model_layers(content):
    """
    Parse ANNModel layers from the model text content.
    
    Args:
        content: The layers section content
    
    Returns:
        List of dictionaries containing layer information
    """
    layers = []
    
    # Find all Linear layers
    linear_matches = re.finditer(r'\(fc(\d+)\): Linear\(in_features=(\d+), out_features=(\d+), bias=True\)', content)
    for match in linear_matches:
        layer_num = int(match.group(1))
        in_features = int(match.group(2))
        out_features = int(match.group(3))
        
        # Find corresponding activation
        activation_match = re.search(rf'\(fc{layer_num}_act\): (\w+)', content)
        activation = activation_match.group(1) if activation_match else 'Identity'
        
        layers.append({
            'layer_num': layer_num,
            'in_features': in_features,
            'out_features': out_features,
            'activation': activation
        })
    
    # Sort by layer number
    layers.sort(key=lambda x: x['layer_num'])
    
    return layers

def load_model_from_structure_with_txt(model_tag, model_dir=MODEL_STRUCTURE_DIR):
    """
    Load a model from the model_structure directory using .model.txt files.
    
    Args:
        model_tag: Model tag (e.g., 'pt_db.v3hp.2HNP')
        model_dir: Directory containing model files
    
    Returns:
        Loaded model and parameters
    """
    try:
        # Model files are in subdirectories named after the model tag
        # e.g., model/v3hp/pt_db.v3hp.2HNP/pt_db.v3hp.2HNP.param.pk
        model_subdir = model_dir / model_tag
        
        # Try to load parameters from .param.pk first
        param_file = model_subdir / f'{model_tag}.param.pk'
        if param_file.exists():
            with open(param_file, 'rb') as f:
                params = pickle.load(f)
        else:
            # Extract the base model name from the tag (e.g., 'pt_db.v3hp.2HNP' -> 'pt_db.2HNP')
            # Split by dots and reconstruct without the version part
            tag_parts = model_tag.split('.')
            if len(tag_parts) >= 3 and tag_parts[1].startswith('v'):
                # Remove version part (e.g., 'v3hp')
                base_tag = f"{tag_parts[0]}.{'.'.join(tag_parts[2:])}"
            else:
                base_tag = model_tag
            
            # Parse from .model.txt file
            model_txt_file = model_subdir / f'{base_tag}.model.txt'
            if not model_txt_file.exists():
                print(f"Neither parameter file nor model.txt file found for {model_tag}")
                print(f"Tried: {param_file} and {model_txt_file}")
                return None, None
            
            print(f"Parsing model parameters from {model_txt_file}")
            params = parse_model_txt_to_params(model_txt_file)
        
        print(params)
        # Build model
        model_config = ModelConfig(**params['model_params'])
        model = build_model(model_name=model_tag, **model_config.get_all())
        
        # Load model weights
        model_file = model_subdir / f'{model_tag}.model.pt.lz4'
        if not model_file.exists():
            print(f"Model file not found: {model_file}")
            return None, None
        
        with lz4.frame.open(model_file, 'rb') as f:
            model.load_state_dict(torch.load(f))
        
        model.eval()
        print(f"Successfully loaded model: {model_tag}")
        return model, params
        
    except Exception as e:
        print(f"Error loading model {model_tag}: {e}")
        return None, None
# Test loading one model
if model_tags:
    print("Available model tags:", model_tags)
    print("\nTesting model loading with .model.txt files...")
    
    # Test the naming conversion
    for tag in model_tags:
        tag_parts = tag.split('.')
        if len(tag_parts) >= 3 and tag_parts[1].startswith('v'):
            base_tag = f"{tag_parts[0]}.{'.'.join(tag_parts[2:])}"
            print(f"Model tag: {tag} -> Base tag: {base_tag}")
    
    # Load the first model
    test_model, test_params = load_model_from_structure_with_txt(model_tags[0])
    if test_model is not None:
        print(f"\nModel architecture:")
        print(test_model)
        print(f"\nModel parameters:")
        print(test_params)


## Evaluate -2k - Feasibility

In [52]:
# Fixed version of prepare_grade_datasets_for_evaluation function
def prepare_grade_datasets_for_evaluation_fixed(train_df, test_df):
    """
    Prepare train and test datasets for grade task evaluation.
    
    Args:
        train_df: Training dataframe
        test_df: Test dataframe
    
    Returns:
        Dictionary containing prepared grade datasets
    """
    # Extract input features (X_COLS) and ensure float32
    X_train = train_df[X_COLS].values.astype(np.float32)
    X_test = test_df[X_COLS].values.astype(np.float32)
    
    # Extract grade target values and ensure float32
    grade_cols = TARGET_Y_COLS['grade']
    y_train = train_df[grade_cols].values.astype(np.float32)
    y_test = test_df[grade_cols].values.astype(np.float32)
    
    dataset = {
        'train_X': torch.tensor(X_train, dtype=torch.float32),
        'train_y': torch.tensor(y_train, dtype=torch.float32),
        'test_X': torch.tensor(X_test, dtype=torch.float32),
        'test_y': torch.tensor(y_test, dtype=torch.float32)
    }
    
    print(f"Prepared grade dataset: train {y_train.shape}, test {y_test.shape}")
    print(f"Grade targets: {grade_cols}")
    
    return dataset

# Prepare grade datasets with fixed function
print("Preparing grade datasets for evaluation...")
grade_dataset = prepare_grade_datasets_for_evaluation_fixed(train_df, test_df)


Preparing grade datasets for evaluation...
Prepared grade dataset: train (2648, 3), test (180, 3)
Grade targets: ['Detachability', 'FlatnessUni', 'Feasibility']


In [53]:
# Fixed version of calculate_grade_model_losses function
def calculate_grade_model_losses_fixed(model, dataset, model_tag, device='cpu'):
    """
    Calculate grade task losses for a model on train and test datasets.
    
    Args:
        model: The trained model
        dataset: Dictionary containing train/test datasets for grade task
        model_tag: Model identifier
        device: Device to run evaluation on
    
    Returns:
        Dictionary containing loss results
    """
    model.to(device)
    model.eval()
    
    # Ensure model parameters are float32
    for param in model.parameters():
        param.data = param.data.float()
    
    # Use BCEWithLogitsLoss for grade task (binary classification)
    criterion = nn.BCEWithLogitsLoss()
    
    results = {'model_tag': model_tag}
    
    with torch.no_grad():
        # Calculate train loss
        train_X = dataset['train_X'].to(device).float()
        train_y = dataset['train_y'].to(device).float()
        train_pred = model.forward(train_X, 'grade')
        train_loss = criterion(train_pred, train_y)
        results['grade_train_loss'] = train_loss.item()
        
        # Calculate test loss
        test_X = dataset['test_X'].to(device).float()
        test_y = dataset['test_y'].to(device).float()
        test_pred = model.forward(test_X, 'grade')
        test_loss = criterion(test_pred, test_y)
        results['grade_test_loss'] = test_loss.item()
        
        print(f"{model_tag} - Grade: Train Loss = {train_loss.item():.6f}, Test Loss = {test_loss.item():.6f}")
    
    return results


In [54]:
# Calculate grade losses for all loaded models using fixed functions
loss_results = []

print("Calculating grade losses for all models...")
print("=" * 60)

for model_tag in tqdm(model_tags):
    # Load model using the existing function
    model, params = load_model_from_structure_with_txt(model_tag)
    
    if model is not None:
        # Calculate grade losses using fixed function
        loss_result = calculate_grade_model_losses_fixed(model, grade_dataset, model_tag)
        loss_results.append(loss_result)
        print(f"Completed evaluation for {model_tag}")
    else:
        print(f"Failed to load model: {model_tag}")

print(f"\nSuccessfully evaluated {len(loss_results)} models")


Calculating grade losses for all models...


 20%|██        | 1/5 [00:00<00:00,  7.04it/s]

{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.RE5C
pt_db.v3hp.RE5C - Grade: Train Loss = 0.556246, Test Loss = 0.659720
Completed evaluation for pt_db.v3hp.RE5C
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.2HNP


 60%|██████    | 3/5 [00:00<00:00,  9.58it/s]

pt_db.v3hp.2HNP - Grade: Train Loss = 0.571103, Test Loss = 0.658976
Completed evaluation for pt_db.v3hp.2HNP
{'model_params': {'encoder__hidden_layers': 50, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.CPUT
pt_db.v3hp.CPUT - Grade: Train Loss = 0.573922, Test Loss = 0.651741
Completed evaluation for pt_db.v3hp.CPUT
{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.VPLV


100%|██████████| 5/5 [00:00<00:00,  9.31it/s]

pt_db.v3hp.VPLV - Grade: Train Loss = 0.558082, Test Loss = 0.655647
Completed evaluation for pt_db.v3hp.VPLV
{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 400.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.PFFZ
pt_db.v3hp.PFFZ - Grade: Train Loss = 0.570911, Test Loss = 0.659414
Completed evaluation for pt_db.v3hp.PFFZ

Successfully evaluated 5 models





In [55]:
# Organize and display grade loss results
loss_df = pd.DataFrame(loss_results)

print("Grade Task Loss Results Summary:")
print("=" * 80)

# Display grade task results
train_col = 'grade_train_loss'
test_col = 'grade_test_loss'

if train_col in loss_df.columns and test_col in loss_df.columns:
    print(f"\nGRADE Task Losses:")
    print("-" * 40)
    
    grade_results = loss_df[['model_tag', train_col, test_col]].copy()
    grade_results['overfitting'] = grade_results[test_col] - grade_results[train_col]
    
    print(grade_results.round(6))
    
    # Summary statistics
    print(f"\nSummary for Grade Task:")
    print(f"  Average Train Loss: {grade_results[train_col].mean():.6f}")
    print(f"  Average Test Loss: {grade_results[test_col].mean():.6f}")
    print(f"  Average Overfitting: {grade_results['overfitting'].mean():.6f}")
    print(f"  Best Test Loss: {grade_results[test_col].min():.6f} ({grade_results.loc[grade_results[test_col].idxmin(), 'model_tag']})")
    print(f"  Worst Test Loss: {grade_results[test_col].max():.6f} ({grade_results.loc[grade_results[test_col].idxmax(), 'model_tag']})")

# Save results
loss_df.to_csv(RESULTS_DIR / 'grade_model_loss_results.csv', index=False)
print(f"\nGrade loss results saved to {RESULTS_DIR / 'grade_model_loss_results.csv'}")


Grade Task Loss Results Summary:

GRADE Task Losses:
----------------------------------------
         model_tag  grade_train_loss  grade_test_loss  overfitting
0  pt_db.v3hp.RE5C          0.556246         0.659720     0.103474
1  pt_db.v3hp.2HNP          0.571103         0.658976     0.087873
2  pt_db.v3hp.CPUT          0.573922         0.651741     0.077819
3  pt_db.v3hp.VPLV          0.558082         0.655647     0.097565
4  pt_db.v3hp.PFFZ          0.570911         0.659414     0.088503

Summary for Grade Task:
  Average Train Loss: 0.566053
  Average Test Loss: 0.657100
  Average Overfitting: 0.091047
  Best Test Loss: 0.651741 (pt_db.v3hp.CPUT)
  Worst Test Loss: 0.659720 (pt_db.v3hp.RE5C)

Grade loss results saved to evaluation_results/grade_model_loss_results.csv


In [57]:
# Calculate Classifier Accuracy for Ensemble
def calculate_classifier_accuracy(y_true, y_pred, threshold=0.5):
    """
    Calculate classification accuracy for binary classification
    
    Args:
        y_true: True binary labels
        y_pred: Predicted probabilities
        threshold: Classification threshold (default 0.5)
    
    Returns:
        Dictionary containing accuracy metrics
    """
    # Convert predictions to binary using threshold
    y_pred_binary = (y_pred >= threshold).astype(int)
    y_true_binary = y_true.astype(int)
    
    # Calculate accuracy
    accuracy = np.mean(y_true_binary == y_pred_binary)
    
    # Calculate per-class accuracy
    class_accuracies = []
    for i in range(y_true.shape[1]):
        class_acc = np.mean(y_true_binary[:, i] == y_pred_binary[:, i])
        class_accuracies.append(class_acc)
    
    return {
        'overall_accuracy': accuracy,
        'class_accuracies': class_accuracies,
        'y_pred_binary': y_pred_binary,
        'y_true_binary': y_true_binary
    }

def calculate_ensemble_classifier_accuracy(model_tags, grade_dataset, threshold=0.5, device='cpu'):
    """
    Calculate classifier accuracy for individual models and ensemble
    
    Args:
        model_tags: List of model tags
        grade_dataset: Dictionary containing train/test datasets
        threshold: Classification threshold
        device: Device to run evaluation on
    
    Returns:
        Dictionary containing accuracy results
    """
    print("Calculating classifier accuracy for individual models and ensemble...")
    print("=" * 70)
    
    # Store individual model predictions
    individual_predictions = []
    model_results = []
    
    # Get true values
    y_test_true = grade_dataset['test_y'].numpy()
    y_train_true = grade_dataset['train_y'].numpy()
    grade_cols = TARGET_Y_COLS['grade']
    
    for model_tag in tqdm(model_tags, desc="Loading models"):
        # Load model
        model, params = load_model_from_structure_with_txt(model_tag)
        
        if model is not None:
            model.to(device)
            model.eval()
            
            # Ensure model parameters are float32
            for param in model.parameters():
                param.data = param.data.float()
            
            with torch.no_grad():
                # Test predictions
                test_X = grade_dataset['test_X'].to(device).float()
                test_pred = model.forward(test_X, 'grade').cpu().numpy()
                
                # Train predictions
                train_X = grade_dataset['train_X'].to(device).float()
                train_pred = model.forward(train_X, 'grade').cpu().numpy()
                
                # Store predictions for ensemble
                individual_predictions.append(test_pred)
                
                # Calculate accuracy for this model
                model_result = {'model_tag': model_tag}
                
                # Test accuracy
                test_acc = calculate_classifier_accuracy(y_test_true, test_pred, threshold)
                model_result['test_overall_accuracy'] = test_acc['overall_accuracy']
                for i, col in enumerate(grade_cols):
                    model_result[f'test_accuracy_{col}'] = test_acc['class_accuracies'][i]
                
                # Train accuracy
                train_acc = calculate_classifier_accuracy(y_train_true, train_pred, threshold)
                model_result['train_overall_accuracy'] = train_acc['overall_accuracy']
                for i, col in enumerate(grade_cols):
                    model_result[f'train_accuracy_{col}'] = train_acc['class_accuracies'][i]
                
                model_results.append(model_result)
                
                print(f"{model_tag}: Test Acc = {test_acc['overall_accuracy']:.3f}, Train Acc = {train_acc['overall_accuracy']:.3f}")
    
    # Calculate ensemble predictions (average of all models)
    if individual_predictions:
        ensemble_test_pred = np.mean(individual_predictions, axis=0)
        
        # Calculate ensemble accuracy
        ensemble_result = {'model_tag': 'ensemble'}
        ensemble_acc = calculate_classifier_accuracy(y_test_true, ensemble_test_pred, threshold)
        
        ensemble_result['test_overall_accuracy'] = ensemble_acc['overall_accuracy']
        for i, col in enumerate(grade_cols):
            ensemble_result[f'test_accuracy_{col}'] = ensemble_acc['class_accuracies'][i]
        
        model_results.append(ensemble_result)
        
        print(f"\nENSEMBLE: Test Accuracy = {ensemble_acc['overall_accuracy']:.3f}")
        print("=" * 70)
    
    return model_results

# Calculate classifier accuracy for all models and ensemble
accuracy_results = calculate_ensemble_classifier_accuracy(model_tags, grade_dataset)


Calculating classifier accuracy for individual models and ensemble...


Loading models:  20%|██        | 1/5 [00:00<00:00,  9.87it/s]

{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.RE5C
pt_db.v3hp.RE5C: Test Acc = 0.769, Train Acc = 0.979
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.2HNP
pt_db.v3hp.2HNP: Test Acc = 0.770, Train Acc = 0.946
{'model_params': {'encoder__hidden_layers': 50, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers':

Loading models:  60%|██████    | 3/5 [00:00<00:00, 10.13it/s]

Successfully loaded model: pt_db.v3hp.CPUT
pt_db.v3hp.CPUT: Test Acc = 0.793, Train Acc = 0.945
{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.VPLV
pt_db.v3hp.VPLV: Test Acc = 0.787, Train Acc = 0.978
{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 400.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}


Loading models: 100%|██████████| 5/5 [00:00<00:00,  9.90it/s]

Successfully loaded model: pt_db.v3hp.PFFZ
pt_db.v3hp.PFFZ: Test Acc = 0.783, Train Acc = 0.949

ENSEMBLE: Test Accuracy = 0.815





In [58]:
# Display and analyze classifier accuracy results
accuracy_df = pd.DataFrame(accuracy_results)

print("CLASSIFIER ACCURACY ANALYSIS RESULTS")
print("=" * 80)

# Display individual model accuracy results
individual_models = accuracy_df[accuracy_df['model_tag'] != 'ensemble']
ensemble_result = accuracy_df[accuracy_df['model_tag'] == 'ensemble']

if not individual_models.empty:
    print("\nINDIVIDUAL MODEL ACCURACY RESULTS:")
    print("-" * 60)
    
    # Create summary table
    summary_cols = ['model_tag', 'test_overall_accuracy', 'train_overall_accuracy']
    summary_df = individual_models[summary_cols].copy()
    summary_df['overfitting'] = summary_df['train_overall_accuracy'] - summary_df['test_overall_accuracy']
    
    # Sort by test accuracy (descending)
    summary_df = summary_df.sort_values('test_overall_accuracy', ascending=False)
    
    print(summary_df.round(3))
    
    # Best and worst individual models
    best_model = summary_df.iloc[0]
    worst_model = summary_df.iloc[-1]
    
    print(f"\nBest Individual Model: {best_model['model_tag']}")
    print(f"  Test Accuracy: {best_model['test_overall_accuracy']:.3f}")
    print(f"  Train Accuracy: {best_model['train_overall_accuracy']:.3f}")
    print(f"  Overfitting: {best_model['overfitting']:.3f}")
    
    print(f"\nWorst Individual Model: {worst_model['model_tag']}")
    print(f"  Test Accuracy: {worst_model['test_overall_accuracy']:.3f}")
    print(f"  Train Accuracy: {worst_model['train_overall_accuracy']:.3f}")
    print(f"  Overfitting: {worst_model['overfitting']:.3f}")

# Display ensemble results
if not ensemble_result.empty:
    ensemble_acc = ensemble_result.iloc[0]['test_overall_accuracy']
    print(f"\nENSEMBLE RESULTS:")
    print("-" * 40)
    print(f"Ensemble Test Accuracy: {ensemble_acc:.3f}")
    
    # Compare ensemble with individual models
    if not individual_models.empty:
        avg_individual_acc = individual_models['test_overall_accuracy'].mean()
        best_individual_acc = individual_models['test_overall_accuracy'].max()
        
        print(f"\nCOMPARISON:")
        print(f"  Average Individual Accuracy: {avg_individual_acc:.3f}")
        print(f"  Best Individual Accuracy: {best_individual_acc:.3f}")
        print(f"  Ensemble Accuracy: {ensemble_acc:.3f}")
        print(f"  Ensemble vs Average: {ensemble_acc - avg_individual_acc:+.3f}")
        print(f"  Ensemble vs Best: {ensemble_acc - best_individual_acc:+.3f}")
        
        if ensemble_acc > best_individual_acc:
            print("  ✅ Ensemble outperforms best individual model!")
        else:
            print("  ❌ Ensemble does not outperform best individual model")

# Per-class accuracy analysis
print(f"\nPER-CLASS ACCURACY ANALYSIS:")
print("-" * 40)
grade_cols = TARGET_Y_COLS['grade']

if not individual_models.empty:
    print("Individual Models Average:")
    for i, col in enumerate(grade_cols):
        col_acc = individual_models[f'test_accuracy_{col}'].mean()
        print(f"  {col}: {col_acc:.3f}")

if not ensemble_result.empty:
    print("\nEnsemble:")
    for i, col in enumerate(grade_cols):
        col_acc = ensemble_result.iloc[0][f'test_accuracy_{col}']
        print(f"  {col}: {col_acc:.3f}")

# Save accuracy results
accuracy_df.to_csv(RESULTS_DIR / 'classifier_accuracy_results.csv', index=False)
print(f"\nAccuracy results saved to {RESULTS_DIR / 'classifier_accuracy_results.csv'}")


CLASSIFIER ACCURACY ANALYSIS RESULTS

INDIVIDUAL MODEL ACCURACY RESULTS:
------------------------------------------------------------
         model_tag  test_overall_accuracy  train_overall_accuracy  overfitting
2  pt_db.v3hp.CPUT                  0.793                   0.945        0.152
3  pt_db.v3hp.VPLV                  0.787                   0.978        0.191
4  pt_db.v3hp.PFFZ                  0.783                   0.949        0.166
1  pt_db.v3hp.2HNP                  0.770                   0.946        0.176
0  pt_db.v3hp.RE5C                  0.769                   0.979        0.211

Best Individual Model: pt_db.v3hp.CPUT
  Test Accuracy: 0.793
  Train Accuracy: 0.945
  Overfitting: 0.152

Worst Individual Model: pt_db.v3hp.RE5C
  Test Accuracy: 0.769
  Train Accuracy: 0.979
  Overfitting: 0.211

ENSEMBLE RESULTS:
----------------------------------------
Ensemble Test Accuracy: 0.815

COMPARISON:
  Average Individual Accuracy: 0.780
  Best Individual Accuracy: 0.793
 

In [59]:
# 5-Fold Cross-Validation Ensemble Evaluation
from sklearn.model_selection import KFold
import torch.nn as nn

def create_kfold_splits(data_df, n_splits=5, random_state=42):
    """
    Create K-fold splits for cross-validation
    
    Args:
        data_df: DataFrame containing the data
        n_splits: Number of folds
        random_state: Random seed for reproducibility
    
    Returns:
        List of (train_idx, val_idx) tuples
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    splits = []
    
    for train_idx, val_idx in kf.split(data_df):
        splits.append((train_idx, val_idx))
    
    return splits

def evaluate_ensemble_on_fold(model_tags, train_df, val_df, test_df, device='cpu'):
    """
    Evaluate ensemble on a single fold
    
    Args:
        model_tags: List of model tags
        train_df: Training data for this fold
        val_df: Validation data for this fold
        test_df: Test data (same for all folds)
        device: Device to run evaluation on
    
    Returns:
        Dictionary containing fold results
    """
    # Prepare datasets for this fold
    def prepare_fold_datasets(train_df, val_df, test_df):
        # Extract input features and targets
        X_train = train_df[X_COLS].values.astype(np.float32)
        X_val = val_df[X_COLS].values.astype(np.float32)
        X_test = test_df[X_COLS].values.astype(np.float32)
        
        grade_cols = TARGET_Y_COLS['grade']
        y_train = train_df[grade_cols].values.astype(np.float32)
        y_val = val_df[grade_cols].values.astype(np.float32)
        y_test = test_df[grade_cols].values.astype(np.float32)
        
        return {
            'train_X': torch.tensor(X_train, dtype=torch.float32),
            'train_y': torch.tensor(y_train, dtype=torch.float32),
            'val_X': torch.tensor(X_val, dtype=torch.float32),
            'val_y': torch.tensor(y_val, dtype=torch.float32),
            'test_X': torch.tensor(X_test, dtype=torch.float32),
            'test_y': torch.tensor(y_test, dtype=torch.float32)
        }
    
    fold_dataset = prepare_fold_datasets(train_df, val_df, test_df)
    
    # Store predictions from all models
    train_predictions = []
    val_predictions = []
    test_predictions = []
    
    # Load and evaluate each model
    for model_tag in model_tags:
        model, params = load_model_from_structure_with_txt(model_tag)
        
        if model is not None:
            model.to(device)
            model.eval()
            
            # Ensure model parameters are float32
            for param in model.parameters():
                param.data = param.data.float()
            
            with torch.no_grad():
                # Get predictions for all sets
                train_pred = model.forward(fold_dataset['train_X'].to(device).float(), 'grade').cpu().numpy()
                val_pred = model.forward(fold_dataset['val_X'].to(device).float(), 'grade').cpu().numpy()
                test_pred = model.forward(fold_dataset['test_X'].to(device).float(), 'grade').cpu().numpy()
                
                train_predictions.append(train_pred)
                val_predictions.append(val_pred)
                test_predictions.append(test_pred)
    
    # Calculate ensemble predictions (average of all models)
    if train_predictions:
        ensemble_train_pred = np.mean(train_predictions, axis=0)
        ensemble_val_pred = np.mean(val_predictions, axis=0)
        ensemble_test_pred = np.mean(test_predictions, axis=0)
        
        # Calculate losses
        criterion = nn.BCEWithLogitsLoss()
        
        train_loss = criterion(torch.tensor(ensemble_train_pred), fold_dataset['train_y']).item()
        val_loss = criterion(torch.tensor(ensemble_val_pred), fold_dataset['val_y']).item()
        test_loss = criterion(torch.tensor(ensemble_test_pred), fold_dataset['test_y']).item()
        
        # Calculate accuracies
        train_acc = calculate_classifier_accuracy(fold_dataset['train_y'].numpy(), ensemble_train_pred)
        val_acc = calculate_classifier_accuracy(fold_dataset['val_y'].numpy(), ensemble_val_pred)
        test_acc = calculate_classifier_accuracy(fold_dataset['test_y'].numpy(), ensemble_test_pred)
        
        return {
            'train_loss': train_loss,
            'val_loss': val_loss,
            'test_loss': test_loss,
            'train_accuracy': train_acc['overall_accuracy'],
            'val_accuracy': val_acc['overall_accuracy'],
            'test_accuracy': test_acc['overall_accuracy'],
            'train_class_accuracies': train_acc['class_accuracies'],
            'val_class_accuracies': val_acc['class_accuracies'],
            'test_class_accuracies': test_acc['class_accuracies']
        }
    
    return None

def run_5fold_ensemble_evaluation(model_tags, data_df, test_df, n_splits=5, random_state=42, device='cpu'):
    """
    Run 5-fold cross-validation ensemble evaluation
    
    Args:
        model_tags: List of model tags
        data_df: Training data for cross-validation
        test_df: Test data (same for all folds)
        n_splits: Number of folds
        random_state: Random seed
        device: Device to run evaluation on
    
    Returns:
        Dictionary containing comprehensive results
    """
    print("Running 5-Fold Cross-Validation Ensemble Evaluation")
    print("=" * 70)
    
    # Create K-fold splits
    splits = create_kfold_splits(data_df, n_splits, random_state)
    fold_results = []
    
    for fold_idx, (train_idx, val_idx) in enumerate(tqdm(splits, desc="Processing folds")):
        print(f"\nFold {fold_idx + 1}/{n_splits}")
        print("-" * 30)
        
        # Split data
        train_df = data_df.iloc[train_idx].copy()
        val_df = data_df.iloc[val_idx].copy()
        
        print(f"Train: {len(train_df)} samples, Val: {len(val_df)} samples, Test: {len(test_df)} samples")
        
        # Evaluate ensemble on this fold
        fold_result = evaluate_ensemble_on_fold(model_tags, train_df, val_df, test_df, device)
        
        if fold_result is not None:
            fold_result['fold'] = fold_idx + 1
            fold_results.append(fold_result)
            
            print(f"  Train Loss: {fold_result['train_loss']:.6f}, Accuracy: {fold_result['train_accuracy']:.3f}")
            print(f"  Val Loss: {fold_result['val_loss']:.6f}, Accuracy: {fold_result['val_accuracy']:.3f}")
            print(f"  Test Loss: {fold_result['test_loss']:.6f}, Accuracy: {fold_result['test_accuracy']:.3f}")
        else:
            print(f"  ❌ Fold {fold_idx + 1} failed")
    
    if not fold_results:
        print("❌ All folds failed!")
        return None
    
    # Calculate average results across folds
    avg_results = {
        'train_loss_mean': np.mean([r['train_loss'] for r in fold_results]),
        'train_loss_std': np.std([r['train_loss'] for r in fold_results]),
        'val_loss_mean': np.mean([r['val_loss'] for r in fold_results]),
        'val_loss_std': np.std([r['val_loss'] for r in fold_results]),
        'test_loss_mean': np.mean([r['test_loss'] for r in fold_results]),
        'test_loss_std': np.std([r['test_loss'] for r in fold_results]),
        'train_accuracy_mean': np.mean([r['train_accuracy'] for r in fold_results]),
        'train_accuracy_std': np.std([r['train_accuracy'] for r in fold_results]),
        'val_accuracy_mean': np.mean([r['val_accuracy'] for r in fold_results]),
        'val_accuracy_std': np.std([r['val_accuracy'] for r in fold_results]),
        'test_accuracy_mean': np.mean([r['test_accuracy'] for r in fold_results]),
        'test_accuracy_std': np.std([r['test_accuracy'] for r in fold_results])
    }
    
    # Calculate per-class accuracy averages
    grade_cols = TARGET_Y_COLS['grade']
    for i, col in enumerate(grade_cols):
        avg_results[f'train_accuracy_{col}_mean'] = np.mean([r['train_class_accuracies'][i] for r in fold_results])
        avg_results[f'train_accuracy_{col}_std'] = np.std([r['train_class_accuracies'][i] for r in fold_results])
        avg_results[f'val_accuracy_{col}_mean'] = np.mean([r['val_class_accuracies'][i] for r in fold_results])
        avg_results[f'val_accuracy_{col}_std'] = np.std([r['val_class_accuracies'][i] for r in fold_results])
        avg_results[f'test_accuracy_{col}_mean'] = np.mean([r['test_class_accuracies'][i] for r in fold_results])
        avg_results[f'test_accuracy_{col}_std'] = np.std([r['test_class_accuracies'][i] for r in fold_results])
    
    print(f"\n5-FOLD CROSS-VALIDATION RESULTS")
    print("=" * 50)
    print(f"Train Loss: {avg_results['train_loss_mean']:.6f} ± {avg_results['train_loss_std']:.6f}")
    print(f"Val Loss: {avg_results['val_loss_mean']:.6f} ± {avg_results['val_loss_std']:.6f}")
    print(f"Test Loss: {avg_results['test_loss_mean']:.6f} ± {avg_results['test_loss_std']:.6f}")
    print(f"Train Accuracy: {avg_results['train_accuracy_mean']:.3f} ± {avg_results['train_accuracy_std']:.3f}")
    print(f"Val Accuracy: {avg_results['val_accuracy_mean']:.3f} ± {avg_results['val_accuracy_std']:.3f}")
    print(f"Test Accuracy: {avg_results['test_accuracy_mean']:.3f} ± {avg_results['test_accuracy_std']:.3f}")
    
    return {
        'fold_results': fold_results,
        'average_results': avg_results,
        'n_folds': len(fold_results)
    }


In [60]:
# Run 5-fold cross-validation ensemble evaluation
print("Starting 5-Fold Cross-Validation Ensemble Evaluation...")
print(f"Using {len(model_tags)} models: {[tag.split('.')[-1] for tag in model_tags]}")

# Run the evaluation
cv_results = run_5fold_ensemble_evaluation(
    model_tags=model_tags,
    data_df=train_df,  # Use train_df for cross-validation
    test_df=test_df,   # Use test_df as held-out test set
    n_splits=5,
    random_state=42,
    device='cpu'
)


Starting 5-Fold Cross-Validation Ensemble Evaluation...
Using 5 models: ['RE5C', '2HNP', 'CPUT', 'VPLV', 'PFFZ']
Running 5-Fold Cross-Validation Ensemble Evaluation


Processing folds:   0%|          | 0/5 [00:00<?, ?it/s]


Fold 1/5
------------------------------
Train: 2118 samples, Val: 530 samples, Test: 180 samples
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.RE5C
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.2HNP
{'model_params': {'encoder__hidden_layers': 50, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dty

Processing folds:  20%|██        | 1/5 [00:00<00:01,  2.08it/s]

Successfully loaded model: pt_db.v3hp.PFFZ
  Train Loss: 0.564545, Accuracy: 0.958
  Val Loss: 0.562088, Accuracy: 0.953
  Test Loss: 0.649876, Accuracy: 0.815

Fold 2/5
------------------------------
Train: 2118 samples, Val: 530 samples, Test: 180 samples
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.RE5C
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.2HNP
{'model_params': {'encoder__hidden_layers': 50, 'encoder__hidden_base': 320.0, 'encoder__hidden_sc

Processing folds:  40%|████      | 2/5 [00:00<00:01,  2.17it/s]

Successfully loaded model: pt_db.v3hp.PFFZ
  Train Loss: 0.564654, Accuracy: 0.959
  Val Loss: 0.561653, Accuracy: 0.948
  Test Loss: 0.649876, Accuracy: 0.815

Fold 3/5
------------------------------
Train: 2118 samples, Val: 530 samples, Test: 180 samples
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.RE5C
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.2HNP
{'model_params': {'encoder__hidden_layers': 50, 'encoder__hidden_base': 320.0, 'encoder__hidden_sc

Processing folds:  60%|██████    | 3/5 [00:01<00:00,  2.18it/s]

{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 400.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.PFFZ
  Train Loss: 0.563259, Accuracy: 0.954
  Val Loss: 0.567229, Accuracy: 0.970
  Test Loss: 0.649876, Accuracy: 0.815

Fold 4/5
------------------------------
Train: 2119 samples, Val: 529 samples, Test: 180 samples
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.RE5C
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128

Processing folds:  80%|████████  | 4/5 [00:01<00:00,  2.17it/s]

{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 400.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.PFFZ
  Train Loss: 0.562384, Accuracy: 0.957
  Val Loss: 0.570737, Accuracy: 0.957
  Test Loss: 0.649876, Accuracy: 0.815

Fold 5/5
------------------------------
Train: 2119 samples, Val: 529 samples, Test: 180 samples
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.RE5C
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128

Processing folds: 100%|██████████| 5/5 [00:02<00:00,  2.18it/s]

Successfully loaded model: pt_db.v3hp.PFFZ
  Train Loss: 0.565424, Accuracy: 0.957
  Val Loss: 0.558561, Accuracy: 0.956
  Test Loss: 0.649876, Accuracy: 0.815

5-FOLD CROSS-VALIDATION RESULTS
Train Loss: 0.564053 ± 0.001086
Val Loss: 0.564054 ± 0.004348
Test Loss: 0.649876 ± 0.000000
Train Accuracy: 0.957 ± 0.002
Val Accuracy: 0.957 ± 0.007
Test Accuracy: 0.815 ± 0.000





In [61]:
# Comprehensive 5-Fold Cross-Validation Results Analysis
if cv_results is not None:
    print("\n" + "="*80)
    print("COMPREHENSIVE 5-FOLD CROSS-VALIDATION ENSEMBLE RESULTS")
    print("="*80)
    
    avg_results = cv_results['average_results']
    fold_results = cv_results['fold_results']
    
    # Create detailed results table
    print("\nDETAILED RESULTS SUMMARY:")
    print("-" * 60)
    
    # Loss results
    print("LOSS RESULTS:")
    print(f"  Train Loss: {avg_results['train_loss_mean']:.6f} ± {avg_results['train_loss_std']:.6f}")
    print(f"  Val Loss:   {avg_results['val_loss_mean']:.6f} ± {avg_results['val_loss_std']:.6f}")
    print(f"  Test Loss:  {avg_results['test_loss_mean']:.6f} ± {avg_results['test_loss_std']:.6f}")
    
    # Accuracy results
    print("\nACCURACY RESULTS:")
    print(f"  Train Accuracy: {avg_results['train_accuracy_mean']:.3f} ± {avg_results['train_accuracy_std']:.3f}")
    print(f"  Val Accuracy:   {avg_results['val_accuracy_mean']:.3f} ± {avg_results['val_accuracy_std']:.3f}")
    print(f"  Test Accuracy:  {avg_results['test_accuracy_mean']:.3f} ± {avg_results['test_accuracy_std']:.3f}")
    
    # Per-class accuracy results
    print("\nPER-CLASS ACCURACY RESULTS:")
    grade_cols = TARGET_Y_COLS['grade']
    for col in grade_cols:
        train_mean = avg_results[f'train_accuracy_{col}_mean']
        train_std = avg_results[f'train_accuracy_{col}_std']
        val_mean = avg_results[f'val_accuracy_{col}_mean']
        val_std = avg_results[f'val_accuracy_{col}_std']
        test_mean = avg_results[f'test_accuracy_{col}_mean']
        test_std = avg_results[f'test_accuracy_{col}_std']
        
        print(f"  {col}:")
        print(f"    Train: {train_mean:.3f} ± {train_std:.3f}")
        print(f"    Val:   {val_mean:.3f} ± {val_std:.3f}")
        print(f"    Test:  {test_mean:.3f} ± {test_std:.3f}")
    
    # Fold-by-fold breakdown
    print(f"\nFOLD-BY-FOLD BREAKDOWN:")
    print("-" * 40)
    fold_df = pd.DataFrame(fold_results)
    
    for i, row in fold_df.iterrows():
        print(f"Fold {int(row['fold'])}:")
        print(f"  Train: Loss={row['train_loss']:.6f}, Acc={row['train_accuracy']:.3f}")
        print(f"  Val:   Loss={row['val_loss']:.6f}, Acc={row['val_accuracy']:.3f}")
        print(f"  Test:  Loss={row['test_loss']:.6f}, Acc={row['test_accuracy']:.3f}")
    
    # Performance analysis
    print(f"\nPERFORMANCE ANALYSIS:")
    print("-" * 30)
    
    # Overfitting analysis
    train_val_loss_diff = avg_results['val_loss_mean'] - avg_results['train_loss_mean']
    train_val_acc_diff = avg_results['train_accuracy_mean'] - avg_results['val_accuracy_mean']
    
    print(f"Overfitting Analysis:")
    print(f"  Loss Overfitting: {train_val_loss_diff:.6f} (Val - Train)")
    print(f"  Accuracy Overfitting: {train_val_acc_diff:.3f} (Train - Val)")
    
    # Generalization analysis
    val_test_loss_diff = avg_results['test_loss_mean'] - avg_results['val_loss_mean']
    val_test_acc_diff = avg_results['val_accuracy_mean'] - avg_results['test_accuracy_mean']
    
    print(f"\nGeneralization Analysis:")
    print(f"  Loss Generalization: {val_test_loss_diff:.6f} (Test - Val)")
    print(f"  Accuracy Generalization: {val_test_acc_diff:.3f} (Val - Test)")
    
    # Save comprehensive results
    results_summary = {
        'evaluation_type': '5_fold_cross_validation_ensemble',
        'n_models': len(model_tags),
        'model_tags': model_tags,
        'n_folds': cv_results['n_folds'],
        'average_results': avg_results,
        'fold_results': fold_results
    }
    
    # Save to files
    pd.DataFrame(fold_results).to_csv(RESULTS_DIR / '5fold_cv_fold_results.csv', index=False)
    
    # Save summary results
    summary_df = pd.DataFrame([avg_results])
    summary_df.to_csv(RESULTS_DIR / '5fold_cv_summary_results.csv', index=False)
    
    print(f"\nResults saved to:")
    print(f"  - {RESULTS_DIR / '5fold_cv_fold_results.csv'}")
    print(f"  - {RESULTS_DIR / '5fold_cv_summary_results.csv'}")
    
else:
    print("❌ Cross-validation evaluation failed!")



COMPREHENSIVE 5-FOLD CROSS-VALIDATION ENSEMBLE RESULTS

DETAILED RESULTS SUMMARY:
------------------------------------------------------------
LOSS RESULTS:
  Train Loss: 0.564053 ± 0.001086
  Val Loss:   0.564054 ± 0.004348
  Test Loss:  0.649876 ± 0.000000

ACCURACY RESULTS:
  Train Accuracy: 0.957 ± 0.002
  Val Accuracy:   0.957 ± 0.007
  Test Accuracy:  0.815 ± 0.000

PER-CLASS ACCURACY RESULTS:
  Detachability:
    Train: 0.996 ± 0.001
    Val:   0.996 ± 0.003
    Test:  0.867 ± 0.000
  FlatnessUni:
    Train: 0.893 ± 0.005
    Val:   0.893 ± 0.020
    Test:  0.733 ± 0.000
  Feasibility:
    Train: 0.981 ± 0.001
    Val:   0.981 ± 0.004
    Test:  0.844 ± 0.000

FOLD-BY-FOLD BREAKDOWN:
----------------------------------------
Fold 1:
  Train: Loss=0.564545, Acc=0.958
  Val:   Loss=0.562088, Acc=0.953
  Test:  Loss=0.649876, Acc=0.815
Fold 2:
  Train: Loss=0.564654, Acc=0.959
  Val:   Loss=0.561653, Acc=0.948
  Test:  Loss=0.649876, Acc=0.815
Fold 3:
  Train: Loss=0.563259, Acc=0.

In [62]:
# Individual Model Performance Evaluation
def evaluate_individual_model_on_fold(model_tag, train_df, val_df, test_df, device='cpu'):
    """
    Evaluate a single model on a fold
    
    Args:
        model_tag: Model tag to evaluate
        train_df: Training data for this fold
        val_df: Validation data for this fold
        test_df: Test data (same for all folds)
        device: Device to run evaluation on
    
    Returns:
        Dictionary containing individual model results
    """
    # Prepare datasets for this fold
    def prepare_fold_datasets(train_df, val_df, test_df):
        # Extract input features and targets
        X_train = train_df[X_COLS].values.astype(np.float32)
        X_val = val_df[X_COLS].values.astype(np.float32)
        X_test = test_df[X_COLS].values.astype(np.float32)
        
        grade_cols = TARGET_Y_COLS['grade']
        y_train = train_df[grade_cols].values.astype(np.float32)
        y_val = val_df[grade_cols].values.astype(np.float32)
        y_test = test_df[grade_cols].values.astype(np.float32)
        
        return {
            'train_X': torch.tensor(X_train, dtype=torch.float32),
            'train_y': torch.tensor(y_train, dtype=torch.float32),
            'val_X': torch.tensor(X_val, dtype=torch.float32),
            'val_y': torch.tensor(y_val, dtype=torch.float32),
            'test_X': torch.tensor(X_test, dtype=torch.float32),
            'test_y': torch.tensor(y_test, dtype=torch.float32)
        }
    
    fold_dataset = prepare_fold_datasets(train_df, val_df, test_df)
    
    # Load and evaluate the model
    model, params = load_model_from_structure_with_txt(model_tag)
    
    if model is not None:
        model.to(device)
        model.eval()
        
        # Ensure model parameters are float32
        for param in model.parameters():
            param.data = param.data.float()
        
        with torch.no_grad():
            # Get predictions for all sets
            train_pred = model.forward(fold_dataset['train_X'].to(device).float(), 'grade').cpu().numpy()
            val_pred = model.forward(fold_dataset['val_X'].to(device).float(), 'grade').cpu().numpy()
            test_pred = model.forward(fold_dataset['test_X'].to(device).float(), 'grade').cpu().numpy()
            
            # Calculate losses
            criterion = nn.BCEWithLogitsLoss()
            
            train_loss = criterion(torch.tensor(train_pred), fold_dataset['train_y']).item()
            val_loss = criterion(torch.tensor(val_pred), fold_dataset['val_y']).item()
            test_loss = criterion(torch.tensor(test_pred), fold_dataset['test_y']).item()
            
            # Calculate accuracies
            train_acc = calculate_classifier_accuracy(fold_dataset['train_y'].numpy(), train_pred)
            val_acc = calculate_classifier_accuracy(fold_dataset['val_y'].numpy(), val_pred)
            test_acc = calculate_classifier_accuracy(fold_dataset['test_y'].numpy(), test_pred)
            
            return {
                'model_tag': model_tag,
                'train_loss': train_loss,
                'val_loss': val_loss,
                'test_loss': test_loss,
                'train_accuracy': train_acc['overall_accuracy'],
                'val_accuracy': val_acc['overall_accuracy'],
                'test_accuracy': test_acc['overall_accuracy'],
                'train_class_accuracies': train_acc['class_accuracies'],
                'val_class_accuracies': val_acc['class_accuracies'],
                'test_class_accuracies': test_acc['class_accuracies']
            }
    
    return None

def run_5fold_individual_model_evaluation(model_tags, data_df, test_df, n_splits=5, random_state=42, device='cpu'):
    """
    Run 5-fold cross-validation for individual models
    
    Args:
        model_tags: List of model tags
        data_df: Training data for cross-validation
        test_df: Test data (same for all folds)
        n_splits: Number of folds
        random_state: Random seed
        device: Device to run evaluation on
    
    Returns:
        Dictionary containing individual model results
    """
    print("Running 5-Fold Cross-Validation Individual Model Evaluation")
    print("=" * 70)
    
    # Create K-fold splits
    splits = create_kfold_splits(data_df, n_splits, random_state)
    all_model_results = []
    
    for model_tag in model_tags:
        print(f"\nEvaluating Model: {model_tag}")
        print("-" * 50)
        
        model_fold_results = []
        
        for fold_idx, (train_idx, val_idx) in enumerate(tqdm(splits, desc=f"Processing {model_tag.split('.')[-1]}")):
            # Split data
            train_df = data_df.iloc[train_idx].copy()
            val_df = data_df.iloc[val_idx].copy()
            
            # Evaluate model on this fold
            fold_result = evaluate_individual_model_on_fold(model_tag, train_df, val_df, test_df, device)
            
            if fold_result is not None:
                fold_result['fold'] = fold_idx + 1
                model_fold_results.append(fold_result)
        
        if model_fold_results:
            # Calculate average results for this model
            avg_results = {
                'model_tag': model_tag,
                'train_loss_mean': np.mean([r['train_loss'] for r in model_fold_results]),
                'train_loss_std': np.std([r['train_loss'] for r in model_fold_results]),
                'val_loss_mean': np.mean([r['val_loss'] for r in model_fold_results]),
                'val_loss_std': np.std([r['val_loss'] for r in model_fold_results]),
                'test_loss_mean': np.mean([r['test_loss'] for r in model_fold_results]),
                'test_loss_std': np.std([r['test_loss'] for r in model_fold_results]),
                'train_accuracy_mean': np.mean([r['train_accuracy'] for r in model_fold_results]),
                'train_accuracy_std': np.std([r['train_accuracy'] for r in model_fold_results]),
                'val_accuracy_mean': np.mean([r['val_accuracy'] for r in model_fold_results]),
                'val_accuracy_std': np.std([r['val_accuracy'] for r in model_fold_results]),
                'test_accuracy_mean': np.mean([r['test_accuracy'] for r in model_fold_results]),
                'test_accuracy_std': np.std([r['test_accuracy'] for r in model_fold_results])
            }
            
            # Calculate per-class accuracy averages
            grade_cols = TARGET_Y_COLS['grade']
            for i, col in enumerate(grade_cols):
                avg_results[f'train_accuracy_{col}_mean'] = np.mean([r['train_class_accuracies'][i] for r in model_fold_results])
                avg_results[f'train_accuracy_{col}_std'] = np.std([r['train_class_accuracies'][i] for r in model_fold_results])
                avg_results[f'val_accuracy_{col}_mean'] = np.mean([r['val_class_accuracies'][i] for r in model_fold_results])
                avg_results[f'val_accuracy_{col}_std'] = np.std([r['val_class_accuracies'][i] for r in model_fold_results])
                avg_results[f'test_accuracy_{col}_mean'] = np.mean([r['test_class_accuracies'][i] for r in model_fold_results])
                avg_results[f'test_accuracy_{col}_std'] = np.std([r['test_class_accuracies'][i] for r in model_fold_results])
            
            all_model_results.append({
                'model_tag': model_tag,
                'fold_results': model_fold_results,
                'average_results': avg_results,
                'n_folds': len(model_fold_results)
            })
            
            print(f"  Train Loss: {avg_results['train_loss_mean']:.6f} ± {avg_results['train_loss_std']:.6f}")
            print(f"  Val Loss:   {avg_results['val_loss_mean']:.6f} ± {avg_results['val_loss_std']:.6f}")
            print(f"  Test Loss:  {avg_results['test_loss_mean']:.6f} ± {avg_results['test_loss_std']:.6f}")
            print(f"  Train Acc:  {avg_results['train_accuracy_mean']:.3f} ± {avg_results['train_accuracy_std']:.3f}")
            print(f"  Val Acc:    {avg_results['val_accuracy_mean']:.3f} ± {avg_results['val_accuracy_std']:.3f}")
            print(f"  Test Acc:   {avg_results['test_accuracy_mean']:.3f} ± {avg_results['test_accuracy_std']:.3f}")
        else:
            print(f"  ❌ Model {model_tag} failed on all folds")
    
    return all_model_results

# Run individual model evaluation
print("Starting 5-Fold Cross-Validation Individual Model Evaluation...")
print(f"Evaluating {len(model_tags)} models: {[tag.split('.')[-1] for tag in model_tags]}")

individual_model_results = run_5fold_individual_model_evaluation(
    model_tags=model_tags,
    data_df=train_df,
    test_df=test_df,
    n_splits=5,
    random_state=42,
    device='cpu'
)


Starting 5-Fold Cross-Validation Individual Model Evaluation...
Evaluating 5 models: ['RE5C', '2HNP', 'CPUT', 'VPLV', 'PFFZ']
Running 5-Fold Cross-Validation Individual Model Evaluation

Evaluating Model: pt_db.v3hp.RE5C
--------------------------------------------------


Processing RE5C:  20%|██        | 1/5 [00:00<00:00,  8.09it/s]

{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.RE5C
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.RE5C
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}


Processing RE5C: 100%|██████████| 5/5 [00:00<00:00, 12.31it/s]


Successfully loaded model: pt_db.v3hp.RE5C
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.RE5C
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.RE5C
  Train Loss: 0.556246 ± 0.001224
  Val Loss:   0.556248 ± 0.004898
  Test Loss:  0.659720 ± 0.000000
  Train Acc:  0.979 ± 0.001
  Val Acc:    0.979 ± 0.004
  Test Acc:   0.769 ± 0.000

Evaluating Model: pt_db.v3hp.2HNP
--------------------------------------------------


Processing 2HNP:   0%|          | 0/5 [00:00<?, ?it/s]

{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}


Processing 2HNP:  40%|████      | 2/5 [00:00<00:00, 12.09it/s]

Successfully loaded model: pt_db.v3hp.2HNP
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.2HNP
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.2HNP


Processing 2HNP: 100%|██████████| 5/5 [00:00<00:00, 13.59it/s]


{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.2HNP
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.2HNP
  Train Loss: 0.571103 ± 0.000925
  Val Loss:   0.571103 ± 0.003705
  Test Loss:  0.658976 ± 0.000000
  Train Acc:  0.946 ± 0.001
  Val Acc:    0.946 ± 0.004
  Test Acc:   0.770 ± 0.000

Evaluating Model: pt_db.v3hp.CPUT
--------------------------------------------------


Processing CPUT:   0%|          | 0/5 [00:00<?, ?it/s]

{'model_params': {'encoder__hidden_layers': 50, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.CPUT


Processing CPUT:  20%|██        | 1/5 [00:00<00:00,  9.08it/s]

{'model_params': {'encoder__hidden_layers': 50, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.CPUT


Processing CPUT:  40%|████      | 2/5 [00:00<00:00,  9.54it/s]

{'model_params': {'encoder__hidden_layers': 50, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.CPUT


Processing CPUT:  60%|██████    | 3/5 [00:00<00:00,  9.28it/s]

{'model_params': {'encoder__hidden_layers': 50, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.CPUT


Processing CPUT:  80%|████████  | 4/5 [00:00<00:00,  9.07it/s]

{'model_params': {'encoder__hidden_layers': 50, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.CPUT


Processing CPUT: 100%|██████████| 5/5 [00:00<00:00,  9.26it/s]


  Train Loss: 0.573922 ± 0.001043
  Val Loss:   0.573922 ± 0.004175
  Test Loss:  0.651741 ± 0.000000
  Train Acc:  0.945 ± 0.001
  Val Acc:    0.945 ± 0.006
  Test Acc:   0.793 ± 0.000

Evaluating Model: pt_db.v3hp.VPLV
--------------------------------------------------


Processing VPLV:   0%|          | 0/5 [00:00<?, ?it/s]

{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.VPLV


Processing VPLV:  40%|████      | 2/5 [00:00<00:00, 10.37it/s]

{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.VPLV
{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.VPLV
{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}


Processing VPLV:  80%|████████  | 4/5 [00:00<00:00, 10.54it/s]

Successfully loaded model: pt_db.v3hp.VPLV
{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.VPLV


Processing VPLV: 100%|██████████| 5/5 [00:00<00:00,  9.47it/s]


  Train Loss: 0.558083 ± 0.001188
  Val Loss:   0.558083 ± 0.004753
  Test Loss:  0.655647 ± 0.000000
  Train Acc:  0.978 ± 0.001
  Val Acc:    0.978 ± 0.004
  Test Acc:   0.787 ± 0.000

Evaluating Model: pt_db.v3hp.PFFZ
--------------------------------------------------


Processing PFFZ:  20%|██        | 1/5 [00:00<00:00,  7.80it/s]

{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 400.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.PFFZ
{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 400.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.PFFZ


Processing PFFZ:  60%|██████    | 3/5 [00:00<00:00,  9.08it/s]

{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 400.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.PFFZ
{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 400.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.PFFZ


Processing PFFZ: 100%|██████████| 5/5 [00:00<00:00,  9.17it/s]

{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 400.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.PFFZ
  Train Loss: 0.570911 ± 0.001263
  Val Loss:   0.570911 ± 0.005057
  Test Loss:  0.659414 ± 0.000000
  Train Acc:  0.949 ± 0.002
  Val Acc:    0.949 ± 0.008
  Test Acc:   0.783 ± 0.000





In [63]:
# Individual Model Results Analysis and Comparison
if individual_model_results:
    print("\n" + "="*80)
    print("INDIVIDUAL MODEL 5-FOLD CROSS-VALIDATION RESULTS")
    print("="*80)
    
    # Create summary table for all models
    summary_data = []
    for model_result in individual_model_results:
        avg_results = model_result['average_results']
        model_name = model_result['model_tag'].split('.')[-1]
        
        summary_data.append({
            'Model': model_name,
            'Train_Loss': f"{avg_results['train_loss_mean']:.6f} ± {avg_results['train_loss_std']:.6f}",
            'Val_Loss': f"{avg_results['val_loss_mean']:.6f} ± {avg_results['val_loss_std']:.6f}",
            'Test_Loss': f"{avg_results['test_loss_mean']:.6f} ± {avg_results['test_loss_std']:.6f}",
            'Train_Acc': f"{avg_results['train_accuracy_mean']:.3f} ± {avg_results['train_accuracy_std']:.3f}",
            'Val_Acc': f"{avg_results['val_accuracy_mean']:.3f} ± {avg_results['val_accuracy_std']:.3f}",
            'Test_Acc': f"{avg_results['test_accuracy_mean']:.3f} ± {avg_results['test_accuracy_std']:.3f}",
            'Overfitting_Loss': f"{avg_results['val_loss_mean'] - avg_results['train_loss_mean']:.6f}",
            'Overfitting_Acc': f"{avg_results['train_accuracy_mean'] - avg_results['val_accuracy_mean']:.3f}"
        })
    
    summary_df = pd.DataFrame(summary_data)
    print("\nCOMPREHENSIVE MODEL COMPARISON:")
    print("-" * 80)
    print(summary_df.to_string(index=False))
    
    # Find best and worst models
    print(f"\nMODEL RANKING:")
    print("-" * 40)
    
    # Sort by test accuracy (descending)
    sorted_models = sorted(individual_model_results, 
                          key=lambda x: x['average_results']['test_accuracy_mean'], 
                          reverse=True)
    
    print("By Test Accuracy:")
    for i, model_result in enumerate(sorted_models):
        model_name = model_result['model_tag'].split('.')[-1]
        test_acc = model_result['average_results']['test_accuracy_mean']
        print(f"  {i+1}. {model_name}: {test_acc:.3f}")
    
    # Sort by test loss (ascending)
    sorted_models_loss = sorted(individual_model_results, 
                               key=lambda x: x['average_results']['test_loss_mean'])
    
    print("\nBy Test Loss (lower is better):")
    for i, model_result in enumerate(sorted_models_loss):
        model_name = model_result['model_tag'].split('.')[-1]
        test_loss = model_result['average_results']['test_loss_mean']
        print(f"  {i+1}. {model_name}: {test_loss:.6f}")
    
    # Best and worst model analysis
    best_model = sorted_models[0]
    worst_model = sorted_models[-1]
    
    print(f"\nBEST MODEL: {best_model['model_tag'].split('.')[-1]}")
    print("-" * 30)
    best_avg = best_model['average_results']
    print(f"  Test Accuracy: {best_avg['test_accuracy_mean']:.3f} ± {best_avg['test_accuracy_std']:.3f}")
    print(f"  Test Loss: {best_avg['test_loss_mean']:.6f} ± {best_avg['test_loss_std']:.6f}")
    print(f"  Val Accuracy: {best_avg['val_accuracy_mean']:.3f} ± {best_avg['val_accuracy_std']:.3f}")
    print(f"  Overfitting: {best_avg['train_accuracy_mean'] - best_avg['val_accuracy_mean']:.3f}")
    
    print(f"\nWORST MODEL: {worst_model['model_tag'].split('.')[-1]}")
    print("-" * 30)
    worst_avg = worst_model['average_results']
    print(f"  Test Accuracy: {worst_avg['test_accuracy_mean']:.3f} ± {worst_avg['test_accuracy_std']:.3f}")
    print(f"  Test Loss: {worst_avg['test_loss_mean']:.6f} ± {worst_avg['test_loss_std']:.6f}")
    print(f"  Val Accuracy: {worst_avg['val_accuracy_mean']:.3f} ± {worst_avg['val_accuracy_std']:.3f}")
    print(f"  Overfitting: {worst_avg['train_accuracy_mean'] - worst_avg['val_accuracy_mean']:.3f}")
    
    # Per-class accuracy analysis
    print(f"\nPER-CLASS ACCURACY ANALYSIS:")
    print("-" * 40)
    grade_cols = TARGET_Y_COLS['grade']
    
    for col in grade_cols:
        print(f"\n{col}:")
        class_results = []
        for model_result in individual_model_results:
            model_name = model_result['model_tag'].split('.')[-1]
            test_acc = model_result['average_results'][f'test_accuracy_{col}_mean']
            class_results.append((model_name, test_acc))
        
        # Sort by accuracy
        class_results.sort(key=lambda x: x[1], reverse=True)
        for i, (model_name, acc) in enumerate(class_results):
            print(f"  {i+1}. {model_name}: {acc:.3f}")
    
    # Compare with ensemble results
    if cv_results is not None:
        ensemble_avg = cv_results['average_results']
        print(f"\nENSEMBLE vs INDIVIDUAL MODELS COMPARISON:")
        print("-" * 50)
        
        print(f"Ensemble Test Accuracy: {ensemble_avg['test_accuracy_mean']:.3f} ± {ensemble_avg['test_accuracy_std']:.3f}")
        print(f"Best Individual Test Accuracy: {best_avg['test_accuracy_mean']:.3f} ± {best_avg['test_accuracy_std']:.3f}")
        print(f"Worst Individual Test Accuracy: {worst_avg['test_accuracy_mean']:.3f} ± {worst_avg['test_accuracy_std']:.3f}")
        
        ensemble_vs_best = ensemble_avg['test_accuracy_mean'] - best_avg['test_accuracy_mean']
        ensemble_vs_worst = ensemble_avg['test_accuracy_mean'] - worst_avg['test_accuracy_mean']
        
        print(f"\nEnsemble vs Best Individual: {ensemble_vs_best:+.3f}")
        print(f"Ensemble vs Worst Individual: {ensemble_vs_worst:+.3f}")
        
        if ensemble_vs_best > 0:
            print("✅ Ensemble outperforms best individual model!")
        else:
            print("❌ Ensemble does not outperform best individual model")
    
    # Save individual model results
    all_fold_results = []
    for model_result in individual_model_results:
        for fold_result in model_result['fold_results']:
            all_fold_results.append(fold_result)
    
    pd.DataFrame(all_fold_results).to_csv(RESULTS_DIR / 'individual_models_5fold_results.csv', index=False)
    
    # Save summary results
    individual_summary = []
    for model_result in individual_model_results:
        individual_summary.append(model_result['average_results'])
    
    pd.DataFrame(individual_summary).to_csv(RESULTS_DIR / 'individual_models_summary.csv', index=False)
    
    print(f"\nIndividual model results saved to:")
    print(f"  - {RESULTS_DIR / 'individual_models_5fold_results.csv'}")
    print(f"  - {RESULTS_DIR / 'individual_models_summary.csv'}")
    
else:
    print("❌ No individual model results to analyze!")



INDIVIDUAL MODEL 5-FOLD CROSS-VALIDATION RESULTS

COMPREHENSIVE MODEL COMPARISON:
--------------------------------------------------------------------------------
Model          Train_Loss            Val_Loss           Test_Loss     Train_Acc       Val_Acc      Test_Acc Overfitting_Loss Overfitting_Acc
 RE5C 0.556246 ± 0.001224 0.556248 ± 0.004898 0.659720 ± 0.000000 0.979 ± 0.001 0.979 ± 0.004 0.769 ± 0.000         0.000001           0.000
 2HNP 0.571103 ± 0.000925 0.571103 ± 0.003705 0.658976 ± 0.000000 0.946 ± 0.001 0.946 ± 0.004 0.770 ± 0.000         0.000000          -0.000
 CPUT 0.573922 ± 0.001043 0.573922 ± 0.004175 0.651741 ± 0.000000 0.945 ± 0.001 0.945 ± 0.006 0.793 ± 0.000        -0.000000          -0.000
 VPLV 0.558083 ± 0.001188 0.558083 ± 0.004753 0.655647 ± 0.000000 0.978 ± 0.001 0.978 ± 0.004 0.787 ± 0.000         0.000001           0.000
 PFFZ 0.570911 ± 0.001263 0.570911 ± 0.005057 0.659414 ± 0.000000 0.949 ± 0.002 0.949 ± 0.008 0.783 ± 0.000        -0.000001       

## ANN DEMO DATASET (343 datapoints)

In [94]:
# ANN Demo Model Evaluation with All Heads
import pandas as pd
from sklearn.model_selection import train_test_split

# ANN Demo specific columns
ANN_DEMO_X_COLS = ['LAP', 'MMT', 'CMC', 'CNF', 'SLK', 'AGR',
                   'ALG', 'CAR', 'CHS', 'PEC', 'PUL', 'STA', 'GEL', 'GLU', 'ZIN', 'GLY',
                   'FFA', 'LAC', 'LEV', 'PHA', 'SRB', 'SUA', 'XYL']

ANN_DEMO_Y_COLS = ['TensileStress', 'TensileStrain','TensileModulusLog10', 'TensileToughnessMean100n90',
                   'TransVis', 'TransIR', 'TransUV']

# Load ANN demo data
ann_demo_df = pd.read_csv('data/ann_demo.csv')
print(f"ANN Demo dataset shape: {ann_demo_df.shape}")

# Create train/test split using the same method as train_test_split.py
X_train_ann, X_test_ann, y_train_ann, y_test_ann = train_test_split(
    ann_demo_df[ANN_DEMO_X_COLS], 
    ann_demo_df[ANN_DEMO_Y_COLS], 
    test_size=0.2, 
    random_state=0
)

print(f"Train set: {X_train_ann.shape[0]} samples")
print(f"Test set: {X_test_ann.shape[0]} samples")
print(f"Target columns: {ANN_DEMO_Y_COLS}")

ANN Demo dataset shape: (342, 42)
Train set: 273 samples
Test set: 69 samples
Target columns: ['TensileStress', 'TensileStrain', 'TensileModulusLog10', 'TensileToughnessMean100n90', 'TransVis', 'TransIR', 'TransUV']


In [95]:
# Diagnose optical data values to understand MRE issue
print("Optical Data Statistics (columns 4:7 are TransVis, TransIR, TransUV):")
print("=" * 70)

# Training data optical values
train_optical = y_train_ann.iloc[:, 3:].values  # Last 3 columns
print("\nTrain Optical Data:")
print(f"  Min values: {train_optical.min(axis=0)}")
print(f"  Max values: {train_optical.max(axis=0)}")
print(f"  Mean values: {train_optical.mean(axis=0)}")
print(f"  Std values: {train_optical.std(axis=0)}")
print(f"  Values <= 0.01: {(train_optical <= 0.01).sum(axis=0)}")
print(f"  Values == 0: {(train_optical == 0).sum(axis=0)}")

# Test data optical values
test_optical = y_test_ann.iloc[:, 3:].values
print("\nTest Optical Data:")
print(f"  Min values: {test_optical.min(axis=0)}")
print(f"  Max values: {test_optical.max(axis=0)}")
print(f"  Mean values: {test_optical.mean(axis=0)}")
print(f"  Std values: {test_optical.std(axis=0)}")
print(f"  Values <= 0.01: {(test_optical <= 0.01).sum(axis=0)}")
print(f"  Values == 0: {(test_optical == 0).sum(axis=0)}")


Optical Data Statistics (columns 4:7 are TransVis, TransIR, TransUV):

Train Optical Data:
  Min values: [1.29561446e-03 2.36666667e+00 3.10000000e+00 0.00000000e+00]
  Max values: [  9.98133898 275.1         85.43333333 100.        ]
  Mean values: [ 0.88493767 70.12092796 69.22490842 45.13296703]
  Std values: [ 1.41265209 17.46409791 11.69908822 14.12777834]
  Values <= 0.01: [12  0  0  1]
  Values == 0: [0 0 0 1]

Test Optical Data:
  Min values: [2.88704121e-03 4.65666667e+01 4.26000000e+01 1.90333333e+01]
  Max values: [ 7.94972092 81.26666667 86.26666667 59.93333333]
  Mean values: [ 0.77583781 70.3615942  69.14299517 44.57463768]
  Std values: [ 1.26619188  8.27573491  9.32651252 11.16123294]
  Values <= 0.01: [3 0 0 0]
  Values == 0: [0 0 0 0]


In [96]:
# ANN Demo Model Evaluation Functions
def prepare_ann_demo_datasets(X_train, X_test, y_train, y_test):
    """
    Prepare ANN demo datasets for evaluation
    """
    # Convert to float32
    X_train_tensor = torch.tensor(X_train.values.astype(np.float32), dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test.values.astype(np.float32), dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values.astype(np.float32), dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values.astype(np.float32), dtype=torch.float32)
    
    return {
        'train_X': X_train_tensor,
        'train_y': y_train_tensor,
        'test_X': X_test_tensor,
        'test_y': y_test_tensor
    }

def evaluate_ann_demo_model(model_tag, dataset, device='cpu'):
    """
    Evaluate ANN demo model on all heads
    """
    # Load model
    model, params = load_model_from_structure_with_txt(model_tag)
    
    if model is None:
        return None
    
    model.to(device)
    model.eval()
    
    # Ensure model parameters are float32
    for param in model.parameters():
        param.data = param.data.float()
    
    results = {'model_tag': model_tag}
    
    with torch.no_grad():
        # Get predictions for all tasks
        train_X = dataset['train_X'].to(device).float()
        test_X = dataset['test_X'].to(device).float()
        
        # Evaluate each task head
        for task in ['tensile', 'optical']:  # Based on the target columns
            try:
                # Get predictions
                train_pred = model.forward(train_X, task).cpu().numpy()
                test_pred = model.forward(test_X, task).cpu().numpy()
                
                # Get true values
                if task == 'tensile':
                    train_y = dataset['train_y'][:, :4].numpy()  # First 4 columns (tensile)
                    test_y = dataset['test_y'][:, :4].numpy()
                elif task == 'optical':
                    train_y = dataset['train_y'][:, 4:7].numpy()  # Last 3 columns (optical)
                    test_y = dataset['test_y'][:, 4:7].numpy()
                
                # Calculate MSE loss
                train_mse = np.mean((train_y - train_pred) ** 2)
                test_mse = np.mean((test_y - test_pred) ** 2)
                
                # Calculate MAE
                train_mae = np.mean(np.abs(train_y - train_pred))
                test_mae = np.mean(np.abs(test_y - test_pred))
                
                # Calculate R²
                from sklearn.metrics import r2_score
                train_r2 = r2_score(train_y, train_pred)
                test_r2 = r2_score(test_y, test_pred)
                
                results[f'{task}_train_mse'] = train_mse
                results[f'{task}_test_mse'] = test_mse
                results[f'{task}_train_mae'] = train_mae
                results[f'{task}_test_mae'] = test_mae
                results[f'{task}_train_r2'] = train_r2
                results[f'{task}_test_r2'] = test_r2
                
                # Calculate MRE (Mean Relative Error)
                # Use threshold-based approach to avoid division by very small values
                # Only calculate relative error for values above threshold
                threshold = 0.1  # Ignore values below 0.1 for relative error calculation
                
                # Train MRE - only for values above threshold
                train_mask = np.abs(train_y) > threshold
                if train_mask.any():
                    train_mre = np.mean(np.abs((train_y[train_mask] - train_pred[train_mask]) / train_y[train_mask]))
                else:
                    train_mre = np.nan
                
                # Test MRE - only for values above threshold
                test_mask = np.abs(test_y) > threshold
                if test_mask.any():
                    test_mre = np.mean(np.abs((test_y[test_mask] - test_pred[test_mask]) / test_y[test_mask]))
                else:
                    test_mre = np.nan
                
                results[f'{task}_train_mre'] = train_mre
                results[f'{task}_test_mre'] = test_mre
                
            except Exception as e:
                print(f"Error evaluating {task} for {model_tag}: {e}")
                results[f'{task}_train_mse'] = np.nan
                results[f'{task}_test_mse'] = np.nan
                results[f'{task}_train_mae'] = np.nan
                results[f'{task}_test_mae'] = np.nan
                results[f'{task}_train_r2'] = np.nan
                results[f'{task}_test_r2'] = np.nan
    
    return results

# Prepare ANN demo datasets
ann_demo_dataset = prepare_ann_demo_datasets(X_train_ann, X_test_ann, y_train_ann, y_test_ann)

In [97]:
# Evaluate ANN Demo Model on All Models
print("Evaluating ANN Demo Model on All Available Models")
print("=" * 60)

ann_demo_results = []

for model_tag in tqdm(model_tags, desc="Evaluating models"):
    print(f"\nEvaluating {model_tag} on ANN demo data...")
    
    result = evaluate_ann_demo_model(model_tag, ann_demo_dataset)
    
    if result is not None:
        ann_demo_results.append(result)
        
        # Print results
        print(f"  Tensile - Train MSE: {result['tensile_train_mse']:.6f}, Test MSE: {result['tensile_test_mse']:.6f}")
        print(f"  Tensile - Train MRE: {result['tensile_train_mre']:.3f}, Test MRE: {result['tensile_test_mre']:.3f}")
        print(f"  Tensile - Train R²: {result['tensile_train_r2']:.3f}, Test R²: {result['tensile_test_r2']:.3f}")
        print(f"  Optical - Train MSE: {result['optical_train_mse']:.6f}, Test MSE: {result['optical_test_mse']:.6f}")
        print(f"  Optical - Train MRE: {result['optical_train_mre']:.3f}, Test MRE: {result['optical_test_mre']:.3f}")
        print(f"  Optical - Train R²: {result['optical_train_r2']:.3f}, Test R²: {result['optical_test_r2']:.3f}")
    else:
        print(f"  ❌ Failed to evaluate {model_tag}")

print(f"\nSuccessfully evaluated {len(ann_demo_results)} models on ANN demo data")

Evaluating ANN Demo Model on All Available Models


Evaluating models:  40%|████      | 2/5 [00:00<00:00, 14.35it/s]


Evaluating pt_db.v3hp.RE5C on ANN demo data...
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.RE5C
  Tensile - Train MSE: 682.837402, Test MSE: 776.439636
  Tensile - Train MRE: 0.896, Test MRE: 0.892
  Tensile - Train R²: -3.535, Test R²: -2.350
  Optical - Train MSE: 4047.579102, Test MSE: 3919.073486
  Optical - Train MRE: 0.989, Test MRE: 0.990
  Optical - Train R²: -20.021, Test R²: -46.753

Evaluating pt_db.v3hp.2HNP on ANN demo data...
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Suc

Evaluating models: 100%|██████████| 5/5 [00:00<00:00, 14.48it/s]

  Tensile - Train MSE: 682.539978, Test MSE: 776.287231
  Tensile - Train MRE: 0.888, Test MRE: 0.888
  Tensile - Train R²: -3.482, Test R²: -2.350
  Optical - Train MSE: 4048.046143, Test MSE: 3919.197510
  Optical - Train MRE: 0.989, Test MRE: 0.990
  Optical - Train R²: -20.023, Test R²: -46.754

Evaluating pt_db.v3hp.VPLV on ANN demo data...
{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.VPLV
  Tensile - Train MSE: 682.491211, Test MSE: 776.080933
  Tensile - Train MRE: 0.890, Test MRE: 0.891
  Tensile - Train R²: -3.454, Test R²: -2.336
  Optical - Train MSE: 4048.774170, Test MSE: 3919.395508
  Optical - Train MRE: 0.989, Test MRE: 0.990
  Optical - Train R²: -20.026, Test R²: -46.757

Evaluating pt_db.v3hp.PFFZ on




In [100]:
# ANN Demo Results Analysis
if ann_demo_results:
    ann_demo_df = pd.DataFrame(ann_demo_results)
    
    print("ANN DEMO MODEL EVALUATION RESULTS")
    print("=" * 80)
    
    # Create summary table
    summary_data = []
    for _, row in ann_demo_df.iterrows():
        model_name = row['model_tag'].split('.')[-1]
        summary_data.append({
            'Model': model_name,
            'Tensile_Train_MSE': f"{row['tensile_train_mse']:.6f}",
            'Tensile_Test_MSE': f"{row['tensile_test_mse']:.6f}",
            'Tensile_Train_R2': f"{row['tensile_train_r2']:.3f}",
            'Tensile_Test_R2': f"{row['tensile_test_r2']:.3f}",
            'Optical_Train_MSE': f"{row['optical_train_mse']:.6f}",
            'Optical_Test_MSE': f"{row['optical_test_mse']:.6f}",
            'Optical_Train_R2': f"{row['optical_train_r2']:.3f}",
            'Optical_Test_R2': f"{row['optical_test_r2']:.3f}",
            'Tensile_Train_MRE': f"{row['tensile_train_mre']:.3f}",
            'Tensile_Test_MRE': f"{row['tensile_test_mre']:.3f}",
            'Optical_Train_MRE': f"{row['optical_train_mre']:.3f}",
            'Optical_Test_MRE': f"{row['optical_test_mre']:.3f}"
        })
    
    summary_df = pd.DataFrame(summary_data)
    print("\nCOMPREHENSIVE RESULTS:")
    print(summary_df.to_string(index=False))
    
    # Find best models
    print(f"\nBEST MODELS:")
    print("-" * 30)
    
    # Best tensile model (by test R²)
    best_tensile_idx = ann_demo_df['tensile_test_r2'].idxmax()
    best_tensile = ann_demo_df.iloc[best_tensile_idx]
    print(f"Best Tensile Model: {best_tensile['model_tag'].split('.')[-1]}")
    print(f"  Test R²: {best_tensile['tensile_test_r2']:.3f}")
    print(f"  Test MSE: {best_tensile['tensile_test_mse']:.6f}")
    print(f"  Test MRE: {best_tensile['tensile_test_mre']:.3f}")
    
    # Best optical model (by test R²)
    best_optical_idx = ann_demo_df['optical_test_r2'].idxmax()
    best_optical = ann_demo_df.iloc[best_optical_idx]
    print(f"Best Optical Model: {best_optical['model_tag'].split('.')[-1]}")
    print(f"  Test R²: {best_optical['optical_test_r2']:.3f}")
    print(f"  Test MSE: {best_optical['optical_test_mse']:.6f}")
    print(f"  Test MRE: {best_optical['optical_test_mre']:.3f}")

    
else:
    print("❌ No ANN demo results to analyze!")

ANN DEMO MODEL EVALUATION RESULTS

COMPREHENSIVE RESULTS:
Model Tensile_Train_MSE Tensile_Test_MSE Tensile_Train_R2 Tensile_Test_R2 Optical_Train_MSE Optical_Test_MSE Optical_Train_R2 Optical_Test_R2 Tensile_Train_MRE Tensile_Test_MRE Optical_Train_MRE Optical_Test_MRE
 RE5C        682.837402       776.439636           -3.535          -2.350       4047.579102      3919.073486          -20.021         -46.753             0.896            0.892             0.989            0.990
 2HNP        682.354370       775.905701           -3.477          -2.321       4048.314453      3919.327393          -20.024         -46.755             0.895            0.893             0.989            0.990
 CPUT        682.539978       776.287231           -3.482          -2.350       4048.046143      3919.197510          -20.023         -46.754             0.888            0.888             0.989            0.990
 VPLV        682.491211       776.080933           -3.454          -2.336       4048.774170   

## 343 datapoints + UIP Augmentation

In [101]:
for sec in [
    '01_imports_and_configuration',
    '02_data_loading_and_preparation',
    '03_neural_network_architecture',
    '04_data_augmentation_methods',
    '05_training_and_hyperparameter_tuning',
    '06_model_architectures_and_configs',
    '07_parallel_training_utilities',
    '08_experiment_runners'
]:
    exec(open(f'core_sections/{sec}.py').read())
print("✅ Core sections loaded.")


✅ [01] Imports and Configuration Module Loaded
📊 Dataset Configuration (aligned with ann_model):
   📋 Input features: 23
   🎯 Prediction tasks: ['optical', 'tensile']
      optical: 3 targets - ['TransVis', 'TransIR', 'TransUV']
      tensile: 4 targets - ['TensileStress', 'TensileStrain', 'TensileModulusLog10', 'TensileToughnessMean100n90']
🔧 Environment Information:
   🔮 PyTorch version: 2.8.0
   💻 Device: CPU
   🎲 Random seed: 0
   🚀 Parallel processing: 10 cores available
📁 Loaded dataset shape: (342, 42)
🧹 After removing missing targets: (342, 42)
📈 Data retention: 100.0%

📋 Dataset Summary:
   Total samples: 342
   Input features: 23
   Target variables: 7
   Missing values in targets: 0

📊 Target Variable Ranges:

OPTICAL Targets:
  TransVis: [2.37, 275.10] (mean: 70.17)
  TransIR: [3.10, 86.27] (mean: 69.21)
  TransUV: [0.00, 100.00] (mean: 45.02)

TENSILE Targets:
  TensileStress: [0.36, 161.67] (mean: 39.95)
  TensileStrain: [0.07, 75.03] (mean: 5.84)
  TensileModulusLog10: [

## UIP on Train + Evaluate Loaded Models (tensile, optical)


In [104]:
# Apply UIP on ANN demo train split and evaluate loaded models
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Ensure core sections (including apply_uip_augmentation) are loaded
try:
    _ = apply_uip_augmentation  # noqa: F401
except NameError:
    for sec in [
        '01_imports_and_configuration',
        '02_data_loading_and_preparation',
        '03_neural_network_architecture',
        '04_data_augmentation_methods',
        '05_training_and_hyperparameter_tuning',
        '06_model_architectures_and_configs',
        '07_parallel_training_utilities',
        '08_experiment_runners'
    ]:
        exec(open(f'core_sections/{sec}.py').read())

# Prepare ANN demo train/test arrays
X_train = X_train_ann[ANN_DEMO_X_COLS].values.astype(np.float32)
X_test = X_test_ann[ANN_DEMO_X_COLS].values.astype(np.float32)

y_train_full = y_train_ann[ANN_DEMO_Y_COLS].values.astype(np.float32)
y_test_full = y_test_ann[ANN_DEMO_Y_COLS].values.astype(np.float32)

# Split targets into tasks
optical_cols = TARGET_Y_COLS['optical']
tensile_cols = TARGET_Y_COLS['tensile']

Y_train_opt = y_train_ann[optical_cols].values.astype(np.float32)
Y_test_opt = y_test_ann[optical_cols].values.astype(np.float32)

Y_train_ten = y_train_ann[tensile_cols].values.astype(np.float32)
Y_test_ten = y_test_ann[tensile_cols].values.astype(np.float32)

# Optional: experimental stds not available here
Y_train_opt_std = None
Y_train_ten_std = None

# UIP augmentation on raw train for both tasks
ratio = 100
X_aug_opt, Y_aug_opt = apply_uip_augmentation(X_train, Y_train_opt, ratio, 'optical', y_train_std=Y_train_opt_std)
X_aug_ten, Y_aug_ten = apply_uip_augmentation(X_train, Y_train_ten, ratio, 'tensile', y_train_std=Y_train_ten_std)

# For evaluation of loaded models, we only need feature scalers (models expect raw inputs); keep X as raw tensors later
# Build scalers for potential normalization use (not applied to model inputs here)
feature_scaler = StandardScaler().fit(np.vstack([X_aug_opt, X_aug_ten]))

# Target scaling per core sections
optical_scales = np.array(TARGET_Y_SCALES['optical'], dtype=np.float32)
tensile_scales = np.array(TARGET_Y_SCALES['tensile'], dtype=np.float32)

# Function to evaluate one model on both heads
def eval_model_on_heads(model_tag: str):
    model, params = load_model_from_structure_with_txt(model_tag)
    if model is None:
        return None
    model.eval()
    # Ensure float32
    for p in model.parameters():
        p.data = p.data.float()
    with torch.no_grad():
        Xtr_tensor = torch.tensor(X_train, dtype=torch.float32)
        Xte_tensor = torch.tensor(X_test, dtype=torch.float32)
        # Tensile
        ten_train_pred = model.forward(Xtr_tensor, 'tensile').cpu().numpy()
        ten_test_pred = model.forward(Xte_tensor, 'tensile').cpu().numpy()
        # Inverse scale predictions and truths to original scale
        ten_train_pred_orig = ten_train_pred * tensile_scales
        ten_test_pred_orig = ten_test_pred * tensile_scales
        ten_train_true_orig = Y_train_ten
        ten_test_true_orig = Y_test_ten
        # Metrics (original scale)
        ten_train_mse = mean_squared_error(ten_train_true_orig, ten_train_pred_orig)
        ten_test_mse = mean_squared_error(ten_test_true_orig, ten_test_pred_orig)
        ten_train_mae = mean_absolute_error(ten_train_true_orig, ten_train_pred_orig)
        ten_test_mae = mean_absolute_error(ten_test_true_orig, ten_test_pred_orig)
        ten_train_r2 = r2_score(ten_train_true_orig, ten_train_pred_orig)
        ten_test_r2 = r2_score(ten_test_true_orig, ten_test_pred_orig)
        # MRE with threshold
        threshold = 0.1
        mask_tr = np.abs(ten_train_true_orig) > threshold
        mask_te = np.abs(ten_test_true_orig) > threshold
        ten_train_mre = np.mean(np.abs((ten_train_true_orig[mask_tr] - ten_train_pred_orig[mask_tr]) / ten_train_true_orig[mask_tr])) if mask_tr.any() else np.nan
        ten_test_mre = np.mean(np.abs((ten_test_true_orig[mask_te] - ten_test_pred_orig[mask_te]) / ten_test_true_orig[mask_te])) if mask_te.any() else np.nan
        # Optical
        opt_train_pred = model.forward(Xtr_tensor, 'optical').cpu().numpy()
        opt_test_pred = model.forward(Xte_tensor, 'optical').cpu().numpy()
        opt_train_pred_orig = opt_train_pred * optical_scales
        opt_test_pred_orig = opt_test_pred * optical_scales
        opt_train_true_orig = Y_train_opt
        opt_test_true_orig = Y_test_opt
        opt_train_mse = mean_squared_error(opt_train_true_orig, opt_train_pred_orig)
        opt_test_mse = mean_squared_error(opt_test_true_orig, opt_test_pred_orig)
        opt_train_mae = mean_absolute_error(opt_train_true_orig, opt_train_pred_orig)
        opt_test_mae = mean_absolute_error(opt_test_true_orig, opt_test_pred_orig)
        opt_train_r2 = r2_score(opt_train_true_orig, opt_train_pred_orig)
        opt_test_r2 = r2_score(opt_test_true_orig, opt_test_pred_orig)
        mask_tr_o = np.abs(opt_train_true_orig) > threshold
        mask_te_o = np.abs(opt_test_true_orig) > threshold
        opt_train_mre = np.mean(np.abs((opt_train_true_orig[mask_tr_o] - opt_train_pred_orig[mask_tr_o]) / opt_train_true_orig[mask_tr_o])) if mask_tr_o.any() else np.nan
        opt_test_mre = np.mean(np.abs((opt_test_true_orig[mask_te_o] - opt_test_pred_orig[mask_te_o]) / opt_test_true_orig[mask_te_o])) if mask_te_o.any() else np.nan
    return {
        'model_tag': model_tag,
        'tensile_train_mse': ten_train_mse,
        'tensile_test_mse': ten_test_mse,
        'tensile_train_mae': ten_train_mae,
        'tensile_test_mae': ten_test_mae,
        'tensile_train_r2': ten_train_r2,
        'tensile_test_r2': ten_test_r2,
        'tensile_train_mre': ten_train_mre,
        'tensile_test_mre': ten_test_mre,
        'optical_train_mse': opt_train_mse,
        'optical_test_mse': opt_test_mse,
        'optical_train_mae': opt_train_mae,
        'optical_test_mae': opt_test_mae,
        'optical_train_r2': opt_train_r2,
        'optical_test_r2': opt_test_r2,
        'optical_train_mre': opt_train_mre,
        'optical_test_mre': opt_test_mre,
        'uip_ratio': ratio,
        'uip_train_samples': int(X_aug_opt.shape[0])  # same count for ten/opt start size
    }

print(f"Applying UIP with ratio 1:{ratio} on train (optical & tensile). Aug samples: {X_aug_opt.shape[0]} / {len(X_train)}x")

# Evaluate all loaded models
uip_eval_results = []
for tag in tqdm(model_tags, desc="Evaluating models on test set (tensile+optical)"):
    res = eval_model_on_heads(tag)
    if res is not None:
        uip_eval_results.append(res)

uip_eval_df = pd.DataFrame(uip_eval_results)
print("\nUIP-based evaluation (train augmented, test original) summary (per model):")
if not uip_eval_df.empty:
    display_cols = [
        'model_tag', 'uip_ratio',
        'tensile_test_r2', 'tensile_test_mre', 'tensile_test_mse',
        'optical_test_r2', 'optical_test_mre', 'optical_test_mse'
    ]
    print(uip_eval_df[display_cols].to_string(index=False))

else:
    print("No results to display.")


Applying UIP with ratio 1:100 on train (optical & tensile). Aug samples: 27573 / 273x


Evaluating models on test set (tensile+optical):  20%|██        | 1/5 [00:00<00:00,  8.32it/s]

{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.RE5C
{'model_params': {'encoder__hidden_layers': 30, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 1, 'tensile__hidden_layers': 1, 'optical__hidden_layers': 1, 'fire__hidden_layers': 1, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.2HNP
{'model_params': {'encoder__hidden_layers': 50, 'encoder__hidden_base': 320.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.CPUT


Evaluating models on test set (tensile+optical): 100%|██████████| 5/5 [00:00<00:00, 14.16it/s]

{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 360.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.VPLV
{'model_params': {'encoder__hidden_layers': 40, 'encoder__hidden_base': 400.0, 'encoder__hidden_scale': 'log_1.025', 'encoder__n_output': 128, 'grade__hidden_layers': 2, 'tensile__hidden_layers': 2, 'optical__hidden_layers': 2, 'fire__hidden_layers': 2, 'dtype_str': 'float32'}}
Successfully loaded model: pt_db.v3hp.PFFZ

UIP-based evaluation (train augmented, test original) summary (per model):
      model_tag  uip_ratio  tensile_test_r2  tensile_test_mre  tensile_test_mse  optical_test_r2  optical_test_mre  optical_test_mse
pt_db.v3hp.RE5C        100         0.197747          0.926468        107.968506      -101.433983          0.728919       7021.876953
pt_db.v3hp.2HN


