In [1]:
import wandb
import pandas as pd
import numpy as np
import json
from itertools import product

# CONFIGURATION
EXPERIMENTS_TO_LOAD = ['avg_degree_range']#, 'homophily_range']  # Multiple experiments
METRIC = 'val/accuracy'  # Options: 'test/accuracy', 'test/auroc', 'test/precision', 'test/recall', etc.

run_to_api_dict = {'avg_degree_range': 'graphuniverse/final_degree_experiments',}
                   #'homophily_range': 'graphuniverse/final_homophily_experiments'}

# Initialize wandb API
api = wandb.Api(timeout=100)

# Function to safely extract nested values
def safe_get_nested(obj, keys, default=None):
    """Safely get nested dictionary values"""
    try:
        for key in keys:
            obj = obj[key]
        return obj
    except (KeyError, TypeError, IndexError):
        return default

def dict_to_sorted_string(d):
    """Convert nested dict to consistent sorted string"""
    if d is None:
        return "None"
    return json.dumps(d, sort_keys=True, separators=(',', ':'))

def dict_to_sorted_string_no_seed(d, seed_keys=['seed', 'random_seed', 'random_state']):
    """Convert nested dict to consistent sorted string, excluding seed-related keys"""
    if d is None:
        return "None"
    
    return json.dumps(d, sort_keys=True, separators=(',', ':'))

# Collect data from all experiments
all_data = []
for experiment_name in EXPERIMENTS_TO_LOAD:
    print(f"Loading data from experiment: {experiment_name}")
    
    # Get all runs from this experiment
    runs = api.runs(run_to_api_dict[experiment_name])
    
    for run in runs:
        # Get summary metrics
        summary = run.summary._json_dict
        summary['run_name'] = run.name
        summary['run_id'] = run.id
        summary['experiment_type'] = experiment_name  # Add experiment identifier
        
        # Add config parameters with 'config_' prefix
        config = run.config
        for key, value in config.items():
            summary[f'config_{key}'] = value
        
        all_data.append(summary)

# Convert to DataFrame
df = pd.DataFrame(all_data)



Loading data from experiment: avg_degree_range


In [2]:
# Extract parameters for each experiment type
def extract_params_multi(gen_params_str, experiment_type):
    try:
        parsed = json.loads(gen_params_str)
        n_graphs = parsed['family_parameters']['n_graphs']
        
        # Get the appropriate parameter based on experiment type
        if experiment_type == 'avg_degree_range':
            varied_param_value = tuple(parsed['family_parameters']['avg_degree_range'])
        elif experiment_type == 'homophily_range':
            varied_param_value = tuple(parsed['family_parameters']['homophily_range'])
        else:
            return None, None, None
            
        return n_graphs, varied_param_value, experiment_type
    except:
        return None, None, None

def extract_transform_info(config_transforms, model_name):
    """Extract relevant transform information based on model type"""
    if pd.isna(config_transforms) or config_transforms is None:
        return "no_transform"
    
    if model_name in ['GPS', 'nsd']:
        # Extract encodings for GPS and NSD
        if 'CombinedPSEs' in config_transforms:
            encodings = config_transforms['CombinedPSEs'].get('encodings', [])
            if encodings:
                return '_'.join(sorted(encodings))  # Sort for consistency
        return "no_encoding"
    
    elif model_name == 'topotune':
        return "cell_lifting"
    
    else:
        return "no_transform"


# Create string representations for sorting
df['generation_params_str'] = df['config_dataset'].apply(
    lambda x: dict_to_sorted_string(safe_get_nested(x, ['loader', 'parameters', 'generation_parameters']))
)

df['model_name'] = df['config_model'].apply(
    lambda x: safe_get_nested(x, ['model_name'])
)

# Sort by generation parameters first, then model name second
df_sorted = df.sort_values(['generation_params_str', 'model_name'])

# Extract transform info separately
df_sorted['transform_info'] = df_sorted.apply(
    lambda row: extract_transform_info(row.get('config_transforms'), row['model_name']), 
    axis=1
)

# Create enhanced model config string that includes PE info for GPS/NSD
df_sorted['model_config_str'] = df_sorted.apply(
    lambda row: (dict_to_sorted_string_no_seed(row['config_model']) + f"_PE_{row['transform_info']}") 
                if row['model_name'] in ['GPS', 'nsd'] 
                else dict_to_sorted_string_no_seed(row['config_model']),
    axis=1
)

# Filter out rows with missing essential data
df_clean = df_sorted.dropna(subset=['generation_params_str', 'model_config_str', METRIC, 'checkpoint'])
df_clean = df_clean[df_clean['generation_params_str'] != "None"]

df_clean[['n_graphs', 'varied_param_value', 'experiment_type_check']] = df_clean.apply(
    lambda row: pd.Series(extract_params_multi(row['generation_params_str'], row['experiment_type'])), axis=1
)

df_clean = df_clean.dropna(subset=['n_graphs', 'varied_param_value'])

# Get unique values
unique_models = sorted(df_clean['model_name'].unique())

# Dictionary to store the best models for ALL experiments
best_models_dict = {}

# Process each experiment type separately
for experiment_type in EXPERIMENTS_TO_LOAD:
    print(f"\nProcessing experiment: {experiment_type}")
    
    # Filter data for this experiment
    experiment_data = df_clean[df_clean['experiment_type'] == experiment_type]
    
    if len(experiment_data) == 0:
        print(f"No data found for experiment: {experiment_type}")
        continue
    
    unique_varied_param_values = sorted(experiment_data['varied_param_value'].unique())
    
    # Process each varied parameter value
    for varied_param_value in unique_varied_param_values:
        subset = experiment_data[experiment_data['varied_param_value'] == varied_param_value]
        
        if len(subset) > 0:
            # Create data config key that includes experiment type
            data_config_key = f"{experiment_type}_{varied_param_value}"
            best_models_dict[data_config_key] = {}
            
            # For each model architecture, find the best configuration
            for model_name in unique_models:
                model_subset = subset[subset['model_name'] == model_name]
                
                if len(model_subset) > 0:
                    # Calculate mean performance for each unique model configuration
                    model_config_performance = model_subset.groupby('model_config_str').agg({
                        METRIC: ['mean', 'std', 'count'],
                        'checkpoint': lambda x: list(x)  # Collect all checkpoint paths
                    }).reset_index()
                    
                    # Flatten column names
                    model_config_performance.columns = [
                        'model_config_str', 
                        f'{METRIC}_mean', 
                        f'{METRIC}_std', 
                        f'{METRIC}_count',
                        'checkpoints'
                    ]
                    
                    # Find the configuration with the best mean performance
                    if len(model_config_performance) > 0:
                        best_config_idx = model_config_performance[f'{METRIC}_mean'].idxmax()
                        best_checkpoints = model_config_performance.loc[best_config_idx, 'checkpoints']
                        
                        # Store the checkpoint locations for this model architecture
                        best_models_dict[data_config_key][model_name] = best_checkpoints

# Clean up empty entries
best_models_dict = {k: v for k, v in best_models_dict.items() if v}

# Print summary
print(f"\nSelected best models based on metric: {METRIC}")
print(f"Processed experiments: {EXPERIMENTS_TO_LOAD}")
print(f"\nFound {len(best_models_dict)} data configurations with best models")

for data_config, models in best_models_dict.items():
    print(f"\n{data_config}:")
    for model_name, checkpoints in models.items():
        print(f"  {model_name}: {len(checkpoints)} checkpoint(s)")
        for checkpoint in checkpoints:
            print(f"    - {checkpoint}")


Processing experiment: avg_degree_range

Selected best models based on metric: val/accuracy
Processed experiments: ['avg_degree_range']

Found 2 data configurations with best models

avg_degree_range_(5, 10):
  DeepSet: 3 checkpoint(s)
    - /data/gbg141/TB/outputs/checkpoints/epoch_386.ckpt
    - /data/gbg141/TB/outputs/checkpoints/epoch_422.ckpt
    - /data/gbg141/TB/outputs/checkpoints/epoch_373-v1.ckpt
  GPS: 3 checkpoint(s)
    - /data/gbg141/TB/outputs/checkpoints/epoch_150-v5.ckpt
    - /data/gbg141/TB/outputs/checkpoints/epoch_105-v5.ckpt
    - /data/gbg141/TB/outputs/checkpoints/epoch_146-v5.ckpt
  GraphMLP: 3 checkpoint(s)
    - /data/gbg141/TB/outputs/checkpoints/epoch_329-v7.ckpt
    - /data/gbg141/TB/outputs/checkpoints/epoch_463.ckpt
    - /data/gbg141/TB/outputs/checkpoints/epoch_376-v1.ckpt
  GraphSAGE: 3 checkpoint(s)
    - /data/gbg141/TB/outputs/checkpoints/epoch_178-v3.ckpt
    - /data/gbg141/TB/outputs/checkpoints/epoch_202-v1.ckpt
    - /data/gbg141/TB/outputs/ch

In [4]:
import os
import shutil

# Set the output directory where you want to organize the checkpoints
OUTPUT_DIR = f"./{EXPERIMENTS_TO_LOAD[0]}_checkpoints"

# Make sure the output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Loop through the best_models_dict and copy files
for data_config, models in best_models_dict.items():
    for model_name, checkpoints in models.items():
        # Create a subfolder for each data_config and model_name
        subfolder = os.path.join(OUTPUT_DIR, str(data_config), str(model_name))
        os.makedirs(subfolder, exist_ok=True)
        for checkpoint_path in checkpoints:
            if os.path.isfile(checkpoint_path):
                # Copy the checkpoint file into the subfolder
                shutil.copy2(checkpoint_path, subfolder)
                print(f"Copied {checkpoint_path} to {subfolder}")
            else:
                print(f"WARNING: Checkpoint file not found: {checkpoint_path}")

Copied /data/gbg141/TB/outputs/checkpoints/epoch_386.ckpt to ./avg_degree_range_checkpoints/avg_degree_range_(5, 10)/DeepSet
Copied /data/gbg141/TB/outputs/checkpoints/epoch_422.ckpt to ./avg_degree_range_checkpoints/avg_degree_range_(5, 10)/DeepSet
Copied /data/gbg141/TB/outputs/checkpoints/epoch_373-v1.ckpt to ./avg_degree_range_checkpoints/avg_degree_range_(5, 10)/DeepSet
Copied /data/gbg141/TB/outputs/checkpoints/epoch_150-v5.ckpt to ./avg_degree_range_checkpoints/avg_degree_range_(5, 10)/GPS
Copied /data/gbg141/TB/outputs/checkpoints/epoch_105-v5.ckpt to ./avg_degree_range_checkpoints/avg_degree_range_(5, 10)/GPS
Copied /data/gbg141/TB/outputs/checkpoints/epoch_146-v5.ckpt to ./avg_degree_range_checkpoints/avg_degree_range_(5, 10)/GPS
Copied /data/gbg141/TB/outputs/checkpoints/epoch_329-v7.ckpt to ./avg_degree_range_checkpoints/avg_degree_range_(5, 10)/GraphMLP
Copied /data/gbg141/TB/outputs/checkpoints/epoch_463.ckpt to ./avg_degree_range_checkpoints/avg_degree_range_(5, 10)/Grap

In [5]:
import json

# Save the best_models_dict as a JSON file in the output directory
with open(os.path.join(OUTPUT_DIR, "best_models_dict.json"), "w") as f:
    json.dump(best_models_dict, f, indent=2)
print(f"Saved best_models_dict to {os.path.join(OUTPUT_DIR, 'best_models_dict.json')}")

# Compress the organized checkpoints folder after copying
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)
print(f"Compressed folder created: {OUTPUT_DIR}.zip")

Saved best_models_dict to ./avg_degree_range_checkpoints/best_models_dict.json
Compressed folder created: ./avg_degree_range_checkpoints.zip


In [7]:
print(f"scp -R gbg141@bobby.ece.ucsb.edu:/home/gbg141/TopoBench-1/tutorials/{OUTPUT_DIR}.zip .")
print(f"scp -R gbg141@daisy.ece.ucsb.edu:/home/gbg141/TopoBench/tutorials/{OUTPUT_DIR}.zip .")
# Run scp -R gbg141@daisy.ece.ucsb.edu:/home/gbg141/TopoBench/tutorials/selected_checkpoints.zip .

scp -R gbg141@bobby.ece.ucsb.edu:/home/gbg141/TopoBench-1/tutorials/./avg_degree_range_checkpoints.zip .
scp -R gbg141@daisy.ece.ucsb.edu:/home/gbg141/TopoBench/tutorials/./avg_degree_range_checkpoints.zip .
