# Example: Checkpointing and Resume Functionality

## 1. Notebook setup

### 1.1. Imports

In [1]:
import pickle
import sys
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from scipy import stats

sys.path.append('..')

from hill_climber import HillClimber

### 1.2. Setup data and objective function

In [2]:
# Create input distribution
n = 5000
np.random.seed(42)
data = pd.DataFrame({
    'x': np.random.normal(loc=10.0, scale=2.0, size=n),
    'y': np.random.normal(loc=10.0, scale=2.0, size=n)
})

# Ensure all values are strictly positive
data['x'] = np.maximum(data['x'], 0.1)
data['y'] = np.maximum(data['y'], 0.1)

print(f"Initial mean X: {np.mean(data['x']):.4f}, mean Y: {np.mean(data['y']):.4f}")
print(f"Initial std X: {np.std(data['x'], ddof=1):.4f}, std Y: {np.std(data['y'], ddof=1):.4f}")

Initial mean X: 10.0112, mean Y: 9.9803
Initial std X: 1.9930, std Y: 2.0209


In [3]:
def objective_same_mean_std_different_distributions(x, y):
    '''Maximize distribution difference while maintaining same mean and std.'''
    mean_x = np.mean(x)
    mean_y = np.mean(y)
    std_x = np.std(x, ddof=1)
    std_y = np.std(y, ddof=1)
    
    # Calculate KS statistic (measure of distribution difference)
    ks_statistic, _ = stats.ks_2samp(x, y)
    
    # Calculate penalty for not matching mean and std
    mean_scale = (abs(mean_x) + abs(mean_y)) / 2 + 0.1
    std_scale = (std_x + std_y) / 2 + 0.1
    
    mean_penalty = abs(mean_x - mean_y) / mean_scale
    std_penalty = abs(std_x - std_y) / std_scale
    total_penalty = mean_penalty + std_penalty
    
    # Objective: maximize KS statistic while minimizing penalty
    penalty_weight = 0.5
    objective = ks_statistic - (penalty_weight * total_penalty)
    
    metrics = {
        'Mean X': mean_x,
        'Mean Y': mean_y,
        'Std X': std_x,
        'Std Y': std_y,
        'KS Statistic': ks_statistic,
        'Mean Penalty': mean_penalty,
        'Std Penalty': std_penalty,
        'Total Penalty': total_penalty
    }
    
    return metrics, objective

## 2. Example 1: Single run with checkpointing

In [4]:
# Create climber with checkpoint file
checkpoint_file = '../data/single_run_checkpoint.pkl'

climber = HillClimber(
    data=data,
    objective_func=objective_same_mean_std_different_distributions,
    max_time=3,  # 3 minutes
    step_spread=2.0,
    perturb_fraction=0.3,
    temperature=1000.0,
    cooling_rate=0.001,
    mode='maximize',
    checkpoint_file=checkpoint_file,
    save_interval=30  # Save every 30 seconds
)

print("Starting optimization with periodic checkpointing...")
best_data, steps_df = climber.climb()

print(f"\nOptimization completed!")
print(f"Final objective: {steps_df['Objective value'].iloc[-1]:.4f}")
print(f"Total steps: {len(steps_df)}")

Starting optimization with periodic checkpointing...
Starting replica exchange with 4 replicas (parallel (4 workers))...
Temperature ladder: [ 1000.          2154.43469003  4641.58883361 10000.        ]
Checkpoint saved: ../data/single_run_checkpoint.pkl
Checkpoint saved: ../data/single_run_checkpoint.pkl
Checkpoint saved: ../data/single_run_checkpoint.pkl
Checkpoint saved: ../data/single_run_checkpoint.pkl
Checkpoint saved: ../data/single_run_checkpoint.pkl
Checkpoint saved: ../data/single_run_checkpoint.pkl
Checkpoint saved: ../data/single_run_checkpoint.pkl

Best result from replica 0 (T=0.1)
Exchange acceptance rate: 100.00%

Optimization completed!
Final objective: 0.0254
Total steps: 9001
Checkpoint saved: ../data/single_run_checkpoint.pkl

Best result from replica 0 (T=0.1)
Exchange acceptance rate: 100.00%

Optimization completed!
Final objective: 0.0254
Total steps: 9001


## 3. Example 2: Resume from checkpoint

In [5]:
# Resume from the checkpoint with additional time
print("Resuming optimization from checkpoint...")

# Load the checkpoint (this restores the state)
resumed_climber = HillClimber.load_checkpoint(
    filepath=checkpoint_file,
    objective_func=objective_same_mean_std_different_distributions
)

# Update max_time for additional runtime
resumed_climber.max_time = 2  # Run for 2 more minutes (total 5 minutes)

# Continue optimization
resumed_best_data, resumed_steps_df = resumed_climber.climb()

print(f"\nResumed optimization completed!")
print(f"Final objective: {resumed_steps_df['Objective value'].iloc[-1]:.4f}")
print(f"Total steps after resume: {len(resumed_steps_df)}")

Resuming optimization from checkpoint...
Resumed from checkpoint: 3.2 minutes elapsed
Starting replica exchange with 4 replicas (parallel (4 workers))...
Temperature ladder: [ 1000.          2154.43469003  4641.58883361 10000.        ]
Resumed from checkpoint: 3.2 minutes elapsed
Starting replica exchange with 4 replicas (parallel (4 workers))...
Temperature ladder: [ 1000.          2154.43469003  4641.58883361 10000.        ]

Best result from replica 1 (T=0.0)
Exchange acceptance rate: 94.12%

Resumed optimization completed!
Final objective: 0.0301
Total steps after resume: 11001

Best result from replica 1 (T=0.0)
Exchange acceptance rate: 94.12%

Resumed optimization completed!
Final objective: 0.0301
Total steps after resume: 11001


## 4. Example 3: Replica exchange with checkpointing

In [6]:
# Run replica exchange optimization with checkpointing
checkpoint_file_replica = '../data/replica_exchange_checkpoint.pkl'

replica_climber = HillClimber(
    data=data,
    objective_func=objective_same_mean_std_different_distributions,
    max_time=3,  # 3 minutes
    step_spread=2.0,
    perturb_fraction=0.3,
    temperature=100.0,  # T_min
    T_max=1000.0,  # T_max
    n_replicas=4,
    cooling_rate=0.001,
    mode='maximize',
    checkpoint_file=checkpoint_file_replica,
    save_interval=30  # Save every 30 seconds
)

print("Starting replica exchange optimization with checkpointing...")
best_data_replica, steps_df_replica = replica_climber.climb()

print("\nReplica exchange optimization completed!")
print(f"Final objective: {steps_df_replica['Objective value'].iloc[-1]:.4f}")
print(f"Total steps: {len(steps_df_replica)}")

Starting replica exchange optimization with checkpointing...
Starting replica exchange with 4 replicas (parallel (4 workers))...
Temperature ladder: [ 100.          215.443469    464.15888336 1000.        ]
Checkpoint saved: ../data/replica_exchange_checkpoint.pkl
Checkpoint saved: ../data/replica_exchange_checkpoint.pkl
Checkpoint saved: ../data/replica_exchange_checkpoint.pkl
Checkpoint saved: ../data/replica_exchange_checkpoint.pkl
Checkpoint saved: ../data/replica_exchange_checkpoint.pkl
Checkpoint saved: ../data/replica_exchange_checkpoint.pkl
Checkpoint saved: ../data/replica_exchange_checkpoint.pkl

Best result from replica 2 (T=0.1)
Exchange acceptance rate: 100.00%

Replica exchange optimization completed!
Final objective: 0.0312
Total steps: 9001
Checkpoint saved: ../data/replica_exchange_checkpoint.pkl

Best result from replica 2 (T=0.1)
Exchange acceptance rate: 100.00%

Replica exchange optimization completed!
Final objective: 0.0312
Total steps: 9001


## 5. Check checkpoint files

In [7]:
# List checkpoint files
print("Checkpoint files created:")
print(f"Single run: {os.path.exists(checkpoint_file)}")
print(f"Resumed run: {os.path.exists('../data/resumed_checkpoint.pkl')}")
print(f"Replica exchange: {os.path.exists(checkpoint_file_replica)}")

Checkpoint files created:
Single run: True
Resumed run: False
Replica exchange: True


## 6. Load and inspect a checkpoint

In [8]:
# Load and inspect a checkpoint file
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'rb') as f:
        checkpoint_data = pickle.load(f)
    
    print("Checkpoint contents:")
    print(f"- Number of replicas: {len(checkpoint_data['replicas'])}")
    print(f"- Elapsed time: {checkpoint_data['elapsed_time']:.1f} seconds")
    
    # Show first replica info
    first_replica = checkpoint_data['replicas'][0]
    print(f"\nFirst replica state:")
    print(f"  - Step: {first_replica['step']}")
    print(f"  - Best objective: {first_replica['best_objective']:.4f}")
    print(f"  - Current temperature: {first_replica['temperature']:.2f}")
    print(f"  - Hyperparameters: {list(first_replica['hyperparameters'].keys())}")
else:
    print("Checkpoint file not found")

Checkpoint contents:
- Number of replicas: 4
- Elapsed time: 192.2 seconds

First replica state:
  - Step: 9001
  - Best objective: 0.0460
  - Current temperature: 0.12
  - Hyperparameters: ['max_time', 'perturb_fraction', 'cooling_rate', 'mode', 'target_value', 'step_spread']
