# Example climb: Matching means with maximum structural diversity

## 1. Notebook setup

### 1.1. Imports


In [None]:
import sys
from itertools import combinations

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats

sys.path.append('..')

from hill_climber import HillClimber, plot_optimization_results

np.random.seed(315)

### 1.2. Input distributions

In [None]:
# Create input distributions (4 variables, all strictly positive)
# Start with similar normal distributions that will be evolved into different structures

n = 100

data = pd.DataFrame({
    'w': np.random.normal(loc=10.0, scale=2.0, size=n),
    'x': np.random.normal(loc=10.0, scale=2.0, size=n),
    'y': np.random.normal(loc=10.0, scale=2.0, size=n),
    'z': np.random.normal(loc=10.0, scale=2.0, size=n)
})

# Ensure all values are strictly positive by clipping any negative values
for col in data.columns:
    data[col] = np.maximum(data[col], 0.1)  # Minimum value of 0.1

print("Initial statistics:")

for col in data.columns:
    print(f"Mean {col}: {np.mean(data[col]):.4f}")

print(f"\nInitial pairwise KS statistics (similarity):")

for col1, col2 in combinations(data.columns, 2):
    ks_stat, _ = stats.ks_2samp(data[col1], data[col2])
    print(f"{col1}-{col2}: {ks_stat:.4f}")

# Visualize using KDE plots to show initial distribution shapes
fig, ax = plt.subplots()

for col in data.columns:
    data[col].plot.kde(label=col, ax=ax)

plt.title('Initial distributions (all similar)')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.show()

## 2. Maximize structural diversity while preserving mean similarity

### 2.1. Objective function


In [None]:
def objective_similar_means_diverse_structures(w, x, y, z):
    '''Maximize structural diversity while maintaining similar means.
    
    Objective function: mean_wasserstein_distance - penalty_weight * mean_penalty
    
    This encourages:
        - Similar means across all 4 distributions (w, x, y, z)
        - Maximum structural diversity between distributions (high pairwise Wasserstein distances)
    
    The objective constrains the first moment (mean) while maximizing shape differences.
    Uses Wasserstein distance (Earth Mover's Distance) which is more sensitive to 
    distributional differences throughout the entire range compared to KS statistic.
    
    Args:
        w, x, y, z: Four variables (array-like)
    
    Returns:
        Tuple of (metrics_dict, objective_value) where:
            - metrics_dict: Dict with means, Wasserstein distances, and penalties
            - objective_value: mean_wasserstein_distance - penalty_weight * mean_penalty
    '''
    
    # Calculate means for all 4 distributions
    means = {
        'w': np.mean(w),
        'x': np.mean(x), 
        'y': np.mean(y),
        'z': np.mean(z)
    }
    
    # Calculate standard deviations for visualization
    stds = {
        'w': np.std(w),
        'x': np.std(x),
        'y': np.std(y),
        'z': np.std(z)
    }
    
    # Calculate target mean (average across all distributions)
    target_mean = np.mean(list(means.values()))
    
    # Calculate mean penalty - how far each mean is from the target
    mean_deviations = [abs(mean_val - target_mean) for mean_val in means.values()]
    mean_penalty = np.mean(mean_deviations)
    
    # Calculate pairwise Wasserstein distances to measure structural diversity
    distributions = {'w': w, 'x': x, 'y': y, 'z': z}
    wasserstein_distances = {}
    wasserstein_values = []
    
    for name1, name2 in combinations(distributions.keys(), 2):
        w_dist = stats.wasserstein_distance(distributions[name1], distributions[name2])
        wasserstein_distances[f'Wasserstein_{name1}_{name2}'] = w_dist
        wasserstein_values.append(w_dist)
    
    # Mean Wasserstein distance - higher means more structural diversity
    mean_wasserstein = np.mean(wasserstein_values)
    min_wasserstein = np.min(wasserstein_values)
    max_wasserstein = np.max(wasserstein_values)
    
    # Objective: maximize structural diversity while maintaining similar means
    penalty_weight = 2
    objective = mean_wasserstein - (penalty_weight * mean_penalty)
    
    # Compile metrics
    metrics = {
        'Mean W': means['w'],
        'Mean X': means['x'],
        'Mean Y': means['y'],
        'Mean Z': means['z'],
        'Std W': stds['w'],
        'Std X': stds['x'],
        'Std Y': stds['y'],
        'Std Z': stds['z'],
        'Target Mean': target_mean,
        'Mean Penalty': mean_penalty,
        'Mean Wasserstein': mean_wasserstein,
        'Min Wasserstein': min_wasserstein,
        'Max Wasserstein': max_wasserstein,
        **wasserstein_distances
    }
    
    return metrics, objective

### 2.2. Hill climbing run

In [None]:
# Create HillClimber instance with replica exchange
climber = HillClimber(
    data=data,
    objective_func=objective_similar_means_diverse_structures,
    max_time=3,
    n_replicas=8,
    T_min=0.1,
    T_max=10,
    cooling_rate=1e-8,
    exchange_interval=100,
    db_enabled=True,
    db_path='../data/hill_climber_progress.db',
    checkpoint_file='../data/hill_climber_progress.pkl'
)

In [None]:
# Run optimization with replica exchange
best_data, history_df = climber.climb()