# Example climb: Matching mean & standard deviation with maximum difference

This notebook demonstrates using the hill climber to generate pairs of distributions that have identical means and standard deviations, but are as different as possible in their overall shape and structure.

## Goal

Find distributions where:
- **x** and **y** have the same mean
- **x** and **y** have the same standard deviation
- The distributions are as different as possible (maximizing some measure of dissimilarity)

We'll use the Kolmogorov-Smirnov statistic as our measure of distribution difference, while penalizing deviations from matching mean and standard deviation.

In [None]:
import sys

import numpy as np
import pandas as pd
from scipy import stats

sys.path.append('..')

from hill_climber import HillClimber

## 1. Objective function

In [None]:
def objective_same_mean_std_different_distributions(x, y):
    '''Maximize distribution difference while maintaining same mean and std.
    
    Objective function: KS_statistic - penalty
    
    Where penalty = |mean_x - mean_y| + |std_x - std_y|
    
    This encourages:
        - Identical means between x and y
        - Identical standard deviations between x and y
        - Maximum Kolmogorov-Smirnov statistic (most different distributions)
    
    The KS statistic measures the maximum difference between cumulative 
    distribution functions, providing a measure of how different two 
    distributions are overall.
    
    Args:
        x: First variable (array-like)
        y: Second variable (array-like)
        penalty_weight: Weighting factor for the penalty term
    
    Returns:
        Tuple of (metrics_dict, objective_value) where:
            - metrics_dict: Dict with means, stds, KS statistic, and penalty
            - objective_value: KS_statistic - penalty
    '''
    # Calculate statistics
    mean_x = np.mean(x)
    mean_y = np.mean(y)
    std_x = np.std(x, ddof=1)
    std_y = np.std(y, ddof=1)
    
    # Calculate KS statistic (measure of distribution difference)
    ks_statistic, _ = stats.ks_2samp(x, y)
    
    # Calculate penalty for not matching mean and std
    mean_penalty = abs(mean_x - mean_y)
    std_penalty = abs(std_x - std_y)
    total_penalty = mean_penalty + std_penalty
    
    # Objective: maximize KS statistic while minimizing penalty
    # Using large penalty weight to enforce constraints
    objective = ks_statistic - (2 * total_penalty)
    
    metrics = {
        'Mean X': mean_x,
        'Mean Y': mean_y,
        'Std X': std_x,
        'Std Y': std_y,
        'KS Statistic': ks_statistic,
        'Mean Penalty': mean_penalty,
        'Std Penalty': std_penalty,
        'Total Penalty': total_penalty
    }
    
    return metrics, objective

## 2. Input distributions

In [None]:
# Create input distribution with same mean and std
n = 1000

# Start with two similar distributions
x_init = np.random.normal(loc=5.0, scale=2.0, size=n)
y_init = np.random.normal(loc=5.0, scale=2.0, size=n)

data = pd.DataFrame({
    'x': x_init,
    'y': y_init
})

print(f"Initial mean X: {np.mean(data['x']):.4f}, mean Y: {np.mean(data['y']):.4f}")
print(f"Initial std X: {np.std(data['x'], ddof=1):.4f}, std Y: {np.std(data['y'], ddof=1):.4f}")
print(f"Initial KS statistic: {stats.ks_2samp(data['x'], data['y'])[0]:.4f}")

## 3. Run parameters

In [None]:
# Set hyperparameters
max_time = 6 * 60
step_size = 1.0
replicates = 10
temperature = 100.0
initial_noise = 0.5
cooling_rate = 0.99999
objective_func = objective_same_mean_std_different_distributions

## 4. Hill climbing run

In [None]:
# Create HillClimber instance
climber = HillClimber(
    data=data,
    objective_func=objective_func,
    max_time=max_time,
    step_size=step_size,
    temperature=temperature,
    cooling_rate=cooling_rate,
    mode='maximize'
)

# Plot the input data
climber.plot_input(plot_type='kde')

# Run parallel optimization
results = climber.climb_parallel(
    replicates=replicates,
    initial_noise=initial_noise
)

## 5. Results

In [None]:
# Visualize results using KDE plot type, display only KS Statistic metric
climber.plot_results(results, plot_type='histogram', metrics=['KS Statistic'])