In [26]:
from scipy.optimize import minimize
import numpy as np
import pandas as pd
from sklearn.metrics import mutual_info_score

def generate_variables_for_mi(target_mi, n_samples=1000):
    """Generates two lists of variables (X, Y) with approximately the given mutual information."""
    
    # Generate base variable X
    X = np.random.randn(n_samples)
    
    # Define a function that adjusts noise to achieve target MI
    def mi_objective(noise_std):
        """Objective function to minimize the difference between actual and target MI."""
        Y = 3 * X + np.random.normal(scale=noise_std, size=n_samples)  # Add controlled noise
        X_discrete = pd.qcut(X, q=10, labels=False, duplicates="drop")
        Y_discrete = pd.qcut(Y, q=10, labels=False, duplicates="drop")
        mi = mutual_info_score(X_discrete, Y_discrete)
        return abs(mi - target_mi)  # We want MI close to target
    
    # Find optimal noise level to match target MI
    result = minimize(mi_objective, x0=1.0, bounds=[(0.01, 10.0)])
    optimal_noise = result.x[0]

    # Generate final dataset with optimized noise
    Y_final = 3 * X + np.random.normal(scale=optimal_noise, size=n_samples)

    return X, Y_final, optimal_noise

# Example: Generate two lists with MI ≈ 1.5
target_mi = 4
X_generated, Y_generated, final_noise = generate_variables_for_mi(target_mi)

# Compute achieved MI for verification
X_discrete_final = pd.qcut(X_generated, q=10, labels=False, duplicates="drop")
Y_discrete_final = pd.qcut(Y_generated, q=10, labels=False, duplicates="drop")
achieved_mi = mutual_info_score(X_discrete_final, Y_discrete_final)

final_noise, achieved_mi

(1.000007581115343, 0.9828249867374883)