In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import differential_evolution
from scipy.fft import fft, fftfreq
from sklearn.metrics import r2_score

# --- Step 1: User-Provided Function for Initial Guesses ---
# This is the function you provided, integrated directly into the script.
# It helps generate a strong starting point for the optimization.

def estimate_initial_params(data, value_column="value"):
    """
    Estimate initial parameters for a single sinusoidal component from data.

    Args:
        data (pd.DataFrame): The input DataFrame with a datetime index.
        value_column (str): The name of the column containing the signal values.

    Returns:
        list: A list of estimated parameters [Amplitude, Decay, Frequency, Phase, Offset].
    """
    values = data[value_column].values
    dates = data.index
    t_years = (dates - dates[0]).total_seconds() / (3600 * 24 * 365.25)

    # Basic estimates
    amplitude = np.std(values) * 2
    offset = np.mean(values)
    decay = np.ptp(t_years) / 2 # Guess decay as half the signal duration

    # Auto-detect sampling rate from the first two data points
    if len(dates) > 1:
        dt_seconds = (dates[1] - dates[0]).total_seconds()
        samples_per_year = (3600 * 24 * 365.25) / dt_seconds
    else:
        samples_per_year = 12 # Default to monthly if only one point

    # Use FFT to find the single strongest frequency
    fft_vals = fft(values - offset) # Remove offset for clearer FFT
    freqs = fftfreq(len(values), 1 / samples_per_year)

    # Find peak frequency in the positive spectrum (ignoring the DC component at index 0)
    positive_freq_indices = np.where(freqs > 0)[0]
    if len(positive_freq_indices) > 0:
        magnitudes = np.abs(fft_vals[positive_freq_indices])
        peak_idx = positive_freq_indices[np.argmax(magnitudes)]
        frequency = freqs[peak_idx]
    else:
        frequency = 1.0 # Fallback frequency

    # Return the list of initial guesses [A, tau, f, phi, c]
    # Phase (phi) is initially set to 0.0
    return [amplitude, decay, frequency, 0.0, offset]


# --- Step 2: Core Model and Fitting Functions (Updated) ---

def multi_damped_sine_model(t, params, num_components):
    """Model for the sum of N damped sine waves."""
    total_signal = 0
    offset = params[-1]
    component_params = np.array(params[:-1]).reshape(num_components, 4)
    for i in range(num_components):
        A, tau, f, phi = component_params[i]
        total_signal += A * np.exp(-t / tau) * np.sin(2 * np.pi * f * t + phi)
    return total_signal + offset

def fit_multi_damped_sine_wave(
    time_series_data,
    num_components=1,
    value_column="value" # NEW: Parameter to specify the value column
):
    """
    Fits a sum of N damped sine waves, now using the new initial guess function.
    """
    dates = time_series_data.index
    t_years = (dates - dates[0]).total_seconds() / (3600 * 24 * 365.25)
    observed_values = time_series_data[value_column].values

    # --- UPDATED INITIAL GUESS LOGIC ---
    initial_guess = []
    bounds = []

    # 1. Use the new function for the first component's guess
    first_guess = estimate_initial_params(time_series_data, value_column)
    initial_guess.extend(first_guess[:4]) # [A, tau, f, phi]
    offset_guess = first_guess[4]         # Get the offset guess

    # 2. For additional components, find other frequency peaks using FFT
    if num_components > 1:
        dt_seconds = (dates[1] - dates[0]).total_seconds()
        samples_per_year = (3600 * 24 * 365.25) / dt_seconds
        yf = fft(observed_values - offset_guess)
        xf = fftfreq(len(observed_values), d=1/samples_per_year)
        # Find the top N frequency peaks
        freq_indices = np.argsort(np.abs(yf[1:len(yf)//2]))[::-1] + 1
        freq_guesses = np.abs(xf[freq_indices[:num_components]])
        
        # Overwrite the first frequency guess for consistency
        initial_guess[2] = freq_guesses[0]
        
        # Add guesses for the other components
        for i in range(1, num_components):
            initial_guess.extend([
                np.std(observed_values)/2, # Guess smaller amplitude for secondary peaks
                t_years[-1],             # Guess a different decay
                freq_guesses[i],         # Use the i-th frequency peak
                0.0
            ])

    # 3. Dynamically create the bounds for all components
    amp_guess, decay_guess, freq_guess = initial_guess[0], initial_guess[1], initial_guess[2]
    bounds.extend([
        (0, amp_guess * 2),
        (0.1, decay_guess * 5),
        (freq_guess * 0.5, freq_guess * 1.5),
        (-np.pi, np.pi)
    ])
    if num_components > 1:
        for i in range(1, num_components):
            amp_g, decay_g, freq_g = initial_guess[i*4], initial_guess[i*4+1], initial_guess[i*4+2]
            bounds.extend([
                (0, amp_g * 3),
                (0.1, decay_g * 5),
                (freq_g * 0.5, freq_g * 1.5),
                (-np.pi, np.pi)
            ])

    # Add the single offset parameter at the end
    initial_guess.append(offset_guess)
    bounds.append((offset_guess - np.std(observed_values), offset_guess + np.std(observed_values)))

    # --- Optimization ---
    def objective(params):
        predicted = multi_damped_sine_model(t_years, params, num_components)
        return np.sum((observed_values - predicted)**2)

    result = differential_evolution(objective, bounds, seed=42, maxiter=2500, tol=1e-7)

    # --- Package and Return Results ---
    fitted_params = result.x
    fitted_signal = multi_damped_sine_model(t_years, fitted_params, num_components)
    r2 = r2_score(observed_values, fitted_signal)
    rmse = np.sqrt(np.mean((observed_values - fitted_signal)**2))
    fit_metrics = {'R²': r2, 'RMSE': rmse}

    return fitted_params, fitted_signal, fit_metrics, t_years


# --- Step 3: Execution with Real Data ---
try:
    print("--- Analysis of test_data_2.csv ---")
    df = pd.read_csv('test_data_2.csv')
    df['time'] = pd.to_datetime(df['time'])
    df = df.set_index('time')

    # The column with the signal is 'residual'. We will pass this to our function.
    VALUE_COLUMN_NAME = 'residual'
    NUM_COMPONENTS = 2

    print(f"Fitting model with {NUM_COMPONENTS} components on column: '{VALUE_COLUMN_NAME}'")

    fitted_params, fitted_signal, metrics, t_years = fit_multi_damped_sine_wave(
        df,
        num_components=NUM_COMPONENTS,
        value_column=VALUE_COLUMN_NAME # Use the new parameter here
    )

    print("\n--- Fit Quality ---")
    print(f"R²: {metrics['R²']:.4f}")
    print(f"RMSE: {metrics['RMSE']:.4f}")

    print("\n--- Fitted Parameters ---")
    component_params = np.array(fitted_params[:-1]).reshape(NUM_COMPONENTS, 4)
    # Sort by frequency for clarity
    component_params = component_params[np.argsort(component_params[:, 2])]

    for i in range(NUM_COMPONENTS):
        A, tau, f, phi = component_params[i]
        period_years = 1/f if f > 0 else float('inf')
        print(f"\n-- Component {i+1} --")
        print(f"  Amplitude (A): {A:.4f}")
        print(f"  Decay (τ):     {tau:.4f} years")
        print(f"  Frequency (f): {f:.4f} cycles/year (Period: {period_years:.2f} years)")
        print(f"  Phase (φ):     {phi:.4f} radians")
    print(f"\nShared Offset (c): {fitted_params[-1]:.4f}")

    # Plotting
    plt.figure(figsize=(14, 7))
    plt.plot(df.index, df[VALUE_COLUMN_NAME], 'k-', label='Original Data', alpha=0.5)
    plt.plot(df.index, fitted_signal, 'r-', label='Total Fitted Model', linewidth=2)
    offset_val = fitted_params[-1]
    for i in range(NUM_COMPONENTS):
        comp_params_with_zero_offset = list(component_params[i]) + [0]
        comp_signal = multi_damped_sine_model(t_years, comp_params_with_zero_offset, 1)
        plt.plot(df.index, comp_signal + offset_val, '--', label=f'Component {i+1}')
    plt.title('Multi-Component Fit Using New Initial Guess Function')
    plt.legend()
    plt.grid(True)
    plt.show()

except FileNotFoundError:
    print("Error: 'test_data_2.csv' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

--- Analysis of test_data_2.csv ---
Fitting model with 2 components on column: 'residual'
