## Import

In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from astropy.timeseries import LombScargle
from scipy.stats import median_abs_deviation as MAD
from tqdm import tqdm

In [70]:
FILENAME = r'data_cleaned/all_mid_time_data_with_OC.csv'
df = pd.read_csv(FILENAME)
df.tail(3)

Unnamed: 0,Planet,Tmid_(BJD_TDB),Tmid_unc.,source,P_(days),P_unc.,epoch,O-C_(days),O-C_unc._(days)
30341,piMenc,2460149.0,0.00084,space,6.267821,1e-06,122,0.000897,0.001149
30342,piMenc,2460156.0,0.00095,space,6.267821,1e-06,123,0.000166,0.00126
30343,piMenc,2460175.0,0.0011,space,6.267821,1e-06,126,4e-06,0.001414


## Outlier rejection (5-sigma clipping)

In [73]:
# 5 sigma clipping
def sigma_clip_OC(df, sigma=5):
    mu = df.groupby('Planet')['O-C_(days)'].transform('mean')
    std = df.groupby('Planet')['O-C_(days)'].transform('std')

    mask = (df['O-C_(days)'] - mu).abs() <= sigma * std
    return df.loc[mask].reset_index(drop=True)

df_clip = sigma_clip_OC(df)

# Choose only rows with at least 10 observations
df_clip = df_clip[df_clip.groupby('Planet')['Planet'].transform('size') >= 10].reset_index(drop=True)

df_clip.tail(3)

Unnamed: 0,Planet,Tmid_(BJD_TDB),Tmid_unc.,source,P_(days),P_unc.,epoch,O-C_(days),O-C_unc._(days)
29568,piMenc,2460149.0,0.00084,space,6.267821,1e-06,122,0.000897,0.001149
29569,piMenc,2460156.0,0.00095,space,6.267821,1e-06,123,0.000166,0.00126
29570,piMenc,2460175.0,0.0011,space,6.267821,1e-06,126,4e-06,0.001414


## Periodicity search (GLS)

In [93]:
def lomb_scargle_analysis(df_full, planet_name, N_freq=10000):
    # Take only specific planet
    df = df_full[df_full.Planet == planet_name]
    
    # Frequency grid and setup
    Tmid = df['Tmid_(BJD_TDB)']
    Porb = df['P_(days)'].mode().iloc[0]
    OC = df['O-C_(days)']
    OCerr = df['O-C_unc._(days)']

    baseline = Tmid.max() - Tmid.min()
    fmin, fmax = 1 / (10*baseline), 1 / (2*Porb)
    freq = np.linspace(fmin, fmax, N_freq)

    # Lomb–Scargle (1-harmonic model)
    ls1 = LombScargle(Tmid, OC, OCerr, nterms=1, normalization="psd")
    power = ls1.power(freq, normalization="psd")
    
    # Best frequency -> period -> amplitude 
    f_best = freq[np.argmax(power)]
    period = 1 / f_best
    
    model1 = ls1.model(Tmid, f_best)
    amp1 = 0.5 * (model1.max() - model1.min())

    # # Lomb–Scargle (2-harmonic model)
    # ls2 = LombScargle(Tmid, OC, OCerr, nterms=2)
    # model2 = ls2.model(Tmid, f_best)
    # amp2 = 0.5 * (model2.max() - model2.min())

    # # BIC
    # res1 = OC - model1
    # res2 = OC - model2
    # n = len(OC)
    # k0, k1, k2 = 1, 3, 5  # Parameters estimated by model; offset + sin/cos terms

    # bic1 = k1 * np.log(n) + n * np.log(np.var(res1))
    # bic2 = k2 * np.log(n) + n * np.log(np.var(res2))
    
    # # Preferred N
    # scatter = MAD(OC, nan_policy='omit') / OCerr.median()
    # Nbest = 2 if (bic2 - bic1 > 0) else 1

    # False Alarm Probability / FAP at 5%, 1%, 0.1% and at best frequency
    fap = ls1.false_alarm_probability(power.max(), minimum_frequency=fmin, maximum_frequency=fmax)
    fap5 = ls1.false_alarm_level(0.05, minimum_frequency=fmin, maximum_frequency=fmax)
    fap1 = ls1.false_alarm_level(0.01, minimum_frequency=fmin, maximum_frequency=fmax)
    fap01 = ls1.false_alarm_level(0.001, minimum_frequency=fmin, maximum_frequency=fmax)

    
    # Phase (Plot setup)
    t_fit = np.linspace(Tmid.min(), Tmid.max(), 2000)
    phase = ((Tmid - Tmid.min()) / period) % 1 - 0.5
    phase_fit = ((t_fit - Tmid.min()) / period) % 1 - 0.5
    
    # Sort model by phase
    idx = np.argsort(phase_fit)
    phase_s = phase_fit[idx]
    model1_s = ls1.model(t_fit, f_best)[idx]
    # model2_s = ls2.model(t_fit, f_best)[idx]
    
    # Break wrap-around at phase = 1 → 0
    breaks = np.where(np.diff(phase_s) < 0)[0] + 1
    phase_s = np.insert(phase_s, breaks, np.nan)
    model1_s = np.insert(model1_s, breaks, np.nan)
    # model2_s = np.insert(model2_s, breaks, np.nan)
    
    # Plot
    fig, ax = plt.subplots(1, 2, figsize=(12, 3.5))
    P = 1/freq
    Pmin, Pmax = P.min(), P.max()

    # Plot GLS power spectrum
    ax[0].plot(1/freq, power, lw=0.5, color='black')
    ax[0].axvline(period, lw=1, color='red', ls='-.',
                 label=f'{planet_name} peak at {1/f_best:.2f} d\nFAP={fap*100:.3g}%')
    for i, fap_ in enumerate([fap5, fap1, fap01]):
        ax[0].axhline(fap_, ls='--', lw=0.7, color=f'C{i}')
    ax[0].axhline(power.max(), lw=0.7, color='red', ls='-.')
    
    ax[0].set_xlabel("Period (days)"); ax[0].set_ylabel("GLS Power")
    ax[0].set_xscale("log")
    ax[0].set_xlim(Pmin / 1.05, Pmax * 1.05); ax[0].set_ylim(bottom=0) 
    ax[0].legend(loc='upper right')

    # Plot sinusoidal fit
    for i, source in enumerate(['literature', 'space', 'exoclock']):
        msk = (df.source == source)
        ax[1].errorbar(phase[msk], OC[msk], OCerr[msk], fmt='.', color=f'C{i}', zorder=1, label=source)
    ax[1].plot(phase_s, model1_s, label='Fit (N=1)', color='red', lw=1, zorder=2)
    # ax[1].plot(phase_s, model2_s, '--', label='2-harmonic', zorder=2)
    ax[1].set_xlabel("Phase"); ax[1].set_ylabel("O − C (days)")
    ax[1].set_xlim(-0.51, 0.51)
    ax[1].legend(loc='lower right')
    
    plt.tight_layout()
    EXPORT_FILENAME = rf'fit_pics/{planet_name}_GLSfit_N={1}.jpg'
    plt.savefig(EXPORT_FILENAME, dpi=300, bbox_inches="tight")
    plt.close()
    
    # return {
    #     "period": period,
    #     "amplitude_1harm": amp1,
    #     "amplitude_2harm": amp2,
    #     "BIC_1harm": bic1,
    #     "BIC_2harm": bic2,
    #     "FAP": fap,
    # }
    
all_planets = df_clip['Planet'].dropna().astype(str).unique()
for planet in tqdm(all_planets):
    lomb_scargle_analysis(df_clip, planet_name=planet)

100%|████████████████████████████████████████████████████████████████████████████████| 502/502 [04:47<00:00,  1.75it/s]
