## Import

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from astropy.timeseries import LombScargle
from scipy.stats import median_abs_deviation as MAD
from tqdm import tqdm

In [26]:
FILENAME = r'data_cleaned/all_mid_time_data_with_OC.csv'
df = pd.read_csv(FILENAME)
df.tail(3)

Unnamed: 0,Planet,Tmid_(BJD_TDB),Tmid_unc.,source,P_(days),P_unc.,epoch,O-C_(days),O-C_unc._(days)
30341,piMenc,2460149.0,0.00084,space,6.267821,1e-06,122,0.000897,0.001149
30342,piMenc,2460156.0,0.00095,space,6.267821,1e-06,123,0.000166,0.00126
30343,piMenc,2460175.0,0.0011,space,6.267821,1e-06,126,4e-06,0.001414


## Outlier rejection (5-sigma clipping)

In [27]:
# 5 sigma clipping
def sigma_clip_OC(df, sigma=5):
    mu = df.groupby('Planet')['O-C_(days)'].transform('mean')
    std = df.groupby('Planet')['O-C_(days)'].transform('std')

    mask = (df['O-C_(days)'] - mu).abs() <= sigma * std
    return df.loc[mask].reset_index(drop=True)
df_clip = sigma_clip_OC(df)

# Choose only remaining rows with at least 5 observations
df_clip = df_clip[df_clip.groupby('Planet')['Planet'].transform('size') >= 5].reset_index(drop=True)

df_clip.tail(3)

Unnamed: 0,Planet,Tmid_(BJD_TDB),Tmid_unc.,source,P_(days),P_unc.,epoch,O-C_(days),O-C_unc._(days)
30172,piMenc,2460149.0,0.00084,space,6.267821,1e-06,122,0.000897,0.001149
30173,piMenc,2460156.0,0.00095,space,6.267821,1e-06,123,0.000166,0.00126
30174,piMenc,2460175.0,0.0011,space,6.267821,1e-06,126,4e-06,0.001414


In [28]:
# Import big dataset for later insertion of final parameters
FILENAME = r'data_cleaned/merged_exoclock_exoplanet.eu_nasa.csv'
df_big = pd.read_csv(FILENAME)
df_big.head(3)

Unnamed: 0,name_exoclock,name_exoplanet.eu,star_name,pl_letter,T0_(BJD_TDB),T0_unc.,P_(days),P_unc.,mass,mass_error,...,omega,omega_error,inclination,inclination_error,star_mass,star_mass_error,star_radius,star_radius_error,midtime_counts,planet_count
0,,55 Cnc Ab,55 Cnc A,b,,,,,0.84,0.23,...,110.0,54.0,89.73,24.54,0.9,0.115,0.963,0.0654,,5.0
1,,55 Cnc Ac,55 Cnc A,c,,,,,0.1714,0.0055,...,356.0,22.0,89.73,24.54,0.9,0.115,0.963,0.0654,,5.0
2,,55 Cnc Ad,55 Cnc A,d,,,,,3.878,0.068,...,254.0,32.0,89.73,24.54,0.9,0.115,0.963,0.0654,,5.0


## Periodicity search (GLS)

In [29]:
def lomb_scargle_analysis(df_full, planet_name):
    # Take only specific planet
    df = df_full[df_full.Planet == planet_name]
    
    # Frequency grid and setup
    Tmid = df['Tmid_(BJD_TDB)']
    Porb = df['P_(days)'].mode().iloc[0]
    OC = df['O-C_(days)']
    OCerr = df['O-C_unc._(days)']

    baseline = Tmid.max() - Tmid.min()
    fmin, fmax = 1 / (10*baseline), 1 / (2*Porb)
    Nfreq = np.int64(10 * baseline * fmax) # VanderPlas 2017
    freq = np.linspace(fmin, fmax, Nfreq)

    # Lomb–Scargle (1-harmonic model)
    ls1 = LombScargle(Tmid, OC, OCerr, nterms=1, normalization="psd")
    power1 = ls1.power(freq, normalization="psd")
    f_best1 = freq[np.argmax(power1)]
    model1 = ls1.model(Tmid, f_best1)

    # Lomb–Scargle (2-harmonic model)
    ls2 = LombScargle(Tmid, OC, OCerr, nterms=2, normalization="psd")
    power2 = ls2.power(freq, normalization="psd")
    f_best2 = freq[np.argmax(power2)]
    model2 = ls2.model(Tmid, f_best2)

    # BICs for best model determination
    res1 = OC - model1
    res2 = OC - model2
    n = len(OC)
    k0, k1, k2 = 1, 3, 5  # Parameters estimated by model; offset + sin/cos terms

    bic0 = k0 * np.log(n) + n * np.log(np.var(OC))
    bic1 = k1 * np.log(n) + n * np.log(np.var(res1))
    bic2 = k2 * np.log(n) + n * np.log(np.var(res2))

    # Preferred N (1 or 2)
    if bic2 < bic1:
        Nbest = 2
        ls = ls2
        model = model2
        power = power2
        f_best = f_best2
        bic_model = bic2
        
    elif bic1 <= bic2:
        Nbest = 1
        ls = ls1
        model = model1
        power = power1
        f_best = f_best1
        bic_model = bic1

    # Best frequency -> period and amplitude 
    period = 1/f_best
    try:
        f_best_under, f_best_over = freq[np.argmax(power)-1], freq[np.argmax(power)+1]
        period_over, period_under = 1/f_best_under, 1/f_best_over
        period_err = max(period_over-period, period-period_under)
    except IndexError:
        period_err = np.nan
    
    if Nbest == 1:
        A1, A2 = ls.model_parameters(f_best)[1:]
        amp1 = np.sqrt(A1**2 + A2**2)
        amp2 = np.nan
    elif Nbest == 2:
        A1, A2, B1, B2 = ls.model_parameters(f_best)[1:]
        amp1 = np.sqrt(A1**2 + A2**2)
        amp2 = np.sqrt(B1**2 + B2**2)
    
    # TTV strength
    scatter = MAD(OC, nan_policy='omit') / OCerr.median()
    delta_BIC = bic0 - bic_model

    if (delta_BIC >= 10) and (scatter >= 3):
        TTV_strength = 'strong'
    elif (delta_BIC >= 10) and (scatter >= 2):
        TTV_strength = 'weak'
    elif (delta_BIC < 10) or (scatter < 2):
        TTV_strength = 'no_TTV'

    # False Alarm Probability / FAP at 5%, 1%, 0.1% and at best frequency
    # FAP is not implemented for multiterm periodograms
    if Nbest == 1: 
        fap = ls.false_alarm_probability(power.max(), minimum_frequency=fmin, maximum_frequency=fmax)
        fap5 = ls.false_alarm_level(0.05, minimum_frequency=fmin, maximum_frequency=fmax)
        fap1 = ls.false_alarm_level(0.01, minimum_frequency=fmin, maximum_frequency=fmax)
        fap01 = ls.false_alarm_level(0.001, minimum_frequency=fmin, maximum_frequency=fmax)
    elif Nbest == 2:
        fap = np.nan

    # Phase (Plot setup)
    t_fit = np.linspace(Tmid.min(), Tmid.max(), 2000)
    phase = ((Tmid - Tmid.min()) / period) % 1 - 0.5
    phase_fit = ((t_fit - Tmid.min()) / period) % 1 - 0.5
    
    # Sort model by phase
    idx = np.argsort(phase_fit)
    phase_s = phase_fit[idx]
    model_s = ls.model(t_fit, f_best)[idx]
    
    # Break wrap-around at phase = 1 → 0
    breaks = np.where(np.diff(phase_s) < 0)[0] + 1
    phase_s = np.insert(phase_s, breaks, np.nan)
    model_s = np.insert(model_s, breaks, np.nan)
    
    # Plot
    fig, ax = plt.subplots(1, 2, figsize=(12, 3.5))
    P = 1/freq
    Pmin, Pmax = P.min(), P.max()

    # Plot GLS power spectrum
    ax[0].plot(1/freq, power, lw=0.5, color='black')
    if Nbest == 1: # Plot FAP
        ax[0].axvline(period, lw=1, color='red', ls='-.',
                      label=f'{planet_name} peak at {1/f_best:.2f} d\nFAP={fap*100:.3g}%')
        for i, fap_ in enumerate([fap5, fap1, fap01]):
            ax[0].axhline(fap_, ls='--', lw=0.7, color=f'C{i}')
        ax[0].legend(loc='upper right')
        
    ax[0].axhline(power.max(), lw=0.7, color='red', ls='-.')
    
    ax[0].set_xlabel("Period (days)"); ax[0].set_ylabel("GLS Power")
    ax[0].set_xscale("log")
    ax[0].set_xlim(Pmin / 1.05, Pmax * 1.05); ax[0].set_ylim(bottom=0) 

    # Plot sinusoidal fit
    day2min = 1440
    for i, source in enumerate(['literature', 'space', 'exoclock']):
        msk = (df.source == source)
        ax[1].errorbar(phase[msk], OC[msk] * day2min, OCerr[msk] * day2min, fmt='.', color=f'C{i}', zorder=1, label=source)
    ax[1].plot(phase_s, model_s * day2min, label=f'Fit (N={Nbest})', color='red', lw=1, zorder=2)
    ax[1].set_xlabel("TTV Phase"); ax[1].set_ylabel("O-C (minutes)")
    ax[1].set_xlim(-0.51, 0.51)
    ax[1].legend(loc='lower right')
    
    plt.tight_layout()
    EXPORT_FILENAME = rf'fit_pics/{planet_name}_GLSfit_N={Nbest}.jpg'
    plt.savefig(EXPORT_FILENAME, dpi=300, bbox_inches="tight")
    plt.close()
    
    return period, period_err, amp1, amp2, Nbest, TTV_strength, scatter, bic0, bic1, bic2, delta_BIC, fap, Nfreq
    
# Inititate final col
df_final = df_big.copy()
df_final_newcols = ['Pttv', 'Pttv_err', 'Attv1', 'Attv2', 'Nbest', 'TTV_strength', 'scatter', 'bic0', 'bic1', 'bic2', 'delta_bic', 'fap', 'sampled_f']
float_cols = [c for c in df_final_newcols if c != 'TTV_strength']

df_final[float_cols] = np.nan
df_final['TTV_strength'] = None

all_planets = df_clip['Planet'].dropna().astype(str).unique()
for planet in tqdm(all_planets):
    newcol_values = lomb_scargle_analysis(df_clip, planet_name=planet)
    mask = (df_final["name_exoclock"] == planet)
    df_final.loc[mask, df_final_newcols] = newcol_values

EXPORT_FILENAME = r'data_cleaned\merged_GLSfitted.csv'
df_final.to_csv(EXPORT_FILENAME, index=False)
df_final.head(3)

100%|████████████████████████████████████████████████████████████████████████████████| 591/591 [07:51<00:00,  1.25it/s]


Unnamed: 0,name_exoclock,name_exoplanet.eu,star_name,pl_letter,T0_(BJD_TDB),T0_unc.,P_(days),P_unc.,mass,mass_error,...,Attv2,Nbest,scatter,bic0,bic1,bic2,delta_bic,fap,sampled_f,TTV_strength
0,,55 Cnc Ab,55 Cnc A,b,,,,,0.84,0.23,...,,,,,,,,,,
1,,55 Cnc Ac,55 Cnc A,c,,,,,0.1714,0.0055,...,,,,,,,,,,
2,,55 Cnc Ad,55 Cnc A,d,,,,,3.878,0.068,...,,,,,,,,,,
