## Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use("Agg")
import seaborn as sns
from astropy.timeseries import LombScargle
from scipy.stats import median_abs_deviation as MAD
from tqdm import tqdm

In [2]:
FILENAME = r'data_cleaned/all_mid_time_data_with_OC.csv'
df = pd.read_csv(FILENAME)
df.tail(3)

Unnamed: 0,Planet,Tmid_(BJD_TDB),Tmid_unc.,source,P_(days),P_unc.,epoch,O-C_(days),O-C_unc._(days)
275545,Kepler-1639b,2455985.0,,Holczer+16,9.878925,3e-05,24,0.002083,0.006944
275546,Kepler-1639b,2456005.0,,Holczer+16,9.878925,3e-05,26,-0.005556,0.013889
275547,Kepler-1639b,2456015.0,,Holczer+16,9.878925,3e-05,27,0.007639,0.013889


## Outlier rejection (5-sigma clipping)

In [3]:
# Remove excessively large TTV error and then 5 sigma clipping
def outlier_reject(df, errthres=3, sigma=5):
    OCerr_mu = df.groupby('Planet')['O-C_unc._(days)'].transform('mean')
    mask_err = df['O-C_unc._(days)'] / OCerr_mu <= errthres
    df_masked = df.loc[mask_err].reset_index(drop=True)
    
    OC_mu = df_masked.groupby('Planet')['O-C_(days)'].transform('mean')
    OC_std = df_masked.groupby('Planet')['O-C_(days)'].transform('std')
    mask_sigma = (df_masked['O-C_(days)'] - OC_mu).abs() <= sigma * OC_std
    df_masked2 = df_masked.loc[mask_sigma].reset_index(drop=True)
    
    return df_masked2
df_clip = outlier_reject(df)

# Choose only remaining rows with at least 5 observations
df_clip = df_clip[df_clip.groupby('Planet')['Planet'].transform('size') >= 5].reset_index(drop=True)

df_clip.tail(3)

Unnamed: 0,Planet,Tmid_(BJD_TDB),Tmid_unc.,source,P_(days),P_unc.,epoch,O-C_(days),O-C_unc._(days)
272666,Kepler-1639b,2455985.0,,Holczer+16,9.878925,3e-05,24,0.002083,0.006944
272667,Kepler-1639b,2456005.0,,Holczer+16,9.878925,3e-05,26,-0.005556,0.013889
272668,Kepler-1639b,2456015.0,,Holczer+16,9.878925,3e-05,27,0.007639,0.013889


In [5]:
# Import big dataset for later insertion of final parameters
FILENAME = r'data_cleaned/merged_potential_TTV_flagged.csv'
df_big = pd.read_csv(FILENAME)[df_big.potential_sinTTV_flag == 1]
df_big.head(3)

Unnamed: 0,name_exoplanet.eu,star_name,name_exoclock+holczer,T0_(BJD_TDB),T0_unc.,P_(days),P_unc.,mass,mass_error,radius,...,planet_pos,period_ratio,mass_ratio,potential_sinTTV_flag,TTV_pos,MMR,j,N,Delta,expected_Pttv
51,HAT-P-26 b,HAT-P-26,HAT-P-26b,2457197.0,7e-05,4.234501,3.2e-07,0.0585,0.00717,0.57,...,1,1.0,2.844043,1.0,inner,3:2,3.0,1.0,0.038139,57.631893
52,HAT-P-26 d,HAT-P-26,,,,,,0.020569,0.003319,0.1758,...,2,1.557208,1.0,1.0,outer,3:2,3.0,1.0,0.038139,57.631893
53,HAT-P-27 Ac,HAT-P-27 A,,,,,,0.066608,0.016082,0.3863,...,1,1.0,1.0,1.0,inner,5:2,5.0,3.0,0.0137,44.372517


## Periodicity search (GLS)

In [15]:
def lomb_scargle_analysis(df_full, planet_name, expected_Pttv=None):
    # Take only specific planet
    df = df_full[df_full.Planet == planet_name]
    
    # Frequency grid and setup
    Tmid = df['Tmid_(BJD_TDB)']
    Porb = df['P_(days)'].mode().iloc[0]
    OC = df['O-C_(days)']
    OCerr = df['O-C_unc._(days)']

    baseline = Tmid.max() - Tmid.min()
    fmin0, fmax0 = 1 / (10*baseline), 1 / (2*Porb)
    if expected_Pttv == None:
        fmin, fmax = fmin0, fmax0
    else:
        if (expected_Pttv > 1/fmin0) or (expected_Pttv < 1/fmax0):
            fmin, fmax = fmin0, fmax0
        else:
            fmin = max(fmin0, 1 / (expected_Pttv * 10**0.5))
            fmax = min(fmax0, 1 / (expected_Pttv * 10**-0.5))
    Nfreq = max(np.int64(10 * baseline * fmax), 5000)  # VanderPlas 2017 for max
    freq = np.linspace(fmin, fmax, Nfreq)

    # Lomb–Scargle (1-harmonic model)
    ls1 = LombScargle(Tmid, OC, OCerr, nterms=1, normalization="psd")
    power1 = ls1.power(freq, normalization="psd")
    f_best1 = freq[np.argmax(power1)]
    model1 = ls1.model(Tmid, f_best1)

    # Lomb–Scargle (2-harmonic model)
    ls2 = LombScargle(Tmid, OC, OCerr, nterms=2, normalization="psd")
    power2 = ls2.power(freq, normalization="psd")
    f_best2 = freq[np.argmax(power2)]
    model2 = ls2.model(Tmid, f_best2)

    # BICs for best model determination
    chi2_0 = np.sum(((OC - np.average(OC, weights=1/OCerr**2)) / OCerr)**2)
    chi2_1 = np.sum(((OC - model1) / OCerr)**2)
    chi2_2 = np.sum(((OC - model2) / OCerr)**2)
    n = len(OC)
    k0, k1, k2 = 1, 3, 5  # Parameters estimated by model; offset + sin/cos terms

    bic0 = k0 * np.log(n) + chi2_0
    bic1 = k1 * np.log(n) + chi2_1
    bic2 = k2 * np.log(n) + chi2_2

    # Preferred N (1 or 2)
    if bic1 - bic2 > 10:
        Nbest = 2
        ls = ls2
        model = model2
        power = power2
        f_best = f_best2
        bic_model = bic2
        
    elif bic1 - bic2 <= 10:
        Nbest = 1
        ls = ls1
        model = model1
        power = power1
        f_best = f_best1
        bic_model = bic1

    # Best frequency -> period and amplitude 
    period = 1/f_best
    try:
        f_best_under, f_best_over = freq[np.argmax(power)-1], freq[np.argmax(power)+1]
        period_over, period_under = 1/f_best_under, 1/f_best_over
        period_err = max(period_over-period, period-period_under)
    except IndexError:
        period_err = np.nan
    
    if Nbest == 1:
        A1, A2 = ls.model_parameters(f_best)[1:]
        amp1 = np.sqrt(A1**2 + A2**2)
        amp2 = np.nan
    elif Nbest == 2:
        A1, A2, B1, B2 = ls.model_parameters(f_best)[1:]
        amp1 = np.sqrt(A1**2 + A2**2)
        amp2 = np.sqrt(B1**2 + B2**2)
    
    # TTV strength
    scatter = MAD(OC, nan_policy='omit') / OCerr.median()
    delta_BIC = bic0 - bic_model

    if (delta_BIC >= 10) and (scatter >= 3):
        TTV_strength = 'strong'
    elif (delta_BIC >= 10) and (scatter >= 2):
        TTV_strength = 'weak'
    elif (delta_BIC < 10) or (scatter < 2):
        TTV_strength = 'no_TTV'

    # False Alarm Probability / FAP at 5%, 1%, 0.1% and at best frequency
    # FAP is not implemented for multiterm periodograms
    if Nbest == 1: 
        fap = ls.false_alarm_probability(power.max(), minimum_frequency=fmin, maximum_frequency=fmax)
        fap5 = ls.false_alarm_level(0.05, minimum_frequency=fmin, maximum_frequency=fmax)
        fap1 = ls.false_alarm_level(0.01, minimum_frequency=fmin, maximum_frequency=fmax)
        fap01 = ls.false_alarm_level(0.001, minimum_frequency=fmin, maximum_frequency=fmax)
    elif Nbest == 2:
        fap = np.nan

    # Plot only if delta_BIC >= 10
    if delta_BIC >= 10:
        # Phase (Plot setup)
        t_fit = np.linspace(Tmid.min(), Tmid.max(), 2000)
        phase = ((Tmid - Tmid.min()) / period) % 1 - 0.5
        phase_fit = ((t_fit - Tmid.min()) / period) % 1 - 0.5
        
        # Sort model by phase
        idx = np.argsort(phase_fit)
        phase_s = phase_fit[idx]
        model_s = ls.model(t_fit, f_best)[idx]
        
        # Break wrap-around at phase = 1 → 0
        breaks = np.where(np.diff(phase_s) < 0)[0] + 1
        phase_s = np.insert(phase_s, breaks, np.nan)
        model_s = np.insert(model_s, breaks, np.nan)
    
        # Plot
        fig, ax = plt.subplots(1, 2, figsize=(12, 3.5))
        P = 1/freq
        Pmin, Pmax = P.min(), P.max()
    
        # Plot GLS power spectrum
        ax[0].plot(1/freq, power, lw=0.5, color='black')
        if Nbest == 1: # Plot FAP
            ax[0].axvline(period, lw=1, color='red', ls='-.',
                          label=f'{planet_name} peak at {1/f_best:.2f} d\nFAP={fap*100:.2e}%')
            for i, fap_ in enumerate([fap5, fap1, fap01]):
                ax[0].axhline(fap_, ls='--', lw=0.7, color=f'C{i}')
            ax[0].legend(loc='upper right')
            
        ax[0].axhline(power.max(), lw=0.7, color='red', ls='-.')
        
        ax[0].set_xlabel("Period (days)"); ax[0].set_ylabel("GLS Power")
        ax[0].set_xscale("log")
        ax[0].set_xlim(Pmin / 1.05, Pmax * 1.05); ax[0].set_ylim(bottom=0) 
    
        # Plot sinusoidal fit
        day2min = 1440
        uses_Holczer =  df.source.eq('Holczer+16').any()
        source_list = [f'Kokkori+25 ({s})' for s in ('literature', 'space', 'exoclock')] if not uses_Holczer else ['Holczer+16']
        for i, source in enumerate(source_list):
            msk = (df.source == source)
            ax[1].errorbar(phase[msk], OC[msk] * day2min, OCerr[msk] * day2min, fmt='.', color=f'C{i}', zorder=1, label=source)
        ax[1].plot(phase_s, model_s * day2min, label=f'Fit (N={Nbest})', color='red', lw=1, zorder=2)
        ax[1].set_xlabel("TTV Phase"); ax[1].set_ylabel("O-C (minutes)")
        ax[1].set_xlim(-0.51, 0.51)
        ax[1].legend(loc='lower right')
        
        plt.tight_layout()
        EXPORT_FILENAME = rf'fit_pics/{planet_name}_GLSfit_N={Nbest}.jpg'
        plt.savefig(EXPORT_FILENAME, dpi=300, bbox_inches="tight")
        plt.close(fig)
        del fig, ax, t_fit
    
    del power1, power2, model1, model2, freq
    return period, period_err, amp1, amp2, Nbest, TTV_strength, scatter, bic0, bic1, bic2, delta_BIC, fap, Nfreq

def expected_Pttv(planet):
    try:
        Pttv = df_final.loc[df_final['name_exoclock+holczer'] == planet, 'expected_Pttv'].iloc[0]
    except IndexError:
        Pttv = None
    return Pttv
    
# Inititate final col
df_final = df_big.copy()
df_final_newcols = ['Pttv', 'Pttv_err', 'Attv1', 'Attv2', 'Nbest', 'TTV_strength', 'scatter', 'bic0', 'bic1', 'bic2', 'delta_bic', 'fap', 'sampled_f']
float_cols = [c for c in df_final_newcols if c != 'TTV_strength']

df_final[float_cols] = np.nan
df_final['TTV_strength'] = None

all_planets = df_clip['Planet'].dropna().astype(str).unique()
for planet in tqdm(all_planets):
    expected_Pttv = expected_Pttv(planet)
    newcol_values = lomb_scargle_analysis(df_clip, planet_name=planet,
                                          expected_Pttv=expected_Pttv)
    mask = (df_final["name_exoclock+holczer"] == planet)
    df_final.loc[mask, df_final_newcols] = newcol_values

EXPORT_FILENAME = r'data_cleaned\merged_GLSfitted.csv'
df_final.to_csv(EXPORT_FILENAME, index=False)
df_final.head(3)

57.6318926302492
