# Find TOIs with Missing Orbital Period, Depth (planet-to-star ratio), Transift Duration
* Orbital period → pl_orbper
* Planet-to-star radius ratio (depth) → pl_ratror
* Transit duration → pl_trandur

   
-----
* finds TOIs with missing parameters (above)
* gets FFIs for each TIC
* clean, mask, remove outlier, flatten light curve
* run bls to esimate period, depth, duration, t0
* also estimates stellar radius/mass from MAST (a/R*)
* creates graphs and saves to csv file

In [7]:
# Updated end-to-end pipeline (notebook-ready)
# Paste into a Jupyter cell and run.
# Requirements:
#   pip install lightkurve astroquery astropy pandas matplotlib scipy batman-package

import os
os.environ['LIGHTKURVE_CACHE'] = "C:/lkcache"   # your cache path

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

from astroquery.nasa_exoplanet_archive import NasaExoplanetArchive
from astroquery.mast import Catalogs
import lightkurve as lk
from astropy.timeseries import BoxLeastSquares
from astropy import units as u
from astropy.constants import G, M_sun, R_sun

# ---------- Missing Orbital Parameters ----------
OUTPUT_DIR = Path("TOI_missing_pipeline_results")
PLOTS_DIR = OUTPUT_DIR / "plots"
LC_DIR = OUTPUT_DIR / "lightcurves"
RESULTS_CSV = OUTPUT_DIR / "missing_toi_bls_results.csv"

# BLS search settings
PERIOD_MIN = 0.5    # days
PERIOD_MAX = 200    # days (set higher if needed)
N_PERIODS = 20000
DURATIONS = np.linspace(0.01, 0.2, 10)  # fraction of period (typical transit durations)

# Fields (actual ExoFOP TOI CSV names)
TOI_PERIOD_COL = "Period (days)"
TOI_DURATION_COL = "Duration (hours)"
TOI_DEPTH_PPM_COL = "Depth (ppm)"
TOI_TIC_COL = "TIC ID"
TOI_COL = "TOI"

# Create directories
OUTPUT_DIR.mkdir(exist_ok=True)
PLOTS_DIR.mkdir(exist_ok=True)
LC_DIR.mkdir(exist_ok=True)

# ---------- Helper functions ----------

def find_tois_with_missing_parameters(required_cols=None, verbose=True):
    """
    Download TOI table from ExoFOP and return rows where ANY of required_cols is missing.
    By default required_cols = [Period (days), Duration (hours), Depth (ppm)].
    Returns a DataFrame with a numeric 'tic' column.
    """
    if required_cols is None:
        required_cols = [TOI_PERIOD_COL, TOI_DURATION_COL, TOI_DEPTH_PPM_COL]

    url = "https://exofop.ipac.caltech.edu/tess/download_toi.php?sort=toi&output=csv"
    print("Downloading TOI table from ExoFOP (this may take a few seconds)...")
    df = pd.read_csv(url, dtype=str)   # read as strings to handle odd entries

    # Check for expected columns
    available = list(df.columns)
    missing_expected = [c for c in required_cols + [TOI_TIC_COL, TOI_COL] if c not in df.columns]
    if missing_expected:
        raise KeyError(f"Expected columns missing from TOI table: {missing_expected}\nAvailable columns: {available}")

    # Coerce target numeric columns to numeric (invalid -> NaN)
    for col in required_cols:
        # Some values may contain commas or text; use to_numeric with errors='coerce'
        df[col + "_num"] = pd.to_numeric(df[col].str.replace(",","").str.strip(), errors='coerce')

    # Create a clean TIC numeric column
    # Some "TIC ID" values contain extra characters; try to coerce
    df["tic"] = pd.to_numeric(df[TOI_TIC_COL].astype(str).str.extract(r'(\d+)', expand=False), errors='coerce').astype('Int64')

    # Mask: any of the required numeric columns is NaN
    mask = np.zeros(len(df), dtype=bool)
    for col in required_cols:
        mask |= df[col + "_num"].isna()

    missing_df = df[mask].copy().reset_index(drop=True)

    if verbose:
        print(f"Total TOIs downloaded: {len(df)}")
        print(f"TOIs missing at least one of {required_cols}: {len(missing_df)}")
        # show useful columns (rename numeric columns for readability)
        display_cols = [TOI_COL, "tic"] + required_cols
        # attempt to show a few rows safely
        print(missing_df[display_cols].head(10).to_string(index=False))

    return missing_df

def fetch_stellar_params_from_mast(tic_id):
    """Attempt to fetch stellar radius [Rsun] and mass [Msun] from MAST/TIC.
       Returns (radius_Rsun, mass_Msun) or (None, None) if not available."""
    try:
        r = Catalogs.query_object(f"TIC {int(tic_id)}", catalog="TIC")
        if len(r) == 0:
            return None, None
        row = r[0]
        rad = row.get("st_rad")  # in solar radii
        mass = row.get("st_mass")
        # Some entries can be NaN or None
        rad = float(rad) if rad not in (None, np.nan) else None
        mass = float(mass) if mass not in (None, np.nan) else None
        return rad, mass
    except Exception:
        return None, None

def estimate_a_over_Rstar(period_days, mass_Msun, radius_Rsun):
    """
    Estimate a/R* from Kepler's third law:
      a = (G M P^2 / 4π^2)^(1/3)
    convert a in meters, divide by Rstar (in meters)
    Returns a/R* (unitless) or None if missing
    """
    if period_days is None or mass_Msun is None or radius_Rsun is None:
        return None
    try:
        P = (period_days * u.day).to(u.s).value
        M = (mass_Msun * M_sun).to(u.kg).value
        a_m = (G.value * M * P**2 / (4 * np.pi**2))**(1/3)
        Rstar_m = (radius_Rsun * R_sun).to(u.m).value
        return a_m / Rstar_m
    except Exception:
        return None

def download_and_clean_lc(tic_id, cutout_size=7, min_points=200):
    """Download TESS cutout(s) for a TIC and return a combined, cleaned LightCurve
       Returns None if no valid data.
    """
    if pd.isna(tic_id):
        return None
    target = f"TIC {int(tic_id)}"
    try:
        search = lk.search_tesscut(target)
        if len(search) == 0:
            # fallback: search for lightcurves (2min)
            lc_search = lk.search_lightcurve(target, author="SPOC")
            if len(lc_search) == 0:
                print(f"  No TESS data found for {target}")
                return None
            # download 2-min LCs
            lcs = lc_search.download_all()
            lc_list = []
            for lc in lcs:
                lc = lc.remove_nans().remove_outliers(sigma=5)
                if len(lc.time) < min_points:
                    continue
                lc = lc.flatten(window_length=301).remove_outliers(sigma=4)
                lc_list.append(lc)
            if len(lc_list) == 0:
                return None
            full = lc_list[0]
            for extra in lc_list[1:]:
                full = full.append(extra)
            return full.normalize()
        # if TESSCut works, download small cutouts and extract LCs
        tpfs = search.download_all(cutout_size=cutout_size)
        lc_list = []
        for tpf in tpfs:
            if getattr(tpf, "flux", None) is None or tpf.flux.size == 0:
                continue
            mask = tpf.create_threshold_mask(threshold=3)
            if mask.sum() == 0:
                mask = tpf.create_threshold_mask(threshold=1)
            if mask.sum() == 0:
                mask = tpf.pipeline_mask
            try:
                lc = tpf.to_lightcurve(aperture_mask=mask)
            except Exception:
                continue
            lc = lc[lc.quality == 0]  # quality filter
            lc = lc.remove_nans()
            lc = lc.remove_outliers(sigma=5)
            if len(lc.time) < min_points:
                continue
            lc = lc.flatten(window_length=301)
            lc = lc.remove_outliers(sigma=4)
            # remove extreme flux points
            med = np.median(lc.flux.value)
            std = np.std(lc.flux.value)
            lc = lc[np.abs(lc.flux.value - med) < 5 * std]
            lc = lc[lc.flux.value > 0]
            if len(lc.time) < min_points:
                continue
            lc_list.append(lc)
        if len(lc_list) == 0:
            return None
        all_lc = lc_list[0]
        for extra in lc_list[1:]:
            all_lc = all_lc.append(extra)
        return all_lc.normalize()
    except Exception as e:
        print(f"  Error downloading/processing TIC {tic_id}: {e}")
        return None

def run_bls_and_estimate(lc, period_min=PERIOD_MIN, period_max=PERIOD_MAX, n_periods=N_PERIODS, durations=DURATIONS):
    """Run Box Least Squares on a light curve and return dictionary of best-fit values and the bls result object."""
    time = lc.time.value
    flux = lc.flux.value
    # simple error proxy
    flux_err = np.ones_like(flux) * np.nanstd(flux)

    model = BoxLeastSquares(time, flux, flux_err)
    periods = np.linspace(period_min, period_max, n_periods)
    bls_result = model.power(periods, durations)
    best_idx = np.argmax(bls_result.power)
    result = {
        "period": float(bls_result.period[best_idx]),
        "duration_days": float(bls_result.duration[best_idx]),
        "t0": float(bls_result.transit_time[best_idx]),
        "depth": float(bls_result.depth[best_idx]),
        "depth_ppm": float(bls_result.depth[best_idx] * 1e6),
        "power": float(bls_result.power[best_idx]),
        "bls": bls_result
    }
    return result

def plot_diagnostics(tic_id, lc, bls_result_dict, outpath=None):
    """Create diagnostic plots: full LC, BLS periodogram, folded (scatter + binned), predicted transit times."""
    res = bls_result_dict
    bls = res["bls"]
    per = res["period"]
    t0 = res["t0"]
    dur = res["duration_days"]
    time = lc.time.value
    flux = lc.flux.value

    fig = plt.figure(figsize=(14,10))

    ax1 = plt.subplot(3,2,1)
    lc.plot(ax=ax1)
    ax1.set_title(f"TIC {tic_id} - Full LC")

    ax2 = plt.subplot(3,2,2)
    ax2.plot(bls.period, bls.power, lw=0.7)
    ax2.axvline(per, color="red", linestyle="--", label=f"P={per:.4f} d")
    ax2.set_xlabel("Period (days)")
    ax2.set_ylabel("BLS Power")
    ax2.legend()
    ax2.grid(alpha=0.3)
    ax2.set_title("BLS periodogram")

    ax3 = plt.subplot(3,2,3)
    lc_fold = lc.fold(period=per, epoch_time=t0)
    lc_fold.scatter(ax=ax3, s=2, alpha=0.4)
    ax3.set_title("Folded (scatter)")

    ax4 = plt.subplot(3,2,4)
    lc_fold_binned = lc_fold.bin(time_bin_size=dur/10.0 if dur>0 else 0.01)
    lc_fold_binned.scatter(ax=ax4, s=20)
    ax4.set_title("Folded (binned)")

    ax5 = plt.subplot(3,2,(5,6))
    ax5.plot(time, flux, '.', ms=2, alpha=0.5)
    time_baseline = time[-1] - time[0]
    ntrans = int(time_baseline / per) if per>0 else 0
    for i in range(ntrans+1):
        tt = t0 + per * i
        if time[0] <= tt <= time[-1]:
            ax5.axvline(tt, color='red', alpha=0.3)
    ax5.set_title(f"Predicted transit times (~{ntrans} events)")
    ax5.set_xlabel("Time (BTJD)")
    ax5.set_ylabel("Normalized flux")

    plt.tight_layout()
    if outpath:
        fig.savefig(outpath, dpi=150)
        plt.close(fig)
    else:
        plt.show()

# ---------- Main processing function (using functions from above) ----------
def process_toi_list(missing_df, limit=None, period_min=PERIOD_MIN, period_max=PERIOD_MAX):
    """
    For each row in missing_df, attempts LC extraction, BLS, plotting, and saves results.
    limit: optional int to process only first N targets (for testing).
    """
    results = []
    total = len(missing_df) if limit is None else min(limit, len(missing_df))
    print(f"Processing up to {total} targets...")

    for idx in range(total):
        row = missing_df.iloc[idx]
        tic = row["tic"]
        toi = row.get(TOI_COL, "")
        print(f"\n[{idx+1}/{total}] TIC {tic} (TOI {toi})")

        lc = download_and_clean_lc(tic)
        if lc is None:
            print("  No usable light curve — skipping.")
            continue

        # run BLS on cleaned LC
        try:
            bls_res = run_bls_and_estimate(lc, period_min=period_min, period_max=period_max)
        except Exception as e:
            print(f"  BLS failed: {e}")
            continue

        # attempt to fetch stellar params and compute a/R*
        rad_Rsun, mass_Msun = fetch_stellar_params_from_mast(tic)
        a_over_R = None
        if bls_res["period"] is not None and mass_Msun and rad_Rsun:
            a_over_R = estimate_a_over_Rstar(bls_res["period"], mass_Msun, rad_Rsun)

        # save LC to a CSV for later fitting if desired
        lc_csv_path = LC_DIR / f"TIC{tic}_lc.csv"
        pd.DataFrame({"time": lc.time.value, "flux": lc.flux.value}).to_csv(lc_csv_path, index=False)

        # save plot
        plot_path = PLOTS_DIR / f"TIC{tic}_diagnostic.png"
        plot_diagnostics(tic, lc, bls_res, outpath=plot_path)

        # compile results
        entry = {
            "tic": int(tic) if not pd.isna(tic) else None,
            "toi": toi,
            "period_d": bls_res["period"],
            "t0": bls_res["t0"],
            "duration_d": bls_res["duration_days"],
            "depth_frac": bls_res["depth"],
            "depth_ppm": bls_res["depth_ppm"],
            "bls_power": bls_res["power"],
            "lc_csv": str(lc_csv_path),
            "plot_png": str(plot_path),
            "stellar_radius_Rsun": rad_Rsun,
            "stellar_mass_Msun": mass_Msun,
            "a_over_Rstar": a_over_R
        }
        results.append(entry)
        print(f"  Saved results for TIC {tic}: P={entry['period_d']:.4f} d, depth={entry['depth_ppm']:.1f} ppm")

    # save summary CSV
    if results:
        df_res = pd.DataFrame(results)
        df_res.to_csv(RESULTS_CSV, index=False)
        print(f"\nAll results saved to: {RESULTS_CSV}")
        return df_res
    else:
        print("No successful analyses")
        return None

# ------------------------------
# Run: find missing TOIs and process them (call these lines in a notebook cell)
# Example: do a quick test by setting limit=5
# ------------------------------

# Find TOIs missing Period/Duration/Depth
missing_df = find_tois_with_missing_parameters(verbose=True)

# Optional: inspect the first few rows
print("\nFirst few missing rows (cleaned):")
display(missing_df.head())

# Process a small number of targets for testing
# Set limit to None to process all (can be slow). Start small to test.
df_results = process_toi_list(missing_df, limit=5, period_min=1.0, period_max=150.0)

# show results
if df_results is not None:
    display(df_results.head())


Downloading TOI table from ExoFOP (this may take a few seconds)...
Total TOIs downloaded: 7821
TOIs missing at least one of ['Period (days)', 'Duration (hours)', 'Depth (ppm)']: 1
    TOI       tic     Period (days) Duration (hours)       Depth (ppm)
7523.01 238959233 4.661281745462188              NaN 14867.18694688341

First few missing rows (cleaned):


Unnamed: 0,TIC ID,TOI,Previous CTOI,Master,SG1A,SG1B,SG2,SG3,SG4,SG5,...,Stellar Mass (M_Sun) err,Sectors,Date TOI Alerted (UTC),Date TOI Updated (UTC),Date Modified,Comments,Period (days)_num,Duration (hours)_num,Depth (ppm)_num,tic
0,238959233,7523.01,,3,4,3,3,1,4,4,...,,19597386,2025-10-30,2025-10-30,2025-10-31 12:04:13,found in faint-star QLP search; no stellar radius,4.661282,,14867.186947,238959233


Processing up to 1 targets...

[1/1] TIC 238959233 (TOI 7523.01)
  Saved results for TIC 238959233: P=64.7901 d, depth=14949.8 ppm

All results saved to: TOI_missing_pipeline_results\missing_toi_bls_results.csv


Unnamed: 0,tic,toi,period_d,t0,duration_d,depth_frac,depth_ppm,bls_power,lc_csv,plot_png,stellar_radius_Rsun,stellar_mass_Msun,a_over_Rstar
0,238959233,7523.01,64.79009,1839.364426,0.2,0.01495,14949.81445,472.607264,TOI_missing_pipeline_results\lightcurves\TIC23...,TOI_missing_pipeline_results\plots\TIC23895923...,,,


In [None]:
missing_df = find_tois_with_missing_parameters()
df_results = process_toi_list(missing_df, limit=20, period_min=1.0, period_max=150.0)