
# Filtering and Preprocessing (GNSS‑IR)

This notebook prepares SNR data for spectral analysis. We will:
- Select elevation/azimuth windows.
- Split data into clean **rising/setting arcs**.
- Remove outliers and **detrend/normalize** SNR.
- Convert to the natural independent variable \(x = \sin E\).
- Export cleaned arcs for the next spectral notebooks.


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pathlib import Path

# reuse helpers from previous notebook if present; otherwise redefine minimal versions
def _normalize(name: str) -> str:
    return ''.join(ch for ch in name.lower() if ch.isalnum())

KEYMAP = {
    'time': 'time','epoch':'time','datetime':'time',
    'prn':'prn','sat':'prn','satellite':'prn',
    'elev':'elev_deg','elevation':'elev_deg','elevdeg':'elev_deg',
    'az':'az_deg','azimuth':'az_deg','azdeg':'az_deg',
    'snr':'snr','snrl1':'snr_l1','snrl2':'snr_l2','snrl5':'snr_l5',
    'cn0':'snr','cn0l1':'snr_l1','cn0l2':'snr_l2','cn0l5':'snr_l5',
}

def read_snr_text(path: str) -> pd.DataFrame:
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(path)
    with path.open('r', encoding='utf-8', errors='ignore') as f:
        first = f.readline()
    sep = ',' if (first.count(',') >= 1) else None
    df = pd.read_csv(path, sep=sep, comment='#', engine='python')
    colmap = {}
    for c in df.columns:
        norm = _normalize(str(c))
        if norm in KEYMAP:
            out = KEYMAP[norm]
        else:
            if norm.startswith('snr') or norm.startswith('cn0'):
                if 'l1' in norm: out = 'snr_l1'
                elif 'l2' in norm: out = 'snr_l2'
                elif 'l5' in norm: out = 'snr_l5'
                else: out = 'snr'
            else:
                out = None
        if out:
            if out in colmap.values():
                k = 2
                while f'{out}_{k}' in colmap.values():
                    k += 1
                out = f'{out}_{k}'
            colmap[c] = out
    df = df.rename(columns=colmap)
    keep = [c for c in df.columns if c in {'time','prn','elev_deg','az_deg','snr','snr_l1','snr_l2','snr_l5'}]
    df = df[keep]
    if 'time' in df.columns:
        df['time'] = pd.to_datetime(df['time'], errors='coerce')
    for k in ['elev_deg','az_deg','snr','snr_l1','snr_l2','snr_l5']:
        if k in df.columns:
            df[k] = pd.to_numeric(df[k], errors='coerce')
    if 'elev_deg' in df.columns:
        df = df.dropna(subset=['elev_deg'])
    if 'prn' in df.columns:
        df['prn'] = df['prn'].astype(str).str.strip()
    return df.reset_index(drop=True)

def tag_rising_setting(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    if 'time' not in out.columns:
        out['arc_type'] = 'unknown'
        return out
    out = out.sort_values(['prn','time'])
    out['dE_dt'] = np.nan
    for prn, g in out.groupby('prn'):
        idx = g.index
        t = g['time'].astype('int64')/1e9
        e = g['elev_deg'].values
        dt = np.gradient(t)
        de = np.gradient(e)
        dEdt = np.divide(de, dt, out=np.zeros_like(de), where=dt!=0)
        out.loc[idx, 'dE_dt'] = dEdt
    out['arc_type'] = np.where(out['dE_dt']>=0, 'rising', 'setting')
    return out



## Arc Segmentation

We segment continuous arcs by PRN and arc type using **time gaps** and **monotonic elevation**.


In [None]:

def segment_arcs(df: pd.DataFrame, prn: str, band: str, arc_type='rising',
                 elev_min=5.0, elev_max=30.0, max_gap_s=120.0, min_points=30):
    dff = df.copy()
    if 'arc_type' in dff.columns:
        dff = dff[dff['arc_type']==arc_type]
    dff = dff[(dff['prn']==prn) & (dff['elev_deg']>=elev_min) & (dff['elev_deg']<=elev_max)]
    if 'time' not in dff.columns:
        dff = dff.sort_values(['elev_deg'])
        dff['time'] = np.arange(len(dff))
    dff = dff.sort_values('time')
    # identify gaps
    tsec = dff['time'].astype('int64')/1e9
    gaps = np.diff(tsec, prepend=tsec.iloc[0])
    seg_id = (gaps > max_gap_s).cumsum()
    dff = dff.assign(seg_id=seg_id)
    # collect segments
    segments = []
    for sid, g in dff.groupby('seg_id'):
        g = g.dropna(subset=[band])
        if len(g) >= min_points:
            segments.append(g.copy())
    return segments



## Outlier Removal

We use a robust median‑absolute‑deviation (MAD) filter per segment on the chosen band.


In [None]:

def mad_based_filter(y, k=4.0):
    y = np.asarray(y)
    med = np.nanmedian(y)
    mad = np.nanmedian(np.abs(y - med)) * 1.4826
    if mad == 0 or np.isnan(mad):
        mask = np.ones_like(y, dtype=bool)
    else:
        mask = np.abs(y - med) <= k * mad
    return mask



## Detrending & Normalization

We remove slow trends vs elevation using either a low‑order polynomial fit or a moving average.  
Then we **normalize** to unit variance to stabilize the spectral analysis.


In [None]:

def poly_detrend(elev_deg, y, order=2):
    x = np.asarray(elev_deg)
    y = np.asarray(y)
    m = np.isfinite(x) & np.isfinite(y)
    if m.sum() < order+1:
        return y - np.nanmean(y), np.poly1d([0,])
    p = np.polyfit(x[m], y[m], order)
    trend = np.poly1d(p)(x)
    return y - trend, np.poly1d(p)

def moving_average(y, win=21):
    y = np.asarray(y, float)
    if win < 3:
        return y
    w = int(win)
    if w % 2 == 0:
        w += 1
    pad = w//2
    yp = np.pad(y, (pad,pad), mode='edge')
    kern = np.ones(w)/w
    z = np.convolve(yp, kern, mode='valid')
    return z

def ma_detrend(y, win=21):
    trend = moving_average(y, win=win)
    return y - trend, trend



## Convert to \(x = \sin E\) and Normalize

GNSS‑IR spectra are computed in \(\sin E\)‑space. We also z‑score the residual.


In [None]:

def to_sinE(elev_deg):
    return np.sin(np.deg2rad(elev_deg))

def zscore(y):
    y = np.asarray(y, float)
    mu = np.nanmean(y)
    sd = np.nanstd(y)
    if not np.isfinite(sd) or sd == 0:
        return y*0.0
    return (y - mu) / sd



## End‑to‑End Preprocessing Workflow (per PRN)

Edit the parameters and run. If `path` fails, we'll fall back to a synthetic demo built on a two‑ray model.


In [None]:

# --- User parameters ---
path = '/mnt/data/your_snr_file.snr'   # change me or leave missing to use demo
PRN  = 'G12'
BAND = 'snr_l1'        # options: 'snr_l1','snr_l2','snr_l5','snr'
ARC  = 'rising'        # 'rising' or 'setting'
E_MIN, E_MAX = 5.0, 30.0
MAX_GAP_S = 120.0
MIN_POINTS = 30
DETREND_METHOD = 'poly'   # 'poly' or 'ma'
POLY_ORDER = 2
MA_WIN = 21

# --- Load or synthesize ---
def synth_snr_dataset():
    rng = pd.date_range('2024-01-01 12:00:00', periods=240, freq='30s')
    rows = []
    lam = 0.1903  # ~L1
    h = 2.0
    rho = 0.5
    phi_r = np.pi
    for prn in ['G12','G19']:
        elev = np.concatenate([np.linspace(5,30,120), np.linspace(30,5,120)])
        az = 60.0 if prn=='G12' else 140.0
        for t, E in zip(rng, elev):
            dphi = (4*np.pi*h/lam)*np.sin(np.deg2rad(E))
            snr = 45 + 10*np.log10(1 + rho**2 + 2*rho*np.cos(dphi+phi_r))
            rows.append({'time': t, 'prn': prn, 'elev_deg': E, 'az_deg': az, 'snr_l1': snr})
    df = pd.DataFrame(rows)
    return tag_rising_setting(df)

try:
    df = read_snr_text(path)
    df = tag_rising_setting(df)
except Exception as e:
    print('Using synthetic demo (set path to your file to load real data).')
    df = synth_snr_dataset()

# --- Segment arcs ---
segments = segment_arcs(df, PRN, BAND, arc_type=ARC, elev_min=E_MIN, elev_max=E_MAX,
                        max_gap_s=MAX_GAP_S, min_points=MIN_POINTS)

print(f'Found {len(segments)} usable segments for {PRN} ({ARC}).')

# --- Process each segment ---
processed = []
for i, g in enumerate(segments, start=1):
    y_raw = g[BAND].values
    ok = mad_based_filter(y_raw, k=4.0)
    g1 = g.iloc[ok].copy()
    elev = g1['elev_deg'].values
    y = g1[BAND].values

    if DETREND_METHOD == 'poly':
        resid, trend_model = poly_detrend(elev, y, order=POLY_ORDER)
        trend = trend_model(elev)
    else:
        resid, trend = ma_detrend(y, win=MA_WIN)

    xsin = to_sinE(elev)
    resid_z = zscore(resid)

    out = g1[['time','prn','elev_deg','az_deg']].copy()
    out['sinE'] = xsin
    out['snr_raw'] = y
    out['trend'] = trend
    out['snr_detrended'] = resid
    out['snr_detrended_z'] = resid_z
    out['segment_id'] = i
    processed.append(out)

clean = pd.concat(processed, ignore_index=True) if processed else pd.DataFrame()
clean.head()



## Visual Checks (first segment)


In [None]:

if not clean.empty:
    first_id = int(clean['segment_id'].iloc[0])
    g = clean[clean['segment_id']==first_id].copy()
    fig, ax = plt.subplots(1, 2, figsize=(12,4.5))
    ax[0].plot(g['elev_deg'], g['snr_raw'], '.', ms=4)
    ax[0].plot(g['elev_deg'], g['trend'], '-', lw=2)
    ax[0].set_xlabel('Elevation (deg)'); ax[0].set_ylabel(f'{BAND}')
    ax[0].set_title('Raw SNR with Trend')
    ax[0].grid(True)

    ax[1].plot(g['elev_deg'], g['snr_detrended'], '.', ms=4)
    ax[1].set_xlabel('Elevation (deg)'); ax[1].set_ylabel('Detrended')
    ax[1].set_title('Detrended SNR')
    ax[1].grid(True)
    plt.show()

    plt.figure(figsize=(7,4.5))
    plt.plot(g['sinE'], g['snr_detrended_z'], '.', ms=4)
    plt.xlabel('sin(E)'); plt.ylabel('Detrended Z-score')
    plt.title('Ready for Spectral Analysis in sin(E)')
    plt.grid(True)
    plt.show()
else:
    print("No segments to display.")



## Export Cleaned Arcs

We export a tidy CSV (one row per sample) ready for FFT/Lomb‑Scargle in \(\sin E\)-space.


In [None]:

out_path = Path('/mnt/data/cleaned_arcs.csv')
if not clean.empty:
    clean.to_csv(out_path, index=False)
    print('Wrote:', out_path)
else:
    print('Nothing to export.')



## Exercises
1. Compare **poly** detrending (orders 1–3) vs **moving‑average** detrending; which yields the cleanest oscillations in your site data?
2. Test **elevation windows** (e.g., 7°–23° vs 5°–30°). How does the usable arc length vs coherence trade‑off change?
3. Change the **MAD threshold k** from 3 to 6. How sensitive are your spectra to outlier removal?
4. Export two versions of the same arc (different detrending) and compare their dominant frequencies in the next notebook.



## Next Notebook
**`06_FFT_Basics_on_SNR.ipynb`** — We’ll compute FFT in \(\sin E\)-space to identify the dominant frequency and convert it to reflector height.
