# Robust HMF on *SDSS-V BOSS* spectra of hot stars...
...to find evidence of H-alpha emission.

## Authors:
- **David W. Hogg** (NYU) (MPIA) (Flatiron)

## Contributors and acknowledgements:
- **Kareem El-Badry** (Caltech)
- **Johanna Müller-Horn** (MPIA)
- **Hans-Walter Rix** (MPIA)
- **Jaime Villaseñor** (MPIA)
- **Eleonora Zari** (Firenze)

## Comments:
- The `rank` and `nsigma` inputs are currently set by *vibes*.
- The iteration to remove H-alpha-weird stars is not necessarily stable or correct. 

## To-do items:
- Maybe switch to a mode where we train on *all* stars of a certain color, and test on yso candidates.
- Output a full table of EWs for all stars.
- Make some method (perhaps in `rhmf.py`) to save and restore a Robust HMF model.
- Fit Voigt profiles to emission lines maybe?
- Add more emission lines; see emails from various.

## Bugs:
- Test time and EW measurement parallelizations aren't fast because of shared-stuff issues, maybe?
- Identify and eliminate wrong Doppler shifts and bad wavelength solutions from the training sets?
- Plots show Bohr wavelengths, not correct NIST wavelengths, for the Hydrogen lines.
- This needs a method to save a model and pick up where it left off.
- Maybe RHMF is the wrong tool for this job?
- Inconsistent variable names; inconsistent uses of underscores, directory names, file names, etc.
- At test time and plotting, produces way too much logging output on the terminal (stdout).
- Maybe should think about `zorder` inputs to plotting calls?

In [None]:
# Cell 1: Import required libraries
import numpy as np
import pandas as pd
from astropy.io import fits
import requests
from requests.auth import HTTPBasicAuth
import os
import concurrent.futures
import matplotlib.pyplot as plt
import rhmf

In [None]:
# global variables

# this MUST have the strongest emission lines first.
LINELIST = [("H-alpha", 6564.6),
            ("H-beta", 4861.36),
            # ("H-gamma", 4340.46),
            ("He II 4201", 4201.015),
            ("He II 4543", 4542.864),
            ("He II 4687", 4687.02)]

In [None]:
# data choices
bosstag = 'v6_2_1'
cachedir = f'./boss_{bosstag}_star_cache'
os.makedirs(cachedir, exist_ok=True)

# Create subdirectory for plots
plot_folder = cachedir + '/plots'
os.makedirs(plot_folder, exist_ok=True)

# Tiny weird github / overleaf interaction hack
_ = os.system("chmod a+x ./make_pdf.py")

In [None]:
# model choices
rank, nsigma = 24, 3.5

In [None]:
# Define download functions

# bad global variables! DELETE THESE FOR GITHUB
user, password = None, None

def download_one_file_from_df(args):
    """Download a single file from SDSS."""
    url, filename, user, password, cachedir = args
    subdir = filename.split("-")[1]
    old_filepath = cachedir + "/" + filename
    filepath = cachedir + "/" + subdir + "/" + filename
    os.makedirs(cachedir + "/" + subdir, exist_ok=True)
    
    # Skip if already downloaded
    if os.path.exists(old_filepath):
        os.system(f"mv -fv {old_filepath} {filepath}")
    if os.path.exists(filepath):
        # print(f"File {filepath} already exists, skipping")
        return filepath

    try:
        with requests.Session() as session:
            response = session.get(url, auth=HTTPBasicAuth(user, password), timeout=30)
            response.raise_for_status()
            with open(filepath, 'wb') as f:
                f.write(response.content)
        if np.random.uniform() < 0.02: # one in fifty
            print(f"Random example: File downloaded: {filepath}")
        return filepath
    except Exception as e:
        print(f"Failed to download {filepath}: {e}")
        return ""

def download_files_from_df(df, user, password, dest_folder, boss_tag='v6_2_1', coadd_version='daily', max_workers=16):
    """Download multiple files from SDSS based on dataframe."""
    os.makedirs(dest_folder, exist_ok=True)
    args_list = []

    for idx, row in df.iterrows():
        spec_file = row['SPEC_FILE']
        fieldid = f"{row['FIELD']:06d}"
        mjd = str(row['MJD'])
        fieldidXXX = fieldid[:-3] + 'XXX'
        url = (
            f"https://data.sdss5.org/sas/sdsswork/bhm/boss/spectro/redux/"
            f"{boss_tag}/spectra/{coadd_version}/lite/{fieldidXXX}/{fieldid}/{mjd}/{spec_file}"
        )
        args_list.append((url, spec_file, user, password, dest_folder))

    print(f"Starting attempts to download {len(args_list)} files")
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(download_one_file_from_df, args_list))
    print(f"Number successful: {sum([r is not None for r in results])} files")
    return results

In [None]:
# Download and examine the spAll file
spallname = f'spAll-lite-{bosstag}.fits'
summaryurl = f'https://data.sdss5.org/sas/sdsswork/bhm/boss/spectro/redux/{bosstag}/summary/daily/{spallname}.gz'
summaryfile = cachedir + '/' + spallname + '.gz'
summaryfile_uncompressed = cachedir + '/' + spallname

if not os.path.exists(summaryfile_uncompressed):
    if not os.path.exists(summaryfile):
        print(f"Downloading summary file from {summaryurl}")
        response = requests.get(summaryurl, auth=HTTPBasicAuth(user, password))
        with open(summary_file, 'wb') as f:
            f.write(response.content)
        print(f"Summary file {summaryfile} downloaded")
    
    # Decompress
    os.system(f'gunzip -v {summaryfile}')
    print(f"Summary file {summaryfile} decompressed")
else:
    print(f"Summary file {summaryfile_uncompressed} already exists")

In [None]:
# Load spAll data
with fits.open(summaryfile_uncompressed) as hdul:
    data = hdul[1].data
if False:
    print("="*70)
    print("ALL AVAILABLE COLUMNS IN SUMMARY FILE")
    print("="*70)
    columns = data.columns.names
    for i, col in enumerate(columns):
        print(f"{i+1:3d}. {col}")
print(f"rows: {len(data)}; columns: {len(data.columns.names)}")

In [None]:
# Select a sample of spectra to download
snrcut, mwm_ob, mwm_yso = 29., True, False
df = pd.DataFrame({col: data[col].byteswap().newbyteorder() for col in ['SPEC_FILE', 'FIELD', 'MJD',
                                                                        'SN_MEDIAN_ALL',
                                                                        'BP_MAG', 'RP_MAG', 
                                                                        'FIRSTCARTON', 'PROGRAMNAME']})
if mwm_ob:
    high_snr_df = df[(df['SN_MEDIAN_ALL'] > snrcut)
                   & (df['PROGRAMNAME'] == 'mwm_ob')]
if mwm_yso:
    high_snr_df = df[(df['SN_MEDIAN_ALL'] > snrcut)
                   & (  (df['FIRSTCARTON'] == 'mwm_yso_cluster')
                      | (df['FIRSTCARTON'] == 'mwm_yso_s1')
                      | (df['FIRSTCARTON'] == 'mwm_yso_s2')
                      | (df['FIRSTCARTON'] == 'mwm_yso_s3'))]
print(f"Selected {len(high_snr_df)} spectra")

In [None]:
# Download the sample spectra
pathnames = download_files_from_df(high_snr_df, user, password, cachedir, boss_tag=bosstag, coadd_version='daily', max_workers=32)

In [None]:
# make lists of strings
pathnames = np.array(pathnames)
pathnames = pathnames[pathnames != ""]
filenames = np.array([p.split("/")[-1] for p in pathnames])
specnames = np.array([f[5:-5] for f in filenames])
print(pathnames.shape, filenames.shape, specnames.shape)
print(pathnames[13], filenames[13], specnames[13])

In [None]:
# make rectangular data, plus wavelength grid
wavelength = None
N = len(filenames)
print(f"reading {N} files...")
for i, filepath in enumerate(pathnames):
    
    try:
        with fits.open(filepath) as hdul:
            if len(hdul) > 1 and hasattr(hdul[1], 'data'):
                spec_data = hdul[1].data
                loglam = spec_data['LOGLAM']
                fl = spec_data['FLUX']
                iv = spec_data['IVAR']
                wa = 10**loglam
                if wavelength is None:
                    wavelength = wa
                    M = len(wavelength)
                    flux = np.ones((N, M))
                    ivar = np.zeros_like(flux)
                if np.allclose(wa, wavelength):
                    flux[i] = fl / np.median(fl)
                    ivar[i] = iv * np.median(fl) ** 2
                else:
                    print(f"  Dropped {filepath}: bad wavelength grid")

    except Exception as e:
        print(f"  Dropped {filepath}: {e}")

print("data blocks:", flux.shape, ivar.shape, np.prod(flux.shape))
print("bad pixels:", np.sum(~ np.isfinite(flux)), np.sum(~ np.isfinite(ivar)),
      np.sum(ivar <= 0.) / np.prod(flux.shape))

In [None]:
# trim data
good = (wavelength > 3700) & (wavelength < 10300) # magic
wavelength = wavelength[good]
flux = flux[:, good]
ivar = ivar[:, good]
print(flux.shape, ivar.shape, wavelength.shape)

In [None]:
# floor and ceil the ivars ## HACK
maxivar = 1.e4 / flux ** 2 # magic -- nothing is known to better than 1 percent
ivar = np.clip(ivar, 0., maxivar)
maxivar = 1.e5 / np.median(flux, axis=1) ** 2 # magic -- nothing is known to better than 0.3 percent on average
minivar = 1.e-5 / np.median(flux, axis=1) ** 2 # magic -- there is trivial information even at useless pixels
ivar = np.clip(ivar, minivar[:, None], maxivar[:, None])
print(np.min(ivar), np.max(ivar))

In [None]:
# make two disjoint subsets, A and B
N, M = flux.shape
rng = np.random.default_rng(17) # the most random number
foo = np.random.uniform(size=N)
A = foo < np.median(foo)
B = np.logical_not(A)
print(np.sum(A), np.sum(B), ~np.any(np.logical_and(A, B)))

In [None]:
# everything goes into two disjoint test sets
Aidx = np.arange(N)[A]
Bidx = np.arange(N)[B]
print(len(Aidx), len(Bidx), np.all(A[Aidx]), np.all(B[Bidx]))

In [None]:
# plotting utility: Hydrogen recombination lines

def hydrogen_line(n_upper, n_lower):
    R_H = 10973731.568157 # (12) per meter; Wikipedia
    wave_number = R_H * (1/n_lower**2 - 1/n_upper**2) # per meter
    return (1. / np.abs(wave_number)) * 1.e10 # Angstrom

def plot_hydrogen_lines(ax):
    # plt.axvline(6564.6, color="g", lw=0.5, alpha=0.23) # true H-alpha rather than computed
    for n1 in (2, 3):
        for n2 in range(n1 + 1, n1 + 18): # magic
            ax.axvline(hydrogen_line(n2, n1), color="b", lw=0.5, alpha=0.23, zorder=-1)

In [None]:
# plotting utility: Hogg cares about wavelength axes.

def hogg_wavelength_axis(ax, ws):
    # plot_hydrogen_lines(ax)
    for label, line in LINELIST:
        if label[0:2] == "He":
            plt.axvline(line, color="r", lw=0.5, alpha=0.23, zorder=-1)
    ax.semilogx()
    ticks = [3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]
    ticklabels = [str(tick) for tick in ticks]
    ax.set_xticks(ticks, ticklabels)
    ax.set_xlim(np.min(ws), np.max(ws))
    # ax.set_xlim(6500, 6600) # zoom in
    ax.set_xlabel('wavelength')
    return ax

In [None]:
# plot the eigenvectors of a model

def plot_G(model, waves, title):
    f = plt.figure(figsize=(12,8))
    for k, g in enumerate(model.G):
        plt.step(waves, 10. * g + k,
                 where='mid', lw=0.5, alpha=0.90)
    plt.ylim(-1., model.K)
    ax = hogg_wavelength_axis(plt.gca(), waves)
    plt.title(title)
    fn = plot_folder + "/" + "_".join(title.split(" ")) + ".png"
    print(f"writing file {fn}")
    plt.savefig(fn)
    plt.close(f)

In [None]:
# plot a spectrum and a synthetic spectrum and residuals

def plot_one_spectrum(waves, flux, ivar, name, prefix, synth=None, verbose=False, legend=None):
    f = plt.figure(figsize=(12, 4))
    plt.axhline(0., lw=0.5, color='k', alpha=0.45)
    plt.step(waves, flux,
             where='mid', color='k', lw=0.5, alpha=0.90)
    tiny = 0.01 / np.median(flux) ** 2
    flhi = flux + 1. / np.sqrt(ivar + tiny)
    fllo = flux - 1. / np.sqrt(ivar + tiny)
    plt.fill_between(waves, fllo, flhi,
                     step='mid', color='k', alpha=0.23)
    if synth is not None:
        plt.step(waves, flux - synth,
                 where='mid', color='k', lw=0.5, alpha=0.90)
        plt.step(waves, synth,
                 where='mid', color='r', lw=0.5, alpha=0.90)
        plt.step(waves, np.zeros_like(flux),
                 where='mid', color='r', lw=0.5, alpha=0.90)

    # adjust axes
    foo = np.nanpercentile(flux, 90)
    plt.ylim(-0.15 * foo, 1.5 * foo)
    plt.ylabel('flux')
    plottitle = name
    if legend is not None:
        plottitle = plottitle + " " + legend
    plt.title(plottitle)
    hogg_wavelength_axis(plt.gca(), waves)

    # Save plot
    plot_filename = plot_folder + '/' + prefix + name + '.png'
    plt.savefig(plot_filename)
    plt.close(f)
    if verbose:
        print(f"  Plot saved: {plot_filename}")

In [None]:
# make test step but with a list of lines held out (like, say, H-alpha, H-beta, H-delta)

def censored_cross_test(Y, W, waves, models, lines, delta):
    W_line = 1. * W # copy
    for line in lines:
        near_line = (waves > (line - delta)) & (waves < (line + delta))
        W_line[:, near_line] = 0.
    return cross_test(Y, W_line, models)

In [None]:
# cross test: Test A objects with model B, and B objects with model A.

def cross_test(Y, W, models, max_workers=16):
    synth = np.zeros_like(Y) + np.nan
    print("cross_test(): synthesizing with", np.sum(np.isnan(synth)), "pixels to go")
    for model, testidx in models.values():
        G = 1. * model.G
        Q2 = model.Q2
        def test_one_index(i):
            return i, rhmf.test(Y[i], W[i], G, Q2)
        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
            results = executor.map(test_one_index, testidx)
            for i, s in results:
                synth[i] = s
        print("cross_test(): synthesizing with", np.sum(np.isnan(synth)), "pixels to go")
    return synth

In [None]:
# punk the rhmf model to do robust polynomial fitting
# the requirement to do this shows that I am living my life wrong

def robust_polyfit(ys, ws, xs, degree, nsigma=5.0):
    design = np.vstack([xs ** d for d in range(degree + 1)])
    poly = rhmf.RHMF(degree + 1, nsigma, G=design)
    poly.M = len(xs)
    poly.trained = True
    return poly.test(ys, ws)

def continuum_at_line(inys, inws, inxs, line):
    near = np.abs(inxs - line) < 300 # magic
    ys = np.append(inys[near], 0.) # hackitey hack
    ws = np.append(inws[near], 0.)
    xs = np.append(inxs[near], line)
    verynear = np.abs(xs - line) < 10 # magic
    ws[verynear] = 0.
    return robust_polyfit(ys, ws, xs, 2)[-1]

def integrate_line(dys, ws, xs, line):
    """
    ## bugs:
    - assumes terrible things about `xs`.
    - has a terrible (but empirical) uncertainty analysis
    """
    dxs = 0.5 * np.abs(xs - np.roll(xs, 1)) + 0.5 * np.abs(np.roll(xs, -1) - xs)
    dxs[0], dxs[-1] = 0., 0.
    close = np.abs(xs - line) < 400.0 # too far?
    vvnear = np.abs(xs - line) < 4.0 # too close?
    dyvar = 0.25 * np.percentile(dys[close] ** 2, 95) # woah cray
    return np.sum(dys[vvnear] * dxs[vvnear]), np.sqrt(np.sum(dyvar * dxs[vvnear] * dxs[vvnear]))

def line_ew(ys, ws, xs, ss, line):
    flux, flerr = integrate_line(ys - ss, ws, xs, line)
    cont = continuum_at_line(ys, ws, xs, line)
    return flux / cont, flerr / cont

def measure_all_line_ews(Y, W, waves, S, max_workers=100):
    N = len(Y)
    M = len(LINELIST)
    ews = np.zeros((N, M)) + np.nan
    ewerrs = np.zeros((N, M)) + np.nan
    def measure_one_index(i):
        if i % 10000 == 0:
            print("measure_all_line_ews:", i)
        return i, [line_ew(Y[i], W[i], waves, S[i], line) for _, line in LINELIST]
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = executor.map(measure_one_index, range(N))
        for i, foo in results:
            ews[i] = [f[0] for f in foo]
            ewerrs[i] = [f[1] for f in foo]
    return ews, ewerrs

In [None]:
# the full train and test pipeline

def train_and_test(Y, W, waves, names, models, maxiter=10):
    
    # train step
    for label in models.keys():
        print(label)
        model = models[label][0]
        print(label, model.Y.shape)
        model.train(maxiter=maxiter)
        plot_G(model, waves, "model " + label)
        plt.show()

    # test step
    censor_lines = [foo[1] for foo in LINELIST[0:2]] # H-alpha, H-beta
    delta = 4. # Angstroms, magic
    synth_ex_lines = censored_cross_test(Y, W, waves, models, censor_lines, delta)

    # measure EWs
    ews, ewerrs = measure_all_line_ews(flux, ivar, wavelength, synth_ex_lines)

    # choose interesting emission objects to plot
    p = -1 # He II
    good = ((ews[:,p] / ewerrs[:,p]) > 5.) # magic 5
    idx = (np.arange(len(ews))[good])
    if len(idx) > 100:
        idx = idx[:100]

    # make plots
    prefix = "emitter_"
    pattern = f"{plot_folder}/{prefix}*.png"
    os.system(f"rm -f {pattern}")
    for i in idx:
        thisprefix = prefix + f"{ews[i, p]:4.2f}_"
        legend = f"{LINELIST[p][0]} EW = {ews[i, p]:4.2f}+/-{ewerrs[i, p]:4.2f} Ang"
        plot_one_spectrum(waves, Y[i], W[i], names[i], thisprefix, synth=synth_ex_lines[i],
                          legend=legend)
    os.system(f"./make_pdf.py {pattern} foo.pdf")

    # choose interesting absorption objects to plot
    good = ((ews[:, p] / ewerrs[:, p]) < -5.)
    idx = (np.arange(len(ews))[good])
    if len(idx) > 100:
        idx = idx[:100]

    # make plots
    prefix = "absorber_"
    pattern = f"{plot_folder}/{prefix}*.png"
    os.system(f"rm -f {pattern}")
    for i in idx:
        thisprefix = prefix + f"{-ews[i, p]:4.2f}_"
        legend = f"{LINELIST[p][0]} EW = -{-ews[i, p]:4.2f}+/-{ewerrs[i, p]:4.2f} Ang"
        plot_one_spectrum(waves, Y[i], W[i], names[i], thisprefix, synth=synth_ex_lines[i],
                          legend=legend)
    os.system(f"./make_pdf.py {pattern} bar.pdf")
    
    return synth_ex_lines, ews, ewerrs

In [None]:
def make_training_set(idx, considerations=None, target_size=1000):
    """
    idx: list of indices
    considerations: list of booleans
    """
    if considerations is None:
        considerations = np.ones_like(idx).astype(bool)
    assert len(idx) == len(considerations)
    factor = np.sum(considerations) // target_size
    if factor < 1:
        factor = 1
    train = (idx[considerations])[::factor]
    print("make_good_training_set(): chose", len(train), "from", len(idx))
    return train

In [None]:
# start models
# format of each entry of `models` is `name: (model, test_indices)`
models = {"A": (rhmf.RHMF(rank, nsigma), Bidx),  # model A used for B test set and vice versa
          "B": (rhmf.RHMF(rank, nsigma), Aidx)}
Atrain = make_training_set(Aidx, target_size=500)
Btrain = make_training_set(Bidx, target_size=500)
models["A"][0].set_training_data(flux[Atrain], ivar[Atrain])
models["B"][0].set_training_data(flux[Btrain], ivar[Btrain])
for key in models.keys():
    print(key, models[key][0].Y.shape, models[key][0].input_W.shape)

In [None]:
# start training
synth, ews, ewerrs = train_and_test(flux, ivar, wavelength, specnames, models, maxiter=10)
print(synth.shape, ews.shape)

In [None]:
q = 5
for p in [0, 3, 4]:
    good = (ewerrs[:, p] < 0.2) & (ewerrs[:, q] < 0.2) # magic numbers
    plt.axvline(0., color="k", lw=1, alpha=0.5)
    plt.axhline(0., color="k", lw=1, alpha=0.5)
    plt.scatter(ews[good, p], ews[good, q], color="k", s=2, alpha=0.45)
    foo = np.percentile(ews[good, p], 99)
    bar = np.percentile(ews[good, q], 99)
    plt.xlim(-4 * foo, 4. * foo)
    plt.ylim(-4 * bar, 4. * bar)
    plt.xlabel(LINELIST[p][0] + " EW (Ang)")
    plt.ylabel(LINELIST[q][0] + " EW (Ang)")
    plt.show()

In [None]:
# train even more, but always removing stars with H-alpha emission-line issues
# remove symmetrically because otherwise we may be doomed.
for t in range(4):
    print("iteration", t + 1)
    noHa = (np.abs(ews[:, 0] / ewerrs[:, 0]) < 2.)
    print("fraction with no detectable narrow H-alpha deviation:", np.sum(noHa) / len(ews))
    Atrain = make_training_set(Aidx, considerations = noHa[Aidx], target_size=1000 + 200 * t)
    Btrain = make_training_set(Bidx, considerations = noHa[Bidx], target_size=1000 + 200 * t)
    models["A"][0].set_training_data(flux[Atrain], ivar[Atrain])
    models["B"][0].set_training_data(flux[Btrain], ivar[Btrain])
    synth, ews, ewerrs = train_and_test(flux, ivar, wavelength, specnames, models, maxiter=50 + 10 * t)

In [None]:
synth, ews, ewerrs = train_and_test(flux, ivar, wavelength, specnames, models, maxiter=5)