# Identify LFCS Giving Outrageous Eigen Coefficients

In [None]:
import os
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
from astropy.io import fits
from astropy.time import Time
from scipy import interpolate
from scipy.optimize import minimize, least_squares, curve_fit
from mpfit import mpfit

from tqdm import tqdm
import seaborn as sns

In [None]:
# Gather files
lfc_files = glob('/mnt/home/lzhao/ceph/lfc5a/LFC_*.fits')
num_files = len(lfc_files)
print(f'Number of files: {num_files}')

# Set some useful, general values
hdus = fits.open(lfc_files[0])
t_spec = hdus[1].data['spectrum'].copy()
t_errs = hdus[1].data['uncertainty'].copy()
t_mask = hdus[1].data['pixel_mask'].copy()
hdus.close()
nord, npix = t_spec.shape

lfc_orders = range(41,76)

In [None]:
# Let's take a look at all the LFCs we've gathered
plt.figure(figsize=(6.4*2,4.8))
plt.title('Epoch 5 LFC Spectra: Order 60')
plt.xlabel('Pixel')
plt.ylabel('Extracted Value + "Time" Offset')
colors = sns.color_palette('plasma',95)
for file_name in tqdm(lfc_files):
    hdus=fits.open(file_name)
    mjd = Time(hdus[0].header['MIDPOINT'],format='isot').mjd
    plt.plot(range(3000,3050),hdus[1].data['spectrum'][60][3000:3050]+(int(mjd)-58696)/10.,
             color=colors[int(mjd)-58696],alpha=0.1)
    hdus.close()
plt.xlim(3000,3050)
plt.ylim(0,10)
plt.axhline((int(Time('2019-09-22',format='isot').mjd)-58696)/10,color='g')
plt.axhline((int(Time('2019-10-06',format='isot').mjd)-58696)/10,color='g')
plt.tight_layout()
plt.savefig('./Figures/191113_lfcShifts.png')

In [None]:
# Identify unique nights of LFC data
lfc_dates = np.empty_like(lfc_files)
for i in range(len(lfc_files)):
    lfc_dates[i] = os.path.basename(lfc_files[i]).split('_')[-1].split('.')[0]
np.unique(lfc_dates[np.argsort(lfc_dates.astype(int))])

## Let's Find Those Bad Exposures

In [None]:
from waveCal import *
ckpt_files  = glob('/mnt/home/lzhao/ceph/ckpt5a/LFC_19*.npy')

In [None]:
# Sort files by date:
file_times = np.empty_like(ckpt_files,dtype='float')
for i in range(len(ckpt_files)):
    file_times[i] = os.path.basename(ckpt_files[i]).split('_')[-1][:-4]
ckpt_files = np.array(ckpt_files)[np.argsort(file_times)]

file_times = np.empty_like(lfc_files,dtype='float')
for i in range(len(lfc_files)):
    file_times[i] = os.path.basename(lfc_files[i]).split('_')[-1][:-5]
lfc_files = np.array(lfc_files)[np.argsort(file_times)]

In [None]:
# Read in data, construct wavelength solution, make pretty for PCA
def pcaSetup(file_list, x_range=(500,7000), m_range=(45,75),
             allow_file_error=True, vet_pxls=True, vet_exps=True):
    # Construct wavelength "grids"
    x_range = np.arange(*x_range).astype(float)
    m_range = np.arange(*m_range).astype(float)
    x_grid, m_grid = np.meshgrid(x_range,m_range)
    x_grid = x_grid.flatten()
    m_grid = m_grid.flatten()
    
    # Load in all wavelength solutions
    w_fit_array = np.empty((len(file_list),len(x_grid)))
    if file_list[0].split('.')[-1] == 'thid':
        def readFunc(file_name):
            x,m,w = readThid(file_name)
            e = None
            return x,m,e,w
    else:
        def readFunc(file_name):
             return readParams(file_name)
    
    print('Reading in files')
    used_files = []
    for i in tqdm(range(len(file_list))):
        file_name = file_list[i]
        try:
            x,m,e,w = readFunc(file_name)
            w_fit_array[i] = interp_train_and_predict(x_grid,m_grid,x,m,w,e)
            used_files.append(os.path.basename(file_name))
        except ValueError as err:
            if not allow_file_error:
                raise err
            w_fit_array[i,:] = np.nan
    
    # Bad lines/exposure
    good = np.isfinite(w_fit_array)
    bad  = np.logical_not(good)
    if vet_exps:
        exp_okay = np.sum(good, axis=1) > 3
        w_fit_array = w_fit_array[exp_okay,:]
        print(f"Not okay Exposures: {np.sum(~exp_okay)}")
        print(np.array(file_list)[~exp_okay])
        used_files = np.array(file_list)[exp_okay]
    if vet_pxls:
        pxl_okay = np.sum(good, axis=0) > 3
        w_fit_array = w_fit_array[:,pxl_okay]
        print(f"Not okay Pixels: {np.sum(~pxl_okay)}")
        x_grid = x_grid[pxl_okay]
        m_grid = m_grid[pxl_okay]
    good = np.isfinite(w_fit_array)
    bad = np.logical_not(good)
    
    # Find mean wavelength pixel by pixel
    mean_w_fit = np.empty(w_fit_array.shape[1])
    for i in range(w_fit_array.shape[1]):
        mean_w_fit[i] = np.nanmean(w_fit_array[:,i])
    
    # Replace bad pixels with mean value
    # THIS IS TERRIBLE
    for i in range(w_fit_array.shape[0]):
        w_fit_array[i][bad[i]] = mean_w_fit[bad[i]]
    
    return w_fit_array, mean_w_fit, used_files

In [None]:
w_fit_array, mean_w_fit, used_files = pcaSetup(ckpt_files)

In [None]:
# Find eigenvectors
svd = TruncatedSVD(n_components=5,n_iter=7,random_state=42)
uu = svd.fit_transform(w_fit_array - mean_w_fit[None, :])
ss = svd.singular_values_
vv = svd.components_
ec = (uu.dot(np.diag(ss)))

In [None]:
# Identify 3 sigma outliers
mask = np.zeros_like(ec.shape[1],dtype=bool)
for i in range(3):
    plt.plot(ec[:,i])
    mask = np.logical_or(mask,abs(ec[:,i]) > (3*np.std(ec[:,i])))
    plt.plot(np.arange(len(ec[:,i]))[mask],ec[mask,i],'r.')
    #print(used_files[mask])
#plt.ylim(-500,500)
plt.tight_layout()
plt.savefig('./Figures/191113_outliers1.png')

In [None]:
# Which were the bad files?
used_files[mask]

In [None]:
# How do the eigenvectors look?
# Identify areas giving trouble
x_range = np.arange(500,7000).astype(float)
m_range = np.arange(45,75).astype(float)
x_grid, m_grid = np.meshgrid(x_range,m_range)
x_grid = x_grid.flatten()
m_grid = m_grid.flatten()
for k in range(3):
    plt.figure()
    plt.scatter(x_grid, m_grid, c=vv[k])
    plt.title("eigenvector {:d}".format(k))
    plt.colorbar()
    plt.tight_layout()
    plt.savefig(f'./Figures/191113_eigenvector{k}.png')

In [None]:
# Plot troublesome areas and identify differences in outlier spectra
plt.figure(figsize=(6.4*3,4.8*5))
ax1 = plt.subplot(511)
ax1.set_title('Order 45')
ax2 = plt.subplot(512)
ax2.set_title('Order 46')
ax3 = plt.subplot(513)
ax3.set_title('Order 47')
ax4 = plt.subplot(514)
ax4.set_title('Order 48')
ax5 = plt.subplot(515)
ax5.set_title('Order 49')
colors = sns.color_palette('plasma',1150-1062)
for file_name in tqdm(lfc_files):
    date,obsn,_ = os.path.basename(file_name).split('_')[-1].split('.')
    if date != '190818':
        continue
    
    hdus = fits.open(file_name)
    spec = hdus[1].data['spectrum'].copy()
    hdus.close()
    
    ax1.plot(range(500,1000),spec[45,500:1000],color=colors[int(obsn)-1062],alpha=0.1)
    ax2.plot(range(6500,7000),spec[46,6500:7000],color=colors[int(obsn)-1062],alpha=0.1)
    ax3.plot(range(6500,7000),spec[47,6500:7000],color=colors[int(obsn)-1062],alpha=0.1)
    ax4.plot(range(500,1000),spec[48,500:1000],color=colors[int(obsn)-1062],alpha=0.1)
    ax5.plot(range(500,1000),spec[49,500:1000],color=colors[int(obsn)-1062],alpha=0.1)

for file_name in used_files[mask]:
    hdus = fits.open('/mnt/home/lzhao/ceph/lfc5a/'+os.path.basename(file_name)[:-4]+'.fits')
    spec = hdus[1].data['spectrum'].copy()
    hdus.close()
    
    ax1.plot(range(500,1000),spec[45,500:1000],color='k')
    ax2.plot(range(6500,7000),spec[46,6500:7000],color='k')
    ax3.plot(range(6500,7000),spec[47,6500:7000],color='k')
    ax4.plot(range(500,1000),spec[48,500:1000],color='k')
    ax5.plot(range(500,1000),spec[49,500:1000],color='k')

ax1.set_xlim(500,1000)
ax2.set_xlim(6500,7000)
ax3.set_xlim(6500,7000)
ax4.set_xlim(500,1000)
ax5.set_xlim(500,1000)

plt.tight_layout()
plt.savefig(f'./Figures/191113_problems1.png')

After a first round of outlier cuts, the clear issue is lower signal.  Let's now test for some cuts for signal and then iterate again without these outliers.

In [None]:
# Plot errors of the line fits (can we make a cut from that?)
plt.figure(figsize=(6.4*3,4.8*5))
ax1 = plt.subplot(511)
ax1.set_title('Order 41')
ax2 = plt.subplot(512)
ax2.set_title('Order 42')
ax3 = plt.subplot(513)
ax3.set_title('Order 43')
ax4 = plt.subplot(514)
ax4.set_title('Order 44')
ax5 = plt.subplot(515)
ax5.set_title('Order 45')
colors = sns.color_palette('plasma',1150-1062)
num_lines = np.zeros_like(used_files,dtype=float)
for i in tqdm(range(len(used_files))):
    file_name = used_files[i]
    x,m,e,w = readParams(file_name)
    num_lines[i] = len(e)
    
    ax1.plot(x[m==41],e[m==41],alpha=0.1)
    ax2.plot(x[m==42],e[m==42],alpha=0.1)
    ax3.plot(x[m==43],e[m==43],alpha=0.1)
    ax4.plot(x[m==44],e[m==44],alpha=0.1)
    ax5.plot(x[m==45],e[m==45],alpha=0.1)

for file_name in used_files[mask]:
    x,m,e,w = readParams(file_name)
    
    ax1.plot(x[m==41],e[m==41],color='k')
    ax2.plot(x[m==42],e[m==42],color='k')
    ax3.plot(x[m==43],e[m==43],color='k')
    ax4.plot(x[m==44],e[m==44],color='k')
    ax5.plot(x[m==45],e[m==45],color='k')

plt.tight_layout()
#plt.savefig(f'./Figures/191113_problems1.png')

In [None]:
# Outliers just have less lines in general, let's cut for that
num_lines = np.zeros_like(used_files,dtype=float)
for i in tqdm(range(len(used_files))):
    file_name = used_files[i]
    x,m,e,w = readParams(file_name)
    num_lines[i] = len(e)
    
for file_name in used_files[mask]:
    x,m,e,w = readParams(file_name)
    plt.axvline(len(e),color='r')
plt.hist(num_lines,50);

## Iteration Two
Cut out exposures with less than 15,000 lines.  Fewer lines tends to correspond to exposures with lower signal and therefore either orders without any lines or teeny-tiny lines that are hard to find.

In [None]:
def pcaSetup(file_list, x_range=(500,7000), m_range=(45,75),
             allow_file_error=True, vet_pxls=True, vet_exps=True,
             verbose=False):
    # Construct wavelength "grids"
    x_range = np.arange(*x_range).astype(float)
    m_range = np.arange(*m_range).astype(float)
    x_grid, m_grid = np.meshgrid(x_range,m_range)
    x_grid = x_grid.flatten()
    m_grid = m_grid.flatten()
    
    # Load in all wavelength solutions
    w_fit_array = np.empty((len(file_list),len(x_grid)))
    if file_list[0].split('.')[-1] == 'thid':
        line_requirement = 0
        def readFunc(file_name):
            x,m,w = readThid(file_name)
            e = None
            return x,m,e,w
    else:
        line_requirement = 15000
        def readFunc(file_name):
             return readParams(file_name)
    
    print('Reading in files')
    used_files = []
    for i in tqdm(range(len(file_list))):
        file_name = file_list[i]
        try:
            x,m,e,w = readFunc(file_name)
            if len(e) < line_requirement:
                # THIS LIMIT IS HARD CODED
                # WHICH IS DUMB
                # SHOULD BE SOMETHING LIKE LINES PER ORDER
                # ALSO ONLY WORKS ON LFCs
                if verbose:
                    print(f'File {file_name} has too few lines')
                w_fit_array[i,:] = np.nan
            else:
                w_fit_array[i] = interp_train_and_predict(x_grid,m_grid,x,m,w,e)
                used_files.append(os.path.basename(file_name))
        except ValueError as err:
            if not allow_file_error:
                raise err
            w_fit_array[i,:] = np.nan
    
    # Bad lines/exposure
    good = np.isfinite(w_fit_array)
    bad  = np.logical_not(good)
    if vet_exps:
        exp_okay = np.sum(good, axis=1) > 3
        w_fit_array = w_fit_array[exp_okay,:]
        if verbose:
            print(f"Not okay Exposures: {np.sum(~exp_okay)}")
            print(np.array(file_list)[~exp_okay])
        used_files = np.array(file_list)[exp_okay]
    if vet_pxls:
        pxl_okay = np.sum(good, axis=0) > 3
        w_fit_array = w_fit_array[:,pxl_okay]
        if verbose:
            print(f"Not okay Pixels: {np.sum(~pxl_okay)}")
        x_grid = x_grid[pxl_okay]
        m_grid = m_grid[pxl_okay]
    good = np.isfinite(w_fit_array)
    bad = np.logical_not(good)
    
    # Find mean wavelength pixel by pixel
    mean_w_fit = np.empty(w_fit_array.shape[1])
    for i in range(w_fit_array.shape[1]):
        mean_w_fit[i] = np.nanmean(w_fit_array[:,i])
    
    # Replace bad pixels with mean value
    # THIS IS TERRIBLE
    for i in range(w_fit_array.shape[0]):
        w_fit_array[i][bad[i]] = mean_w_fit[bad[i]]
    
    return w_fit_array, mean_w_fit, used_files

In [None]:
w_fit_array, mean_w_fit, used_files = pcaSetup(ckpt_files,verbose=True)

In [None]:
# Find eigenvectors
svd = TruncatedSVD(n_components=5,n_iter=7,random_state=42)
uu = svd.fit_transform(w_fit_array - mean_w_fit[None, :])
ss = svd.singular_values_
vv = svd.components_
ec = (uu.dot(np.diag(ss)))

In [None]:
# Find time of each exposure
time = np.zeros_like(used_files,dtype=float)
for i in tqdm(range(len(used_files))):
    file_name = used_files[i]
    spec_name = '/mnt/home/lzhao/ceph/lfc5a/'+os.path.basename(file_name)[:-4]+'.fits'
    hdus = fits.open(spec_name)
    time[i] = Time(hdus[0].header['MIDPOINT'],format='isot').mjd
    hdus.close()

In [None]:
fig = plt.figure(figsize=(6.4*3,4.8))
ax1 = plt.gca()
ax1.set_title('Coefficients Over Time')
ax1.set_ylabel('Coefficient 0',color=sns.color_palette()[0])
ax1.tick_params(axis='y', labelcolor=sns.color_palette()[0])
ax1.plot(time,ec[:,0],'o-')
mask = (abs(ec[:,0]) > (5e9))
ax1.plot(np.arange(len(ec[:,0]))[mask],ec[mask,0],'o',color=sns.color_palette()[0],mec='r')
ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
for i in range(1,5):
    if i<3:
        mask = np.logical_or(mask,(ec[:,i]-np.mean(ec[:,i])) > (3 * np.std(ec[:,i])))
    ax2.plot(time,ec[:,i],'o-',color=sns.color_palette()[i])
    #ax2.plot(np.arange(len(ec[:,i]))[mask],ec[mask,i],'o',mec='r')
ax2.set_ylabel('All Other Coefficients')
ax1.set_xlabel('Time [mjd]')
for i in range((min(time.astype(int))),(max(time.astype(int)))+2):
    plt.axvline(i,color='k',alpha=0.2)
plt.tight_layout()
plt.xlim(58709,58737)
plt.savefig('./Figures/191113_ecVtime.png')

In [None]:
used_files[mask]

In [None]:
x_range = np.arange(500,7000).astype(float)
m_range = np.arange(45,75).astype(float)
x_grid, m_grid = np.meshgrid(x_range,m_range)
x_grid = x_grid.flatten()
m_grid = m_grid.flatten()
for k in range(5):
    plt.figure()
    plt.scatter(x_grid, m_grid, c=vv[k])
    plt.title("eigenvector {:d}".format(k))
    plt.colorbar()
    plt.tight_layout()
    plt.savefig(f'./Figures/191113_eigen2vector{k}.png')

### A quick check on other (sub-)epochs.

In [None]:
ckpt5b_files  = glob('/mnt/home/lzhao/ceph/ckpt5b/LFC_19*.npy')

file_times = np.empty_like(ckpt5b_files,dtype='float')
for i in range(len(ckpt5b_files)):
    file_times[i] = os.path.basename(ckpt5b_files[i]).split('_')[-1][:-4]
ckpt5b_files = np.array(ckpt5b_files)[np.argsort(file_times)]

w_fit_array, mean_w_fit, used_files = pcaSetup(ckpt5b_files,verbose=True)
# Find eigenvectors
svd = TruncatedSVD(n_components=5,n_iter=7,random_state=42)
uu = svd.fit_transform(w_fit_array - mean_w_fit[None, :])
ss = svd.singular_values_
vv = svd.components_
ec = (uu.dot(np.diag(ss)))

In [None]:
fig, ax1 = plt.subplots()
ax1.set_title('Coefficients Over Time')
ax1.set_ylabel('Coefficient 0',color=sns.color_palette()[0])
ax1.tick_params(axis='y', labelcolor=sns.color_palette()[0])
ax1.plot(ec[:,0])
mask = (abs(ec[:,0]) > (5e9))
ax1.plot(np.arange(len(ec[:,0]))[mask],ec[mask,0],'o',color=sns.color_palette()[0],mec='r')
ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
for i in range(1,5):
    if i<3:
        mask = np.logical_or(mask,(ec[:,i]-np.mean(ec[:,i])) > (3 * np.std(ec[:,i])))
    ax2.plot(ec[:,i],color=sns.color_palette()[i])
    ax2.plot(np.arange(len(ec[:,i]))[mask],ec[mask,i],'o',mec='r')
ax2.set_ylabel('All Other Coefficients')
ax1.set_xlabel('Exposure Number, but Kind of Time')
plt.tight_layout()
plt.savefig('./Figures/191113_outliers5b.png')

In [None]:
used_files[mask]

In [None]:
x_range = np.arange(500,7000).astype(float)
m_range = np.arange(45,75).astype(float)
x_grid, m_grid = np.meshgrid(x_range,m_range)
x_grid = x_grid.flatten()
m_grid = m_grid.flatten()
for k in range(5):
    plt.figure()
    plt.scatter(x_grid, m_grid, c=vv[k])
    plt.title("eigenvector {:d}".format(k))
    plt.colorbar()
    plt.tight_layout()
    plt.savefig(f'./Figures/191113_eigen5bvector{k}.png')

In [None]:
ckpt5c_files  = glob('/mnt/home/lzhao/ceph/ckpt5c/LFC_19*.npy')

file_times = np.empty_like(ckpt5c_files,dtype='float')
for i in range(len(ckpt5c_files)):
    file_times[i] = os.path.basename(ckpt5c_files[i]).split('_')[-1][:-4]
ckpt5c_files = np.array(ckpt5c_files)[np.argsort(file_times)]

w_fit_array, mean_w_fit, used_files = pcaSetup(ckpt5c_files,verbose=True)
# Find eigenvectors
svd = TruncatedSVD(n_components=5,n_iter=7,random_state=42)
uu = svd.fit_transform(w_fit_array - mean_w_fit[None, :])
ss = svd.singular_values_
vv = svd.components_
ec = (uu.dot(np.diag(ss)))

In [None]:
fig, ax1 = plt.subplots()
ax1.set_title('Coefficients Over Time')
ax1.set_ylabel('Coefficient 0',color=sns.color_palette()[0])
ax1.tick_params(axis='y', labelcolor=sns.color_palette()[0])
ax1.plot(ec[:,0])
mask = (abs(ec[:,0]) > (5e9))
ax1.plot(np.arange(len(ec[:,0]))[mask],ec[mask,0],'o',color=sns.color_palette()[0],mec='r')
ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
for i in range(1,5):
    if i<3:
        mask = np.logical_or(mask,(ec[:,i]-np.mean(ec[:,i])) > (3 * np.std(ec[:,i])))
    ax2.plot(ec[:,i],color=sns.color_palette()[i])
    ax2.plot(np.arange(len(ec[:,i]))[mask],ec[mask,i],'o',mec='r')
ax2.set_ylabel('All Other Coefficients')
ax1.set_xlabel('Exposure Number, but Kind of Time')
plt.tight_layout()
plt.savefig('./Figures/191113_outliers5c.png')

In [None]:
used_files[mask]

In [None]:
x_range = np.arange(500,7000).astype(float)
m_range = np.arange(45,75).astype(float)
x_grid, m_grid = np.meshgrid(x_range,m_range)
x_grid = x_grid.flatten()
m_grid = m_grid.flatten()
for k in range(3):
    plt.figure()
    plt.scatter(x_grid, m_grid, c=vv[k])
    plt.title("eigenvector {:d}".format(k))
    plt.colorbar()
    plt.tight_layout()
    plt.savefig(f'./Figures/191113_eigen5cvector{k}.png')