In [24]:
import os 
import re
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [72]:
import numpy as np
import warnings
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.decomposition import PCA
from scipy.signal import find_peaks, peak_widths

import astropy
from astropy.table import Table
from astropy.io import fits
from astropy import units as u

from specutils import SpectrumCollection

from src.model.mask import get_bal_mask

warnings.filterwarnings('ignore')

In [102]:
X_nonbal_path = '../data/X_full.npy'
X_nonbal_full = np.load(X_nonbal_path)
X_nonbal = X_nonbal_full[~np.isnan(X_nonbal_full).any(axis=1)]

wavelengths_path = '../data/wavelengths.npy'
wavelengths = np.load(wavelengths_path)


In [78]:
X_nonbal.shape

(6547, 2796)

In [70]:
dr16q_filename = '../data/sdss_quasar_catalogs/DR16Q_v4.fits'

hdul = fits.open(dr16q_filename)

In [73]:
hdr = hdul[0].header
dr16_data = hdul[1].data

bal_qso_mask = (dr16_data['BAL_PROB'] == 1) & (dr16_data['zWarning'] == 0)
bal_qso_hdul = dr16_data[bal_qso_mask]

col_l = ['SDSS_NAME', 'RA', 'DEC', 'PLATE', 'MJD', 'FIBERID', 'AUTOCLASS_PQN', 'Z', 'BAL_PROB', 'BI_CIV', 'AI_CIV', ]
bal_qso_df = Table(bal_qso_hdul)[col_l].to_pandas()

test_df = bal_qso_df.iloc[0:20]

from src.wrangle.sdss.qso_spectra import QsoSpectraDataset

qso_spectra = QsoSpectraDataset()
spec_l = qso_spectra.get_shifted_and_rebinned_spec_l(test_df, cut_head_tail=False, clean_mask=True)

bal_spec_l = spec_l.copy()


In [79]:
bal_coll = SpectrumCollection.from_spectra(bal_spec_l)

X_bal = np.array(bal_coll.flux)
bal_wavelength = np.array(bal_coll.spectral_axis)[0]

In [103]:
X = np.concatenate((X_nonbal, X_bal))

In [104]:
training_idx = np.arange(0, len(X_nonbal))

In [105]:
class CustomL2Normalizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self     

    def transform(self, X, y=None):
        L2_norm = np.sqrt(np.nansum(X**2, axis=1))
        return  X / L2_norm[:,np.newaxis]


In [140]:
class CustomPCA(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=5, svd_solver='full', random_state=12345):
        self.n_components = n_components
        self.svd_solver = svd_solver
        self.random_state = random_state
        self.pca = PCA(n_components=n_components, svd_solver=svd_solver, random_state=random_state)

    def fit(self, X, y=None):
        self.pca.fit(X)
        return self
    
    @staticmethod
    def mask_bal_features(X, X_pca):
        bal_idx_mask = get_bal_mask(X)
        X[bal_idx_mask] = X_pca[bal_idx_mask]
        return X
    
    def transform(self, X, y=None):
        X_pca_init = self.pca.transform(X)
        X_projected = self.pca.inverse_transform(X_pca_init)
        X = self.mask_bal_features(X, X_projected)
        return self.pca.transform(X)


In [141]:
prep_pipe = Pipeline(steps = [
    ('normalizer', CustomL2Normalizer()),
    ('pca', CustomPCA(n_components=5, svd_solver='full', random_state=12345))
])


In [138]:
# prep_pipe = Pipeline(steps = [
#     ('normalizer', CustomL2Normalizer()),
# ])
# #

In [142]:
X_train = X_nonbal.copy()

prep_pipe.fit(X_train)
X_trf = prep_pipe.transform(X_train)

AttributeError: 'numpy.ndarray' object has no attribute 'spectral_axis'

In [None]:
X[-3]

In [130]:
X_norm[training_idx]

array([[0.03345497, 0.02904569, 0.0351776 , ..., 0.0053321 , 0.0133157 ,
        0.01203784],
       [0.02817721, 0.02644958, 0.02462755, ..., 0.01156833, 0.0116174 ,
        0.01250922],
       [0.03388304, 0.03287509, 0.03101457, ..., 0.00719751, 0.00694688,
        0.01091403],
       ...,
       [0.03349692, 0.03621573, 0.03457812, ..., 0.01028033, 0.01113919,
        0.00947593],
       [0.03084018, 0.02877475, 0.03486669, ..., 0.01077652, 0.0136249 ,
        0.01169161],
       [0.08488921, 0.09424359, 0.03753836, ..., 0.00898712, 0.01155574,
        0.00841735]])

In [132]:
sum(sum(np.isnan(X_norm[training_idx])))

0

In [110]:
pca = PCA()

pca.fit(X_nonbal)

KeyboardInterrupt: 

In [114]:
sum(sum(np.isnan(X[training_idx])))

0

In [92]:
X_train = X.copy()

prep_pipe.fit(X_train)

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values