In [1]:
import os
# Set environment variables to disable multithreading
# as users will probably want to set the number of cores
# to the max of their computer.
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from autoencoders.ae import AutoEncoder
from anomaly.constants import GALAXY_LINES
from anomaly.utils import specobjid_to_idx
from sdss.metadata import MetaData
from sdss.utils.managefiles import FileDirectory

meta = MetaData()
# %matplotlib inline

# Data ingestion

In [3]:
data_dir = "/home/elom/spectra"
model_dir = "/home/elom/models"
bin_id = "bin_03"
explanations_dir = f"{model_dir}/{bin_id}/explanation"
paper_figures_dir = "/home/elom/phd/00_paper_explain-me-why/sections/figures/"

In [4]:
meta_data_df = pd.read_csv(
    f"{data_dir}/0_01_z_0_5_4_0_snr_inf.csv.gz",
    index_col="specobjid",
)

In [5]:
wave = np.load(f"{data_dir}/wave_spectra_imputed.npy")

spectra = np.load(
    f"{data_dir}/spectra_imputed.npy",
    mmap_mode="r"
)

idx_id = np.load(
    f"{data_dir}/{bin_id}/{bin_id}_index_specobjid.npy"
)

# Load model

In [6]:
ae_model = AutoEncoder(
    reload=True,
    reload_from=f"{model_dir}/{bin_id}",
)

# Overview of anomalies from MSE

In [None]:
mse_anomaly_specid_dict = {
    'strong_emission':3240467000396376064,
    'artifact': 637325355518027776
}

In [None]:
mse_97_anomaly_specid_dict = {
    'blue': 531492683672217600,
    'star': 1780176998165932032
}

In [8]:
mse_filter_anomaly_specid_dict = {
    'broad': 1924292192684238848,
    'artifact': 407686575611209728
}

In [9]:
mse_filter_97_anomaly_specid_dict = {
    'bumpy': 2501881598136313856,
    'halpha_bump': 2664004586678806528
}

In [None]:
scores_list = [
    'mse_filter_250kms_noRel100',
    'mse_filter_250kms_noRel97',
    'mse_filter_250kms_rel100',
    'mse_filter_250kms_rel97',
    'mse_noRel100',
    'mse_noRel97',
    'mse_rel100',
    'mse_rel97'
]

# Ignore

# Directories

In [3]:
vae_architecture = '256_128_64/latent_12'

project_dir = '/home/elom/onedrive/phd/spectra/0_01_z_0_5_4_0_snr_inf'
bin_id = 'bin_03'
explanations_dir = (
    f'{project_dir}/{bin_id}/explanation/{vae_architecture}'
)

scores_list = [
    'mse_filter_250kms_noRel100',
    'mse_filter_250kms_noRel97',
    'mse_filter_250kms_rel100',
    'mse_filter_250kms_rel97',
    'mse_noRel100',
    'mse_noRel97',
    'mse_rel100',
    'mse_rel97'
]

# Load data

In [4]:
# load data
wave = np.load(f"{project_dir}/wave_spectra_imputed.npy")

meta_data_df = pd.read_csv(
    f"{project_dir}/drop_0_01_z_0_5_4_0_snr_inf.csv.gz",
    index_col="specobjid",
)

spectra = np.load(
    f"{project_dir}/spectra_imputed.npy",
    mmap_mode="r"
)

idx_id = np.load(
    f"{project_dir}/{bin_id}/{bin_id}_index_specobjid.npy"
)

# load MSE score of most normal galaxies, AKA, best reconstructed ones
normal_mse_df = pd.read_csv(
    f"{explanations_dir}/mse_noRel100/top_normal.csv.gz",
    index_col="specobjid",
)

anomalous_mse_df = pd.read_csv(
    f"{explanations_dir}/mse_noRel100/top_anomalies.csv.gz",
    index_col="specobjid",
)

In [6]:
anomalies_scores_df_list = []
for score in scores_list:

    fpath = f"{explanations_dir}/{score}/top_anomalies.csv.gz"

    _df = pd.read_csv(fpath)
    _df["score"] = score
    anomalies_scores_df_list.append(_df)

anomalies_scores_df = pd.concat(
    anomalies_scores_df_list,
    ignore_index=True
    )

In [7]:
anomalies_scores_df.score.value_counts()

score
mse_filter_250kms_noRel100    10000
mse_filter_250kms_noRel97     10000
mse_filter_250kms_rel100      10000
mse_filter_250kms_rel97       10000
mse_noRel100                  10000
mse_noRel97                   10000
mse_rel100                    10000
mse_rel97                     10000
Name: count, dtype: int64

# mederic

In [3]:
data_dir = "/home/elom/spectra"
model_dir = "/home/elom/models"
bin_id = "bin_03"
explanations_dir = f"{model_dir}/{bin_id}/explanation"
paper_figures_dir = "/home/elom/phd/00_paper_explain-me-why/sections/figures/"
meta_data_df = pd.read_csv(
    f"{data_dir}/0_01_z_0_5_4_0_snr_inf.csv.gz",
    index_col="specobjid",
)
wave = np.load(f"{data_dir}/wave_spectra_imputed.npy")

spectra = np.load(
    f"{data_dir}/spectra_imputed.npy",
    mmap_mode="r"
)

idx_id = np.load(
    f"{data_dir}/{bin_id}/{bin_id}_index_specobjid.npy"
)

In [4]:
specobjid = 1780176998165932032
meta_data_df.loc[specobjid]

mjd                   53149
plate                  1581
fiberid                 470
run2d                    26
ra                235.10316
dec               32.865899
z                  0.053784
zErr               0.000019
class                GALAXY
subClass                NaN
z_noqso                   0
zErr_noqso                0
targetType          SCIENCE
programname          legacy
instrument             SDSS
snMedian            21.8992
ABSSB             undefined
BROAD             undefined
ebv                0.028206
Name: 1780176998165932032, dtype: object