## Open-Clusters Exploration

In this notebook we explore the open-clusters available within APOGEE dr16 and we generate a pickled dataset for working with them.

In [1]:
import astropy
from astropy.io import fits
import numpy as np
import matplotlib.pyplot as plt

import inspect

from apoNN.src.occam import Occam

import apogee.tools.read as apread
import apogee.tools.path as apogee_path
from apogee.tools import bitmask
import collections


from apoNN.src.datasets import ApogeeDataset
from apoNN.src.utils import generate_loss_with_masking

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch.nn as nn

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

apogee_path.change_dr(16)

## Generating filtered dataset

The first step of the process is to generate a reduced allStar containing only those stars in open clusters

In [33]:
def prepare_occam_allStar(occam_kept,allStar):
    """
    Given a list of of occam_ids, generates a filtered APOGEE dataset containing those OCCAM stars 
    
    INPUTS
    ------
    occam_kept: array
        boolean mask of same size of occam dataset with True for used stars and False for discarded Stars
    allStar:
        allStar file
        
    OUTPUTS
    -------
    (1) filtered_allStar containing only open cluster stars
    (2) cluster_ids of every star in the filtered dataset
    """
    occam = Occam()
    filtered_occam_apogee_id = np.array(occam.apogee_id)[occam_kept]
    
    apogee_idxs = []
    for idx in list(filtered_occam_apogee_id):
        try:
            apogee_idxs.append(list(allStar["Apogee_id"]).index(idx))
        except:
            apogee_idxs.append(-1)
    apogee_idxs = np.array(apogee_idxs)
    found_mask = apogee_idxs!=-1
    
    return allStar[apogee_idxs[found_mask]],np.array(occam.cluster_id)[occam_kept][found_mask]


In [4]:
def get_z(idx,dataset,autoencoder):
    _,z = autoencoder(dataset[idx][0].to(device).unsqueeze(0))
    return z.detach().cpu().numpy()

In [5]:
occam = Occam()
allStar= apread.allStar()



In [24]:
upper_temp_cut = allStar["Teff"]<7000
lower_temp_cut = allStar["Teff"]>3500
lower_g_cut = allStar["logg"]>1.
upper_g_cut = allStar["logg"]<3.5
snr_cut = allStar["SNR"]>100
snr_highcut = allStar["SNR"]<500
#feh_outliercut = allStar["Fe_H"]>-5

combined_cut = lower_g_cut & upper_g_cut & lower_temp_cut & upper_temp_cut & snr_cut & snr_highcut #& feh_outliercut


In [25]:
#occam_kept = occam.pm_prob>0.99
occam_kept = occam.cg_prob>0.9
print(sum(occam_kept))

1165


In [32]:
cut_allStar,cut_cluster_idxs = prepare_occam_allStar(occam_kept,allStar)

In [34]:
len(cut_allStar)

1035

### Generating the dataset

Next we must create an ```ApogeeDataset```, run the autoencoder on the dataset and extract the latents.

In this section we are capable the ```ApogeeDataset``` and we a

In [None]:
intermediary_dataset = ApogeeDataset(cut_allStar,outputs = ["aspcap","mask2","physical","idx"])
#autoencoder = torch.load("/share/splinter/ddm/taggingProject/apogeeFactory/outputs/guild/5/ae_8000.p")
autoencoder = torch.load("/share/splinter/ddm/taggingProject/apogeeFactory/outputs/guild/z10/ae_3000.p")

In [None]:
succeded_spectra = []
for idx in range(len(intermediary_dataset)):
    try:
        print(idx)
        intermediary_dataset[idx]
        succeded_spectra.append(True)
    except:
        succeded_spectra.append(False)

succeded_spectra = np.array(succeded_spectra)        

In [None]:
occam_dataset = ApogeeDataset(cut_allStar[succeded_spectra],outputs = ["aspcap","mask2","physical","idx"])
occam_cluster_idxs = cut_cluster_idxs[succeded_spectra]

In [None]:
len(occam_cluster_idxs)

## Getting the latent

We can now use the autoencoder to find the latent

In [None]:
occam_z = np.array([get_z(idx,occam_dataset,autoencoder) for idx in range(len(occam_dataset))]).squeeze()


as a safeguard we plot the orginal spectra vs reconstructed

In [None]:
loss = torch.nn.L1Loss()
masked_loss = generate_loss_with_masking(loss)
for idx in range(30):
    x = occam_dataset[idx][0].to(device).unsqueeze(0)
    mask =  occam_dataset[idx][1].to(device).unsqueeze(0)
    x_pred,z = autoencoder(x)
    print(masked_loss(x_pred,x,mask))

In [None]:
idx= 24
for idx in range(100,120):
    x_pred,z = autoencoder(occam_dataset[idx][0].to(device).unsqueeze(0))
    plt.plot(x_pred.detach().cpu().numpy()[0])
    plt.plot(occam_dataset[idx][0])
    plt.plot(occam_dataset[idx][1])

    plt.xlim(4500,4800)
    plt.show()


In [None]:
for cluster_name in set(occam_cluster_idxs):
    cluster_idxs = np.where(occam_cluster_idxs==cluster_name)
    occam_z[cluster_idxs]=occam_z[cluster_idxs]-occam_z[cluster_idxs].mean(axis=0)

In [None]:
from sklearn.decomposition import PCA
from sklearn.decomposition import SparsePCA

In [None]:
pca = PCA(n_components=10)

In [None]:
occam_z==0

In [None]:
occam_z.shape

In [None]:
pca.fit(occam_z)

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.components_[0]

In [None]:
import apoNN.src.utils as utils
pca_fulldataset = utils.load("pca")

In [None]:
occam_whitened_z = pca_fulldataset.transform(occam_z)

In [None]:
pca.fit(occam_whitened_z)

In [None]:
pca.explained_variance_ratio_

In [None]:
pca_fulldataset.inverse_transform(pca.components_[24])

In [None]:
pca.components_[0]

### Drop zero variance entries

In [None]:
dropped_z = occam_z[np.where(np.sum(occam_z,axis = 1) != 0)]

In [None]:
pca = PCA(n_components=8)

In [None]:
pca.fit(dropped_z)

In [None]:
pca.explained_variance_ratio_

### Investigate OODness of clustesr

In [None]:
len(cut_allStar[succeded_spectra])

In [None]:
test_allStar = cut_allStar[succeded_spectra]

upper_temp_cut = test_allStar["Teff"]<7000
lower_temp_cut = test_allStar["Teff"]>3500
lower_g_cut = test_allStar["logg"]>1.
upper_g_cut = test_allStar["logg"]<3.5
snr_cut = test_allStar["SNR"]>100
snr_highcut = test_allStar["SNR"]<500
feh_outliercut = test_allStar["Fe_H"]>-5

combined_cut = lower_g_cut & upper_g_cut & lower_temp_cut & upper_temp_cut & snr_cut & snr_highcut & feh_outliercut


In [None]:
cut_occam_dataset = ApogeeDataset(test_allStar[combined_cut],outputs = ["aspcap","mask2","physical","idx"])
cut_occam_cluster_idxs = occam_cluster_idxs[combined_cut]

In [None]:
cut_occam_z = np.array([get_z(idx,cut_occam_dataset,autoencoder) for idx in range(len(cut_occam_dataset))]).squeeze()


In [None]:
for cluster_name in set(cut_occam_cluster_idxs):
    cluster_idxs = np.where(cut_occam_cluster_idxs==cluster_name)
    cut_occam_z[cluster_idxs]=cut_occam_z[cluster_idxs]-cut_occam_z[cluster_idxs].mean(axis=0)

In [None]:
cut_occam_z[cluster_idxs].shape

In [None]:
pca = PCA(n_components=8)

In [None]:
pca.fit(cut_occam_z)

In [None]:
pca.explained_variance_ratio_

In [None]:
pca.components_[0]

In [None]:
cluster_name = list(set(occam_cluster_idxs))[24]
cluster_idxs = np.where(occam_cluster_idxs==cluster_name)
cut_allStar[succeded_spectra][cluster_idxs]["Mg_FE"]

In [None]:
cluster_name = list(set(occam_cluster_idxs))[22]
cluster_idxs = np.where(occam_cluster_idxs==cluster_name)
si1 = cut_allStar[succeded_spectra][cluster_idxs]["Mg_FE"]

In [None]:
cluster_name = list(set(occam_cluster_idxs))[20]
cluster_idxs = np.where(occam_cluster_idxs==cluster_name)
si2 = cut_allStar[succeded_spectra][cluster_idxs]["Mg_FE"]


In [None]:
cluster_name = list(set(occam_cluster_idxs))[19]
cluster_idxs = np.where(occam_cluster_idxs==cluster_name)
si3 = cut_allStar[succeded_spectra][cluster_idxs]["Mg_FE"]
si3

In [None]:
cluster_name = list(set(occam_cluster_idxs))[17]
cluster_idxs = np.where(occam_cluster_idxs==cluster_name)
si4 = cut_allStar[succeded_spectra][cluster_idxs]["Mg_FE"]
si4

In [None]:
cluster_name = list(set(occam_cluster_idxs))[5]
cluster_idxs = np.where(occam_cluster_idxs==cluster_name)
cut_allStar[succeded_spectra][cluster_idxs]["Fe_H"]

In [None]:
#plt.hist(si1,alpha=0.5,bins=30)
#plt.hist(si2,alpha=0.5,bins=30)
plt.hist(si3,alpha=0.5,bins=30)
plt.hist(si4,alpha=0.5,bins=30)

In [None]:
cluster_name = list(set(occam_cluster_idxs))[2]
cluster_idxs = np.where(occam_cluster_idxs==cluster_name)
cut_allStar[succeded_spectra][cluster_idxs]["Fe_H"]

In [None]:
list(set(occam_cluster_idxs))[0]

In [None]:
occam_cluster_idxs

In [None]:
collections.Counter(occam_cluster_idxs)