## Experiments

Code for reproducing experiments shown in the paper

## Imports

In [4]:
%load_ext autoreload
%autoreload 2
import apoNN.src.data as apoData
import apoNN.src.utils as apoUtils
import apoNN.src.vectors as vectors
import apoNN.src.fitters as fitters
import apoNN.src.evaluators as evaluators
import apoNN.src.occam as occam_utils

import sklearn
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import random
from ppca import PPCA
import inspect

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


we use the [```apogee```](https://github.com/jobovy/apogee) module for interacting with APOGEE data and make use of dr16

In [5]:
import apogee.tools.path as apogee_path
apogee_path.change_dr(16)

### Hyperparameters

For speed we can run the results using only a subset of spectra

In [6]:
n_start = 0
n_stars = 50000 #100000
d = 100 #number of dimensions to use for compression
tol = 0.01 # tolerance to use for PPCA. Larger means faster but less accurate

## Setup

We load the AllStar file and make appropriate datacuts. This yields ```alllStar_occamlike``` - a large dataset containing all those spectra matching the dataset cuts in the data release 16 - and ```allStar_occam``` containing those stars within ```alllStar_occamlike``` cross-matched with the occam open cluster dataset.

In [None]:
allStar = apoUtils.load("shuffled_allStar")


In [None]:
upper_temp_cut = allStar["Teff"]<5000
lower_temp_cut = allStar["Teff"]>4000
lower_g_cut = allStar["logg"]>1.5
upper_g_cut = allStar["logg"]<3.
occamlike_cut = lower_g_cut & upper_g_cut & lower_temp_cut & upper_temp_cut
allStar_occamlike =  allStar[np.where(occamlike_cut)]


occam = occam_utils.Occam()
occam_kept = occam.cg_prob>0.8
allStar_occam,cluster_idxs = occam_utils.prepare_occam_allStar(occam_kept,allStar_occamlike)



a few apogee id return na in the astronn catalogue. We remove these frmo our dataset

In [None]:
bad_apogee_id = ['2M02123870+4942289', '2M18051909-3214413', '2M06134865+5518282']
good_ids = [apogee_id not in bad_apogee_id for apogee_id in allStar_occamlike["Apogee_id"]]
allStar_occamlike = allStar_occamlike[good_ids]

We convert allStar files into the continuum-normalized spectra using ```apoData.Dataset```

In [None]:
data_occamlike = apoData.Dataset(allStar_occamlike[n_start:n_start+n_stars])
data_occam = apoData.Dataset(allStar_occam)


```cluster_idxs``` contains the clusters to which entries in ```data_occam``` belong

In [None]:
assert(cluster_idxs.shape[0] == data_occam.masked_spectra.shape[0])

## Compression

```apoData.Dataset``` returns spectra for which bins with errors above a threshold are masked. We run a PCA that naturally handles missing values using the [ppca module](https://github.com/allentran/pca-magic). This is wrapped in a function ```fitters.compress_masked_spectra``` 

In [None]:
mask_interstellar, interstellar_locs = apoUtils.get_interstellar_bands()

In [None]:
z,z_occam,ppca = fitters.compress_masked_spectra(data_occamlike.masked_spectra[:,mask_interstellar],data_occam.masked_spectra[:,mask_interstellar],d,tol=tol)


In [None]:
#z,z_occam,ppca = fitters.compress_masked_spectra(data_occamlike.masked_spectra,data_occam.masked_spectra,d,tol=tol)
z,z_occam,ppca = fitters.compress_masked_spectra(data_occamlike.masked_spectra,data_occam.masked_spectra,d,tol=tol)


As stated in the paper, our evaluation method requires using our approach on unseen clusters through a cross-validation scheme. This is handled through the ```evaluators.StandardEvaluator``` class which takes ```vectors.Vector``` as inputs. It is run as follows.

In [None]:
Z_occam = vectors.OccamVector(val = z_occam,cluster_names=cluster_idxs).remove_orphans()
Z = vectors.Vector(val = z)

The vectors come with lots of handy functions. For example we can see which stars belong to which clusters using ```vectors.OccamVector.registry``` or even remove a cluster using ```vectors.OccamVector.without()```

In [None]:
Z.val.shape

In [None]:
Z_occam.registry

In [None]:
Z_occam.without("NGC 6791").registry

## Baseline

As our baseline we use the AstroNN abundances. We use the ```vectors.AstroNNVector``` to create an AstroNN vector from an allStar file. Parameters included in the vector are passed through a ```considered_parameters``` input

In [None]:
considered_parameters = ["Fe_H","C_FE","N_FE","O_FE","Na_FE","Mg_FE","Al_FE","Si_FE","S_FE","K_FE","CA_FE","Ni_FE","Cr_FE","Co_FE"] 
#considered_parameters = ["AK_targ"]

In [None]:
#element_string = "[Fe/H], "
#for param in considered_parameters[1:]:
#    s1,s2 = param.split("_")
#    element_string+=f"[{s1}/{s2}], "
#print(element_string)

In [None]:
Y = vectors.AstroNNVector(allStar_occamlike[n_start:n_start+n_stars],considered_parameters).remove_nan_cols()
#because one of the entries in the AstroNN catalogue contains nan. we remove it with Y.remove_nan_cols()
Y_occam = vectors.AstroNNVector(allStar_occam,considered_parameters)
Y_occam = vectors.OccamVector(cluster_names=cluster_idxs, val = Y_occam.val).remove_orphans()

In [None]:
Yb = vectors.AstroNNVector(allStar_occamlike[n_start:n_start+n_stars],considered_parameters+["Teff","logg"]).remove_nan_cols()


In [None]:
Yb = vectors.AstroNNVector(allStar_occamlike[n_start:n_start+n_stars],considered_parameters+["Teff","logg"]).remove_nan_cols()
#because one of the entries in the AstroNN catalogue contains nan. we remove it with Y.remove_nan_cols()
Yb_occam = vectors.AstroNNVector(allStar_occam,considered_parameters+["Teff","logg"])

Yb_occam = vectors.OccamVector(cluster_names=cluster_idxs, val = Yb_occam.val).remove_orphans()

In [None]:
print(inspect.getsource(vectors.AstroNNVector))

### Plotting performance against number of components

For our experiment we want to determine how performance compares with the number of dimensions preserved in the PCA step. We also want to compare against the performance obtained using abundances.

There are two ingredients required for assessing performance

- a ```fitters.Fitter``` object that takes vectors (spectra, stellar parameters) and scales them to encode chemical similarity. 

- a ```evaluators.Evaluator``` object that takes a fitter, and unsupervised dataset, an occam dataset and calculates the doppelganger rates of a representation.

In [None]:
print(inspect.getsource(evaluators.BaseEvaluator))

In [None]:
print(inspect.getsource(fitters.StandardFitter))

In [None]:
def simple_fitter(z,z_occam):
    """This is a simple fitter that just scales the dimensions of the inputed representation. Which is used as a baseline"""
    return fitters.SimpleFitter(z,z_occam,use_relative_scaling=True,is_pooled=True,is_robust=True)


In [None]:
def standard_fitter(z,z_occam):
    """This fitter performs a change-of-basis to a more appropriate basis for scaling"""
    return fitters.StandardFitter(z,z_occam,use_relative_scaling=True,is_pooled=True,is_robust=True)


In [None]:
evaluator_Y = evaluators.StandardEvaluator(Y,Y_occam,leave_out=True,fitter_class=standard_fitter)
evaluator_Y.weighted_average

In [None]:
evaluator_Y_overfit = evaluators.StandardEvaluator(Y,Y_occam,leave_out=False,fitter_class=standard_fitter)
evaluator_Y_overfit.weighted_average

In [None]:
n_components = [5,15,25,35,45,55,65,75,85,95]
evaluators_X = [evaluators.StandardEvaluator(Z[:,:n_component],Z_occam[:,:n_component],leave_out=True,fitter_class=standard_fitter) for n_component in n_components]

In [None]:
[i.weighted_average for i in evaluators_X]

we can pass ```leave_out=False``` in which case the (overfitted) results without cross-validation are shown

In [None]:
evaluators_X_overfit = [evaluators.StandardEvaluator(Z[:,:n_component],Z_occam[:,:n_component],leave_out=False,fitter_class=standard_fitter) for n_component in n_components]

In [None]:
import matplotlib
import matplotlib.lines as mlines
import matplotlib.patches as mpatches
font = {'family' : 'normal',
        'size'   : 12}

matplotlib.rc('font', **font)

plt.figure(figsize=[10,6])

plt.plot(n_components,np.array([i.weighted_average for i in evaluators_X]),label="with cross-validation",color="blue",marker='o',markersize=11,markeredgecolor="black")
plt.plot(n_components,np.array([i.weighted_average for i in evaluators_X_overfit]),label="without cross-validation",color="orange",marker='o',markersize=11,markeredgecolor="black")
plt.axhline(y=evaluator_Y.weighted_average,c="blue",linestyle  = "--",label="stellar labels")
plt.axhline(y=evaluator_Y_overfit.weighted_average,c="orange",linestyle  = "--",label="from stellar labels")
plt.ylabel("doppelganger rate")
plt.xlabel("PCA dimensionality")
plt.minorticks_on()

dashed_line = mlines.Line2D([], [], color="black",linestyle="--",
                          markersize=15, label='from stellar labels')
full_line = mlines.Line2D([], [], color="black",linestyle="-",
                          markersize=15, label='from spectra')
blue_patch = mpatches.Patch(color='blue', label='with cross-validation')
orange_patch = mpatches.Patch(color='orange', label='without cross-validation')


plt.legend(handles=[full_line,dashed_line,blue_patch,orange_patch],frameon=False)
#plt.legend(frameon=False)
plt.savefig("../../figures/global_doppelganger.pdf",format="pdf")
plt.ylim(0,0.06)
#plt.title("Doppelganger rate per star")

### Fine grained investigation into cluster level performance

In [None]:
n_cols = 22
fig = plt.figure(constrained_layout=True,figsize=[3*n_cols,10])
gspec = gridspec.GridSpec(ncols=n_cols, nrows=2, figure=fig)
#for i in range(len(sorted(spectra_evaluator.registry))):
for i in range(n_cols):
    spec_ax = fig.add_subplot(gspec[0, i])
    evaluators_X[3].plot_cluster(sorted(evaluators_X[3].registry)[i],spec_ax,x_max=80)
    abund_ax = fig.add_subplot(gspec[1, i])
    evaluator_Y.plot_cluster(sorted(evaluator_Y.registry)[i],abund_ax,x_max=80)
    
#plt.savefig("../../figures/local_doppelganger.pdf",format="pdf")


In [None]:
n_cols = 5
fig = plt.figure(constrained_layout=True,figsize=[4*n_cols,5])
gspec = gridspec.GridSpec(ncols=n_cols, nrows=2, figure=fig)
#for i in range(len(sorted(spectra_evaluator.registry))):
for i in range(n_cols):
    spec_ax = fig.add_subplot(gspec[0, i])
    evaluators_X[3].plot_cluster(sorted(evaluators_X[3].registry)[i],spec_ax,x_max=30)
    abund_ax = fig.add_subplot(gspec[1, i])
    #abund_ax.set_xlabel("distance",fontsize=20)
    evaluator_Y.plot_cluster(sorted(evaluator_Y.registry)[i],abund_ax,x_max=40)
    
plt.savefig("../../figures/local_doppelganger0.pdf",format="pdf")

In [None]:
n_cols = 5
start_idx = 5
fig = plt.figure(constrained_layout=True,figsize=[4*n_cols,5])
gspec = gridspec.GridSpec(ncols=n_cols, nrows=2, figure=fig)
#for i in range(len(sorted(spectra_evaluator.registry))):
for i in range(n_cols):
    spec_ax = fig.add_subplot(gspec[0, i])
    evaluators_X[3].plot_cluster(sorted(evaluators_X[3].registry)[i+start_idx],spec_ax,x_max=30)
    abund_ax = fig.add_subplot(gspec[1, i])
    #abund_ax.set_xlabel("distance",fontsize=20)
    evaluator_Y.plot_cluster(sorted(evaluator_Y.registry)[i+start_idx],abund_ax,x_max=40)
    
plt.savefig("../../figures/local_doppelganger1.pdf",format="pdf")

In [None]:
n_cols = 5
start_idx = 10
fig = plt.figure(constrained_layout=True,figsize=[4*n_cols,5])
gspec = gridspec.GridSpec(ncols=n_cols, nrows=2, figure=fig)
#for i in range(len(sorted(spectra_evaluator.registry))):
for i in range(n_cols):
    spec_ax = fig.add_subplot(gspec[0, i])
    evaluators_X[3].plot_cluster(sorted(evaluators_X[3].registry)[i+start_idx],spec_ax,x_max=30)
    abund_ax = fig.add_subplot(gspec[1, i])
    #abund_ax.set_xlabel("distance",fontsize=20)
    evaluator_Y.plot_cluster(sorted(evaluator_Y.registry)[i+start_idx],abund_ax,x_max=40)
    
plt.savefig("../../figures/local_doppelganger2.pdf",format="pdf")

In [None]:
n_cols = 5
start_idx = 15
fig = plt.figure(constrained_layout=True,figsize=[4*n_cols,5])
gspec = gridspec.GridSpec(ncols=n_cols, nrows=2, figure=fig)
#for i in range(len(sorted(spectra_evaluator.registry))):
for i in range(n_cols):
    spec_ax = fig.add_subplot(gspec[0, i])
    evaluators_X[3].plot_cluster(sorted(evaluators_X[3].registry)[i+start_idx],spec_ax,x_max=30)
    abund_ax = fig.add_subplot(gspec[1, i])
    #abund_ax.set_xlabel("distance",fontsize=20)
    evaluator_Y.plot_cluster(sorted(evaluator_Y.registry)[i+start_idx],abund_ax,x_max=40)
    
plt.savefig("../../figures/local_doppelganger3.pdf",format="pdf")

In [None]:
n_cols = 5
start_idx = 20
fig = plt.figure(constrained_layout=True,figsize=[4*n_cols,5])
gspec = gridspec.GridSpec(ncols=n_cols, nrows=2, figure=fig)
#for i in range(len(sorted(spectra_evaluator.registry))):
for i in range(2):
    spec_ax = fig.add_subplot(gspec[0, i])
    evaluators_X[3].plot_cluster(sorted(evaluators_X[3].registry)[i+start_idx],spec_ax,x_max=30)
    abund_ax = fig.add_subplot(gspec[1, i])
    #abund_ax.set_xlabel("distance",fontsize=20)
    evaluator_Y.plot_cluster(sorted(evaluator_Y.registry)[i+start_idx],abund_ax,x_max=40)
    
plt.savefig("../../figures/local_doppelganger4.pdf",format="pdf")

### Plotting performance against open clusters

We plot the performance obtained against the number of open-clusters used.

In [None]:
n_repeats = 5 #How many different combinations of clusters to sample for each size
n_clusters_considered = [10,15,20,22] #How many clusters to preserve
n_component = 25

In [None]:
def get_n_random_clusters(vector_occam,n_clusters):
    cluster_list = random.sample(list(vector_occam.registry),n_clusters)
    return vector_occam.only(cluster_list)

def make_doppelganger_vs_clusters(n_clusters_considered,X,X_occam,n_repeats):
    """
    Calculate the average doppelganger rate for a given number of clusters
    -------------------------------
    n_clusters_considered: list
            cluster sizes to calculate for
    X: vector.Vector
        X dataset
    X_occam:vector.OccamVector
        X_dataset
    """
    res = []
    for n_clusters in n_clusters_considered:
        res.append([])
        for _ in range(n_repeats):
            X_restricted = get_n_random_clusters(X_occam,n_clusters)
            evaluator_X = evaluators.StandardEvaluator(X,X_restricted,leave_out=True,fitter_class=standard_fitter)
            res[-1].append(evaluator_X.weighted_average)  
    return res

In [None]:
n_dim = 15
res_X15 = make_doppelganger_vs_clusters(n_clusters_considered,Z[:,:n_dim],Z_occam[:,:n_dim],n_repeats)
n_dim = 25
res_X25 = make_doppelganger_vs_clusters(n_clusters_considered,Z[:,:n_dim],Z_occam[:,:n_dim],n_repeats)
n_dim = 35
res_X35 = make_doppelganger_vs_clusters(n_clusters_considered,Z[:,:n_dim],Z_occam[:,:n_dim],n_repeats)
res_Y = make_doppelganger_vs_clusters(n_clusters_considered,Y,Y_occam,n_repeats)

In [None]:
plt.figure(figsize=[8,6])
plt.plot(np.array(n_clusters_considered)-1,[np.mean(res_i) for res_i in res_X35],label="spectra 35",color="blue",marker='o',markersize=11,markeredgecolor="black")
plt.plot(np.array(n_clusters_considered)-1,[np.mean(res_i) for res_i in res_X25],label="spectra 25",color="purple",marker='o',markersize=11,markeredgecolor="black")
plt.plot(np.array(n_clusters_considered)-1,[np.mean(res_i) for res_i in res_X15],label="spectra 15",color="black",marker='o',markersize=11,markeredgecolor="black")

plt.plot(np.array(n_clusters_considered)-1,[np.mean(res_i) for res_i in res_Y],label="labels",color="orange",marker='o',markersize=11,markeredgecolor="black")
plt.minorticks_on()
#np.array(n_clusters_considered)-1 because one cluster removed from scaling when evaluated on leave-out=True 

plt.xlabel("Number of clusters")
plt.ylabel("Doppelganger rate")
plt.ylim(0.,0.06)
plt.legend(frameon=False)
plt.savefig("../../figures/doppelganger_vs_clusters.pdf",format="pdf")


### Plotting performance against dataset size

We investigate how much the PCA compression is affected by the dataset size.

In [None]:
def get_Zs(data,data_occam,d,tol,n_stars = 1000):
    kept_idxs = np.random.choice(len(data.masked_spectra),n_stars,replace=False)
    #z,z_occam,ppca = fitters.compress_masked_spectra(data.masked_spectra[kept_idxs],data_occam.masked_spectra,d,tol=tol)
    z,z_occam,ppca = fitters.compress_masked_spectra(data.masked_spectra[kept_idxs][:,mask_interstellar],data_occam.masked_spectra[:,mask_interstellar],d,tol=tol)

    Z_occam = vectors.OccamVector(val = z_occam,cluster_names=cluster_idxs).remove_orphans()
    Z = vectors.Vector(val = z)
    return Z,Z_occam

In [None]:
dataset_sizes = [10000,30000,50000]
n_repeats = 10
d=35

In [None]:
for _ in range(5):
    Z,Z_occam = get_Zs(data_occamlike,data_occam,d=d,tol=0.1,n_stars = 30000)

    evaluator.weighted_average
    print(f"weighted_average:{evaluator.weighted_average}")

In [None]:
Z,Z_occam = get_Zs(data_occamlike,data_occam,d=d,tol=0.01,n_stars = 50000)
evaluator = evaluators.StandardEvaluator(Z,Z_occam,leave_out=True,fitter_class=standard_fitter)
evaluator.weighted_average

In [None]:
res_datasize = []
for dataset_size in dataset_sizes:
    res_datasize.append([])
    for _ in range(n_repeats):
        Z,Z_occam = get_Zs(data_occamlike,data_occam,d=d,tol=0.1,n_stars = dataset_size)
        evaluator = evaluators.StandardEvaluator(Z,Z_occam,leave_out=True,fitter_class=standard_fitter)
        res_datasize[-1].append(evaluator.weighted_average)

In [None]:
plt.plot(dataset_sizes,np.mean(np.array(res_datasize),axis=1))
plt.xlabel(r"size $X_{pop}$")
plt.ylabel("doppelganger rate")
plt.savefig("../../figures/doppelganger_vs_Xsize.pdf",format="pdf")


In [None]:
res_datasize[4]

In [None]:
kept_idxs = np.random.choice(len(data_occamlike.masked_spectra),100,replace=False)
kept_idxs

### Measure correlation between similarity and other parameters

In [None]:
def similarity_ij(i,j,v):
    return np.linalg.norm(v[i]-v[j])

def get_similarity(X,Y,n_repeats=50000,n_max=10000,use_delta=True):
    """
    OUTPUTS
    -------
    similarity_list: 
        contains the chemical similarity for random pairs of stars
    delta_list:
        contains the difference in variable of interest for these same stars
    use_delta: boolean
        if true give the difference between two varialbles. If false give the average.
    """
    similarity_list = []
    delta_list = []
    for _ in range(n_repeats):
        i,j = np.random.choice(n_max,2)
        if  (Y[i]>-100) and (Y[j]>-100):
            similarity_list.append(similarity_ij(i,j,X))
            if use_delta is True:
                delta_list.append(np.abs(Y[i]-Y[j]))
            else:
                delta_list.append(np.mean([Y[i],Y[j]]))
    return similarity_list,delta_list

In [None]:
# Creating the transformed representation on which metric leraning is applied

z,z_occam,ppca = fitters.compress_masked_spectra(data_occamlike.masked_spectra,data_occam.masked_spectra,d,tol=tol)
Z_occam = vectors.OccamVector(val = z_occam,cluster_names=cluster_idxs).remove_orphans()
Z = vectors.Vector(val = z)

In [None]:
z_dim = 30
ev_x = evaluators.StandardEvaluator(Z[:,:z_dim],Z_occam[:,:z_dim],leave_out=True,fitter_class=standard_fitter)
ev_x.weighted_average

In [None]:
Y_fitter = standard_fitter(Y,Y_occam)
v_y = Y_fitter.transform(Y.centered(Y_occam)).val

##########################################################

z_dim = 30
Z_fitter = standard_fitter(Z[:,:z_dim],Z_occam[:,:z_dim])
v_z = Z_fitter.transform(Z_fitter.z.centered(Z_occam[:,:z_dim])).val


In [None]:
#param = "AK_TARG"
param = "SNR"
y_interest = allStar_occamlike[n_start:n_start+n_stars][param]

In [None]:
zs,delta_zs = get_similarity(v_z,y_interest)
ys,delta_ys = get_similarity(v_y,y_interest)

In [None]:
heatmap,xedges,yedges = np.histogram2d(delta_zs,zs,bins=[80,80])
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
plt.clf()
plt.imshow(heatmap.T,extent=extent,origin='lower',cmap="Greens",aspect=0.01)
plt.plot(delta_zs,np.poly1d(np.polyfit(delta_zs, zs, 1))(delta_zs),color="orange",label="from spectra")
plt.plot(delta_ys,np.poly1d(np.polyfit(delta_ys, ys, 1))(delta_ys),color="blue",label="from labels")
plt.legend(frameon=False)
plt.xlim(0,0.5)
plt.ylim(0,40)
plt.ylabel("similarity")
plt.xlabel(rf"$\Delta$ {param}")
plt.colorbar(label="density")
plt.savefig("../../figures/extinction_trend_before.pdf",format="pdf")


In [None]:
norm_zs= zs/np.mean(zs)
norm_ys= ys/np.mean(ys)
heatmap,xedges,yedges = np.histogram2d(delta_zs,norm_zs,bins=[80,80])
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
plt.clf()
plt.imshow(heatmap.T,extent=extent,origin='lower',cmap="Greens",aspect=0.2)
plt.plot(delta_zs,np.poly1d(np.polyfit(delta_zs, norm_zs, 1))(delta_zs),color="orange",label="from spectra")
plt.plot(delta_ys,np.poly1d(np.polyfit(delta_ys, norm_ys, 1))(delta_ys),color="blue",label="from labels")
plt.legend(frameon=False)
plt.xlim(0,0.5)
plt.ylim(0,2)
plt.ylabel("similarity")
plt.xlabel(rf"$\Delta$ {param}")
plt.colorbar(label="density")
plt.savefig("../../figures/extinction_trend_after.pdf",format="pdf")


In [None]:
z,z_occam,ppca = fitters.compress_masked_spectra(data_occamlike.masked_spectra[:,mask_interstellar],data_occam.masked_spectra[:,mask_interstellar],d,tol=tol)
Z_occam = vectors.OccamVector(val = z_occam,cluster_names=cluster_idxs).remove_orphans()
Z = vectors.Vector(val = z)

In [None]:
ev_x = evaluators.StandardEvaluator(Z[:,:z_dim],Z_occam[:,:z_dim],leave_out=True,fitter_class=standard_fitter)
ev_x.weighted_average

In [None]:
Y_fitter = standard_fitter(Y,Y_occam)
v_y = Y_fitter.transform(Y.centered(Y_occam)).val

##########################################################

z_dim = 30
Z_fitter = standard_fitter(Z[:,:z_dim],Z_occam[:,:z_dim])
v_z = Z_fitter.transform(Z_fitter.z.centered(Z_occam[:,:z_dim])).val


In [None]:
zs,delta_zs = get_similarity(v_z,y_interest,use_delta=False)
ys,delta_ys = get_similarity(v_y,y_interest,use_delta=False)

In [None]:
heatmap,xedges,yedges = np.histogram2d(delta_zs,zs,bins=[80,80])
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
plt.clf()
plt.imshow(heatmap.T,extent=extent,origin='lower',cmap="Greens",aspect=0.01)
plt.plot(delta_zs,np.poly1d(np.polyfit(delta_zs, zs, 1))(delta_zs),color="orange",label="from spectra")
plt.plot(delta_ys,np.poly1d(np.polyfit(delta_ys, ys, 1))(delta_ys),color="blue",label="from labels")
plt.legend(frameon=False)
plt.xlim(0,0.5)
plt.ylim(0,40)
plt.ylabel("similarity")
plt.xlabel(rf"$\Delta$ {param}")
plt.colorbar(label="density")
#plt.savefig("../../figures/extinction_trend_after.pdf",format="pdf")


In [None]:
norm_zs= zs/np.mean(zs)
norm_ys= ys/np.mean(ys)

In [None]:
heatmap,xedges,yedges = np.histogram2d(delta_zs,norm_zs,bins=[80,80])
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
plt.clf()
plt.imshow(heatmap.T,extent=extent,origin='lower',cmap="Greens",aspect=0.2)
plt.plot(delta_zs,np.poly1d(np.polyfit(delta_zs, norm_zs, 1))(delta_zs),color="orange",label="from spectra")
plt.plot(delta_ys,np.poly1d(np.polyfit(delta_ys, norm_ys, 1))(delta_ys),color="blue",label="from labels")
plt.legend(frameon=False)
plt.xlim(0,0.5)
plt.ylim(0,2)
plt.ylabel("similarity")
plt.xlabel(rf"$\Delta$ {param}")
plt.colorbar(label="density")
#plt.savefig("../../figures/extinction_trend_after.pdf",format="pdf")


In [None]:
heatmap,xedges,yedges = np.histogram2d(delta_zs,norm_zs,bins=[50,50])
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
plt.clf()
plt.imshow(heatmap.T,extent=extent,origin='lower',cmap="Greens",aspect=400.)
plt.plot(delta_zs,np.poly1d(np.polyfit(delta_zs, norm_zs, 1))(delta_zs),color="orange",label="from spectra")
plt.plot(delta_ys,np.poly1d(np.polyfit(delta_ys, norm_ys, 1))(delta_ys),color="blue",label="from labels")
plt.legend(frameon=False)
plt.xlim(0,300)
plt.ylim(0,2)

plt.ylabel("similarity")
plt.xlabel(rf"$\Delta$ {param}")
plt.colorbar(label="density")

In [None]:
plt.hist(delta_zs)
plt.xlim(0,500)

In [None]:
plt.hist(y_interest)
plt.xlim(0,500)

### Sandbox exploration

In [None]:
allStar[0]

In [None]:
evaluator_Y.doppelganger_rates

In [None]:
sorted(evaluator_Y.registry)[0]

In [None]:
evaluator = evaluator_Y #evaluator_Y #evaluators_X[3] #evaluators_X[3] #evaluator_Y
clust_dopps = []
clust_mean_fes = []
len_clusts = []
for i in range(len(Y_occam.registry)):
    clust_dopp = evaluator.doppelganger_rates[i]
    clust_mean_fe = np.mean(Y_occam.only(sorted(evaluator.registry)[i]).val[:,0])
    clust_mean_fes.append(clust_mean_fe)
    clust_dopps.append(clust_dopp)
    len_clusts.append(len(Y_occam.only(sorted(evaluator.registry)[i]).val[:,0]))

In [None]:
len(clust_dopps)

In [None]:
plt.scatter(len_clusts,clust_dopps)
for i, txt in enumerate(sorted(evaluator.registry)):
    plt.annotate(txt, (len_clusts[i], clust_dopps[i]),fontsize=8)
plt.xlabel("cluster size")
plt.ylabel("doppelganger rate")

plt.title("from spectra")

In [None]:
plt.scatter(len_clusts,clust_dopps)
for i, txt in enumerate(sorted(evaluator.registry)):
    plt.annotate(txt, (len_clusts[i], clust_dopps[i]),fontsize=8)
plt.xlabel("cluster size")
plt.ylabel("doppelganger rate")

plt.title("from label")

In [None]:
plt.scatter(len_clusts,clust_dopps)
for i, txt in enumerate(sorted(evaluator.registry)):
    plt.annotate(txt, (len_clusts[i], clust_dopps[i]),fontsize=8)
#plt.ylim(-0.01,0.05)
plt.title("from spectra")

In [None]:
plt.scatter(clust_mean_fes,clust_dopps)
#plt.ylim(-0.01,0.03)
for i, txt in enumerate(sorted(evaluator.registry)):
    plt.annotate(txt, (clust_mean_fes[i], clust_dopps[i]),fontsize=8)

plt.title("from labels")
plt.xlabel("Fe_H")
plt.ylabel("doppelganger")

In [None]:
plt.scatter(clust_mean_fes,clust_dopps)
for i, txt in enumerate(sorted(evaluator.registry)):
    plt.annotate(txt, (clust_mean_fes[i], clust_dopps[i]),fontsize=8)
plt.ylim(-0.01,0.1)

In [None]:
plt.scatter(clust_mean_fes,clust_dopps)
for i, txt in enumerate(sorted(evaluator.registry)):
    plt.annotate(txt, (clust_mean_fes[i], clust_dopps[i]),fontsize=8)
plt.ylim(-0.01,0.03)

In [None]:
np.mean(Y_occam.only(sorted(evaluator_Y.registry)[i]).val[:,0])

In [None]:
Y_occam.only(sorted(evaluator_Y.registry)[i]).val[:,0]

In [None]:
evaluators_X[3]

In [None]:
n_components

In [None]:
np.mean([5,7])