# Error estimation

This notebook contains code that can be used to calculate the reconstruction error (MSE for example) of our different model types.


The results here are calculated for an L2 loss. The loss can easily be changed to an L1 loss by switching ```loss = nn.MSELoss(reduce=False)
``` to ```loss = nn.L1Loss(reduce=False)```


Currently the notebook is set-up to evaluate the reconstruction on those stars ```idx_large``` in the dataset which have a large difference in physical parameters

### Reconstruction Cannon

In [None]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import numpy as np
import numpy as np
import sys
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from scipy import spatial
import pickle
import argparse
from torch.utils.data import Dataset, DataLoader
import torch
from torch.autograd import Variable
import torch.nn as nn

from tagging.paths import path_val_dataset

In [None]:
n_degree = 4
data_file = path_val_dataset

In [None]:

###############################################################
print("number of degrees of freedom:{}".format(n_degree))

################################################

print("Loading data...")
#data_file = "spectra_noiseless"
#data = pd.read_pickle("/share/rcifdata/ddm/flatiron/taggingPaper/data/final/train/{}.pd".format(data_file))
#data = pd.read_pickle("/share/rcifdata/ddm/flatiron/taggingPaper/data/final/train/spectra_noiseless.pd")
data = pd.read_pickle(data_file)
#data_noisy = pd.read_pickle(opt.data_file)

print("dataset is:{}".format(data_file))



#dataset = ApogeeDataset(data[:50000],n_bins)
####################################################

spectra_matrix = np.matrix(data["spectra"].tolist())
spectra_matrix = spectra_matrix[0:50000]

params_list = data.params.tolist()
params_list = params_list[0:50000]

polynomial = PolynomialFeatures(degree=n_degree)
params_matrix = polynomial.fit_transform(np.array(params_list))
d = np.dot(np.linalg.inv(np.dot(params_matrix.T,params_matrix)),params_matrix.T)
s= np.dot(d,spectra_matrix)

fit_matrix = np.dot(params_matrix,s)
res_matrix = spectra_matrix - fit_matrix

swapped_matrix = fit_matrix[:25000]+res_matrix[25000:50000]
print(fit_matrix)
print(swapped_matrix)


#loss = nn.MSELoss(reduce=False)
loss = nn.L1Loss(reduce=False)
real =torch.tensor(spectra_matrix[0:25000])
fit =torch.tensor(fit_matrix[0:25000])
swapped =torch.tensor(swapped_matrix[0:25000])
err = loss(real,swapped)
print("error swapping is {}".format(np.mean(err.numpy())))
#err = loss(real,fit)
#print("error regular is {}".format(np.mean(err.numpy())))


We now look at only matching stars constrained to have a signifcant difference in the T_eff and log_g

In [None]:
params_list = np.array(params_list)

In [None]:
params_diff = params_list[0:25000]-params_list[25000:]

In [None]:
np.abs(params_diff[:,0]).mean()

In [None]:
idxs_large = np.argwhere(np.abs(params_diff[:,0])>500)[:,0]

In [None]:
#idxs_large = np.argwhere( (np.abs(params_diff[:,0])>500) & (np.abs(params_diff[:,1])>1.0) )[:,0]

In [None]:
idxs_large.shape

In [None]:
err = loss(real[idxs_large],swapped[idxs_large])
print("error swapping is {}".format(np.mean(err.numpy())))


In [None]:
spectra_matrix[0:10].shape

In [None]:
np.array(params_list[0:10]).shape

## Reconstruction FactorDis and FaderDis

In [None]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from scipy import spatial
import pickle
import sys
import argparse
import glob
import ntpath

from tagging.src.datasets import ApogeeDataset
from tagging.src.networks import ConditioningAutoencoder,Embedding_Decoder,Feedforward
from tagging.src.utils import get_batch,invert_x

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:
np.random.seed(0)
torch.manual_seed(0)

In [None]:
idxs_large = np.hstack((idxs_large,idxs_large+25000))

In [None]:
n_bins = 7751 
n_conditioned = 3
n_z = 20
architecture = "fader"
n_batch = 100 #number of spectra in one batch
batch_numbers = 60 #number of batches to use to calculate running average
use_full_dataset = True

data = pd.read_pickle(path_val_dataset)

In [None]:
idxs_large.shape

In [None]:
if not use_full_dataset:
    data = data.iloc[idxs_large]
    data = data.reset_index(drop=True)


In [None]:
dataset = ApogeeDataset(data,n_bins = 7751)
evaluation_loader = torch.utils.data.DataLoader(dataset = dataset,
                                     batch_size = n_batch,
                                     shuffle = False,
                                     drop_last=True)

In [None]:
encoder = Feedforward([n_bins+n_conditioned,2048,512,128,32,n_z],activation=nn.SELU()).to(device)
decoder = Feedforward([n_z+n_conditioned,512,2048,8192,n_bins],activation=nn.SELU()).to(device)
conditioning_autoencoder = ConditioningAutoencoder(encoder,decoder,n_bins=n_bins).to(device)


In [None]:
if architecture == "fader":
    #conditioning_autoencoder.load_state_dict(torch.load("../../outputs/models/faderDis.save"))
    conditioning_autoencoder = torch.load("../../outputs/models/faderDiswFe.save")
elif architecture == "factor":
    weights = torch.load("../../outputs/models/factorDis.save")
    try:
        del weights['w.weight']
    except:
        pass
    conditioning_autoencoder.load_state_dict(weights)


In [None]:
err_tot = 0
errs = []

In [None]:
#loss = nn.MSELoss(reduce=False)
loss = nn.L1Loss(reduce=False)

We alternate between measuring err_rec and err_swp to measure the reconstruction error and the reconstruction with swapping error

In [None]:
for i in range(batch_numbers):
    batch1 = get_batch(0+i*n_batch,n_batch,dataset)
    batch2 = get_batch(int(data.shape[0]/2)+i*n_batch,n_batch,dataset)

    x_test1,u_test1,v_test1,idx_test1 = batch1
    x_test2,u_test2,v_test2,idx_test2 = batch2
    _,z1 = conditioning_autoencoder(x_test1,u_test1[:,0:3],train_decoder=False)
    _,z2 = conditioning_autoencoder(x_test2,u_test2[:,0:3],train_decoder=False)

    x1_pred,_ = conditioning_autoencoder(z1,u_test1[:,0:3],train_encoder=False)
    x1_pred_swp,_ = conditioning_autoencoder(z1,u_test2[:,0:3],train_encoder=False)
    _,z1_pred = conditioning_autoencoder(x1_pred_swp,u_test2[:,0:3],train_decoder=False)

    x1_pred_swp= invert_x(x1_pred_swp)
    x1_pred= invert_x(x1_pred)
    x_test2= invert_x(x_test2)
    x_test1= invert_x(x_test1)
    err_swp = loss(x1_pred_swp,x_test2) #err_swp is the error 
    err_rec = loss(x1_pred,x_test1)
    err_tot+=err_swp.detach().cpu().float().mean()
    errs.append(err_swp.detach().cpu().numpy())
    #print("err_swp:{}".format(err_swp))
print("err_tot:{}".format(err_tot*(1/batch_numbers)))
errs = np.concatenate(errs)
#errs = np.mean(errs,axis=1)
print(errs.shape)
print("err_tot, err:{}, std:{},unc:{}".format(np.mean(errs),np.std(errs),np.std(errs)/np.sqrt(errs.shape[0])))

