# Emcee GP Study

An emcee sampler notebook, testing various things about the emcee package.

In [2]:
%matplotlib notebook 
#restart the kernel if switching from inline to notebook
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')
import corner

import numpy as np
import scipy.stats
import pandas as pd

import emcee
import george

import pickle
import gp_sandbox as cgp

In [3]:
def sample(asteroid, days=1, delay=0, plot=True):
    
    """
    Samples the number of points from the provided lightcurve at a 
    specified cadence.
    The total simulated light curve has around 400,000 points/~160 days.
    
    Parameters
    ----------
    days : int
        The number of nights you wish to sample. There are 2880 data 
        points in each day.
        
    delay : int
        The number of days you wish to delay by. This allows the user to
        sample the lightcurve somewhere other than just the start.
        Note : you cannot delay by more than 158 days or the data runs 
        out.
        
    plot : boolean
        Decides whether you want to plot the original light curve and 
        the sampled data points.
    
    Returns
    -------
    time : pandas.core.series.Series
        The time stamp information of the sampled lightcurve in days.
        
    flux : pandas.core.series.Series
        Flux measurements of sampled lightcurve corresponding to the
        time stamps.
        
    flux_err : numpy.ndarray
        Estimated flux error corresponding to the flux measurements.
        
    ax : matplotlib.axes._subplots.AxesSubplot

    """
    # halt if you delay past he limit of the data
    assert delay < 159 , "You cannot delay more than 158 days."
    
    # convert days to points
    span = 2880 * days
    start_pt = 2880 * delay
    
    # read in lightcurve data 
    if asteroid == 3200:
        true_period = 3.603957 # 3200 Phaeton
        true_log_p = -1.896021
        txt = 'new_shape_models/3200_lc_49627_to_49787_realistic.txt'
        
    if asteroid == 1291:
        true_period = 5.58410 # 1291 Phryge
        true_log_p = -1.45813055
        txt = 'new_shape_models/1291_lc_49627_to_49787_realistic.txt'
        
    if asteroid == 221:
        true_period = 10.443 # 221 Eos
        true_log_p = -0.8321219
        txt = 'new_shape_models/221_lc_49627_to_49787_realistic.txt'
        
    if asteroid == 1388:
        true_period = 11.9432 # 1388 Aphrodite
        true_log_p = -0.69789175
        txt = 'new_shape_models/1388_lc_49627_to_49787_realistic.txt'
        
        
    df = pd.read_csv(txt, delimiter=' ',
                     header=None, names=['all_time','all_flux'], dtype={'all_time':float, 'all_flux':float})
    all_time = df.all_time
    all_flux = df.all_flux
    
    # halt if you sample past the limit of the data
    assert 2880*(delay+days) < len(df.all_time) , "End of data reached. You cannot delay that many days."
    
    time = all_time[start_pt:span+start_pt]
    flux = all_flux[start_pt:span+start_pt]

    # yerr is set to 0.2 magnitude but can easily be changed
    # cadance sampling is turned on
    # can be switched to random or clustered
    tsample, fsample, flux_err = cgp.sample_data(time, flux, yerr_amp=0.2, cadence_n=1) 

    if plot == True:
        ax = plot_sample(time, flux, tsample, fsample, true_log_p)
        return tsample, fsample, time, flux, flux_err, ax

    else: return tsample, fsample, time, flux, flux_err


def plot_sample(time, flux, tsample, fsample, true_log_p):
    
    """
    Plot the sampled data points.
    """
    fig, ax = plt.subplots(1,1, figsize=(8,4))
    ax.plot(time, flux, '-', alpha=0.5, label="Original : " + str(round(true_log_p, 5)))
    ax.plot(tsample, fsample, 'ko', ms = 3, alpha=0.5, 
                 label="Sample : " + str(len(tsample)))
    ax.legend()
    return ax
    

    
def prior(params):
    
    """
    Calculated the log of the prior values, given parameter values.
    
    Parameters
    ----------
    params : list
        List of all kernel parameters
    
    param[0] : float
        mean (between 0 and 2)
    
    param[1] : float
        log amplitude (between -10 and 10)
        
    param[2] : float
        gamma (log gamma between 0.1 and 100)
        
    param[3] : float
        log period (period between 1h and 24hrs)
    
    Returns
    -------
    sum_log_prior : int
        sum of all log priors (-inf if a parameter is out of range)
    
    """

    p_mean = scipy.stats.uniform(0,20).logpdf(params[0])
    p_log_amp = scipy.stats.uniform(-10,30).logpdf(params[1])  
    p_log_gamma = scipy.stats.uniform(np.log(0.1), np.log(10)*3).logpdf(np.log(params[2]))
    p_period = scipy.stats.uniform(np.log(1./24), -np.log(1./24)).logpdf((params[3]))
        
    sum_log_prior =  p_mean + p_log_amp + p_log_gamma + p_period
    
    if np.isnan(sum_log_prior) == True:
        return -np.inf
    
    return sum_log_prior


def post_lnlikelihood(params):
    
    """
    Calculates the posterior likelihood from the log prior and 
    log likelihood. 
    
    Parameters
    ----------
    params : list
        List of all kernel parameters
        

    Returns
    -------
    ln_likelihood : float
        The posterior, unless the posterior is infinite, in which case,
        -1e25 will be returned instead.
        
    """    
    # calculate the log_prior
    log_prior = prior(params)
    
    # return -inf if parameters are outside the priors
    if np.isneginf(log_prior) == True:
        return -np.inf
    
    # compute lnlikelihood based on given parameters
    gp.set_parameter_vector(params)
    gp.compute(tsample, flux_err)
    
    ln_likelihood = gp.lnlikelihood(fsample)+log_prior
    
    return ln_likelihood if np.isfinite(ln_likelihood) else -1e25


def optimize(x, y, yerr, gp, print_result = True):
   
    """
    Optimizes kernel parameters with scipy.optimize.
    
    Parameters
    ----------
    x : pandas.core.series.Series
        The time stamp information of the sampled lightcurve in days.
        
    y : pandas.core.series.Series
        Flux measurements of sampled lightcurve corresponding to the
        time stamps.
    
    yerr : numpy.ndarray
        Estimated flux error corresponding to the flux measurements.
        This value can be changed in the sample function.
        
    print_results : boolean
        Toggles whethere you want extra information about the optimization
        results. 
        
    Returns
    -------
    result : scipy.optimize.optimize.OptimizeResult
        The result for optimizing. Will be using result.x for the 
        initial locations and result.hess_inv for the covariance matrix
        for a multivariate (4D) normal distribution that will provide
        starting locations for our emcee walkers.
          
    """
    
    # import needed function
    from scipy.optimize import minimize
    
    # compute kernel with given x and yerr 
    gp.compute(x, yerr)

    def neg_ln_like(p):
        gp.set_parameter_vector(p)
        try:
            negloglike =  -gp.log_likelihood(y)
            return negloglike
        # throw exception if you run into an error
        except scipy.linalg.LinAlgError:
            return np.inf

    def grad_neg_ln_like(p):
        gp.set_parameter_vector(p)
        try:
            grad_loglike =  -gp.grad_log_likelihood(y)
            return grad_loglike
        # throw exception if you run into an error
        except scipy.linalg.LinAlgError:
            return np.inf

    result = minimize(neg_ln_like, gp.get_parameter_vector(), jac=grad_neg_ln_like)#, method='L-BFGS-B')
    gp.set_parameter_vector(result.x)
    
    if (print_result == True):
        print(result)
    
    return result



def walker_params(params, cov_scale=1, plot=True):
    
    """
    Sets up the initial parameters for all the walkers using optimized 
    parameter values as starting values. The function generates a 
    scattered multivatiate gaussian distribution of starting parameter 
    values.
    
    Parameters
    ----------
    params : list
        List of all kernel parameters.
    
    cov_scale : float
        Determines the scatter of the multivariate distribution.
      
    Returns
    -------
    p0 : numpy.ndarray
        The initial walker parameters [nwalker, ndim]
    
    gp : george.gp.GP
        GP kernel set with the optimized parameter values.
        
    """
    
    
    mean, log_amp, gamma, log_period = params
    amp = np.exp(log_amp)

    print('amp : ' + str(amp))
    kernel = amp * george.kernels.ExpSine2Kernel(gamma = gamma, log_period = log_period)
    gp = george.GP(kernel, fit_mean=True, mean=mean)
    gp.compute(fsample, flux_err)
    
    result = optimize(tsample, fsample, flux_err, gp)
    print(result.x)
    p0 = np.random.multivariate_normal(mean=result.x, cov=cov_scale*result.hess_inv, size=nwalkers)
    
    if plot==True:
        plot_gpfit(time, tsample, fsample, flux_err, gp)
    
    return p0, gp

def plot_gpfit(time, tsample, fsample, flux_err, gp):
    
    """
    Plot a gp fit given a gp class and x, y, and yerr data to fit onto.
    
    """
      
    t_possible = np.linspace(time.iloc[0], time.iloc[-1], time.index[-1])
    pred, pred_var = gp.predict(fsample, t_possible, return_var=True)
    
    ax.fill_between(t_possible, pred - np.sqrt(pred_var), pred + np.sqrt(pred_var),
                color="red", alpha=0.4)
    ax.plot(t_possible, pred, "red", lw=1.5, alpha=0.7, label = "GP Fit : " + str(round(gp.parameter_vector[-1], 5)))
    ax.legend()


How many walkers do we want? So far there are 4 parameters/dimensions we want to study: mean, log_amp, gamma, and log_period.

In [4]:
ndim, nwalkers = 4, 100
days, delay = 5,0

tsample, fsample, time, flux, flux_err, ax = sample(1291, days, delay, plot=True)
data_pts = len(tsample)

FileNotFoundError: File b'new_shape_models/1291_lc_49627_to_49787_realistic.txt' does not exist

Let's see what a Lomb-Scargle periodogram predicts the period should be.

In [None]:
from lombscargle import make_lsp
from astropy.stats import LombScargle

freq, power = make_lsp(tsample, fsample, flux_err)

fig, bx = plt.subplots(1,1, figsize=(4,2))
bx.plot(freq, power)
bx.set_xlabel('Frequency')
bx.set_ylabel('Power')
best_freq = freq[np.argmax(power)]
bx.vlines(best_freq, 0, 1, colors='orange', linestyles='--', 
          label = 'Best fit : ' + str(round(best_freq, 5)))
bx.legend()


best_log_period = np.log(1./best_freq)
y_fit = LombScargle(tsample, fsample, flux_err).model(time, best_freq)
ax.plot(time, y_fit, label = "Lomb-Scargle fit : " + str(round(best_log_period, 5)))
ax.legend()

In [None]:
###initialize walker parameters
params = [np.mean(fsample), -0.5, 1, -1.45]
p0, gp = walker_params(params=params, cov_scale=10, plot=True)

In [None]:
sampler = emcee.EnsembleSampler(nwalkers, ndim, post_lnlikelihood)

In [None]:
%%time
mcmc_sampling = sampler.run_mcmc(p0, 100)

In [None]:
def save_chain(data, file_name):
    with file(file_name + '.txt', 'w') as outfile:
        # I'm writing a header here just for the sake of readability
        # Any line starting with "#" will be ignored by numpy.loadtxt
        outfile.write("# " + str(data.shape) + '\n')

        # Iterating through a ndimensional array produces slices along
        # the last axis. This is equivalent to data[i,:,:] in this case
        for data_slice in data:  
            np.savetxt(outfile, data_slice)

            # Writing out a break to indicate different slices...
            outfile.write('# New chain\n')

def read_chain(file_name, shape):
    new_data = np.genfromtxt(file_name + '.txt', delimiter=' ')
    # data shape can be found as the first commented line in the txt file
    new_data = new_data.reshape(shape)
    return new_data


Now we can save our data for later.

In [None]:
save_chain(sampler.chain, 'hello')

Or read it out if we want to compare. If you want to plot the saved data, make sure to indicate it when plotting by setting __from_saved__ to True.

In [None]:
test = read_chain('hello', (100,100,4))

In [None]:
cgp.plot_steps(test, from_saved=True)

In [None]:
cgp.plot_hist(test, from_saved=True)

In [None]:
fig = corner.corner(sampler.flatchain)

## Evil plotting happening down here.

In [1]:
from plotting import plot_lightcurve, plot_folded_lightcurve
from plotting_utils import plot_sampling_results

In [None]:
#true_period = 3.603957 # 3200 Phaeton
true_period = 5.58410 # 1291 Phryge
#true_period = 10.443 # 221 Eos
#true_period = 11.9432 # 1388 Aphrodite

plot_sampling_results(tsample, fsample, flux_err, gp, sampler, true_lightcurve = [time, flux], true_period=true_period)

In [None]:
def plot_sampling_results(time, flux, flux_err, gp, sampler,
                          t_pred=None, true_lightcurve=None,
                          true_period=None, namestr="test", 
                          nmodels=10, npred=1000):
    
    
    # resample from weights
    new_samples = sampler.flatchain
    
    # make a corner plot
    corner.corner(new_samples, labels=gp.get_parameter_names())

    # save to file
    plt.savefig(namestr + "_corner.pdf", format="pdf")

    # plot some light curves with example models

    # first, get the total number of available samples
    nsamples = new_samples.shape[0]

    # get some random samples from the 

    idx = np.random.choice(np.arange(0, nsamples, 1, dtype=int), size=nmodels)

    # if the array for the predictions isn't given, make one
    if t_pred is None:
        t_pred = np.linspace(time.iloc[0], time.iloc[-1], npred)

    # empty array for output
    m_all = np.zeros((nmodels, t_pred.shape[0]))

    # loop through the indices of samples, for each sample from the GP
    # conditional on the data points
    for i,j in enumerate(idx):
        p = new_samples[j]
        pnew = [p[0], p[1], p[2], p[3]]

        gp.set_parameter_vector(pnew)
        mean_model = gp.sample_conditional(fsample, t_pred)
        m_all[i] = mean_model

    fig, ax = plt.subplots(1, 1, figsize=(6,4))
    plot_lightcurve(tsample, fsample, true_lightcurve=true_lightcurve, 
                        models=(t_pred, m_all), ax=ax)

    plt.tight_layout()
    plt.savefig(namestr + "_lc.pdf", format="pdf")

    # plot histogram of periods
    fig, ax = plt.subplots(1, 1, figsize=(5,4))
    ax.hist(np.exp(new_samples[:,-1])*24, bins=100, normed=True, 
                label="posterior PDF", color="black", alpha=0.5)

    if true_period is not None:
        ylim = ax.get_ylim()
        ax.vlines(true_period, 0, ylim[-1], lw=1, color="red", linestyle="dashed", label="true period : " + str(true_period))

    ax.set_xlabel("Period in hours")
    ax.set_ylabel("Probability")
    ax.legend()

    plt.tight_layout()
    plt.savefig(namestr + "_period_pdf.pdf", format="pdf")

    # plot folded light curve

    fig, ax = plt.subplots(1, 1, figsize=(6,4))


    if true_period:
        ax = plot_folded_lightcurve(tsample, fsample, true_period/24, flux_err=0.01, 
                          models=[t_pred, m_all[:2]], 
                          true_lightcurve=true_lightcurve, ax=ax, use_radians=False)
    else:
        ax = plot_folded_lightcurve(tsample, fsample, best_period, flux_err=flux_err, 
                          models=[t_pred, m_all[:2]], 
                          true_lightcurve=true_lightcurve, ax=bx, use_radians=False)

    plt.tight_layout()
    plt.savefig(namestr + "_folded_lc.pdf", format="pdf")
