# Emcee GP Study

An emcee sampler notebook, testing various things about the emcee package.

In [80]:
%matplotlib notebook 
#restart the kernel if switching from inline to notebook
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')
import corner

import numpy as np
import scipy.stats
import pandas as pd

import emcee
import george

import gp_sandbox as cgp
from subsample import subsample

In [263]:
def plot_sample(time, flux, tsample, fsample, asteroid):
    
    """
    Plot the sampled data points.
    """
    
    if asteroid == 3200:
        true_period = 3.603957 # 3200 Phaeton
        true_log_p = -1.896021
        
        
    elif asteroid == 1291:
        true_period = 5.58410 # 1291 Phryge
        true_log_p = -1.45813055
    
        
    elif asteroid == 221:
        true_period = 10.443 # 221 Eos
        true_log_p = -0.8321219
      
    elif asteroid == 1388:
        true_period = 11.9432 # 1388 Aphrodite
        true_log_p = -0.69789175
        
    else:
        true_period = 0
        true_log_p = 0  
        
    fig, ax = plt.subplots(1,1, figsize=(8,4))
    ax.plot(time, flux, '-', alpha=0.5, label="Original : " + str(round(true_log_p, 5)))
    #ax.plot(tsample, fsample, 'ko', ms = 3, alpha=0.5, 
                 
    ax.set_title("%i nights, %i data points"%(days, len(fsample)))
    ax.set_xlabel("Days (JD)")
    ax.errorbar(tsample, fsample, yerr=flux_err, fmt="o", markersize=5, 
                color="black", zorder=10, label="Sample : " + str(len(tsample)))
    ax.legend()
    return ax
    
    
def prior(params):
    
    """
    Calculated the log of the prior values, given parameter values.
    
    Parameters
    ----------
    params : list
        List of all kernel parameters
    
    param[0] : float
        mean (between 0 and 2)
    
    param[1] : float
        log amplitude (between -10 and 10)
        
    param[2] : float
        gamma (log gamma between 0.1 and 100)
        
    param[3] : float
        log period (period between 1h and 24hrs)
    
    Returns
    -------
    sum_log_prior : int
        sum of all log priors (-inf if a parameter is out of range)
    
    """

    p_mean = scipy.stats.uniform(0,20).logpdf(params[0])
    p_log_amp = scipy.stats.uniform(-10,30).logpdf(params[1])  
    p_log_gamma = scipy.stats.uniform(np.log(0.1), np.log(10)*3).logpdf(np.log(params[2]))
    p_period = scipy.stats.uniform(np.log(1./24), -np.log(1./24)).logpdf((params[3]))
    #p_period = scipy.stats.uniform(np.log(1./24), -np.log(1./(24*30))).logpdf((params[3])) #1-72 hrs
        
    sum_log_prior =  p_mean + p_log_amp + p_log_gamma + p_period
    #sum_log_prior = p_log_amp + p_log_gamma + p_period
    
    if np.isnan(sum_log_prior) == True:
        return -np.inf
    
    return sum_log_prior


def post_lnlikelihood(params):
    
    """
    Calculates the posterior likelihood from the log prior and 
    log likelihood. 
    
    Parameters
    ----------
    params : list
        List of all kernel parameters
        

    Returns
    -------
    ln_likelihood : float
        The posterior, unless the posterior is infinite, in which case,
        -1e25 will be returned instead.
        
    """    
    # calculate the log_prior
    log_prior = prior(params)
    
    #print(log_prior)
    
    # return -inf if parameters are outside the priors
    if np.isneginf(log_prior) == True:
        return -np.inf
    
    # compute lnlikelihood based on given parameters
    gp.set_parameter_vector(params)
    gp.compute(tsample, flux_err)
    
    ln_likelihood = gp.lnlikelihood(fsample)+log_prior
    
    return ln_likelihood if np.isfinite(ln_likelihood) else -1e25



def walker_params(params, cov_scale=1, plot=True):
    
    """
    Sets up the initial parameters for all the walkers using optimized 
    parameter values as starting values. The function generates a 
    scattered multivatiate gaussian distribution of starting parameter 
    values.
    
    Parameters
    ----------
    params : list
        List of all kernel parameters.
    
    cov_scale : float
        Determines the scatter of the multivariate distribution.
      
    Returns
    -------
    p0 : numpy.ndarray
        The initial walker parameters [nwalker, ndim]
    
    gp : george.gp.GP
        GP kernel set with the optimized parameter values.
        
    """
    
    
    mean, log_amp, gamma, log_period = params
    amp = np.exp(log_amp)

    print('amp : ' + str(amp))
    kernel = amp * george.kernels.ExpSine2Kernel(gamma = gamma, log_period = log_period)
    gp = george.GP(kernel, fit_mean=True, mean=mean)
    gp.compute(fsample, flux_err)
    
    p_mean = np.array(params)/100.
    cov_matrix = np.sqrt(np.diag(p_mean)**2)
    print(params)
    print(cov_matrix)

    p0 = np.random.multivariate_normal(mean=params, cov=cov_scale*cov_matrix, size=nwalkers)
    
    #corner.corner(p0,labels=["$mean$", "$log\_amp$", "$gamma$","$log\_period$"])
    
    if plot==True:
        plot_gpfit(time, tsample, fsample, flux_err, gp)
    
    return p0, gp

def plot_gpfit(time, tsample, fsample, flux_err, gp):
    
    """
    Plot a gp fit given a gp class and x, y, and yerr data to fit onto.
    
    """
      
    t_possible = np.linspace(time[0], time[-1], len(time))
    pred, pred_var = gp.predict(fsample, t_possible, return_var=True)
    
    temp_color = np.random.rand(3)
    
    ax.fill_between(t_possible, pred - np.sqrt(pred_var), pred + np.sqrt(pred_var),
                color="red", alpha=0.4)
    ax.plot(t_possible, pred, "red", lw=1.5, alpha=0.7, label = "GP Fit : " + str(round(gp.parameter_vector[-1], 5)))
    ax.legend()


First we have to extract the time stamps and flux from any data files we might have. For simulated data, we have to sample it at a candence that would be similar to real observations, meaning we have 3 tyoes of flux and time data.
1. __data.time & data.flux__ : all the time and flux data from the simulated data. This might span multiple days-months and has a data point for every 30 seconds using DAMIT generated data

2. __time & flux__ : the time and flux data for a smaller range of dates than all_time and all_flux. This is essentially the observing window in which we are working with and time is what we will be projecting our gp fits onto

3. __tsample & fsample__ : the time and flux data sampled from the time and flux data. The sampling of this is more realistic (every 10 minutes instead of every 0.5 minutes)

In [264]:
asteroid = 1388
txt = '../data/'+str(asteroid)+'_lc_49627_to_49787.txt'

data = pd.read_csv(txt, delimiter=' ',
                 header=None, names=['time','flux'], dtype={'time':float, 'flux':float})

days, delay = 5, 2

# convert days to points
span = 2880 * days
start_pt = 2880 * delay

time = np.array(data.time[start_pt:span+start_pt])
flux = np.array(data.flux[start_pt:span+start_pt])

flux_err = np.ones_like(flux) * np.std(flux)/10.0

tsample, fsample, flux_err = subsample(time, flux, flux_err=flux_err, npoints=100, kind="random")

ax = plot_sample(time, flux, tsample, fsample, asteroid)

<IPython.core.display.Javascript object>

Let's see what a Lomb-Scargle periodogram predicts the period should be based on our data so far.

In [223]:
from lombscargle import make_lsp
from astropy.stats import LombScargle

freq, power = make_lsp(tsample, fsample, flux_err, p_max=5.0)

fig, (bx,cx,dx) = plt.subplots(1,3, figsize=(9,2.5))
fig.set_tight_layout('tight')
bx.plot(freq, power)
bx.set_xlabel('Frequency')
bx.set_ylabel('Power')
best_freq = freq[np.argmax(power)]
best_log_period = np.log(1./best_freq)
bx.vlines(best_freq, 0, 1, colors='orange', linestyles='--', 
          label = 'Best freq : ' + str(round(best_freq, 5)))
bx.legend()

cx.plot((1./freq),power)
cx.set_xlabel('Period')
cx.vlines(1./best_freq, 0, 1, colors='orange', linestyles='--', 
          label = 'Best period : ' + str(round(1./best_freq, 5)))
cx.set_xlim([0,1])
cx.legend()

dx.plot(np.log(1./freq),power)
dx.set_xlabel('Log Period')
dx.vlines(np.log(1./best_freq), 0, 1, colors='orange', linestyles='--', 
          label = 'Best log period : ' + str(round(np.log(1./best_freq), 5)))
dx.set_xlim([-3.3,0])
dx.legend()

y_fit = LombScargle(tsample, fsample, flux_err).model(time, best_freq)
ax.plot(time, y_fit, label = "Lomb-Scargle fit : " + str(round(best_log_period, 5)))
ax.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7fe31873de80>

In [194]:
new_freq = best_freq*(1./(3/2))
new_log_period = np.log(1./new_freq)
y_fit = LombScargle(tsample, fsample, flux_err).model(time, new_freq)
ax.plot(time, y_fit, '--', label = "Lomb-Scargle fit : " + str(round(new_log_period, 5)))
ax.legend()

bx.vlines(new_freq, 0, 1, linestyles='--', 
          label = 'New fit : ' + str(round(new_freq, 5)))
bx.legend()

cx.vlines(1./new_freq, 0, 1, linestyles='--', 
          label = 'New period : ' + str(round(1./new_freq, 5)))
cx.legend()
dx.vlines(new_log_period, 0, 1, linestyles='--', 
          label = 'New log period : ' + str(round(1./new_freq, 5)))
dx.legend()

<matplotlib.legend.Legend at 0x7fe320ccb6a0>

How many walkers do we want? So far there are 4 parameters/dimensions we want to study: mean, log_amp, gamma, and log_period.

In [265]:
ndim, nwalkers = 4, 100

In [267]:
# initialize walker parameters
best_log_amp = np.log(flux.max()-flux.min())
params = [np.mean(fsample), best_log_amp, 4, -0.696]# new_log_period]
p0, gp = walker_params(params=params, cov_scale=1, plot=True)

amp : 0.408181
[1.0142145916638241, -0.89604457550055916, 4, -0.696]
[[ 0.01014215  0.          0.          0.        ]
 [ 0.          0.00896045  0.          0.        ]
 [ 0.          0.          0.04        0.        ]
 [ 0.          0.          0.          0.00696   ]]


In [268]:
sampler = emcee.EnsembleSampler(nwalkers, ndim, post_lnlikelihood, threads=1)

### Note

If your sampler fails within the first couple of seconds, try making the cov_scale smaller.

In [269]:
%%time
mcmc_sampling = sampler.run_mcmc(p0, 100)



CPU times: user 50.3 s, sys: 144 ms, total: 50.4 s
Wall time: 50.2 s


In [274]:
def plot_steps(sampler, dims = None, p0=None, data_pts=None, from_saved=False):
    fig, ax = plt.subplots(2,2)
    fig.subplots_adjust(wspace=0.3, hspace=0.3)
    
    axs = [ax[0,0], ax[0,1], ax[1,0], ax[1,1]]
    
    if from_saved==True:
        x = np.arange(sampler.shape[1])
        
        for i in range(sampler.dim):
            axs[i].set_xlabel('Step Number')
            axs[i].set_ylabel('{}'.format(dims[i]))
            
            for j in range(sampler.shape[0]):
                param = sampler[j,:,i]
                if i == len(dims)-1: 
                    param = np.exp(param)
                axs[i].plot(x, param, 'k-', alpha=0.3)
                
            flatchain = sampler[:,:,i]
            if i == len(dims)-1: 
                pre_mean = flatchain.mean()
                flatchain = np.exp(flatchain)
                axs[i].axhline(flatchain.mean(), linestyle='--', 
                               label=(str((round(flatchain.mean(),5)))+' \n'+str((round(pre_mean,5)))))
            
            else: axs[i].axhline(flatchain.mean(), linestyle='--', 
                                 label=round(flatchain.mean(),5))
            axs[i].legend(loc=1)

    else:        
        x = np.arange(sampler.iterations)

        print(str(p0[0]) + '\nData points: ' + str(data_pts))
        print("Mean acceptance fraction: {0:.3f}".format(np.mean(sampler.acceptance_fraction)))

        for i in range(sampler.dim):
            axs[i].set_xlabel('Step Number')
            axs[i].set_ylabel('{}'.format(dims[i]))
            
            for j in range(len(sampler.chain)):
                param = sampler.chain[j,:,i]
                if i == len(dims)-1: 
                    param =  (param)
                axs[i].plot(x, param, 'k-', alpha=0.3)
                # fit might guess period is time range of sampling
                
            flatchain = sampler.flatchain[:,i]
            if i == len(dims)-1: 
                pre_mean = flatchain.mean()
                flatchain = (flatchain)
                axs[i].axhline(flatchain.mean(), linestyle='--' , label=(str((round(flatchain.mean(),5)))+'\n'+str((round(pre_mean,5)))))
            
            else: axs[i].axhline(flatchain.mean(), linestyle='--' , label=round(flatchain.mean(),5))
            axs[i].legend(loc=1)
            

In [275]:
plot_steps(sampler, dims = ['mean', 'log_amp', 'gamma', 'period'], p0=p0, data_pts=len(fsample))

<IPython.core.display.Javascript object>

[ 0.84847743 -0.90842392  3.99629461 -0.58144492]
Data points: 100
Mean acceptance fraction: 0.349


In [None]:
def save_chain(data, file_name):
    with file(file_name + '.txt', 'w') as outfile:
        # I'm writing a header here just for the sake of readability
        # Any line starting with "#" will be ignored by numpy.loadtxt
        outfile.write("# " + str(data.shape) + '\n')

        # Iterating through a ndimensional array produces slices along
        # the last axis. This is equivalent to data[i,:,:] in this case
        for data_slice in data:  
            np.savetxt(outfile, data_slice)

            # Writing out a break to indicate different slices...
            outfile.write('# New chain\n')

def read_chain(file_name, shape):
    new_data = np.genfromtxt(file_name + '.txt', delimiter=' ')
    # data shape can be found as the first commented line in the txt file
    new_data = new_data.reshape(shape)
    return new_data


Now we can save our data for later.

In [None]:
save_chain(sampler.chain, '2001SC170')

Or read it out if we want to compare. If you want to plot the saved data, make sure to indicate it when plotting by setting __from_saved__ to True.

In [None]:
test = read_chain('hello', (100,100,4))

In [None]:
cgp.plot_hist(sampler)

## Evil plotting happening down here.

In [276]:
from plotting import plot_lightcurve, plot_folded_lightcurve


In [280]:
#true_period = 3.603957 # 3200 Phaeton
#true_period = 5.58410 # 1291 Phryge
#true_period = 10.443 # 221 Eos
true_period = 11.9432 # 1388 Aphrodite

plot_mcmc_sampling_results(tsample, fsample, flux_err, gp, sampler, namestr='test', true_lightcurve = [time, flux], true_period=true_period)

<IPython.core.display.Javascript object>

  samples = np.random.multivariate_normal(mean, matrix, N)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [279]:
def plot_mcmc_sampling_results(time, flux, flux_err, gp, sampler,
                          t_pred=None, true_lightcurve=None,
                          true_period=None, namestr="test", 
                          nmodels=10, npred=1000):
    
    
    # resample from weights
    new_samples = sampler.flatchain
    
    # make a corner plot
    corner.corner(new_samples, labels=gp.get_parameter_names())

    # save to file
    plt.savefig(namestr + "_corner.pdf", format="pdf")

    # plot some light curves with example models

    # first, get the total number of available samples
    nsamples = new_samples.shape[0]

    # get some random samples from the 

    idx = np.random.choice(np.arange(0, nsamples, 1, dtype=int), size=nmodels)

    # if the array for the predictions isn't given, make one
    if t_pred is None:
        t_pred = np.linspace(time[0], time[-1], npred)

    # empty array for output
    m_all = np.zeros((nmodels, t_pred.shape[0]))

    # loop through the indices of samples, for each sample from the GP
    # conditional on the data points
    for i,j in enumerate(idx):
        p = new_samples[j]
        pnew = [p[0], p[1], p[2], p[3]]

        gp.set_parameter_vector(pnew)
        mean_model = gp.sample_conditional(fsample, t_pred)
        m_all[i] = mean_model

    fig, ax = plt.subplots(1, 1, figsize=(6,4))
    plot_lightcurve(tsample, fsample, true_lightcurve=true_lightcurve, 
                        models=(t_pred, m_all), ax=ax)

    plt.tight_layout()
    plt.savefig(namestr + "_lc.pdf", format="pdf")

    # plot histogram of periods
    fig, ax = plt.subplots(1, 1, figsize=(5,4))
    ax.hist(np.exp(new_samples[:,-1])*24, bins=100, normed=True, 
                label="posterior PDF", color="black", alpha=0.5)

    if true_period is not None:
        ylim = ax.get_ylim()
        ax.vlines(true_period, 0, ylim[-1], lw=1, color="red", linestyle="dashed", label="true period : " + str(true_period))

    ax.set_xlabel("Period in hours")
    ax.set_ylabel("Probability")
    ax.legend()

    plt.tight_layout()
    plt.savefig(namestr + "_period_pdf.pdf", format="pdf")

    # plot folded light curve

    fig, ax = plt.subplots(1, 1, figsize=(6,4))


    if true_period:
        ax = plot_folded_lightcurve(tsample, fsample, true_period/24, flux_err=0.01, 
                          models=[t_pred, m_all[:2]], 
                          true_lightcurve=true_lightcurve, ax=ax, use_radians=False)
    else:
        ax = plot_folded_lightcurve(tsample, fsample, best_period, flux_err=flux_err, 
                          models=[t_pred, m_all[:2]], 
                          true_lightcurve=true_lightcurve, ax=bx, use_radians=False)

    plt.tight_layout()
    plt.savefig(namestr + "_folded_lc.pdf", format="pdf")


# Misc stuff down here


## Skip the following cell if you are working with simulated data.

Here's an alternative way of doing things if you're working with real data measurements. Since we're working with real data, we are essentially given __tsample__ and __fsample__. So we don't need to sample any points but this also means we don't know what the real light curve looks like. We can still generate a __time__ since it just needs to span from the first observation of __tsample__ to the last.

In [None]:
txt = '../data/asteroid_csv/2018LF05.csv'
data = pd.read_csv(txt)

tsample = data.jd
fsample = data.magpsf
flux_err = data.sigmapsf
data_pts = len(tsample)

tsample.iloc[-1]

time = pd.Series(np.linspace(tsample.iloc[0], tsample.iloc[-1], 1000))

fig, ax = plt.subplots(1,1, figsize=(8,4))
ax.errorbar(tsample, fsample, yerr=flux_err, fmt = 'k.', label="Sample : " + str(len(tsample)))
ax.legend()