In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats
import GP_class
import george
import numpy.random as rnd
import corner

import sys
import time as tm
import multiprocessing as mp

### NOTE TO SELF ###

# When testing the speed of the prior sampling (numpy vs scipy), numpy took ~32 ms for 10**6 samples 
# while scipy took 386 microseconds (so negligable) to set up the distribution and 23 ms to create the 10**6 samples.
# For 10**8 samples, numpy took 3.03 seconds while scipy took 2.35 seconds. So I'll be sticking with scipy for now.

# numpy code
#%timeit J_log_period = np.random.normal(size=100000000)*0.5 + np.log(4/24.) 

# scipy code
#nsamples = 100000000
#%timeit prior_log_period = scipy.stats.norm(np.log(4./24.), (12./24.))
#%timeit J_log_period_2= prior_log_period.rvs(nsamples)

In [11]:
def read_data(filename, whitespace=False, datadir="./"):
    """
    Read in light curve data from asteroid.
    """

    data  = pd.read_csv(datadir+filename, header=None, delim_whitespace=whitespace)

    tsample = data[0]
    fsample = data[1]
    flux_err = data[2]

    return tsample, fsample, flux_err

time, flux, flux_err = read_data("../data/simulation_results_new/3200/3200_lc_49627_to_49787.txt_sampled_talc_14days.txt")

In [13]:
# 1 : set up the prior distriubtion
prior_mean = scipy.stats.norm(1, 0.5)
prior_log_amp = scipy.stats.norm(np.log(0.15), np.log(2))
prior_log_gamma = scipy.stats.norm(np.log(10), np.log(2))
prior_log_period = scipy.stats.norm(np.log(4./24.), (12./24.))

In [22]:
nsamples = 10**7

J_mean = prior_mean.rvs(nsamples)
J_log_amp = prior_log_amp.rvs(nsamples)
J_log_gamma = prior_log_gamma.rvs(nsamples)
J_log_period= prior_log_period.rvs(nsamples)

In [18]:
kernel = np.exp(0) * george.kernels.ExpSine2Kernel(gamma = 1, log_period = 0)
gp = george.GP(kernel, fit_mean=True, mean=1)
gp.compute(time, flux_err)

In [25]:
class DataManager:
    def __init__(self, time, flux, flux_err):
        self.flux = flux
        kernel = np.exp(0) * george.kernels.ExpSine2Kernel(gamma = 1, log_period = 0)
        self.gp = george.GP(kernel, fit_mean=True, mean=1)
        self.gp.compute(time, flux_err)

    def calculate_likelihood(self, params):
        self.gp.set_parameter_vector(params)

        try:
            #gp.compute(time, flux_err)
            lnlike = self.gp.log_likelihood(self.flux)
        except np.linalg.LinAlgError:
            lnlike = -1e25

        return lnlike

In [None]:
#%%prun
# 2: for each j, calculate the log likelihood

L_results = np.ones(nsamples)

start_time = tm.time()

for i in np.arange(nsamples):

    p = ((i+1)/nsamples)*100
    if p.is_integer():
        sys.stdout.write('\r'+str(int(p))+"%")

    params = [J_mean[i], J_log_amp[i], np.exp(J_log_gamma[i]), J_log_period[i]]

    gp.set_parameter_vector(params)

    try:
        #gp.compute(time, flux_err)
        lnlike = gp.log_likelihood(flux)
    except np.linalg.LinAlgError:
        lnlike = -1e25

    L_results[i] = lnlike
        
end_time = tm.time()

print("\ntotal time taken for this loop: %.2f seconds" %(end_time - start_time))


In [24]:
import multiprocessing as mp
(mp.cpu_count())

8

In [26]:
# Parallelizing using Pool.map()
manager = DataManager(time, flux, flux_err)

start_time = tm.time()


pool = mp.Pool(7)

L_results = []
L_results = pool.map(manager.calculate_likelihood, zip(J_mean, J_log_amp, np.exp(J_log_gamma), J_log_period))
L_results = np.array(L_results)
pool.close()

end_time = tm.time()
print("\ntotal time taken for this loop: %.2f seconds" %(end_time - start_time))


total time taken for this loop: 1399.32 seconds


In [36]:
# 3 : Pick a random number r out of a uniform distribution between 0 and Lmax
uu = rnd.uniform(size=len(L_results))

good_samples_bool = 0.00000001 < np.exp(L_results-max(L_results))
good_samples_idx, = np.where(good_samples_bool)

len(good_samples_idx)

134

In [None]:
fig, ax = plt.subplots(2,2, figsize=(12,12))

ax[0,0].scatter(np.exp(J_log_period)*24., L_results)

L_max_period = np.exp(J_log_period[np.where(L_results==L_results.max())])*24.
ax[0,0].vlines(L_max_period, L_results.min(), L_results.max(), label="Lmax: %f" %L_max_period, linestyles='dashed', alpha=0.7)
ax[0,0].legend()
ax[0,0].set_title("Period (hours)")


ax[0,1].scatter(J_mean, L_results)

L_max_mean = J_mean[np.where(L_results==L_results.max())]
ax[0,1].vlines(L_max_mean, L_results.min(), L_results.max(), label="Lmax: %f" %L_max_mean, linestyles='dashed', alpha=0.7)
ax[0,1].legend()
ax[0,1].set_title("Mean")


ax[1,0].scatter(J_log_amp, L_results)

L_max_amp = np.exp(J_log_amp[np.where(L_results==L_results.max())])
ax[1,0].vlines(L_max_amp, L_results.min(), L_results.max(), label="Lmax: %f" %L_max_amp, linestyles='dashed', alpha=0.7)
ax[1,0].legend()
ax[1,0].set_title("Amplitude")

ax[1,1].scatter(np.exp(J_log_gamma), L_results)

L_max_gamma = np.exp(J_log_gamma[np.where(L_results==L_results.max())])
ax[1,1].vlines(L_max_gamma, L_results.min(), L_results.max(), label="Lmax: %f" %L_max_gamma, linestyles='dashed', alpha=0.7)
ax[1,1].legend()
ax[1,1].set_title("Gamma")

Text(0.5,1,'Gamma')

ERROR:tornado.general:Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "/home/christina/anaconda3/lib/python3.7/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/christina/anaconda3/lib/python3.7/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/christina/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/christina/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/christina/anaconda3/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/christina/anaconda3/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_hist

In [None]:
data = np.array([np.exp(J_log_period[good_samples_idx])*24, np.exp(J_log_gamma[good_samples_idx]), np.exp(J_log_amp[good_samples_idx]), J_mean[good_samples_idx]]).T
figure = corner.corner(data, labels=["period", 'gamma', 'amp', 'mean'])#, J_log_gamma[good_samples_idx], J_mean[good_samples_idx])
plt.savefig("posterior_corner.pdf", format="pdf")

In [1]:
L_max_period = np.exp(J_log_period[np.where(L_results==max(L_results))])*24.
L_max_period

NameError: name 'np' is not defined

In [None]:
plt.scatter(np.exp(J_log_period)*24., np.exp(L_results-max(L_results)))

# plot l_max
L_max_period = np.exp(J_log_period[np.where(L_results==max(L_results))])*24.
plt.vlines(L_max_period, 0, 1, label="Lmax: %f" %L_max_period, linestyles='dashed', alpha=0.7)
plt.legend()

In [None]:
plt.scatter(J_mean, np.exp(L_results-L_results.max()))

# plot l_max
L_max_mean = J_mean[np.where(L_results==L_results.max())]
plt.vlines(L_max_mean, 0, 1, label="Lmax: %f" %L_max_period, linestyles='dashed', alpha=0.7)
plt.legend()

In [None]:
len(good_samples_idx)

In [None]:
import plotting

In [None]:
plotting.plot_folded_lightcurve(time, flux, period = np.exp(J_log_period[np.where(L_results==L_results.max())]))

In [None]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

In [None]:
import numpy as np
from time import time

# Prepare data
np.random.RandomState(100)
arr = np.random.randint(0, 10, size=[200000, 5])
data = arr.tolist()
data[:5]

In [None]:
# Solution Without Paralleization

def howmany_within_range(row, minimum, maximum):
    """Returns how many numbers lie within `maximum` and `minimum` in a given `row`"""
    count = 0
    for n in row:
        if minimum <= n <= maximum:
            count = count + 1
    return count

results = []
for row in data:
    results.append(howmany_within_range(row, minimum=4, maximum=8))

print(results[:10])
#> [3, 1, 4, 4, 4, 2, 1, 1, 3, 3]

In [None]:
# Parallelizing using Pool.apply()

import multiprocessing as mp

# Step 1: Init multiprocessing.Pool()
pool = mp.Pool(mp.cpu_count())

# Step 2: `pool.apply` the `howmany_within_range()`
results = [pool.apply(howmany_within_range, args=(row, 4, 8)) for row in data]

# Step 3: Don't forget to close
pool.close()    

print(results[:10])
#> [3, 1, 4, 4, 4, 2, 1, 1, 3, 3]

In [None]:
# Parallelizing using Pool.map()
import multiprocessing as mp

# Redefine, with only 1 mandatory argument.
def howmany_within_range_rowonly(row, minimum=4, maximum=8):
    count = 0
    for n in row:
        if minimum <= n <= maximum:
            count = count + 1
    return count

pool = mp.Pool(mp.cpu_count())

results = pool.map(howmany_within_range_rowonly, [row for row in data])

pool.close()

print(results[:10])
#> [3, 1, 4, 4, 4, 2, 1, 1, 3, 3]