In [1]:
# imports
import sys
import numpy as np
import matplotlib.pyplot as plt
import emcee 
import pandas as pd
from time import time
from tqdm import tqdm_notebook

## Task c: Reproduce Table III

We want to sample our parameter space for our features $\{a\}$, for different values of $k$ and $k_{max}$, using both our uniform and naturaleness priors as defined in the basic tasks. However, the main difference is that we need to marginalize out the extra parameters for higher values of $k_{max}$. 

In [2]:
# Define a modular model that generates features up to k_max: however we 
# then marginalize all larger k. 
def modular_model(a, x, kmax):
    '''
    Returns model of order kmax. a is a vector containing the model features.
    Note the kmax+1 in the for loop. This is due to range being from 0 to
    kmax-1.
    '''
    model = 0
    for k in range(kmax+1):
        model += a[k]*x**k
    return model

$\chi^2/\text{dof}$ means the value of $\chi^2$ per degree of freedom. 

Copy most of the code from the basic part.

In [3]:
# Load dataset
def load_data(file):
    d = {
        "x": [],
        "d": [],
        "sigma": []
    }
    # Skip first two rows, which are the header:
    with open(file) as f:
        for idx,line in enumerate(f):
            if idx < 3:
                pass
            else:
                val = line.split()
                d["x"].append(np.float(val[0]))
                d["d"].append(np.float(val[1]))
                d["sigma"].append(np.float(val[2]))
    # cast to numpy arrays
    d["x"] = np.array(d["x"])
    d["d"] = np.array(d["d"])
    d["sigma"] = np.array(d["sigma"])
    return d


file = 'D1_c_5.dat'
data = load_data(file)
display(data)

{'x': array([0.03183, 0.06366, 0.09549, 0.12732, 0.15915, 0.19099, 0.22282,
        0.25465, 0.28648, 0.31831]),
 'd': array([0.31694, 0.33844, 0.42142, 0.57709, 0.56218, 0.68851, 0.73625,
        0.8727 , 1.0015 , 1.0684 ]),
 'sigma': array([0.01585 , 0.01692 , 0.02107 , 0.02885 , 0.02811 , 0.03443 ,
        0.03681 , 0.04364 , 0.050075, 0.05342 ])}

In [4]:
def log_uniform_prior(a):
    '''
    Uniform prior, returns a log(1) if the values in a are in abs(a)<100. Note that this
    prior is not normalized. We take care of this later.
    '''
    if np.all(np.abs(a)<=100):
        return 0  # log(1)
    else: 
        return -np.inf  # log(0)
    

def log_naturaleness_prior(a, bar_a=5):
    '''Naturaleness prior implemented according to equation 24 with bar(a)=5. This ensures'''
    return -len(a)*np.log(np.sqrt(2*np.pi)*bar_a) - 1/2*(a.dot(a)/bar_a**2)

# Tests 
print(0 == log_uniform_prior([-1,2,50]))
print(-np.inf == log_uniform_prior([-1,2,500]))
print(-7.8651 == np.round(np.log((1/(np.sqrt(2*np.pi)*5))**3 * np.exp(-(1+4+9)/(2*5**2))),4) == np.round(log_naturaleness_prior(np.array([1,2,3]), 5),4))  # Calculate expression exact and log

True
True
True


In [29]:
print(sigmas)

[0.01585  0.01692  0.02107  0.02885  0.02811  0.03443  0.03681  0.04364
 0.050075 0.05342 ]


In [5]:
def chi_squared(a, d, x, sigmas, kmax):
    '''
    Returns the chi squared measure for the datapoints d and x. The standard deviation is 
    assumed to be constant for all datapoints.
    '''
    chi_vec = (d-modular_model(a, x, kmax))/sigmas
    return np.sum(chi_vec**2)


def log_likelihood(a, d, x, sigmas, kmax):
    '''
    Returns log likelihood based on a Gaussian with di as the center values and 
    a standard deviation of sigma. a is the feature vector for our model.
    '''
    chi_sq = chi_squared(a, d, x, sigmas, kmax)
    like = -np.sum(np.log(np.sqrt(2*np.pi)*sigmas)) - 1/2*chi_sq
    return like


# Tests
a = np.array([1,2,3,4])
x = np.array([1,2])
d = np.array([2,4])
sigmas = np.array([2,4])
kmax = len(a)-1  # Sanity check that the model still works in the basic task case
print(f'Model kmax={kmax}: {[10, 49] == modular_model(a,x, kmax)}')
print(f'chi_squared: {142.5625==chi_squared(a,d,x,sigmas, kmax)}')
exact_likelihood = np.prod(1/(np.sqrt(2*np.pi)*sigmas)) * np.exp(-chi_squared(a,d,x,sigmas, kmax)/2)
print(f'log likelihood: {np.round(np.log(exact_likelihood),4) == np.round(log_likelihood(a, d, x, sigmas, kmax),4)}')  # exact calculation, and then log
# Also test in kmax = 6 case
a = np.array([1,2,3,4,5,6,7])
kmax = 6
print(f'Model kmax={kmax}: {[28, 769] == modular_model(a, x, kmax)}')

Model kmax=3: [ True  True]
chi_squared: True
log likelihood: True
Model kmax=6: [ True  True]


#### Ok our new model passes this sanity check. All of our tests from the basic problem returns ok with the new modular model.

In [6]:
def log_post_uniform(a,d,x,sigma, kmax):
    return log_likelihood(a,d,x,sigma,kmax) + log_uniform_prior(a)


def log_post_natural(a,d,x,sigma, kmax, bar_a):
    return log_likelihood(a,d,x,sigma,kmax) + log_naturaleness_prior(a, bar_a=bar_a)

In [7]:
# Sampling

# Define constants and data
bar_a = 5
x = data["x"]
d = data["d"]
sigmas = data["sigma"]
kmaxs = [i for i in range(0,6+1)]  # 0,1,2,3,4,5,6
samples_dict = {}  # Define dict to save samples in
for kmax in tqdm_notebook(kmaxs):
    samples_dict[kmax] = {}
    k = min(kmax,2)  # k+1 is the numbers of relevant features, so our model is at most of degree 2.
    samples_dict[kmax]["k"] = k
    print(f'kmax: {kmax}, k: {k}')
    
    # Define dimensions and walkers
    ndim = kmax+1  # 0,1,...,kmax
    nwalkers = ndim*2
    # Initial guess
    p0 = np.random.rand(ndim * nwalkers).reshape((nwalkers, ndim))


    nburn = 200  # nbr of burning steps
    nsamples = 2000  # nbr of final samples 

    # additional arguments to our sampler: d, x, sigma and d,x,sigma, bar_a respectively
    arglist_uniform = (d, x, sigmas, kmax)
    arglist_natural = (d, x, sigmas, kmax, bar_a)

    # Define samplers
    sampler_uniform = emcee.EnsembleSampler(nwalkers, ndim, log_post_uniform, args=arglist_uniform)
    sampler_natural = emcee.EnsembleSampler(nwalkers, ndim, log_post_natural, args=arglist_natural)
    # Start sampler on posteriors. Use first few hundred iterations as burn in. 
    t0 = time()  # start time
    sampler_uniform.run_mcmc(p0, nburn + nsamples)
    sampler_natural.run_mcmc(p0, nburn + nsamples)
    t1 = time()  # end time
    print(f'Sampling time: {t1-t0} seconds.')
    print()
    
    samples_uniform = sampler_uniform.chain[:,nburn:,:].reshape((-1,ndim))  # reshape to all samples per dim
    samples_natural = sampler_natural.chain[:,nburn:,:].reshape((-1,ndim)) 
    
    # Save samples
    samples_dict[kmax]["Uniform"] = samples_uniform
    samples_dict[kmax]["Natural"] = samples_natural

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

kmax: 0, k: 0
Sampling time: 1.4924135208129883 seconds.

kmax: 1, k: 1
Sampling time: 2.207188844680786 seconds.

kmax: 2, k: 2
Sampling time: 3.223501205444336 seconds.

kmax: 3, k: 2
Sampling time: 3.160998582839966 seconds.

kmax: 4, k: 2
Sampling time: 3.8262534141540527 seconds.

kmax: 5, k: 2
Sampling time: 4.686177015304565 seconds.

kmax: 6, k: 2
Sampling time: 5.6132001876831055 seconds.




Now, marginalize out all features higher than order k=2, and compute the mean for our feature. We just do this by ignoring the other data corresponding to the higher features, since we are just interested in the number of times that we "land" on our relevant features anyway.

We show central values and 68% confidence interval: mean +- 1 std for the Gaussian posterior.

In [24]:
measure, a_mean, a_std = calculate_feature_estimates(samples_natural, sampler_natural, 'Natural', k=2, kmax=2)
print(measure)

8499.187533759798


In [33]:
def calculate_feature_estimates(samples, prior_name, k, kmax):
    a_mean = np.zeros((k+1))
    a_std = np.zeros((k+1))
    for i in range(k+1):
        a_mean[i] = samples[:,i].mean()
        a_std[i] =  samples[:,i].std()
    if prior_name == 'Uniform':
        # TODO should I send in k here? Our model only consists of features up to k, since we marginalize the rest.
        chi_dof = chi_squared(a_mean, d, x, sigmas, k)/(len(x)-1)  # len(x) - 1, since we compute a mean in chi^2
        measure = chi_dof
    else:
        # Integrate over all a to get evidence
        log_posterior = np.zeros(samples.shape[0])
        for i, sample in enumerate(samples):
            log_posterior[i] = log_post_natural(sample, d, x, sigmas, k, bar_a)
            print(log_posterior[i])
            break
        posterior = np.exp(log_posterior)
        print(posterior)
        evidence = np.sum(posterior) # np.sum(np.exp(log_posterior for every sample))
        measure = evidence
    return measure, a_mean, a_std


data_dict = {
    "Uniform": {
        "k": [],
        "kmax": [],
        "measure": [],
        "a0": ['' for i in range(kmaxs[-1]+1)],
        "a1": ['' for i in range(kmaxs[-1]+1)],
        "a2": ['' for i in range(kmaxs[-1]+1)]
    },
    "Natural": {
        "k": [],
        "kmax": [],
        "measure": [],
        "a0": ['' for i in range(kmaxs[-1]+1)],
        "a1": ['' for i in range(kmaxs[-1]+1)],
        "a2": ['' for i in range(kmaxs[-1]+1)]
    }
}
for kmax in samples_dict:
    k = samples_dict[kmax]["k"]
    for prior in data_dict:
        samples = samples_dict[kmax][prior]
        measure, a_mean, a_std = calculate_feature_estimates(samples, prior, k, kmax)
        data_dict[prior]["k"].append(k)
        data_dict[prior]["kmax"].append(kmax)
        data_dict[prior]["measure"].append(str(np.round(measure,2)))
        for idx, a_i in enumerate(a_mean):
            data_dict[prior]["a"+str(idx)][kmax] = str(np.round(a_i,2)) + ' +- ' + str(np.round(a_std[idx],2))
        
uniform_dataframe = pd.DataFrame(data_dict["Uniform"])
natural_dataframe = pd.DataFrame(data_dict["Natural"])
print('********** Uniform **********')
display(uniform_dataframe)
print('********** Natural **********')
display(natural_dataframe)

-276.9841164464923
[5.09714106e-121 1.00000000e+000 1.00000000e+000 ... 1.00000000e+000
 1.00000000e+000 1.00000000e+000]
-92.57625671139823
[6.23221683e-41 1.00000000e+00 1.00000000e+00 ... 1.00000000e+00
 1.00000000e+00 1.00000000e+00]
11.07209466602908
[6.43501579e+04 1.00000000e+00 1.00000000e+00 ... 1.00000000e+00
 1.00000000e+00 1.00000000e+00]
8.264522500494317
[3.88361811e+03 1.00000000e+00 1.00000000e+00 ... 1.00000000e+00
 1.00000000e+00 1.00000000e+00]
0.3035391203047908
[1.35464458 1.         1.         ... 1.         1.         1.        ]
-28.242653902429463
[5.42464417e-13 1.00000000e+00 1.00000000e+00 ... 1.00000000e+00
 1.00000000e+00 1.00000000e+00]
1.4029015687846709
[4.0669835 1.        1.        ... 1.        1.        1.       ]
********** Uniform **********


Unnamed: 0,k,kmax,measure,a0,a1,a2
0,0,0,66.56,0.48 +- 0.01,,
1,1,1,1.99,0.2 +- 0.02,2.55 +- 0.13,
2,2,2,1.28,0.25 +- 0.03,1.64 +- 0.46,3.08 +- 1.51
3,2,3,9.76,0.27 +- 0.04,1.01 +- 1.1,7.72 +- 8.05
4,2,4,18.62,0.27 +- 0.04,0.91 +- 1.23,9.17 +- 11.03
5,2,5,31.39,0.27 +- 0.04,0.74 +- 1.25,10.92 +- 11.55
6,2,6,32.33,0.27 +- 0.04,0.69 +- 1.26,11.14 +- 11.23


********** Natural **********


Unnamed: 0,k,kmax,measure,a0,a1,a2
0,0,0,3999.0,0.48 +- 0.01,,
1,1,1,7999.0,0.22 +- 0.05,2.42 +- 0.35,
2,2,2,76349.16,0.25 +- 0.02,1.64 +- 0.39,3.15 +- 1.28
3,2,3,19882.62,0.25 +- 0.02,1.61 +- 0.43,3.25 +- 2.18
4,2,4,20000.35,0.25 +- 0.03,1.65 +- 0.46,2.94 +- 2.33
5,2,5,23999.0,0.25 +- 0.02,1.65 +- 0.43,2.98 +- 2.29
6,2,6,28003.07,0.25 +- 0.02,1.63 +- 0.48,3.04 +- 2.46
