In [124]:
# imports
import sys
import numpy as np
import matplotlib.pyplot as plt
import emcee 
import pandas as pd
from time import time

## Task c: Reproduce Table III

We want to sample our parameter space for our features $\{a\}$, for different values of $k$ and $k_{max}$, using both our uniform and naturaleness priors as defined in the basic tasks. However, the main difference is that we need to marginalize out the extra parameters for higher values of $k_{max}$. 

In [139]:
# Define a modular model that generates features up to k_max: however we 
# then marginalize all larger k. 
def modular_model(a, x, kmax):
    '''
    Returns model of order kmax. a is a vector containing the model features.
    Note the kmax+1 in the for loop. This is due to range being from 0 to
    kmax-1.
    '''
    model = 0
    for k in range(kmax+1):
        model += a[k]*x**k
    return model

$\chi^2/\text{dof}$ means the value of $\chi^2$ per degree of freedom. 

Copy most of the code from the basic part.

In [140]:
# Load dataset
def load_data(file):
    d = {
        "x": [],
        "d": [],
        "sigma": []
    }
    # Skip first two rows, which are the header:
    with open(file) as f:
        for idx,line in enumerate(f):
            if idx < 3:
                pass
            else:
                val = line.split()
                d["x"].append(np.float(val[0]))
                d["d"].append(np.float(val[1]))
                d["sigma"].append(np.float(val[2]))
    # cast to numpy arrays
    d["x"] = np.array(d["x"])
    d["d"] = np.array(d["d"])
    d["sigma"] = np.array(d["sigma"])
    return d


file = 'D1_c_5.dat'
data = load_data(file)
display(data)

{'x': array([0.03183, 0.06366, 0.09549, 0.12732, 0.15915, 0.19099, 0.22282,
        0.25465, 0.28648, 0.31831]),
 'd': array([0.31694, 0.33844, 0.42142, 0.57709, 0.56218, 0.68851, 0.73625,
        0.8727 , 1.0015 , 1.0684 ]),
 'sigma': array([0.01585 , 0.01692 , 0.02107 , 0.02885 , 0.02811 , 0.03443 ,
        0.03681 , 0.04364 , 0.050075, 0.05342 ])}

In [141]:
def log_uniform_prior(a):
    '''
    Uniform prior, returns a log(1) if the values in a are in abs(a)<100. Note that this
    prior is not normalized. We take care of this later.
    '''
    if np.all(np.abs(a)<=100):
        return 0  # log(1)
    else: 
        return -np.inf  # log(0)
    

def log_naturaleness_prior(a, bar_a=5):
    '''Naturaleness prior implemented according to equation 24 with bar(a)=5. This ensures'''
    return -len(a)*np.log(np.sqrt(2*np.pi)*bar_a) - 1/2*(a.dot(a)/bar_a**2)

# Tests 
print(0 == log_uniform_prior([-1,2,50]))
print(-np.inf == log_uniform_prior([-1,2,500]))
print(-7.8651 == np.round(np.log((1/(np.sqrt(2*np.pi)*5))**3 * np.exp(-(1+4+9)/(2*5**2))),4) == np.round(log_naturaleness_prior(np.array([1,2,3]), 5),4))  # Calculate expression exact and log

True
True
True


In [142]:
def chi_squared(a, d, x, sigmas, kmax):
    '''
    Returns the chi squared measure for the datapoints d and x. The standard deviation is 
    assumed to be constant for all datapoints.
    '''
    chi_vec = (d-modular_model(a, x, kmax))/sigmas
    return np.sum(chi_vec**2)


def log_likelihood(a, d, x, sigmas, kmax):
    '''
    Returns log likelihood based on a Gaussian with di as the center values and 
    a standard deviation of sigma. a is the feature vector for our model.
    '''
    chi_sq = chi_squared(a, d, x, sigmas, kmax)
    like = -np.sum(np.log(np.sqrt(2*np.pi)*sigmas)) - 1/2*chi_sq
    return like


# Tests
a = np.array([1,2,3,4])
x = np.array([1,2])
d = np.array([2,4])
sigmas = np.array([2,4])
kmax = len(a)-1  # Sanity check that the model still works in the basic task case
print(f'Model: {[10, 49] == modular_model(a,x, kmax)}')
print(f'chi_squared: {142.5625==chi_squared(a,d,x,sigmas, kmax)}')
exact_likelihood = np.prod(1/(np.sqrt(2*np.pi)*sigmas)) * np.exp(-chi_squared(a,d,x,sigmas, kmax)/2)
print(f'log likelihood: {np.round(np.log(exact_likelihood),4) == np.round(log_likelihood(a, d, x, sigmas, kmax),4)}')  # exact calculation, and then log

Model: [ True  True]
chi_squared: True
log likelihood: True


#### Ok our new model passes this sanity check. All of our tests from the basic problem returns ok with the new modular model.

In [143]:
def log_post_uniform(a,d,x,sigma, kmax):
    return log_likelihood(a,d,x,sigma,kmax) + log_uniform_prior(a)


def log_post_natural(a,d,x,sigma, kmax, bar_a):
    return log_likelihood(a,d,x,sigma,kmax) + log_naturaleness_prior(a, bar_a=bar_a)

In [144]:
# Sampling

# Define constants and data
bar_a = 5
x = data["x"]
d = data["d"]
sigmas = data["sigma"]
kmaxs = [i for i in range(0,6+1)]  # 0,1,2,3,4,5,6
samples_dict = {}  # Define dict to save samples in
for kmax in kmaxs:
    samples_dict[kmax] = {}
    k = min(kmax,2)  # k+1 is the numbers of relevant features, so our model is at most of degree 2.
    samples_dict[kmax]["k"] = k
    print(f'kmax: {kmax}, k: {k}')
    
    # Define dimensions and walkers
    ndim = kmax+1  # 0,1,...,kmax
    nwalkers = ndim*2
    # Initial guess
    p0 = np.random.rand(ndim * nwalkers).reshape((nwalkers, ndim))


    nburn = 200  # nbr of burning steps
    nsamples = 1000  # nbr of final samples 

    # additional arguments to our sampler: d, x, sigma and d,x,sigma, bar_a respectively
    arglist_uniform = (d, x, sigma, kmax)
    arglist_natural = (d, x, sigma, kmax, bar_a)

    # Define samplers
    sampler_uniform = emcee.EnsembleSampler(nwalkers, ndim, log_post_uniform, args=arglist_uniform)
    sampler_natural = emcee.EnsembleSampler(nwalkers, ndim, log_post_natural, args=arglist_natural)
    # Start sampler on posteriors. Use first few hundred iterations as burn in. 
    t0 = time()  # start time
    sampler_uniform.run_mcmc(p0, nburn + nsamples)
    sampler_natural.run_mcmc(p0, nburn + nsamples)
    t1 = time()  # end time
    print(f'Sampling time: {t1-t0} seconds.')
    print()
    
    samples_uniform = sampler_uniform.chain[:,nburn:,:].reshape((-1,ndim))  # reshape to all samples per dim
    samples_natural = sampler_natural.chain[:,nburn:,:].reshape((-1,ndim)) 
    
    # Save samples
    samples_dict[kmax]["Uniform"] = samples_uniform
    samples_dict[kmax]["Natural"] = samples_natural

kmax: 0, k: 0
Sampling time: 1.0418190956115723 seconds.

kmax: 1, k: 1
Sampling time: 1.6863429546356201 seconds.

kmax: 2, k: 2
Sampling time: 2.0812902450561523 seconds.

kmax: 3, k: 2
Sampling time: 2.80195951461792 seconds.

kmax: 4, k: 2
Sampling time: 3.3598268032073975 seconds.

kmax: 5, k: 2
Sampling time: 4.231670379638672 seconds.

kmax: 6, k: 2
Sampling time: 4.933030128479004 seconds.



Now, marginalize out all features higher than order k=2, and compute the mean for our feature. We just do this by ignoring the other data corresponding to the higher features, since we are just interested in the number of times that we "land" on our relevant features anyway.

We show central values and 68% confidence interval: mean +- 1 std for the Gaussian posterior.

In [153]:
def calculate_feature_estimates(samples, prior_name, k, kmax):
    a_mean = np.zeros((k+1))
    a_std = np.zeros((k+1))
    for i in range(k+1):
        a_mean[i] = samples[:,i].mean()
        a_std[i] =  samples[:,i].std()
    if prior_name == 'Uniform':
        # TODO should I send in k here? Our model only consists of features up to k, since we marginalize the rest.
        chi_dof = chi_squared(a_mean, d, x, sigmas, k)/(k+1)  # k features in our model and k dof, they have been fitted at kmax features
        measure = chi_dof
    else:
        # TODO same as above
        evidence = np.exp(log_likelihood(a_mean, d, x, sigmas, k))
        measure = evidence
    return measure, a_mean, a_std


data_dict = {
    "Uniform": {
        "k": [],
        "kmax": [],
        "measure": [],
        "a0": ['' for i in range(kmaxs[-1]+1)],
        "a1": ['' for i in range(kmaxs[-1]+1)],
        "a2": ['' for i in range(kmaxs[-1]+1)]
    },
    "Natural": {
        "k": [],
        "kmax": [],
        "measure": [],
        "a0": ['' for i in range(kmaxs[-1]+1)],
        "a1": ['' for i in range(kmaxs[-1]+1)],
        "a2": ['' for i in range(kmaxs[-1]+1)]
    }
}
for kmax in samples_dict:
    k = samples_dict[kmax]["k"]
    for prior in data_dict:
        samples = samples_dict[kmax][prior]
        measure, a_mean, a_std = calculate_feature_estimates(samples, prior, k, kmax)
        data_dict[prior]["k"].append(k)
        data_dict[prior]["kmax"].append(kmax)
        data_dict[prior]["measure"].append(str(np.round(measure,2)))
        for idx, a_i in enumerate(a_mean):
            data_dict[prior]["a"+str(idx)][kmax] = str(np.round(a_i,2)) + ' +- ' + str(np.round(a_std[idx],2))
        
uniform_dataframe = pd.DataFrame(data_dict["Uniform"])
natural_dataframe = pd.DataFrame(data_dict["Natural"])
print('********** Uniform **********')
display(uniform_dataframe)
print('********** Natural **********')
display(natural_dataframe)

********** Uniform **********


Unnamed: 0,k,kmax,measure,a0,a1,a2
0,0,0,599.01,0.48 +- 0.01,,
1,1,1,8.94,0.2 +- 0.01,2.54 +- 0.11,
2,2,2,3.84,0.25 +- 0.02,1.64 +- 0.4,3.16 +- 1.36
3,2,3,28.02,0.27 +- 0.04,0.96 +- 1.05,7.82 +- 7.84
4,2,4,65.62,0.27 +- 0.04,0.85 +- 1.19,9.68 +- 10.6
5,2,5,66.95,0.26 +- 0.04,1.01 +- 1.14,9.23 +- 10.36
6,2,6,57.97,0.26 +- 0.04,0.96 +- 1.28,9.12 +- 11.84


********** Natural **********


Unnamed: 0,k,kmax,measure,a0,a1,a2
0,0,0,0.0,0.48 +- 0.01,,
1,1,1,19795322.13,0.2 +- 0.01,2.56 +- 0.1,
2,2,2,476234875.16,0.25 +- 0.02,1.63 +- 0.36,3.12 +- 1.2
3,2,3,376749718.23,0.24 +- 0.02,1.71 +- 0.45,2.7 +- 2.32
4,2,4,286532602.67,0.25 +- 0.02,1.69 +- 0.44,2.63 +- 2.23
5,2,5,398130613.54,0.24 +- 0.02,1.71 +- 0.44,2.72 +- 2.29
6,2,6,397150771.41,0.25 +- 0.02,1.71 +- 0.46,2.73 +- 2.26


## Task 2

In [181]:
# Generate new data using polynomial
def exact_data(x):
    '''Returns exact data using the given function'''
    if (np.abs(x) <= 1/np.pi).all() and (x>0).all():
        g = (1/2+np.tan(np.pi/2*x))**2
        return g
    else:
        return None

# Tests
x_f = np.array([0.1, 0.4])
x_ok = np.array([0.1, 0.2])
g_f = exact_data(x_f)
g_ok = np.round(exact_data(x_ok),4)
print(f'Return none if invalid x: {None == g_f}')
print(f'Return values if ok x: {[0.4335, 0.6805] == g_ok}')

Return none if invalid x: True
Return values if ok x: [ True  True]


In [269]:
# Add higher noise to data
np.random.seed(1)  # Fix seed to get same random variables every time from 
def add_noise(g, c):
    random_vector = np.random.normal(loc=0, scale=1, size=len(g))
    return g*(1+c*random_vector)

print(g)
print(add_noise(g, 0.05))

[0.26595604 0.30130017 0.33897427 0.37909135 0.4217866  0.4672192
 0.51557464 0.56706759 0.62194543 0.68049251]
[0.28755626 0.29208405 0.33002243 0.35875369 0.44003747 0.41345305
 0.56055367 0.5454848  0.63186667 0.67200777]
