# Description of notebook

This notebook includes the code for the solution of the extra task specific to reproducing table III. For the analysis where the effects of varying the data uncertainty and quantity, both this notebook and the one for the main task were used. The variations of data were made with the notebook *Project 1 - data generator*.

In [4]:
# Standard libraries
import sys
from time import time
import pickle

# Third party libraries
import emcee
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
from tqdm import tqdm_notebook

In [5]:
# This parameter is just for us to control the names of saved figures. Please disregard it. :)
task_type = 'basic'

In [3]:
# This cell is for the redone analysis in the extra part of the project. Here, we just control what data is being fed to
# the functions and scripts in this notebook. 
data = pickle.load(open(f'{task_type}_data.pkl',"rb" ))  # Unpickles generated and pickled data

FileNotFoundError: [Errno 2] No such file or directory: 'basic_data.pkl'

## Reproduce Table III

We want to sample our parameter space for our features $\{a\}$, for different values of $k$ and $k_{max}$, using both our uniform and naturaleness priors as defined in the basic tasks. However, the main difference is that we need to marginalize out the extra parameters for higher values of $k_{max}$. 

Copy most of the code from the basic part.

In [6]:
def load_data(file):
    '''Load data given 'file' into a dictionary'''
    d = {
        "x": [],
        "d": [],
        "sigma": []
    }
    # Skip first two rows, which are the header:
    with open(file) as f:
        for idx,line in enumerate(f):
            if idx < 3:
                pass
            else:
                val = line.split()
                d["x"].append(np.float(val[0]))
                d["d"].append(np.float(val[1]))
                d["sigma"].append(np.float(val[2]))
    # Cast to numpy arrays
    d["x"] = np.array(d["x"])
    d["d"] = np.array(d["d"])
    d["sigma"] = np.array(d["sigma"])
    return d


def modular_model(a, x, kmax):
    '''
    Returns model of order kmax. a is a vector containing the model features.
    Note the kmax+1 in the for loop. This is due to range being from 0 to
    kmax-1.
    '''
    model = 0
    for k in range(kmax+1):
        model += a[k]*x**k
    return model


def log_uniform_prior(a):
    '''
    Flat, infinite uniform prior for a. This is used in this part of 
    the assignment to highlight the overfitting for higher model orders.
    '''
    return 0
    

def log_naturalness_prior(a, bar_a=5):
    '''Naturaleness prior implemented according to equation 24 with bar(a)=5. This ensures'''
    return -len(a)*np.log(np.sqrt(2*np.pi)*bar_a) - 1/2*(a.dot(a)/bar_a**2)


def chi_squared(a, d, x, sigmas, kmax):
    '''
    Returns the chi squared measure for the datapoints d and x. The standard deviation is 
    assumed to be constant for all datapoints.
    '''
    chi_vec = (d-modular_model(a, x, kmax))/sigmas
    return np.sum(chi_vec**2)


def log_likelihood(a, d, x, sigmas, kmax):
    '''
    Returns log likelihood based on a Gaussian with di as the center values and 
    a standard deviation of sigma. a is the feature vector for our model.
    '''
    chi_sq = chi_squared(a, d, x, sigmas, kmax)
    like = -np.sum(np.log(np.sqrt(2*np.pi)*sigmas)) - 1/2*chi_sq
    return like


def log_post_uniform(a,d,x,sigma, kmax):
    '''The log posterior corresponding to the uniform prior'''
    return log_likelihood(a,d,x,sigma,kmax) + log_uniform_prior(a)


def log_post_natural(a,d,x,sigma, kmax, bar_a):
    '''The log posterior corresponding to the naturalness prior'''
    return log_likelihood(a,d,x,sigma,kmax) + log_naturalness_prior(a, bar_a=bar_a)

In [7]:
# Unit test suite 1
test1 = log_uniform_prior([-1,2,50]) == 0
test2 = log_uniform_prior([-1,2,500]) == -np.inf
test3 = -7.8651 == np.round(np.log((1/(np.sqrt(2*np.pi)*5))**3 * \
        np.exp(-(1+4+9)/(2*5**2))),4) == np.round(log_naturalness_prior(np.array([1,2,3]), 5),4) # Calculate expression exact and log
                          
print(f'Passed test 1: {test1}')
print(f'Passed test 2: {test2}')
print(f'Passed test 3: {test3}')

# Unit test suite 2
a = np.array([1,2,3,4])
x = np.array([1,2])
d = np.array([2,4])
sigmas = np.array([2,4])
kmax = len(a)-1  # Sanity check that the model still works in the basic task case
a = np.array([1,2,3,4])
x = np.array([1,2])
d = np.array([2,4])
sigmas = np.array([2,4])
test4 = [10, 49] == modular_model(a,x,kmax)
test5 = 142.5625==chi_squared(a,d,x,sigmas,kmax)
exact_likelihood = np.prod(1/(np.sqrt(2*np.pi)*sigmas)) * np.exp(-chi_squared(a,d,x,sigmas,kmax)/2)
test_likelihood = np.round(log_likelihood(np.array([1,2,3,4]), np.array([2,4]), np.array([1,2]), np.array([2,4]), kmax),4)
test6 = np.round(np.log(exact_likelihood),4) == np.round(test_likelihood, 4) # exact calculation, and then log
a = np.array([1,2,3,4,5,6,7])
kmax = 6
test7 = ([28, 769] == modular_model(a, x, kmax))
print(f'Passed test 4: {test4}')
print(f'Passed test 5: {test5}')
print(f'Passed test 6: {test6}')
print(f'Passed test 7: {test7}')

Passed test 1: True
Passed test 2: False
Passed test 3: True
Passed test 4: [ True  True]
Passed test 5: True
Passed test 6: True
Passed test 7: [ True  True]


In [8]:
# Load data
file = 'D1_c_5.dat'
data = load_data(file)

# Define constants and data
bar_a = 5
x = data["x"]
d = data["d"]
sigmas = data["sigma"]
kmaxs = [i for i in range(0,6+1)]  # 0,1,2,3,4,5,6
samples_dict = {}  # Define dict to save samples in

for kmax in tqdm_notebook(kmaxs):
    samples_dict[kmax] = {}
    k = min(kmax,2)  # k+1 is the numbers of relevant features, so our model is at most of degree 2.
    samples_dict[kmax]["k"] = k
    print(f'kmax: {kmax}, k: {k}')
    
    # Define dimensions and walkers
    ndim = kmax+1  # 0,1,...,kmax
    nwalkers = ndim*2
    # Initial guess
    p0 = np.random.rand(ndim * nwalkers).reshape((nwalkers, ndim))


    nburn = 200  # number of burn-in steps
    nsamples = 2000  # number of final samples 

    # Additional arguments to our sampler: d, x, sigma and d,x,sigma, bar_a respectively
    arglist_uniform = (d, x, sigmas, kmax)
    arglist_natural = (d, x, sigmas, kmax, bar_a)

    # Define samplers
    sampler_uniform = emcee.EnsembleSampler(nwalkers, ndim, log_post_uniform, args=arglist_uniform)
    sampler_natural = emcee.EnsembleSampler(nwalkers, ndim, log_post_natural, args=arglist_natural)
    # Start sampler on posteriors. Use first few hundred iterations as burn-in. 
    t0 = time()  # Start time
    sampler_uniform.run_mcmc(p0, nburn + nsamples)
    sampler_natural.run_mcmc(p0, nburn + nsamples)
    t1 = time()  # End time
    print(f'Sampling time: {t1-t0} seconds.')
    print()
    
    # Extract the samples and ignore burn in. Reshape to a single matrix
    samples_uniform = sampler_uniform.chain[:,nburn:,:].reshape((-1,ndim))  # reshape to all samples per dim
    samples_natural = sampler_natural.chain[:,nburn:,:].reshape((-1,ndim)) 
    
    # Save samples
    samples_dict[kmax]["Uniform"] = samples_uniform
    samples_dict[kmax]["Natural"] = samples_natural

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

kmax: 0, k: 0
Sampling time: 1.711735486984253 seconds.

kmax: 1, k: 1
Sampling time: 2.6064202785491943 seconds.

kmax: 2, k: 2
Sampling time: 3.276283025741577 seconds.

kmax: 3, k: 2
Sampling time: 4.181361198425293 seconds.

kmax: 4, k: 2
Sampling time: 5.250135898590088 seconds.

kmax: 5, k: 2
Sampling time: 6.414647340774536 seconds.

kmax: 6, k: 2
Sampling time: 7.775234222412109 seconds.




Now, marginalize out all features higher than order $k=2$. We just do this by ignoring the other data corresponding to the higher order model parameters, since we are just interested in the number of times that we "land" on our relevant parameters anyway.

Note that we show central values and 68% confidence interval: mean +- 1 std in the table (same as in the report). This is due to the posterior being Gaussian in both casses.

In [9]:
def calculate_feature_estimates(samples, prior_name, k, kmax):
    a_mean = np.zeros((kmax+1))
    a_std = np.zeros((kmax+1))
    for i in range(kmax+1):
        a_mean[i] = samples[:,i].mean()
        a_std[i] =  samples[:,i].std()
    if prior_name == 'Uniform':
        # Calculate chi^2/dof for each "model", i.e. sets of {a_i}. Then take the mean. 
        chi_dof_arr = np.zeros((samples.shape[0],1))
        for i in tqdm_notebook(range(samples.shape[0])):
            # Iterate over all versions of the model 
            model = samples[i,:]
            chi_dof_i = chi_squared(model, d, x, sigmas, kmax)/(len(x)-(k+1))  # len(x) - (k+1), since we compute (k+1) parameters in our model.
            chi_dof_arr[i] = chi_dof_i
        measure = chi_dof_arr.mean()
    else:
        # Integrate over all a to get evidence.
        # Do this using Laplace's method, as described in the lecture notes (Learning from Data: Model selection).
        cov = np.cov(samples.T)
        if kmax == 0:
            # Cast cov to a (1,1)-matrix for numpy.linalg.det to function.
            cov = np.array(cov).reshape(1,1)
        # Use the approximate expression for the evidence from the lecture notes (Learning from Data: Model selection).
        # This approximate form is valid if the posterior is a Gaussian, which is the case here since our likelihood 
        # is a Gaussian and our priors are either Gaussian or uniform.
        evidence = np.exp(log_post_natural(a_mean, d, x, sigmas, kmax, bar_a)) * np.sqrt((2*np.pi)**(kmax+1)/np.linalg.det(np.linalg.inv(cov)))
        measure = evidence
    return measure, a_mean[0:k+1], a_std[0:k+1]


# Define dict to save table data in 
data_dict = {
    "Uniform": {
        "k": [],
        "kmax": [],
        "measure": [],
        "a0": ['' for i in range(kmaxs[-1]+1)],
        "a1": ['' for i in range(kmaxs[-1]+1)],
        "a2": ['' for i in range(kmaxs[-1]+1)]
    },
    "Natural": {
        "k": [],
        "kmax": [],
        "measure": [],
        "a0": ['' for i in range(kmaxs[-1]+1)],
        "a1": ['' for i in range(kmaxs[-1]+1)],
        "a2": ['' for i in range(kmaxs[-1]+1)]
    }
}

# Generate table data, and populate the tables
for kmax in samples_dict:
    k = samples_dict[kmax]["k"]
    for prior in data_dict:
        samples = samples_dict[kmax][prior]
        measure, a_mean, a_std = calculate_feature_estimates(samples, prior, k, kmax)
        data_dict[prior]["k"].append(k)
        data_dict[prior]["kmax"].append(kmax)
        data_dict[prior]["measure"].append(str(np.round(measure,2)))
        for idx, a_i in enumerate(a_mean):
            data_dict[prior]["a"+str(idx)][kmax] = str(np.round(a_i,2)) + ' +- ' + str(np.round(a_std[idx],2))
        
uniform_dataframe = pd.DataFrame(data_dict["Uniform"])
natural_dataframe = pd.DataFrame(data_dict["Natural"])

# Show the tables
print('******** Uniform ********')
display(uniform_dataframe)
print('******** Natural ********')
display(natural_dataframe)

HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=12000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=16000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=24000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=28000), HTML(value='')))


******** Uniform ********


Unnamed: 0,k,kmax,measure,a0,a1,a2
0,0,0,66.68,0.48 +- 0.01,,
1,1,1,3.32,0.21 +- 0.02,2.5 +- 0.25,
2,2,2,2.09,0.25 +- 0.02,1.6 +- 0.41,3.23 +- 1.3
3,2,3,2.16,0.27 +- 0.04,1.09 +- 1.05,7.2 +- 7.77
4,2,4,2.13,0.32 +- 0.07,-1.25 +- 2.7,36.75 +- 32.36
5,2,5,1.75,0.52 +- 0.15,-12.31 +- 7.91,231.73 +- 135.55
6,2,6,1.86,0.54 +- 0.26,-13.58 +- 15.64,256.71 +- 334.33


******** Natural ********


Unnamed: 0,k,kmax,measure,a0,a1,a2
0,0,0,0.0,0.48 +- 0.01,,
1,1,1,692.12,0.21 +- 0.03,2.37 +- 0.39,
2,2,2,2983.5,0.25 +- 0.02,1.63 +- 0.37,3.13 +- 1.24
3,2,3,2800.18,0.25 +- 0.02,1.6 +- 0.44,3.2 +- 2.27
4,2,4,2851.46,0.25 +- 0.02,1.71 +- 0.46,2.56 +- 2.45
5,2,5,2695.28,0.25 +- 0.03,1.66 +- 0.48,2.79 +- 2.35
6,2,6,2617.1,0.24 +- 0.02,1.7 +- 0.45,2.87 +- 2.33
