# Section 1.4.2.1
Normal random numbers

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st

In [None]:
# suppress some warnings from pymc3 output
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
%matplotlib inline

The data set has been downloaded from https://www.itl.nist.gov/div898/handbook/datasets/RANDN.DAT

In [None]:
# load data from file as 1d array
filename = 'RANDN.DAT'
data = (np.loadtxt(filename,skiprows=25)).flatten()

In [None]:
N=len(data);
print(N)

In [None]:
data[:5] # first five data points

Check some summary statistics to see if they agree.  The published mean in -0.00294 and the published standard deviation is 1.021042.

In [None]:
data.mean()

In [None]:
data.std(ddof=1) # ddof=1 gives sample deviation instead of population deviation (ddof=0).

Reproduce the plots in the source to check for consistency

In [None]:
plt.plot(data)
plt.title('Data sequence');

In [None]:
plt.hist(data,bins=25)
plt.title('Data histogram');

In [None]:
plt.plot(data[:-1],data[1:],'.')
plt.xlabel('$x_i$');
plt.ylabel('$x_{i+1}$')
plt.title('Lag plot');

## Posterior of the mean, assuming fixed variance

Since the data are nominally normally distributed, if we assume a flat prior, analytically we'll get a posterior of
$$
p(\theta|y) \approx N(\theta,\bar{y},\sigma^2/n)
$$
(ref. Gelman pg 52)

In [None]:
x = np.linspace(-1,1,100)
plt.plot(x,st.norm.pdf(x,data.mean(),np.sqrt(data.std(ddof=1)/N)))
plt.title('Posterior of the mean')
plt.xlabel(r'$ \theta $');

### Model using MCMC

In [None]:
import pymc3 as pm

In [None]:
with pm.Model() as model1:
    theta = pm.Flat('theta')
    sigma = data.std(ddof=1)
    yobs = pm.Normal('yobs',mu=theta,sigma=sigma,observed=data)

In [None]:
with model1:
    trace1=pm.sample(1000)

In [None]:
pm.traceplot(trace1);

In [None]:
# Compare numerical results to analytic results
pm.plot_posterior(trace1)
x = np.linspace(-0.15,0.15,100)
plt.plot(x,st.norm.pdf(x,data.mean(),np.sqrt(data.std(ddof=1)/N)),'r')
plt.legend(['MCMC','HPD','Analytic']);

### TODOs
* Run again without fixed sigma
* Add linear drift and compare the fixed mean model with linear drift model
* Compare to Gaussian Process model