-
Notifications
You must be signed in to change notification settings - Fork 12
/
ANOVAOnewayPyMC.py
161 lines (115 loc) · 4.62 KB
/
ANOVAOnewayPyMC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# -*- coding: utf-8 -*-
'''Hierarchical Model for estimation of oneway ANOVA parameters via MCMC.
Python (PyMC) adaptation of the R code from "Doing Bayesian Data Analysis",
by John K. Krushcke.
More info: http://doingbayesiandataanalysis.blogspot.com.br/
'''
from __future__ import division
import pymc
import numpy as np
from matplotlib import pyplot as plot
from plot_post import plot_post
from normalize import (normalize, convert_baseline, convert_deflection,
convert_sigma)
from math import ceil
from os import path
# Code to find the data path.
scr_dir = path.dirname(__file__)
file_name = 'McDonaldSK1991data.txt'
comp_dir = path.join(scr_dir, 'Data', file_name)
# Using data from the book for easier comparison.
# Data from McDonald (1991) study about geographical location and muscle size
# in mussels.
# Again we use Numpy to assign the data to variables.
x, y = np.genfromtxt(comp_dir, delimiter=' ',
skip_header=19, usecols=(0, 1), unpack=True)
# Define the contrasts.
# TODO: use dictionary for easier retrieval of contrast description.
contrasts = np.array(((-1/3, -1/3, 1/2, -1/3, 1/2),
(1, -1, 0, 0, 0),
(-1/2, -1/2, 1, 0, 0),
(-1/2, -1/2, 1/2, 1/2, 0),
(1/3, 1/3, 1/3, -1, 0),
(-1/4, -1/4, -1/4, -1/4, 1),
(1/3, 1/3, 1/3, -1/2, -1/2),
(0, 0, 0, -1, 1)))
# Random data, for test purposes:
# y_truesd = 4.0
# a0_true = 100
# atrue = [15, -10, -7, 8, -6]
#x = [1] * 3 + [2] * 4 + [3] * 3 + [4] * 5 + [5] * 3
#y = [a0_true + atrue[i - 1] + np.random.normal(0, y_truesd) for i in x]
# Normalize the data for better MCMC performance.
# And define the total number of levels in our categorical variable.
zy = normalize(y)
x_levels = len(set(x))
y_mean = np.mean(y)
y_sd = np.sqrt(np.var(y))
# Begin the definition of the model.
# First, we define a Gamma distribution for the precision of
# the deflection parameters.
a_sd = pymc.Gamma('a_sd', 1.01005, 0.1005)
@pymc.deterministic
def a_tau(a_sd=a_sd):
return 1.0 / a_sd**2
# Then we define a normal prior on the baseline and deflection parameters.
a0 = pymc.Normal('a0', mu=0.0, tau=0.001)
a = pymc.Normal('a', mu=0.0, tau=a_tau, size=x_levels)
# Almost there! We still need to set the prior on the data variance.
sigma = pymc.Uniform('sigma', 0, 10)
@pymc.deterministic
def tau(sigma=sigma):
return 1.0 / sigma**2
# The priors are all set! Now we can define the linear model.
# Maybe it can be clearly defined using the 'Lambda()' class.
# But we will use a 'for' loop for easier readability.
mu = []
for i in x:
mu.append(a0 + a[int(i - 1)])
# And the likelihood.
like_y = pymc.Normal('like_y', mu=mu, tau=tau, value=zy, observed=True)
# Now we build the model, set the MAP and sample the posterior distribution.
model = pymc.Model([like_y, a0, a, sigma, a_tau, a_sd])
map_ = pymc.MAP(model)
map_.fit()
mcmc = pymc.MCMC(model)
mcmc.sample(iter=80000, burn=20000, thin=10)
# Extract the samples.
a0_sample = mcmc.trace('a0')[:]
a_sample = mcmc.trace('a')[:]
sigma_sample = mcmc.trace('sigma')[:]
a_sd_sample = mcmc.trace('a_sd')[:]
# Convert the values.
b0_sample = convert_baseline(a0_sample, a_sample, x_levels, y)
b_sample = convert_deflection(a0_sample, a_sample, x_levels, y)
sig_sample = convert_sigma(y, sigma_sample)
b_sd_sample = convert_sigma(y, a_sd_sample)
# Plot the results.
plot.figure(figsize=(6.0, 4.0))
plot.subplot(211)
plot_post(sig_sample, title=r'$\sigma$ (cell SD) posterior')
plot.subplot(212)
plot_post(b_sd_sample, title=r'$aSD$ posterior')
plot.subplots_adjust(wspace=0.2, hspace=0.5)
plot.figure(figsize=(18.0, 3.0))
total_subplot = len(b_sample[0, :])
plot_n = 100 + (total_subplot + 1) * 10 + 1
plot.subplot(plot_n)
plot_post(b0_sample, title=r'$\beta_0$ posterior')
for i in range(total_subplot):
plot.subplot(plot_n + i + 1)
plot_post(b_sample[:, i], title=r'$\beta_{1%i}$ posterior' % (i + 1))
plot.subplots_adjust(wspace=0.2)
n_cons = len(contrasts)
if n_cons > 0:
plot_per_rows = 5
plot_rows = ceil(n_cons / plot_per_rows)
plot_cols = ceil(n_cons / plot_rows)
plot.figure(figsize=(3.75 * plot_cols, 2.5 * plot_rows))
for i in range(n_cons):
contrast = contrasts[i, :]
comp = np.dot(b_sample, contrast)
plot.subplot(plot_rows, plot_cols, i + 1)
plot_post(comp, title='Contrast %i' % (i + 1), comp=0.0)
plot.subplots_adjust(wspace=0.2, hspace=0.5)
plot.show()