-
Notifications
You must be signed in to change notification settings - Fork 6
/
midterm.py
153 lines (103 loc) · 5.43 KB
/
midterm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
'''
ASTR 503/703 Midterm Challenge
October 2016
This take-home test is intended to be completed in 3 hours. Please do not spend
more than 5 hours on it. If a blank region is 3+ spaces long, you are expected
to fill in code. If it's only one space, then a text comment is all you need.
'''
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from astroML.utils import check_random_state
# you are given a file of input data xx and yy; the uncertainties are unknown
# load the data and plot it
input = np.load('xydata.npz')
xx = input['xx']
yy = input['yy']
# determine whether there is a correlation using both the Pearson & the
# Spearman Rank tests and print out the probability that there *is* a
# correlation expressed as a number between 0-100%
# check whether your results are invariant under swapping xx and yy
# since you do not know the relative error in xx and yy
# select the preferred frequentist linear fit between the forward, inverse,
# and bisector fits, plot it, and print out its coefficients (slope and
# intercept expressed assuming yy = slope*xx + intercept)
# use bootstrap resampling of the data to determine the 68% confidence
# intervals for the slope and intercept you computed above
# (hint: you cannot use the pre-made bootstrap/smoothedbootstrap functions
# but must construct a custom bootstrap as Ivezic Fig. 3.24 had to do)
nboot=100
Ndata=len(xx)
sloperesults = np.zeros(nboot)
intresults = np.zeros(nboot)
random_state=None
rng = check_random_state(random_state)
ind = rng.randint(Ndata, size=(Ndata,nboot))
#for k in range(nboot):
#slopesort=np.argsort(sloperesults)
#intsort=np.argsort(intresults)
#print "68%% confidence interval for slope: %0.2f -- %0.2f" % (,)
#print "68%% confidence interval for intercept: %0.2f -- %0.2f" % (,)
# now suppose we wish to determine whether a 2nd order model is
# superior to a 1st order one for this data set -- this problem is too
# hard for a 3-hour test if we allow for errors in both variables, so
# we'll assume from now on that all the scatter is in the yy direction
# perform 1st and 2nd order polyfit results
# compute the reduced chi^2 to compare the fits: here please note that
# since you don't know the error bars, you'll have to assume some value
# for them -- you can't estimate them from the rms around the fit as
# that would give a different answer for the 1st and 2nd order fits,
# biasing the comparison -- so let's first assume error bars = 2.
errs = 2.
#print "reduced chi^2 for 1st order fit = %0.2f and for 2nd order fit = %0.2f" % (redchisq1,redchisq2)
# what is wrong with the reduced chi^2 values? adjust your error bar assumption to fix the problem
# plot the two fits
# which fit order is preferred based on this analysis? does your preferred
# fit shed any light on why the Spearman rank test yielded lower confidence
# in a correlation than the Pearson test?
# use stats.chi2.ppf to determine whether the fit order you did not
# prefer is rejected at >68% or >95% one-sided confidence, i.e., "one
# sigma" or "two sigma" confidence in the language of Gaussians
# (watch out that ppf returns chi^2, not reduced chi^2)
# how confident are you in your choice of fit order? explain
'''
side note: if we had wanted to assume a mix of error in xx and error in yy,
we could have used scipy.odr (orthogonal distance regression) which supports
polynomial fitting and user-supplied functions -- for details see
http://docs.scipy.org/doc/scipy/reference/odr.html and
http://blog.rtwilson.com/orthogonal-distance-regression-in-python/
'''
# below is a code block you can uncomment with two Bayesian likelihood grid
# calculations for the 1st and 2nd order model parameters, assuming flat priors
'''
ndata=len(xx)
nalpha=100
nbeta=100
alphaposs = np.linspace(pfit1[0]-4.*np.sqrt(covp1[0,0]),pfit1[0]+4.*np.sqrt(covp1[0,0]),nalpha)
betaposs = np.linspace(pfit1[1]-4.*np.sqrt(covp1[1,1]),pfit1[1]+4.*np.sqrt(covp1[1,1]),nbeta)
prior=1.
modelgridterm1 = alphaposs.reshape(1,nalpha) * xx.reshape(ndata,1)
modelgrid = modelgridterm1.reshape(ndata,nalpha,1) + betaposs.reshape(nbeta,1,1).T
residgrid = yy.reshape(ndata,1,1) - modelgrid
chisqgrid = np.sum(residgrid**2/errs**2,axis=0)
lnpostprob1 = (-1./2.)*chisqgrid + np.log(prior)
ndata=len(xx)
np0=100
np1=100
np2=100
p0poss = np.linspace(pfit2[0]-4.*np.sqrt(covp2[0,0]),pfit2[0]+4.*np.sqrt(covp2[0,0]),np0)
p1poss = np.linspace(pfit2[1]-4.*np.sqrt(covp2[1,1]),pfit2[1]+4.*np.sqrt(covp2[1,1]),np1)
p2poss = np.linspace(pfit2[2]-4.*np.sqrt(covp2[2,2]),pfit2[2]+4.*np.sqrt(covp2[2,2]),np2)
prior=1.
modelgridterm1 = p0poss.reshape(1,np0) * xx.reshape(ndata,1)**2
modelgridterm2 = modelgridterm1.reshape(ndata,np0,1) + (p1poss.reshape(np1,1,1).T * xx.reshape(ndata,1,1))
modelgrid = modelgridterm2.reshape(ndata,np0,np1,1) + p2poss.reshape(np2,1,1,1).T
residgrid = yy.reshape(ndata,1,1,1) - modelgrid
chisqgrid = np.sum(residgrid**2/errs**2,axis=0)
lnpostprob2 = (-1./2.)*chisqgrid + np.log(prior)
'''
# marginalize over all parameters in the two posterior distributions to
# decide whether a Bayesian odds calculation favors a 1st or 2nd order model
#print "odds favoring a 1st order over a 2nd order model: %0.2f" % odds
# consult Ivezic section 5.4 -- does your result agree with your earlier result
# based on chi^2 analysis? discuss the confidence levels in each analysis