-
Notifications
You must be signed in to change notification settings - Fork 0
/
Trend.py
409 lines (371 loc) · 13.7 KB
/
Trend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
defs useful for calculating trends and their confidence levels
Ported from Javascript
https://skepticalscience.com/trend.php
(c) Skeptical Science 2012
Attribution-ShareAlike 3.0 Unported (CC BY-SA 3.0)
https://creativecommons.org/licenses/by-sa/3.0/
This module accepts Pandas DataFrames with datetime indexes for data
@author: dan
"""
import pandas as pd
import numpy as np
class Holder:
""" Simple class to hold data as attributes.
"""
def padded(s, size, ptype='linear'):
"""Takes a series and returns a version padded at both ends.
Parameters
----------
s : pd.Series
Series to be padded
size : int
Size of window being used. The returned series will be
(size - 2) bigger than the supplied series.
type : str ['linear' | 'mirror'] default 'linear'
Type of padding to use. 'Linear' fits a line to the end data of length
``size`` and uses that to fill the start and end padding. 'Mirror'
copies and reflects the data instead.
Notes
-----
'mirror' uses actual data for padding, but results in zero-slope
(horizontal) end points. 'linear' will usually give better results.
"""
if type(s) != pd.Series: # Check for dataframe
s = s.iloc[:, 0] # use first column as a series
n = len(s)
hw = size//2 # half-window size
tx = np.array(s.index)
ty = np.array(s.values)
x = np.zeros(n + 2 * hw, dtype=s.index.dtype)
y = np.zeros(n + 2 * hw)
x[hw:hw+n] = tx # add actual data
y[hw:hw+n] = ty
# x-value intervals are mirrored in both cases
for i in range(hw): # pad beginning
x[i] = tx[0] - (tx[hw-i] - tx[0])
for i in range(hw): # pad end
x[i+hw+n] = tx[n-1] + (tx[n-1] - tx[n-2-i])
if ptype.lower() == 'mirror':
# pad data as a reflection of original data. eg use index values:
# 2, 1, 0, 1, 2, 3, 4, 5 and
# n-3, n-2, n-1, n-2, n-3, n-4
for i in range(hw): # pad beginning
y[i] = ty[hw-i]
for i in range(hw): # pad end
y[i+hw+n] = ty[n-2-i]
else:
# use 'linear' for any other input
# Note that x values may be dates, so normalize them so fit line
# coefficients are not too large
ix = x.copy()
if type(x[0]) == np.datetime64:
# make days, then floats
tx = (tx - x[0]).astype('m8[D]').astype(np.float64)
ix = (ix - x[0]).astype('m8[D]').astype(np.float64)
# fit a line to start
c = np.polyfit(tx[:hw], ty[:hw], 1)
p = np.poly1d(c)
y[:hw] = p(ix[:hw])
# fit a line to end
c = np.polyfit(tx[-hw:], ty[-hw:], 1)
p = np.poly1d(c)
y[-hw:] = p(ix[-hw:])
return pd.Series(y, index=x)
def lowess(data, f=2./3., pts=None, itns=3, order=1,
pad='linear', **kwargs):
""" Locally-Weighted Scatterplot Smoothing. Fits a nonparametric regression
curve to a scatterplot.
Parameters
----------
data : pandas.Series
Data points in the scatterplot. The
function returns the estimated (smooth) values of y.
f : float default 2/3
The fraction of the data set to use for smoothing. A
larger value for f will result in a smoother curve.
pts : int default None
The explicit number of data points to be used for
smoothing instead of f.
itn : int default 3
The number of robustifying iterations. The function will run
faster with a smaller number of iterations.
order : int default 1
The order of the polynomial used for fitting. Defaults to 1
(straight line). Values < 1 are made 1. Larger values should be
chosen based on shape of data (# of peaks and valleys + 1)
pad : str ['linear' | 'mirror' | None] default 'linear'
Type of padding to use. If no padding desired, use ``None``.
Returns
-------
pandas.Series containing the smoothed data.
Notes
-----
Surprisingly works with pd.DateTime index values.
"""
# Authors: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
# original
# Dan Neuman <https://github.com/dneuman>
# converted to Pandas series, extended to polynomials,
# and added padding option.
# License: BSD (3-clause)
n = len(data)
if pts is None:
f = np.min([f, 1.0])
r = int(np.ceil(f * n))
else: # allow use of number of points to determine smoothing
r = int(np.min([pts, n]))
r = min([r, n-1])
order = max([1, order])
if pad:
s = padded(data, r*2, ptype=pad)
x = np.array(s.index)
y = np.array(s.values)
n = len(y)
else:
x = np.array(data.index)
y = np.array(data.values)
# condition x-values to be between 0 and 1 to reduce errors in linalg
x = x - x.min()
x = x / x.max()
# Create matrix of 1, x, x**2, x**3, etc, by row
xm = np.array([x**j for j in range(order+1)])
# Create weight matrix, one column per data point
h = [np.sort(np.abs(x - x[i]))[r] for i in range(n)]
w = np.clip(np.abs((x[:, None] - x[None, :]) / h), 0.0, 1.0)
w = (1 - w ** 3) ** 3
# Set up output
yEst = np.zeros(n)
delta = np.ones(n) # Additional weights for iterations
for iteration in range(itns):
for i in range(n):
weights = delta * w[:, i]
xw = np.array([weights * x**j for j in range(order+1)])
b = xw.dot(y)
a = xw.dot(xm.T)
beta = np.linalg.solve(a, b)
yEst[i] = sum([beta[j] * x[i]**j for j in range(order+1)])
# Set up weights to reduce effect of outlier points on next iteration
residuals = y - yEst
s = np.median(np.abs(residuals))
delta = np.clip(residuals / (6.0 * s), -1, 1)
delta = (1 - delta ** 2) ** 2
if pad:
n = len(data)
return pd.Series(yEst[r:n+r], index=data.index,
name='Locally Weighted Smoothing')
else:
return pd.Series(yEst, index=data.index,
name='Locally Weighted Smoothing')
def convertYear(data):
""" Add a 'Year' column, if it does not already exist, containing
the fractional year. `data` must have a pd.datetime index.
This method accounts for leap years.
"""
if 'Year' in data.columns: return
if not hasattr(data.index, 'year'): # integer year index
data['Year'] = data.index
return
dy = data.index.year
doy = data.index.day_of_year - 1
ey = data.index.is_leap_year * 1 + 365
data['Year'] = dy + doy / ey
def selectData(data, cn=None, start=None, end=None):
""" Extract xy data from the temperature data object by date range
data: (Pandas DataFrame) with an index of datetime
cn: (string) Column name to use. Default is first one
start: (float) start date as fractional year
end: (float) end date as fractional year
Returns data as x and y DataSeries
"""
if not start: start = -np.inf
if not end: end = np.inf
if not cn: cn = data.columns[0]
convertYear(data)
d = data.loc[data['Year'].between(start, end)]
return d['Year'], d[cn]
def linearFit(x, y):
""" Determines the linear fit to supplied Numpy arrays. Use
`df['Column Name'].values` to extract array from DataFrame.
Returns dictionary of results.
xdata, ydata: (numpy array) Input x and y data
"""
# accumulate sums
n = len(x)
sx = x.mean()
sy = y.mean()
sxx = (x*x).mean()
sxy = (x*y).mean()
# trend
if sxx > sx*sx:
b = (sxy - sx*sy)/(sxx - sx*sx)
else: b=0
a = sy - b*sx
# uncertainty
sd2 = ((y - (a+b*x))**2).sum()/(n - 2) # residual variance
sb2 = sd2/(n*(sxx-sx*sx)) # slope variance
sa2 = sxx*sb2 # residual variance
# package results
lsq = Holder() # empty object
lsq.x = x
lsq.y = y
lsq.n = n
lsq.sx = sx
lsq.sy = sy
lsq.sxx = sxx
lsq.sxy = sxy
lsq.slope = b
lsq.intercept = a
lsq.slopeVar = sb2
lsq.interceptVar = sa2
lsq.residualVar = sd2
lsq.xline = np.array([x.min(), x.max()])
lsq.yline = a + b * lsq.xline
lsq.res = lsq.y - (a + b * x)
return lsq
def dataPerDegreeOfFreedom(xdata, ydata, lsq=None):
""" Returns the number of data points per degree of freedom
based on the covariance of the data. See:
Foster and Rahmstorf, 2011
"Global temperature evolution 1979–2010"
https://iopscience.iop.org/article/10.1088/1748-9326/6/4/044022#erl408263app1
xdata, ydata: (numpy array) Input x and y data
"""
if not lsq:
lsq = linearFit(xdata, ydata)
yres = ydata - lsq.slope * xdata
cov = autocovariance(yres, 0)
rho1 = autocovariance(yres, 1) / cov
rho2 = autocovariance(yres, 2) / cov
nu = 1.0 + (2.0*rho1)/(1.0-rho2/rho1)
if nu < 0.:
nu = (1. + rho1)/(1. - rho1)
return nu
def autocovariance(data, j):
""" Return covariance of the data at lag j
data: (numpy array) data to be analyzed
j: (int) lag to calculate the covariance at
"""
# var n = data.length, sx = 0.0, cx = 0.0;
# for ( var i = 0; i < n; i++ )
# sx += data[i];
# sx /= n;
# for ( var i = 0; i < n-j; i++ )
# cx += (data[i]-sx)*(data[i+j]-sx);
# return cx/n;
n = len(data)
rx = data - data.mean()
cx = (rx[:n-j] * rx[j:]).mean()
return cx
def movingAverage(xdata, ydata, period):
""" Return the moving average of the supplied data. Assumes
that the data is at regular intervals.
xdata, ydata: (numpy array) data to be analyzed
period: (int) number of values per average
Returns x, y: (lists of floats)
"""
x = []
y = []
for i in range(len(xdata)-period+1):
x.append(xdata[i:i+period].mean())
y.append(ydata[i:i+period].mean())
return x, y
def confidenceInterval(xdata, ydata, stdDevs, lsq=None):
""" Calculate the slope limits for the supplied confidence limit.
xdata, ydata: (numpy array) data to be analyzed
stdDevs: (float) number of standard deviations to use for interval
lsq: (object) Optional least squares data. Will calculate
if not supplied.
Returns lsq with y1 and y2 added
"""
if not lsq:
lsq = linearFit(xdata, ydata)
y = lsq.intercept + lsq.slope * xdata
xvar = (xdata - lsq.sx)**2
dy = stdDevs * (lsq.residualVar * (1 + xvar/(lsq.sxx-lsq.sx**2))/lsq.n)**0.5
lsq.y1 = y - dy
lsq.y2 = y + dy
return lsq
def analyzeData(xdata, ydata, stdDevs=2.):
""" Get the trend of the supplied data, along with the confidence
interval at the supplied number of standard deviations.
xdata, ydata: (numpy array) data to be analyzed
stdDevs: (float) Number of standard deviations for confidence
interval. Default is 2.0 (95%).
Returns lsq with analysis values and confidence limits y1, y2
"""
if hasattr(xdata, 'values'): x = xdata.values
else: x = xdata
if hasattr(ydata, 'values'): y = ydata.values
else: y = ydata
lsq = linearFit(x, y)
nu = dataPerDegreeOfFreedom(x, y, lsq)
nu_adj = max(1., nu)
lsq.sigma = (nu_adj * lsq.slopeVar)**0.5
lsq.nu = nu
lsq = confidenceInterval(x, y, stdDevs * nu_adj, lsq)
return lsq
def analyzeRate(df, cn, window, stdDevs=2.):
""" Analyze the slope at each point in the supplied data, and
include the adjusted variance.
df: (Pandas DataFrame) data to analyzed
cn: (string) column name to analyze
window: (float) size of the window in years, centred on
each point, to use for calculating the slope.
stdDevs: (float) Number of standard deviations for confidence
interval. Default is 2.0 (95%).
"""
yn = 'Year'
if yn not in df.columns:
convertYear(df)
w2 = window/2
for i, yr in zip(df.index, df[yn]):
d = df.loc[df[yn].between(yr-w2, yr+w2)]
x = d[yn].values
y = d[cn].values
lsq = linearFit(x, y)
nu = dataPerDegreeOfFreedom(x, y, lsq)
nu_adj = max(1., nu)
sigma = (nu_adj * lsq.slopeVar)**0.5 * stdDevs
df.loc[i, 'Rate'] = lsq.slope
df.loc[i, 'R1'] = lsq.slope - sigma
df.loc[i, 'R2'] = lsq.slope + sigma
def example():
import matplotlib.pyplot as plt
# Create trend line with autocorrelated noise
w = np.ones(21)/21. # filtering window
d = pd.DataFrame(index=pd.date_range(start='1980-1-1',
end='2020-1-1',
freq='MS'))
d['n'] = np.random.normal(0, 2., len(d))
d.n = np.convolve(d.n, w, 'same') # filter with a moving average
d.n -= d.n.mean()
d['ya'] = 0.01
d.ya = d.ya.cumsum()
d['y'] = d.ya + d.n
# Do analysis
stdDevs = 2.
x, y = selectData(d, 'y', start=1990.5)
lsq = analyzeData(x, y, stdDevs)
mx, my = movingAverage(x, y, 12)
#save data for inspection
lsq.xdata = x
lsq.ydata = y
fig, ax = plt.subplots(num='example', clear=True)
ax.plot(x, y, 'k+', alpha=0.3) # data
ax.plot(mx, my, 'g-', lw=2) # moving average
ax.plot(lsq.x, lsq.y, 'b-', lw=3) # trend
ax.plot(x, lsq.y1, 'b-', lw=1) # lower limit
ax.plot(x, lsq.y2, 'b-', lw=1) # upper limit
# add labels with info
error = stdDevs * lsq.sigma
text = (f'Actual: 0.12/year\nTrend: {lsq.slope:.2f}±{error:.3f}\n'+
f'nu: {lsq.nu:.3f}')
ax.text(0.5, 0.25, text, va='top', ma='left', ha='center',
transform=ax.transAxes)
plt.show()
return lsq
if __name__ == '__main__':
lsq = example()