In [1]:
import json

from pymongo import MongoClient

import numpy as np
import matplotlib.pyplot as plt
import line_profiler

from helpers import *
from imputation import *

from bokeh.io import output_notebook, push_notebook
from bokeh.plotting import figure, show
from bokeh.layouts import column
from bokeh.models import *

from ipywidgets.widgets import Button, HBox
from IPython.display import display

client = MongoClient("localhost", 27017)
db = client['usgs']
output_notebook()
%matplotlib inline

In [2]:
def extreme_values(y, continuity, high = True, low = True):
    n = y.size // 2
    res = y.copy()
    
    z = np.sort(np.abs(y))
    dz = z[1:] - z[:-1]
    
    if low:
        mask_min = dz[:n][::-1] > continuity
        if mask_min.sum() > 0:
            i_min = n - np.argmax(dz[:n][::-1] > continuity)
            thr_min = 0.5*(z[i_min] + z[i_min - 1])
        else:
            thr_min = z[0] - 1
    else:
        thr_min = z[0] - 1
        
    if high:
        mask_max = dz[n:] > continuity
        if mask_max.sum() > 0:
            i_max = n + np.argmax(dz[n:] > continuity)
            thr_max = 0.5*(z[i_max] + z[i_max + 1])
        else:
            thr_max = z[-1] + 1   
    else:
        thr_max = z[-1] + 1  
    
    idx = np.logical_or(res < thr_min, res > thr_max)
    return idx

In [None]:
sid = json.load(open('revision_list_d.json'))['to_review']

In [None]:
# mis-alignment
2198000, 2422500, 2318700, 15743850

In [None]:
good = [5124480, 2378500, 11057500, 6061500, 9447800, 2361000, 1022500, 3550000, 2329500, 11152000, 1372500, 3049000, 
        12043300, 10293000, 1193500, 9124500]

In [None]:
si = 0
print(sid[si])

Tm, Zm, Tc, Zc = get_data(db, sid[si])
Zm = feet_to_meters(Zm)
Zc = feet_to_meters(Zc)

z = Zm.copy()

mask = extreme_values(z, continuity = 0.1, high = True, low = True)
z[mask] = -1

z = fill_gaps(z,  max_gap = 7 * 86400 // dt, spike_size = 2, window_size = 10)

dz = np.hstack([0, (z[2:] - z[:-2]) / (Tm[2:] - Tm[:-2]) * 3600 , 0])
mask = extreme_values(dz, continuity = 0.1, high = True, low = True)
z[mask] = -1

dt, Ym, Yc = align_measurements(Tm, Zm, Tc, Zc)
dt, y, _ = align_measurements(Tm, z, Tc, Zc)

Ym = fill_gaps(Ym, max_gap = 30 * 86400 // dt, spike_size = 2, window_size = 10)
Yc = fill_gaps(Yc, max_gap = 30 * 86400 // dt, spike_size = 2, window_size = 10)
y  = fill_gaps(y,  max_gap = 30 * 86400 // dt, spike_size = 2, window_size = 10)

print(Ym.size, Yc.size, dt)

In [None]:
bw_front = np.zeros(y.size, dtype = bool)
fw_front = np.zeros(y.size, dtype = bool)

thr = 0.2

for i in range(1, y.size - 1):
    dy_l = ( y[i]   - y[i-1] ) / dt * 3600
    dy_r = ( y[i+1] - y[i]   ) / dt * 3600
    if dy_l > 0 and dy_l > thr:
        bw_front[i] = True
    elif dy_r < 0 and dy_l < -thr:
        fw_front[i] = True

In [None]:
plt.figure(figsize = (16,4))
plt.plot(Yc, 'c-')
#plt.plot((y[1:] - y[:-1]) * 4, 'r-')
plt.plot(y, 'k-')
plt.plot(bw_front, 'g-')
plt.plot(fw_front, 'r-')
plt.ylim(0.5,3)
plt.xlim(91000,98000)

In [None]:
Ym[Ym <= 0] = -1
Yc[Yc <= 0] = -1

Ym = fill_gaps(Ym, max_gap = 4, spike_size = 2, window_size = 10)
Yc = fill_gaps(Yc, max_gap = 4, spike_size = 2, window_size = 10)

y_true = mark_anomaly(Ym, Yc, 0.05)

#Ym[Ym <= 0] = np.nan
#Yc[Yc <= 0] = np.nan

n = 1000
k = 50
i = k
count = 0
idx = Ym > 30
t = np.arange(Ym.size) * dt / 3600

source = ColumnDataSource(
    data = dict(
        x = t, 
        m = Ym, 
        c = Yc, 
        a = y_true,
        o = Ym > 30
    )
)

gh_tools = [BoxZoomTool(), WheelZoomTool(dimensions=['width']), WheelZoomTool(dimensions=['height']), PanTool(), 
            SaveTool()]

gh_plot = figure(title = 'Site %i' % sid[si], 
                 plot_height = 300, 
                 plot_width = 950, 
                 tools = gh_tools, 
                 toolbar_location = "above", 
                 x_axis_label = "Time, hours",
                 y_axis_label = "Gage Height, meters",
                 y_range = (-1, Zc.max()))

anomaly_plot = figure(title = 'Site %i. Is Anomaly' % sid[si], plot_height = 200, plot_width = 950, y_range = (-0.5,1.5), 
                      x_range = gh_plot.x_range, toolbar_location = "above")

gh_plot.line("x", "m",  source = source, color = '#ff0000', line_width = 2, legend="GH Measured")
gh_plot.line("x", "c",  source = source, color = '#0000ff', line_width = 2, legend="GH Computed", alpha = 0.5)
#gh_plot.circle(t[idx], Ym[idx],  source = source, color = '#00ff00', legend="Outlier")

gh_plot.ygrid.minor_grid_line_color = 'navy'
gh_plot.ygrid.minor_grid_line_alpha = 0.1

gh_plot.xgrid.minor_grid_line_color = 'navy'
gh_plot.xgrid.minor_grid_line_alpha = 0.1

anomaly_plot.line  ("x", "a", source = source, color = '#000000', line_width = 2)
anomaly_plot.line  ("x", "o", source = source, color = '#00ff00', line_width = 2)

show(column(gh_plot, anomaly_plot))

In [None]:
cursor = db['cites'].find()
sid = [c['site_no'] for c in cursor]

In [None]:
M = list()
m = list()
for si in sid:
    try:
        _, _, _, Zc = get_data(db, si)
        Zc = feet_to_meters(Zc)
        M.append(Zc.max())
        m.append(Zc.min())
    except:
        pass

In [None]:
X = np.zeros((6500000, 5))
k = 0

for si in sid:
    try:
        Tm, Zm, Tc, Zc = get_data(db, si)
        Zm = feet_to_meters(Zm)
        Zc = feet_to_meters(Zc)
        dt, Ym, Yc = align_measurements(Tm, Zm, Tc, Zc)
        if dt != 900:
            print('Bad dt: ', si, dt)
            continue

        Ym[Ym <= 0] = -1
        Yc[Yc <= 0] = -1

        Ym = fill_gaps(Ym, max_gap = 4, spike_size = 2, window_size = 10)
        Yc = fill_gaps(Yc, max_gap = 4, spike_size = 2, window_size = 10)

        y_true = mark_anomaly(Ym, Yc, 0.05)
        idx = np.where(y_true > 0)[0]
        for i in idx:
            X[k] = Ym[i-2:i+3]
            k += 1
            
            if k % 100000 == 0:
                print(k)
    except:
        print('Bad SID: ', si)   

In [None]:
X[:773891]
n = 773891

f0 = X[:n,2]
f1 = (X[:n,3] -   X[:n,1]) / 8
f2 = (X[:n,2] -   X[:n,1]) / 4
f3 = (X[:n,3] - 2*X[:n,2] + X[:n,1]) * 16
f4 = (X[:n,1] +   X[:n,3]) / 2 - X[:n,2]
f5 = (X[:n,0] +   X[:n,1] + X[:n,3] + X[:n,4]) / 4 - X[:n,2]

In [None]:
thr0 = 27.62 # max(y_computed)
thr1_pos = [0.50, 1.10]
thr1_neg = [0.05, 0.90]
thr3_pos = [1.00, 5.00]
thr3_neg = []
thr4_abs = [0.10, 0.30]
thr5_abs = [0.20, 1.00]

In [None]:
y = np.sort(Ym[y_true > 0])
t = np.linspace(0.6, 1, y.size)

plt.figure(figsize = (16,4))
#plt.semilogy(t, y)

y = np.sort(f0)
t = np.linspace(0, 1, y.size)
plt.semilogy(t, y)

plt.grid(which = 'both')
#plt.plot([0,n], [32,32], 'r--')
plt.ylim(1e+0,1e+2)
plt.xlim(0.9, 1.0)

In [None]:
plt.figure(figsize = (16,4))
plt.hist(np.log10(f0[f0 > 10]), bins = 100);