In [2]:
import json
import time
import os

from pymongo import MongoClient

import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import line_profiler

from imputation import *
from solve_cubic import *

%load_ext line_profiler
%load_ext Cython
%matplotlib inline

client = MongoClient("localhost", 27017)
db = client['usgs']

In [4]:
sites = db['cites']
measured = db['measured']
corrected = db['corrected']

cursor = sites.find()
sid = [c['site_no'] for c in cursor]

In [5]:
def confusion_matrix(y_true, y_pred):
    TN = np.logical_and(np.logical_not(y_true), np.logical_not(y_pred)).sum()
    TP = np.logical_and(y_true, y_pred).sum()
    FN = np.logical_and(y_true, np.logical_not(y_pred)).sum()
    FP = np.logical_and(np.logical_not(y_true), y_pred).sum()
    cm = np.array([[TN, FP],[FN, TP]])
    return cm

def mark_anomaly(y_m, y_c, anomaly_thresh):
    return np.logical_and(y_m > 0, np.abs(y_c - y_m) > anomaly_thresh)

In [None]:
CM_max = np.zeros((len(sid), 50, 2, 2))
thr_max = np.linspace(1e-4, 5e-3, 50)
for i in range(len(sid)):
    try:
        Tm, Zm, Tc, Zc = get_data(db, sid[i])    
        Ym, Yc = align_measurements(Tm, Zm, Tc, Zc)
        Ym[Ym <= 0] = -1
        Yc[Yc <= 0] = -1
        
        dt = min((Tc[1:] - Tc[:-1]).min(), (Tm[1:] - Tm[:-1]).min())
        Qm = fill_gaps(Ym, max_gap = 2000, spike_size = 2, window_size = 1 * 86400 // dt)
        Qc = fill_gaps(Yc, max_gap = 2000, spike_size = 2, window_size = 1 * 86400 // dt)
        dQm = np.hstack([0, (Qm[1:] - Qm[:-1]) / dt])
        y_true = mark_anomaly(Qm, Qc, 0.05) 
    except:
        print(sid[i], '- error')
    
    for j in range(50):
        y_pred = np.logical_and(dQm > thr_max[j], Qm > 0)
        cm = confusion_matrix(y_true, y_pred)
        CM_max[i,j] = cm
    
    if i % 50 == 0:
        print(i, sid[i])
        
CM_min = np.zeros((len(sid), 50, 2, 2))
thr_min = np.linspace(-5e-3, -1e-4, 50)
for i in range(len(sid)):
    try:
        Tm, Zm, Tc, Zc = get_data(db, sid[i])    
        Ym, Yc = align_measurements(Tm, Zm, Tc, Zc)
        Ym[Ym <= 0] = -1
        Yc[Yc <= 0] = -1
        
        dt = min((Tc[1:] - Tc[:-1]).min(), (Tm[1:] - Tm[:-1]).min())
        Qm = fill_gaps(Ym, max_gap = 2000, spike_size = 2, window_size = 1 * 86400 // dt)
        Qc = fill_gaps(Yc, max_gap = 2000, spike_size = 2, window_size = 1 * 86400 // dt)
        dQm = np.hstack([0, (Qm[1:] - Qm[:-1]) / dt])
        y_true = mark_anomaly(Qm, Qc, 0.05) 
    except:
        print(sid[i], '- error')
    
    for j in range(50):
        y_pred = np.logical_and(dQm < thr_min[j], Qm > 0)
        cm = confusion_matrix(y_true, y_pred)
        CM_min[i,j] = cm
    
    if i % 50 == 0:
        print(i, sid[i])
        
#dy_thr_max = thr_max[np.argmax(CM_max[:,:].mean(0))]
#dy_thr_min = thr_min[np.argmax(CM_min[:,:].mean(0))]

In [11]:
dy_thr_max = +0.0007
dy_thr_min = -0.0006

In [12]:
si = sid[23]

Tm, Zm, Tc, Zc = get_data(db, si)
dt = min((Tc[1:] - Tc[:-1]).min(), (Tm[1:] - Tm[:-1]).min())

abs_thresh = np.median(Zm) + 15*np.subtract(*np.percentile(Zm, [90, 10]))
Ym, Yc = align_measurements(Tm, Zm, Tc, Zc)
Ym[Ym <= 0] = -1
Yc[Yc <= 0] = -1

Qm = fill_gaps(Ym, max_gap = 2000, spike_size = 2, window_size = 1 * 86400 // dt)
Qc = fill_gaps(Yc, max_gap = 2000, spike_size = 2, window_size = 1 * 86400 // dt)
dQm = np.hstack([0, (Qm[1:] - Qm[:-1]) / dt])
y_true = mark_anomaly(Qm, Qc, 0.05)
y_pred = np.logical_or(dQm > dy_thr_max, dQm < dy_thr_min)