<a href="https://colab.research.google.com/github/geoskimoto/SNOTEL_ErrorAndOutlier_Tools/blob/fix_root/errorFinder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Library Imports

In [73]:
from sklearn.linear_model import LassoCV, RidgeCV, HuberRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scipy.stats as stats
# import statsmodels.api as sm
from functools import reduce

import requests
import xml.dom.minidom as minidom
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
import datetime


General format of "Error Finder Object":


*   outlierDetection
  *   tempOutliers

*   logicTests
  *   SWEoutpacePREC
  *   Filters
  





# Helper Functions

In [2]:
# Web Call to Access and Download Data of a Single Station from AWDB Web Service (SOAP API)

def SOAP_Call(stationtriplets, elementCD, begindate, enddate):

  # Create a dictionaries to store the data
  headers = {'Content-type': 'text/soap'}
  # current_dictionary = {}
  
  # Define Web Service URL
  URL = "https://wcc.sc.egov.usda.gov/awdbWebService/services?WSDL"

  # Define Parameters for SOAP Elements (getData:current and getCentralTendencyData:normals)
  SOAP_current = '''
  <?xml version="1.0" encoding="UTF-8"?>
  <SOAP-ENV:Envelope xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" xmlns:q0="http://www.wcc.nrcs.usda.gov/ns/awdbWebService" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <SOAP-ENV:Body>
      <q0:getData>
        <stationTriplets>STATIONTRIPLETS</stationTriplets>
        <elementCd>ELEMENTCD</elementCd>   
        <ordinal>1</ordinal>
        <duration>DAILY</duration>
        <getFlags>false</getFlags>
        <beginDate>BEGINDATE</beginDate>
        <endDate>ENDDATE</endDate>
        <alwaysReturnDailyFeb29>false</alwaysReturnDailyFeb29>   
      </q0:getData>
    </SOAP-ENV:Body>
  </SOAP-ENV:Envelope>

  '''.strip()
  #Read GetData documents - If <alwaysReturnDailyFeb29> is set to true, will set a null for every non leap year on the 29th,  
  #which breaks this request when selecting date ranges that include Feb 29.
  #Possible element codes: PREC, WTEQ (Water Equivalent/SWE)
  

  # Post SOAP Elements to AWDB Web Service and process results - getData
  SOAP_current = SOAP_current.replace("ELEMENTCD", elementCD)
  SOAP_current = SOAP_current.replace("STATIONTRIPLETS", stationtriplets)
  SOAP_current = SOAP_current.replace("BEGINDATE", begindate)
  SOAP_current = SOAP_current.replace("ENDDATE", enddate)

  response_current = requests.post(URL, data=SOAP_current, headers=headers)
  xmldoc = minidom.parseString(response_current.text)

  val_length = len(xmldoc.getElementsByTagName('values'))
  data = pd.DataFrame([xmldoc.getElementsByTagName('values')[i].firstChild.data for i in range(0,val_length)])

  date = datetime.datetime.strptime(begindate, "%m/%d/%Y").date()  #https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior; .date() after .strptime just tells it to make it a datetime object.  Def necessary.
  # print(date)
  Date = []                                                       
  for i in range(0, val_length): 
    date += datetime.timedelta(days=1)
    Date.append((date))

  # {str(xmldoc.getElementsByTagName('stationTriplet')[0].firstChild.data):{Date[j]:xmldoc.getElementsByTagName('values')[j].firstChild.data} for j in range(3)}

  data['Date'] = Date
  data.columns = [f'{elementCD}','Date']
  data.set_index('Date', inplace=True)
  
  data[f'{elementCD}'] = list(map(lambda x: float(x), data[f'{elementCD}']))
  
  return data
  # data.tail(7)

In [3]:
# Function to Download Multiple Stations at a time from AWDB Web Service

# stations = ['401:OR:SNTL', '471:ID:SNTL', '591:WA:SNTL']
def getData(stations, parameter_of_interest, begindate, enddate):
  # data = []
  # data_singleDF
  # for i in stations:
  #   data.append(SOAP_Call(stationtriplets=i,elementCD=parameter_of_interest,begindate=begindate,enddate=enddate))

  # data_singleDF =
 
  data_singleDF = reduce(lambda left,right: pd.merge(left,right,left_index=True, right_index=True, how='outer'), [SOAP_Call(stationtriplets=j,elementCD=parameter_of_interest,begindate=begindate,enddate=enddate) for j in stations])
  # data_singleDF.rename(columns='bla')
  data_singleDF.columns = [f'{j}' for j in stations]

  # index = np.array([data_singleDF.columns.values, data_singleDF.iloc[1].values])
  # data_singleDF = pd.MultiIndex.from_arrays(index)
  # data_singleDF = data_singleDF.iloc[1:]

  # data_singleDF.xs('price', axis=1, level=1, drop_level=False)


  return data_singleDF

# Creating errorFinder Class

In [84]:
SWE = getData(['401:OR:SNTL'], 'WTEQ', '01/01/2014', '04/01/2020')
PREC = getData(['401:OR:SNTL'], 'PREC', '01/01/2014', '04/01/2020')
df = pd.merge(SWE, PREC, left_index=True, right_index=True).reset_index()
df.columns = ['Date', 'SWE (in)', 'PREC (in)']

#Mask to select only data between Nov - Apr
winter_filter = ~pd.to_datetime(df['Date']).dt.month.between(5,10)
df2 = df.where(winter_filter).dropna()
# df2.set_index('Date', inplace = True)

#Add column specifying the water year (Oct 1 - Sep 30) of each record
df2.reset_index(inplace=True)  #Have to reset the index so the dates become just a regular datetime object to get .dt.year to work.
pd.to_datetime(df2['Date'])
df2['water_year'] = pd.to_datetime(df2['Date']).dt.year.where(pd.to_datetime(df2['Date']).dt.month < 10, pd.to_datetime(df2['Date']).dt.year + 1)
df2['water_year'] = list(map(lambda x: str(x), df2['water_year']))

#Plot the data
fig = go.Figure()

fig = px.line(df2, x="PREC (in)",hover_name=df2['Date'], y="SWE (in)", color='water_year')

fig.update_layout(
    yaxis=dict(scaleanchor="x", scaleratio=1),
    height=800,
    width=1100)

fig.show()

In [80]:
df2

Unnamed: 0,index,Date,SWE (in),PREC (in),water_year
0,0,2014-01-02,0.0,9.0,2014
1,1,2014-01-03,0.0,9.0,2014
2,2,2014-01-04,0.0,9.0,2014
3,3,2014-01-05,0.0,9.1,2014
4,4,2014-01-06,0.0,9.1,2014
...,...,...,...,...,...
1174,2278,2020-03-29,5.7,21.4,2020
1175,2279,2020-03-30,5.8,21.5,2020
1176,2280,2020-03-31,5.7,21.6,2020
1177,2281,2020-04-01,6.7,22.4,2020


In [111]:
class errorChecking():
  
  """
  """
  class logicalTests():
    def __init__(self, stationtriplets, begindate, enddate):   
      self.stations = stationtriplets
      self.begindate = begindate
      self.enddate = enddate


    def SWEvsPREC(self, param1, param2):  

      #Create dataframe with WTEQ and Precip Accum for selected station
      SWE = getData(self.stations, param1, self.begindate, self.enddate)
      PREC = getData(self.stations, param2, self.begindate, self.enddate)

      # SWE = getData(['401:OR:SNTL'], 'WTEQ', '01/01/2014', '04/01/2020')
      # PREC = getData(['401:OR:SNTL'], 'PREC', '01/01/2014', '04/01/2020')
      df = pd.merge(SWE, PREC, left_index=True, right_index=True).reset_index()
      df.columns = ['Date', 'SWE (in)', 'PREC (in)']

      #Mask to select only data between Nov - Apr
      winter_filter = ~pd.to_datetime(df['Date']).dt.month.between(5,10)
      df2 = df.where(winter_filter).dropna()
      # df2.set_index('Date', inplace = True)

      #Add column specifying the water year (Oct 1 - Sep 30) of each record
      df2.reset_index(inplace=True)  #Have to reset the index so the dates become just a regular datetime object to get .dt.year to work.
      pd.to_datetime(df2['Date'])
      df2['water_year'] = pd.to_datetime(df2['Date']).dt.year.where(pd.to_datetime(df2['Date']).dt.month < 10, pd.to_datetime(df2['Date']).dt.year + 1)
      df2['water_year'] = list(map(lambda x: str(x), df2['water_year']))

      #Plot the data
      fig = go.Figure()

      fig = px.line(df2, x="PREC (in)",hover_name=df2['Date'], y="SWE (in)", color='water_year')

      fig.update_layout(
          yaxis=dict(scaleanchor="x", scaleratio=1),
          height=800,
          width=1100)

      fig.show()
    
    def precipitationLessThanZero(self):
      PREC = getData(self.stations, 'PREC', self.begindate, self.enddate)
      PREC[PREC.iloc[:,0] < 0]


In [103]:
PREC = getData(['401:OR:SNTL'], 'PREC', '01/01/2014', '04/01/2020')

In [107]:
PREC[PREC.iloc[:,0] < 0]

Unnamed: 0_level_0,401:OR:SNTL
Date,Unnamed: 1_level_1


In [112]:
OR401 = errorChecking.logicalTests(['401:OR:SNTL'],'01/01/2014', '04/01/2020')

In [114]:
OR401.SWEvsPREC('WTEQ', 'PREC')