<a href="https://colab.research.google.com/github/geoskimoto/AWDB_SOAP_Request/blob/main/Regression_connected_to_webservice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Library Imports

In [1]:
try:
    import xmltodict
    print("module 'xmltodict' is installed")
except ModuleNotFoundError:
  !pip install xmltodict

Collecting xmltodict
  Downloading https://files.pythonhosted.org/packages/28/fd/30d5c1d3ac29ce229f6bdc40bbc20b28f716e8b363140c26eff19122d8a5/xmltodict-0.12.0-py2.py3-none-any.whl
Installing collected packages: xmltodict
Successfully installed xmltodict-0.12.0


In [2]:
from sklearn.linear_model import LassoCV, RidgeCV, HuberRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scipy.stats as stats
# import statsmodels.api as sm
from functools import reduce

import requests
import xmltodict
import datetime

# Helper Functions

In [3]:
# Web Call to Access and Download Data of a Single Station from AWDB Web Service (SOAP API)

def SOAP_Call(stationtriplets, elementCD, begindate, enddate):
  global xml, dict_of_xml, df
  # Create a dictionaries to store the data
  headers = {'Content-type': 'text/soap'}
  # current_dictionary = {}
  
  # Define Web Service URL
  URL = "https://wcc.sc.egov.usda.gov/awdbWebService/services?WSDL"

  # Define Parameters for SOAP Elements (getData:current and getCentralTendencyData:normals)
  SOAP_current = '''
  <?xml version="1.0" encoding="UTF-8"?>
  <SOAP-ENV:Envelope xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" xmlns:q0="http://www.wcc.nrcs.usda.gov/ns/awdbWebService" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <SOAP-ENV:Body>
      <q0:getData>
        <stationTriplets>STATIONTRIPLETS</stationTriplets>
        <elementCd>ELEMENTCD</elementCd>   
        <ordinal>1</ordinal>
        <duration>DAILY</duration>
        <getFlags>false</getFlags>
        <beginDate>BEGINDATE</beginDate>
        <endDate>ENDDATE</endDate>
        <alwaysReturnDailyFeb29>false</alwaysReturnDailyFeb29>   
      </q0:getData>
    </SOAP-ENV:Body>
  </SOAP-ENV:Envelope>

  '''.strip()
  #Read GetData documents - If <alwaysReturnDailyFeb29> is set to true, will set a null for every non leap year on the 29th,  
  #which breaks this request when selecting date ranges that include Feb 29.
  #Possible element codes: PREC, WTEQ (Water Equivalent/SWE)
  
  # Post SOAP Elements to AWDB Web Service and process results - getData
  SOAP_current = SOAP_current.replace("ELEMENTCD", elementCD)
  SOAP_current = SOAP_current.replace("STATIONTRIPLETS", stationtriplets)
  SOAP_current = SOAP_current.replace("BEGINDATE", begindate)
  SOAP_current = SOAP_current.replace("ENDDATE", enddate)

  #Send request to server and receive xml document
  xml = requests.post(URL, data=SOAP_current, headers=headers)

  #convert xml document to a dictionary, extract values putting them in a dataframe.  XML's aren't the easiest to parse and extract data from, so this is a nice work around.
  dict_of_xml = xmltodict.parse(xml.text)
  df = dict_of_xml['soap:Envelope']['soap:Body']['ns2:getDataResponse']['return']['values']

  #Null values are given as OrderedDictionaries with lots of text, while actual values are given as strings.  This converts all the OrderedDictionaries into actual null/none values, and converts all values that were given as strings into float numbers.
  df = pd.DataFrame(map(lambda i: float(i) if type(i) == str else None, df))

  #Since invidual dates aren't associated with the values in the xml document, have to create a range of dates bw the begindate and endate, which is then added to the dataframe.
  df['Date'] = pd.date_range(begindate,enddate,freq='d')
  df.columns = [f'{elementCD}','Date']
  df.set_index('Date', inplace=True)

  return df


In [4]:
# Function to Download Multiple Stations at a time from AWDB Web Service

def getData(stations, parameter_of_interest, begindate, enddate):
  data_singleDF = reduce(lambda left,right: pd.merge(left,right,left_index=True, right_index=True, how='outer'), [SOAP_Call(stationtriplets=j,elementCD=parameter_of_interest,begindate=begindate,enddate=enddate) for j in stations])
  data_singleDF.columns = [f'{j}' for j in stations]
  return data_singleDF

In [5]:
#Choose Regression Model

def regressionModel(regression_model):
  if regression_model == 'Lasso':
    regr = LassoCV(alphas=(0.001,0.01,0.1,1,10,100,1000))
    return regr
  elif regression_model == 'Ridge':
    regr = RidgeCV(alphas=(0.001,0.01,0.1,1,10,100,1000))
    return regr
  elif regression_model == 'SVM':
    regr = svm.SVR(kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1)
    return regr
  elif regression_model == 'Huber':
    regr = HuberRegressor()
    return regr
  elif regression_model == 'GradientBoost':
    regr = GradientBoostingRegressor(random_state=0, n_estimators=100)
    return regr
  elif regression_model == 'AdaBoost':
    regr = AdaBoostRegressor(random_state=0, n_estimators=100)
    return regr
  elif regression_model == 'Random Forest':
    regr = RandomForestRegressor(n_estimators=100)
    return regr
  else:
    print('Choose either Lasso, Ridge, SVM, Huber, GradientBoost, AdaBoost, or Random Forest')

# Regression with selected data

### Function to check model fit

In [8]:
def checkModel(stations, parameter_of_interest, begindate, enddate, regression_model, test_size):

#Download Data from AWDB Web Service
  data = getData(stations, parameter_of_interest, begindate, enddate)

#Define Targets and Features (e.g. Dependent and Independent Variables)
  target = data.iloc[:,0]
  features = data.iloc[:,1:]

#Split into training and test sets
  features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=test_size, shuffle=False)
  # x_train, x_test, y_train, y_test
# Choose Regression Model:

  regr = regressionModel(regression_model)

#Fit model on training set
  regr.fit(features_train, target_train)

#Run predictions on training features and test features
  target_train_pred = regr.predict(features_train)
  target_test_pred = regr.predict(features_test)
  
#Plot Data
  fig = make_subplots(
      rows=2, cols=2,
  )

  fig.add_trace(go.Scatter(
      y = target_train,
      x = target_train.index,
      mode = 'lines',
      name = 'Training'
  ), 
  row=1, col=1)

  fig.add_trace(go.Scatter(
      y = target_train_pred,
      x = target_train.index,
      mode = 'lines',
      name = 'Model Predictions on Training Data'
  ), 
  row=1, col=1)


  fig.add_trace(go.Scatter(
      y = target_test,
      x = target_test.index,
      mode = 'lines',
      name = 'Test Data'
  ), 
  row=2, col=1)

  fig.add_trace(go.Scatter(
      y = target_test_pred,
      x = target_test.index,
      mode = 'lines',
      name = 'Model Predictions on Test Data'
  ), 
  row=2, col=1)

  # # # # # Regression Plot # # # 

  # fig.add_trace(go.Scatter(
  #   x=features_train.iloc[:,0],
  #   y=target_train,
  #   mode='markers',
  #   # hovertext = ytrain_pred.tolist(),  
  # ), 
  # row=3, col=1)

  # fig.add_trace(go.Scatter(
  #   x=features_train.iloc[:,0],
  #   y=pd.DataFrame(target_train_pred.tolist()).iloc[:,0],
  #   mode='markers',
  #   # hovertext = ytrain_pred.tolist(),  
  # ), 
  # row=3, col=1)
  
  # # fig.add_trace(go.Scatter(
  # #   x=X_test,
  # #   y=ytest_pred.tolist(),
  # #   mode='lines',
  # #   hovertext = ytest_pred,  
  # # ), 
  # # row=3, col=1)

  # # fig.update_xaxes(title_text = f"{parameter_of_interest}", row=1, col=1)
  # # fig.update_yaxes(title_text = 'Date', row=1, col=1) 

  # # # fig.update_xaxes(title_text = f"{site_list.columns[0]}", row=2, col=1)
  # # # fig.update_yaxes(title_text = f'{site_list.columns[1:len(stations)', row=2, col=1) 

  fig.update_layout(
      showlegend=True,
      height=700,
      width=900 
  )
  fig.show()
  
  
  # print('RMSE', mean_squared_error(target, y_pred))

  return regr, data

### Make Predictions with New Date Range Using Fitted Model

In [9]:
def makePredictions(stations, parameter_of_interest, train_begindate, train_enddate, predict_begindate, predict_enddate, regression_model):

  # global predict_target, predict_features, train_target, train_features

#Download Data from AWDB Web Service
  train_data = getData(stations, parameter_of_interest, train_begindate, train_enddate)
  predict_data  = getData(stations, parameter_of_interest, predict_begindate, predict_enddate)
#Define Targets and Features (e.g. Dependent and Independent Variables)
  train_target = train_data.iloc[:,0].fillna(0)
  train_features = train_data.iloc[:,1:].fillna(0)

  predict_target = predict_data.iloc[:,0].fillna(0)
  predict_features = predict_data.iloc[:,1:].fillna(0)

# Choose Regression Model:
  regr = regressionModel(regression_model)

#Fit model to training set
  regr.fit(train_features, train_target)

#Run predictions
  predictions = regr.predict(predict_features)
  fig = go.Figure()

  fig.add_trace(go.Scatter(
      y = predict_target,
      x = predict_target.index,
      mode = 'lines',
      name = f'{stations[0]} {parameter_of_interest}'
  )) 

  fig.add_trace(go.Scatter(
      y = predictions,
      x = predict_features.index,
      mode = 'lines',
      name = 'Model Predictions on Training Data'
  )) 

  fig.update_layout(
    showlegend=True,
    height=650,
    width=1400,
    title={
        'text': "Model Predictions",
        'xanchor': 'center',
        'yanchor': 'top',
        'y':0.9,
        'x':0.4},
    xaxis_title = "Date",
    yaxis_title = f"{parameter_of_interest} (in)"
  )

  fig.show()

#### Test Functions

In [None]:
SOAP_Call('642:WA:SNTL', 'WTEQ', '01/10/2020', '04/01/2021')

Unnamed: 0_level_0,WTEQ
Date,Unnamed: 1_level_1
2020-01-10,20.8
2020-01-11,22.3
2020-01-12,23.2
2020-01-13,24.5
2020-01-14,25.0
...,...
2021-03-28,51.6
2021-03-29,55.3
2021-03-30,
2021-03-31,


In [None]:
getData(['642:WA:SNTL', '1085:WA:SNTL','679:WA:SNTL'],'WTEQ', '01/10/2020', '04/01/2021')

Unnamed: 0_level_0,642:WA:SNTL,1085:WA:SNTL,679:WA:SNTL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-10,20.8,21.5,26.4
2020-01-11,22.3,24.1,28.6
2020-01-12,23.2,25.3,31.4
2020-01-13,24.5,27.4,34.3
2020-01-14,25.0,28.1,34.9
...,...,...,...
2021-03-28,51.6,66.6,90.8
2021-03-29,55.3,67.2,92.2
2021-03-30,,67.3,92.0
2021-03-31,,67.4,92.0


## Estimating missing Data

In [48]:
#Morse Lake Predictions

stations = ['642:WA:SNTL', '1085:WA:SNTL','679:WA:SNTL']
parameter_of_interest = 'WTEQ'
train_begindate = '01/01/2020'
train_enddate = '01/01/2021'
predict_begindate = '01/04/2021'
predict_enddate = '04/29/2021'


regr2 = makePredictions(stations, parameter_of_interest, train_begindate, train_enddate, predict_begindate, predict_enddate, regression_model='Ridge')

In [51]:
#Blewett Pass Predictions

stations = ['352:WA:SNTL', '734:WA:SNTL','507:WA:SNTL', '841:WA:SNTL','478:WA:SNTL']
parameter_of_interest = 'WTEQ'
train_begindate = '01/01/2021'
train_enddate = '02/26/2021'
predict_begindate = '01/01/2021'
predict_enddate = '04/29/2021'


regr2 = makePredictions(stations, parameter_of_interest, train_begindate, train_enddate, predict_begindate, predict_enddate, regression_model='Random Forest')

In [45]:
 #Mt. Crag Predictions

stations = ['648:WA:SNTL', '1107:WA:SNTL','974:WA:SNTL']
parameter_of_interest = 'PREC'
train_begindate = '01/01/2014'
train_enddate = '01/01/2021'
predict_begindate = '01/01/2021'
predict_enddate = '04/29/2021'


regr2 = makePredictions(stations, parameter_of_interest, train_begindate, train_enddate, predict_begindate, predict_enddate, regression_model='Ridge')

In [41]:
 #Olallie Meadows Predictions

stations = ['672:WA:SNTL', '788:WA:SNTL', '898:WA:SNTL']
parameter_of_interest = 'WTEQ'
train_begindate = '01/01/2021'
train_enddate = '03/31/2021'
predict_begindate = '01/01/2021'
predict_enddate = '04/29/2021'


regr2 = makePredictions(stations, parameter_of_interest, train_begindate, train_enddate, predict_begindate, predict_enddate, regression_model='Ridge')

In [42]:
 #Thunder Basin Predictions

stations = ['817:WA:SNTL', '681:WA:SNTL', '711:WA:SNTL', '975:WA:SNTL']
parameter_of_interest = 'WTEQ'
train_begindate = '01/01/2021'
train_enddate = '02/28/2021'
predict_begindate = '01/01/2021'
predict_enddate = '04/29/2021'


regr2 = makePredictions(stations, parameter_of_interest, train_begindate, train_enddate, predict_begindate, predict_enddate, regression_model='Ridge')