<a href="https://colab.research.google.com/github/geoskimoto/AWDB_SOAP_Request/blob/main/GetData_AWDB_SOAP_call.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Library Imports

In [1]:
from sklearn.linear_model import LassoCV, RidgeCV, HuberRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scipy.stats as stats
# import statsmodels.api as sm
from functools import reduce

import requests
import xml.dom.minidom as minidom
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
import datetime


# Helper Functions

In [2]:
# Web Call to Access and Download Data of a Single Station from AWDB Web Service (SOAP API)

def SOAP_Call(stationtriplets, elementCD, begindate, enddate):

  # Create a dictionaries to store the data
  headers = {'Content-type': 'text/soap'}
  # current_dictionary = {}
  
  # Define Web Service URL
  URL = "https://wcc.sc.egov.usda.gov/awdbWebService/services?WSDL"

  # Define Parameters for SOAP Elements (getData:current and getCentralTendencyData:normals)
  SOAP_current = '''
  <?xml version="1.0" encoding="UTF-8"?>
  <SOAP-ENV:Envelope xmlns:SOAP-ENV="http://schemas.xmlsoap.org/soap/envelope/" xmlns:q0="http://www.wcc.nrcs.usda.gov/ns/awdbWebService" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <SOAP-ENV:Body>
      <q0:getData>
        <stationTriplets>STATIONTRIPLETS</stationTriplets>
        <elementCd>ELEMENTCD</elementCd>   
        <ordinal>1</ordinal>
        <duration>DAILY</duration>
        <getFlags>false</getFlags>
        <beginDate>BEGINDATE</beginDate>
        <endDate>ENDDATE</endDate>
        <alwaysReturnDailyFeb29>false</alwaysReturnDailyFeb29>   
      </q0:getData>
    </SOAP-ENV:Body>
  </SOAP-ENV:Envelope>

  '''.strip()
  #Read GetData documents - If <alwaysReturnDailyFeb29> is set to true, will set a null for every non leap year on the 29th,  
  #which breaks this request when selecting date ranges that include Feb 29.
  #Possible element codes: PREC, WTEQ (Water Equivalent/SWE)
  

  # Post SOAP Elements to AWDB Web Service and process results - getData
  SOAP_current = SOAP_current.replace("ELEMENTCD", elementCD)
  SOAP_current = SOAP_current.replace("STATIONTRIPLETS", stationtriplets)
  SOAP_current = SOAP_current.replace("BEGINDATE", begindate)
  SOAP_current = SOAP_current.replace("ENDDATE", enddate)

  response_current = requests.post(URL, data=SOAP_current, headers=headers)
  xmldoc = minidom.parseString(response_current.text)

  val_length = len(xmldoc.getElementsByTagName('values'))
  data = pd.DataFrame([xmldoc.getElementsByTagName('values')[i].firstChild.data for i in range(0,val_length)])

  date = datetime.datetime.strptime(begindate, "%m/%d/%Y").date()  #https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior; .date() after .strptime just tells it to make it a datetime object.  Def necessary.
  # print(date)
  Date = []                                                       
  for i in range(0, val_length): 
    date += datetime.timedelta(days=1)
    Date.append((date))

  # {str(xmldoc.getElementsByTagName('stationTriplet')[0].firstChild.data):{Date[j]:xmldoc.getElementsByTagName('values')[j].firstChild.data} for j in range(3)}

  data['Date'] = Date
  data.columns = [f'{elementCD}','Date']
  data.set_index('Date', inplace=True)
  
  data[f'{elementCD}'] = list(map(lambda x: float(x), data[f'{elementCD}']))
  
  return data
  # data.tail(7)

In [3]:
# Function to Download Multiple Stations at a time from AWDB Web Service

# stations = ['401:OR:SNTL', '471:ID:SNTL', '591:WA:SNTL']
def getData(stations, parameter_of_interest, begindate, enddate):
  # data = []
  # data_singleDF
  # for i in stations:
  #   data.append(SOAP_Call(stationtriplets=i,elementCD=parameter_of_interest,begindate=begindate,enddate=enddate))

  # data_singleDF =
 
  data_singleDF = reduce(lambda left,right: pd.merge(left,right,left_index=True, right_index=True, how='outer'), [SOAP_Call(stationtriplets=j,elementCD=parameter_of_interest,begindate=begindate,enddate=enddate) for j in stations])
  # data_singleDF.rename(columns='bla')
  data_singleDF.columns = [f'{j}' for j in stations]

  # index = np.array([data_singleDF.columns.values, data_singleDF.iloc[1].values])
  # data_singleDF = pd.MultiIndex.from_arrays(index)
  # data_singleDF = data_singleDF.iloc[1:]

  # data_singleDF.xs('price', axis=1, level=1, drop_level=False)


  return data_singleDF

In [4]:
#Choose Regression Model

def regressionModel(regression_model):
  if regression_model == 'Lasso':
    regr0 = LassoCV(alphas=(0.001,0.01,0.1,1,10,100,1000))
    return regr0
  elif regression_model == 'Ridge':
    regr0 = RidgeCV(alphas=(0.001,0.01,0.1,1,10,100,1000))
    return regr0
  elif regression_model == 'SVM':
    regr0 = svm.SVR(kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1)
    return regr0
  elif regression_model == 'Huber':
    regr0 = HuberRegressor()
    return regr0
  elif regression_model == 'GradientBoost':
    regr0 = GradientBoostingRegressor(random_state=0, n_estimators=100)
    return regr0
  elif regression_model == 'AdaBoost':
    regr0 = AdaBoostRegressor(random_state=0, n_estimators=100)
    return regr0
  elif regression_model == 'Random Forest':
    regr0 = RandomForestRegressor(n_estimators=100)
    return regr0
  else:
    print('Choose either Lasso, Ridge, SVM, Huber, GradientBoost, AdaBoost, or Random Forest')

# Regression with selected data

### Fit Model to Parameter and Date Range of Choice

In [12]:
def trainModel(stations, parameter_of_interest, begindate, enddate, regression_model, test_size):

  # global data, target, features, features_test, features_train, target_test, target_train, target_test_pred, target_train_pred, df_train, df_test, regr, regr0


#Download Data from AWDB Web Service
  data = getData(stations, parameter_of_interest, begindate, enddate)

#Define Targets and Features (e.g. Dependent and Independent Variables)
  target = data.iloc[:,0]
  features = data.iloc[:,1:]

#Split into training and test sets
  features_train, features_test, target_train, target_test = train_test_split(features,target, test_size=test_size,shuffle=False)
  # x_train, x_test, y_train, y_test
# Choose Regression Model:

  regr = regressionModel(regression_model)

#Fit model on training set
  regr.fit(features_train, target_train)

#Run predictions on training features and test features
  target_train_pred = regr.predict(features_train)
  target_test_pred = regr.predict(features_test)
  
#Combine everything into a training and test Dataframes to be quickly referenced when plotting.

  # df_train = features_train.iloc[:,:]
  # df_train['Target'] = target_train
  # df_train['Predictions'] = target_train_pred.tolist()

  # df_test = features_test
  # df_test['Target'] = target_test
  # df_test['Predictions'] = target_test_pred.tolist()



  fig = make_subplots(
      rows=4, cols=1,
      # subplot_titles=('Predictions vs. Actual - Training Set','Predictions vs. Actual - Test Set', 'Regression Fit')
      )# - Training Set', 'Regression Fit - Test Set')


  # 'Predictions vs. Actual - Training Set'
  # fig.add_trace(go.Scatter(
  #     y = target,
  #     x = target.index,
  #     mode = 'lines',
  #     name = f'{stations[0]}'
  # ), 
  # row=1, col=1)

  fig.add_trace(go.Scatter(
      y = target_train,
      x = target_train.index,
      mode = 'lines',
      name = 'Training'
  ), 
  row=1, col=1)

  fig.add_trace(go.Scatter(
      y = target_train_pred,
      x = target_train.index,
      mode = 'lines',
      name = 'Model Predictions on Training Data'
  ), 
  row=1, col=1)


  fig.add_trace(go.Scatter(
      y = target_test,
      x = target_test.index,
      mode = 'lines',
      name = 'Test Data'
  ), 
  row=2, col=1)

  fig.add_trace(go.Scatter(
      y = target_test_pred,
      x = target_test.index,
      mode = 'lines',
      name = 'Model Predictions on Training Data'
  ), 
  row=2, col=1)

  # # # # # Regression Plot # # # 

  # fig.add_trace(go.Scatter(
  #   x=features_train.iloc[:,0],
  #   y=target_train,
  #   mode='markers',
  #   # hovertext = ytrain_pred.tolist(),  
  # ), 
  # row=3, col=1)

  # fig.add_trace(go.Scatter(
  #   x=features_train.iloc[:,0],
  #   y=pd.DataFrame(target_train_pred.tolist()).iloc[:,0],
  #   mode='markers',
  #   # hovertext = ytrain_pred.tolist(),  
  # ), 
  # row=3, col=1)
  
  # # fig.add_trace(go.Scatter(
  # #   x=X_test,
  # #   y=ytest_pred.tolist(),
  # #   mode='lines',
  # #   hovertext = ytest_pred,  
  # # ), 
  # # row=3, col=1)

  # # fig.update_xaxes(title_text = f"{parameter_of_interest}", row=1, col=1)
  # # fig.update_yaxes(title_text = 'Date', row=1, col=1) 

  # # # fig.update_xaxes(title_text = f"{site_list.columns[0]}", row=2, col=1)
  # # # fig.update_yaxes(title_text = f'{site_list.columns[1:len(stations)', row=2, col=1) 

  fig.update_layout(
      showlegend=True,
      height=2000,
      width=1200 
  )
  fig.show()

  return regr, data

### Make Predictions with New Date Range Using Fitted Model

In [75]:
def makePredictions(stations, parameter_of_interest, train_begindate, train_enddate, predict_begindate, predict_enddate, regression_model):

  global predict_target, predict_features, train_target, train_features

#Download Data from AWDB Web Service
  train_data = getData(stations, parameter_of_interest, train_begindate, train_enddate)
  predict_data  = getData(stations, parameter_of_interest, predict_begindate, predict_enddate)
#Define Targets and Features (e.g. Dependent and Independent Variables)
  train_target = train_data.iloc[:,0]
  train_features = train_data.iloc[:,1:]

  predict_target = predict_data.iloc[:,0]
  predict_features = predict_data.iloc[:,1:]

#Split into training and test sets
  # features_train, features_test, target_train, target_test = train_test_split(features,target, test_size=test_size,shuffle=False)
  # x_train, x_test, y_train, y_test
# Choose Regression Model:

  regr = regressionModel(regression_model)

#Fit model to training set
  regr.fit(train_features, train_target)

#Run predictions
  predictions = regr.predict(predict_features)

  fig = make_subplots(
        rows=2, cols=1,
        # subplot_titles=('Predictions vs. Actual - Training Set','Predictions vs. Actual - Test Set', 'Regression Fit')
        )# - Training Set', 'Regression Fit - Test Set')

  fig.add_trace(go.Scatter(
      y = predict_target,
      x = predict_target.index,
      mode = 'lines',
      # name = ''
  ), 
  row=1, col=1)

  fig.add_trace(go.Scatter(
      y = predictions,
      x = predict_features.index,
      mode = 'lines',
      name = 'Model Predictions on Training Data'
  ), 
  row=1, col=1)

  fig.update_layout(
    showlegend=False,
    height=1550,
    width=1150 
  ) 

  fig.show()

In [38]:
# def makePredictions(startdate, enddate):

#   startdate = pd.to_datetime(startdate).date()
#   enddate = pd.to_datetime(enddate).date()
#   features = regr2[1].iloc[:,1:]
#   features = features.reset_index()
#   features_filtered_by_daterange = features[(features['Date'] > startdate) & (features['Date'] < enddate)]

# #Run predictions

#   # predictions = regr2[0].predict(features_filtered_by_daterange)

#   return features_filtered_by_daterange
#   # return predictions
  

In [77]:
stations = ['401:OR:SNTL', '471:ID:SNTL','591:WA:SNTL']


parameter_of_interest = 'WTEQ'
begindate='01/01/2015'
enddate='01/01/2018'

regr2 = trainModel(stations, parameter_of_interest, begindate = begindate, enddate=enddate, regression_model='Random Forest', test_size=0.1)

In [76]:
stations = ['401:OR:SNTL', '471:ID:SNTL','591:WA:SNTL']


parameter_of_interest = 'WTEQ'
train_begindate = '01/01/2015'
train_enddate = '01/01/2018'
predict_begindate = '01/02/2018'
predict_enddate = '01/01/2019'


regr2 = makePredictions(stations, parameter_of_interest, train_begindate, train_enddate, predict_begindate, predict_enddate, regression_model='AdaBoost')

In [72]:
predict_features

Unnamed: 0_level_0,471:ID:SNTL,591:WA:SNTL
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-03,5.9,13.8
2018-01-04,5.8,13.9
2018-01-05,5.9,14.0
2018-01-06,6.0,14.1
2018-01-07,6.0,14.3
...,...,...
2018-12-29,5.5,9.3
2018-12-30,5.4,10.0
2018-12-31,5.6,10.0
2019-01-01,6.3,10.4


To do's:

*   Predict based on date range
*   regression plots

