In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import linregress
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyRegressor


In [2]:
# df = pd.read_csv(os.path.join("..", "..", "data", "streamflow", "ComalRv_Spgfl.csv"))
df = pd.read_csv(os.path.join("..", "..", "data", "streamflow", "Springflow_All_Comal.csv"))

df.set_index(pd.to_datetime(df['datetime']), inplace=True)
df.dropna(inplace=True)
del df['datetime']
df.head()

Unnamed: 0_level_0,Comal Rv Discharge (cfs),Springflow (cfs),J17 (ft above msl),J27 (ft above msl)
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1986-10-04,238.0,245.0,659.2,873.15
1986-10-05,254.0,245.0,659.2,873.15
1986-10-08,262.0,250.0,662.56,873.45
1986-10-14,286.0,278.0,669.96,874.05
1986-10-15,278.0,277.0,670.75,874.15


# Let's compare J-17 to Comal Springs

In [3]:
m = 3
print(len(df))
for col in df.columns:
    std = np.std(df[col])
    df = df.loc[abs(df[col] - np.nanmean(df[col])) < m * std]
print(len(df))

9619
9568


In [4]:
X_J27 = np.array(df['J27 (ft above msl)'].tolist())

In [5]:
X_J17, y = np.array(df['J17 (ft above msl)'].tolist()), np.array(df['Springflow (cfs)'].tolist()) # make these list like arrays
X_ComRv = np.array(df['Comal Rv Discharge (cfs)'].tolist())
X_J27 = np.array(df['J27 (ft above msl)'].tolist())

In [6]:
X = []

for i, row in df.iterrows():
    X_J17, X_ComRv, X_J27 = row['J17 (ft above msl)'], row["Comal Rv Discharge (cfs)"], row['J27 (ft above msl)']
    X.append([X_J17, X_ComRv, X_J27])

X = np.array(X)
X[0]

array([659.2 , 238.  , 873.15])

In [7]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.5, random_state=42, shuffle=True)

print("Labels for training and testing data")
print("Train X shape: ", train_X.shape)
print("Train Y shape: ", train_y.shape)
print("Test  X shape: ", test_X.shape)
print("Test  Y shape: ", test_y.shape)

Labels for training and testing data
Train X shape:  (4784, 3)
Train Y shape:  (4784,)
Test  X shape:  (4784, 3)
Test  Y shape:  (4784,)


In [None]:
svr = SVR(kernel='linear', gamma='scale', C=1.0, epsilon=0)
svr.fit(train_X, train_y) 
print('finished training svr')
clf = linear_model.Lasso(alpha=0.1)
clf.fit(train_X, train_y)
print('finished training clf')

In [None]:
future_j17 = [[400]] # Springflow at Comal River
future_j17 = np.array(future_j17)

x_list = []
y_list = []
for i in range(len(X)):
    x_list.append(X[i])
    y_list.append(y[i])

x_list = np.array(x_list)
y_list = np.array(y_list)

print(x_list.shape)
print(y_list.shape)

In [None]:
y_pred_clf = clf.predict(x_list)
y_pred_svr = svr.predict(x_list)
print('--------------')
print("Lasso: ", clf.score(test_X, test_y))
print("SVR: ", svr.score(test_X, test_y))

In [None]:
print(df.loc['2014-01-01'])

x_list_test = [[640.68, 164, 836]]

y_pred_clf_test = clf.predict(x_list_test)
y_pred_svr_test = svr.predict(x_list_test)
print("Lasso: ", clf.score(test_X, test_y))
print("SVR: ", svr.score(test_X, test_y))
print(y_pred_svr_test, y_pred_clf_test)

In [None]:
df['Springflow_SVR'] = y_pred_svr_test



In [None]:
fig, ax = plt.subplots(figsize=(8,8))
ax.scatter(y_list, y_pred_clf, alpha=0.3)
slope, intercept, r_value, p_value, std_err = linregress(y_list, y_pred_clf)
print("Slope: ", slope)
print("Intercept: ", intercept)
print("R Value: ", r_value)
print("P Value: ", p_value)
regx = np.arange(100,400)
regy = slope*regx + intercept
ax.plot(regx,regy,color='r')

ax.set_title('Lasso: Observed and Predicted Springflow', size=20)
ax.set_ylabel("Predicted Springflow (cfs)", size=20)
ax.set_xlabel("Observed Springflow (cfs)", size=20)
ax.grid()
plt.savefig(os.path.join("..", "..", "img", "Springflow_pred_obs.png"))
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
ax.scatter(y_list, y_pred_svr, alpha=0.3)
slope, intercept, r_value, p_value, std_err = linregress(y_list, y_pred_svr)
print("Slope: ", slope)
print("Intercept: ", intercept)
print("R Value: ", r_value)
print("P Value: ", p_value)
regx = np.arange(100,400)
regy = slope*regx + intercept
ax.plot(regx,regy,color='r')

ax.set_title('SVR: Observed and Predicted Springflow', size=20)
ax.set_ylabel("Predicted Springflow (cfs)", size=20)
ax.set_xlabel("Observed Springflow (cfs)", size=20)
ax.grid()
plt.savefig(os.path.join("..", "..", "img", "svr.png"))
plt.show()

In [None]:
datetime = pd.to_datetime(df.index)
fig, ax = plt.subplots(figsize=(15,8))
ax.plot(datetime, X[:,1],label='Comal River')
ax.plot(datetime,y_pred_clf,label='Comal Springs (Predicted)')
ax.plot(datetime,y_list,label='Comal Springs (Observed)')
ax.set_title('Time Series of Springflow Output', size=25)
ax.set_ylabel("Comal Springflow", size=20)
ax.set_xlabel("Year", size=20)
ax.grid()
ax.legend()
plt.savefig(os.path.join("..", "..", "img", "obs_lasso.png"))
plt.show()
y_pred_clf