In [36]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF, Matern, RationalQuadratic
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import joblib
import GPy

In [39]:
dir = 'C:/Users/fitzpatrick/Desktop/Data/Input/'

## Read in PCP data from CFSR
data_1 = pd.read_csv(dir+'CFSR_APCP_Basin_Sums.csv',sep=',')

## Read in EVAP data from CFSR
data_2 = pd.read_csv(dir+'CFSR_EVAP_Basin_Sums.csv',sep=',')

## Read in TMP data from CFSR
data_3 = pd.read_csv(dir+'CFSR_TMP_Basin_Avgs.csv',sep=',')

In [3]:
# Read in GLCC RNBS data
data_4 = pd.read_csv(dir + 'rnbs_glcc.csv', sep=',')

# Ensure 'Date' column is treated as string
date_strs = data_4['Date'].astype(str)

date_rnbs = []

for date_str in date_strs:
    # Insert leading zero before month
    if len(date_str) == 6:
        date_str = date_str[:4] + '0' + date_str[4:]

    # Insert leading zero before day
    if len(date_str) == 7:
        date_str = date_str[:6] + '0' + date_str[6:]

    # Convert to datetime object
    datetime_obj = datetime.strptime(date_str, '%Y%m%d')

    # Format the datetime object as a string in the desired format
    date_tmp = datetime_obj.strftime('%Y-%m-%d')

    # Append formatted date to list
    date_rnbs.append(date_tmp)

# Replace the original 'Date' column with formatted dates
data_4['Date'] = date_rnbs

# Convert 'Date' column to datetime format
data_4['Date'] = pd.to_datetime(data_4['Date'])

# Add a new column 'Month' extracting month from 'Date'
#data_4['Month'] = data_4['Date'].dt.month

In [19]:
# If we used the averages, we need to acurately average MI and HU together
# If we used the totals, we could simply add them together and skip this cell

mi_pcp = data_1['WaterMichigan'] / 1000 #convert mm to m
hu_pcp = data_1['WaterHuron'] / 1000 # convert mm to m
mi_evap = data_2['WaterMichigan'] / 1000 #convert mm to m
hu_evap = data_2['WaterHuron'] / 1000 # convert mm to m

mi_sa = 57757 * 1000000 #convert km2 to m2
hu_sa = 59570 * 1000000

mh_pcp = ((mi_pcp*mi_sa)+(hu_pcp*hu_sa))/(mi_sa+hu_sa) * 1000
mh_evap = ((mi_evap*mi_sa)+(hu_evap*hu_sa))/(mi_sa+hu_sa) * 1000
print(mh_pcp, data_1['WaterMichigan']+data_1['WaterHuron'])

0      100.277815
1       54.918940
2      122.878389
3      123.388742
4       87.940773
          ...    
379     42.861435
380    102.988742
381     61.165806
382     75.185430
383     75.792605
Length: 384, dtype: float64 0      200.57
1      109.76
2      246.14
3      246.30
4      175.46
        ...  
379     85.56
380    205.50
381    122.14
382    150.00
383    151.59
Length: 384, dtype: float64


In [40]:
# Prepare the data 
# Features
X = pd.DataFrame({
    'su_pcp': data_1['WaterSuperior'],
    'er_pcp': data_1['WaterErie'],
    'on_pcp': data_1['WaterOntario'],
    'mh_pcp': data_1['WaterMichigan']+data_1['WaterHuron'],
    #'mh_pcp': mh_pcp,
    'su_evap': data_2['WaterSuperior'],
    'er_evap': data_2['WaterErie'],
    'on_evap': data_2['WaterOntario'],
    'mh_evap': data_2['WaterMichigan']+data_2['WaterHuron'],
    #'mh_evap': mh_evap,
    'su_tmp': data_3['WaterSuperior'],
    'er_tmp': data_3['WaterErie'],
    'on_tmp': data_3['WaterOntario'],
    'mh_tmp': (data_3['WaterMichigan']+data_3['WaterHuron'])/2 # the average of the two lakes
})

X.set_index(pd.to_datetime(data_1[['year', 'month']].assign(day=1)), inplace=True)

# Targets
rnbs = pd.DataFrame({
    'su_rnbs': data_4['sup'],
    'er_rnbs': data_4['eri'],
    'on_rnbs': data_4['ont'],
    'mh_rnbs': data_4['mic_hur']
})

rnbs.set_index(pd.to_datetime(data_4['Date']), inplace=True)

merged_df = pd.merge(X, rnbs, left_index=True, right_index=True, how='inner')

y = merged_df[['su_rnbs', 'er_rnbs', 'on_rnbs', 'mh_rnbs']]

## IF we want to include the month as a categorical feature
# Merge on 'Month'
#X = pd.concat([X, pd.get_dummies(month, prefix='Month')], axis=1)

# Drop any rows with NaN values (if any)
#X.dropna(inplace=True)

In [42]:
# Split data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_start_date = '1979-01-01'
train_end_date = '2004-12-01'
val_start_date = '2005-01-01'
val_end_date = '2011-01-01'

X_train = X[train_start_date:train_end_date]
y_train = y[train_start_date:train_end_date]
X_test = X[val_start_date:val_end_date]
y_test = y[val_start_date:val_end_date]

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
y_train_scaled = scaler.fit_transform(y_train)
X_test_scaled = scaler.fit_transform(X_test)
y_test_scaled = scaler.fit_transform(y_test)


In [43]:
# Gaussian Process Regression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF, Matern, RationalQuadratic

#kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))
kernel = 1.0 * Matern(nu=1.5) * RationalQuadratic()
gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10, random_state=42)
#gpr = GPy.models.GPRegression(X_train_scaled, y_train_scaled, kernel)
#gpr.optimize(messages=True)

# Fit the model
gpr.fit(X_train_scaled, y_train_scaled)

# Save the trained model
#joblib.dump(gpr, 'GP_trained_model.pkl')

# Predictions
y_pred, sigma = gpr.predict(X_test_scaled, return_std=True)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test_scaled, y_pred)
r_squared = r2_score(y_test_scaled, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

The r squared value for the model is 0.8314989490180902
Mean Squared Error: 0.16850105098190982


: 

In [21]:
## Random Forest Regressor Model

# Initialize RandomForestRegressor
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, 'RF_trained_model.pkl')

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

The r squared value for the model is 0.7532091929741378
Mean Squared Error: 1986.6362203456965


In [22]:
# Get feature importances
importances = model.feature_importances_
feature_names = X.columns

# Create a DataFrame to display feature importances
feature_importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort features by importance
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

# Display the feature importances
print(feature_importances_df)

    Feature  Importance
5   er_evap    0.179649
6   on_evap    0.170103
7   mh_evap    0.149443
1    er_pcp    0.125893
2    on_pcp    0.076537
4   su_evap    0.054185
10   on_tmp    0.053889
11   mh_tmp    0.047326
3    mh_pcp    0.041667
9    er_tmp    0.040468
0    su_pcp    0.030809
8    su_tmp    0.030030


In [23]:
## Neural Network

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(4)  # Number of targets
])

# Compile the model
model.compile(optimizer='adam', loss='mse')  # Using mean squared error (mse) as the loss function

# Fit the model to the training data
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Save the trained model
model.save('NN_trained_model.keras')

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

Epoch 1/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - loss: 20525.1172 - val_loss: 14462.3145
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 20971.2480 - val_loss: 14373.3867
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 20461.3008 - val_loss: 14258.5166
Epoch 4/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 23151.5488 - val_loss: 14109.7363
Epoch 5/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 20024.0000 - val_loss: 13909.6211
Epoch 6/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 19084.5273 - val_loss: 13640.1533
Epoch 7/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 20318.1738 - val_loss: 13277.8369
Epoch 8/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 19857.1133 - val_loss: 12811.0176
Epoch 9