In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [6]:
dir = 'C:/Users/fitzpatrick/Desktop/Data/Input/'

## Read in PCP data from CFSR
data_1 = pd.read_csv(dir+'CFSR_APCP_Basin_Avgs.csv',sep=',')
month = data_1['month']
su_pcp = data_1['WaterSuperior']
er_pcp = data_1['WaterErie']
on_pcp = data_1['WaterOntario']
mh_pcp = data_1['WaterMichigan']+data_1['WaterHuron']

## Read in EVAP data from CFSR
data_2 = pd.read_csv(dir+'CFSR_EVAP_Basin_Avgs.csv',sep=',')
su_evap = data_2['WaterSuperior']
er_evap = data_2['WaterErie']
on_evap = data_2['WaterOntario']
mh_evap = data_2['WaterMichigan']+data_2['WaterHuron']

## Read in TMP data from CFSR
data_3 = pd.read_csv(dir+'CFSR_TMP_Basin_Avgs.csv',sep=',')
su_tmp = data_3['WaterSuperior']
er_tmp = data_3['WaterErie']
on_tmp = data_3['WaterOntario']
mh_tmp = data_3['WaterMichigan']+data_3['WaterHuron']

In [7]:
# Read in GLCC RNBS data
data_4 = pd.read_csv(dir + 'rnbs_glcc.csv', sep=',')

# Ensure 'Date' column is treated as string and modified as necessary
date_strs = data_4['Date'].astype(str)

date_rnbs = []

for date_str in date_strs:
    # Insert leading zero before month (if necessary)
    if len(date_str) == 6:
        date_str = date_str[:4] + '0' + date_str[4:]

    # Insert leading zero before day (if necessary)
    if len(date_str) == 7:
        date_str = date_str[:6] + '0' + date_str[6:]

    # Convert to datetime object
    datetime_obj = datetime.strptime(date_str, '%Y%m%d')

    # Format the datetime object as a string in the desired format
    date_tmp = datetime_obj.strftime('%Y-%m-%d')

    # Append formatted date to list
    date_rnbs.append(date_tmp)

# Replace the original 'Date' column with formatted dates
data_4['Date'] = date_rnbs

# Convert 'Date' column to datetime format
data_4['Date'] = pd.to_datetime(data_4['Date'])

# Add a new column 'Month' extracting month from 'Date'
data_4['Month'] = data_4['Date'].dt.month

# Select only the data that matches with the CFSR dates
start_date = '1979-01-01'
end_date = '2010-12-01'
selected_data = data_4[(data_4['Date'] >= start_date) & (data_4['Date'] <= end_date)]

su_rnbs = selected_data['sup']
er_rnbs = selected_data['eri']
on_rnbs = selected_data['ont']
mh_rnbs = selected_data['mic_hur']

In [11]:
# Prepare the data 
# Features
X = pd.DataFrame({
    'su_pcp': su_pcp,
    'er_pcp': er_pcp,
    'on_pcp': on_pcp,
    'mh_pcp': mh_pcp,
    'su_evap': su_evap,
    'er_evap': er_evap,
    'on_evap': on_evap,
    'mh_evap': mh_evap,
    'su_tmp': su_tmp,
    'er_tmp': er_tmp,
    'on_tmp': on_tmp,
    'mh_tmp': mh_tmp
})

# Targets
y = pd.DataFrame({
    'su_rnbs': su_rnbs,
    'er_rnbs': er_rnbs,
    'on_rnbs': on_rnbs,
    'mh_rnbs': mh_rnbs
})

# Merge on 'Month'
X = pd.concat([X, pd.get_dummies(month, prefix='Month')], axis=1)

# Drop any rows with NaN values (if any)
X.dropna(inplace=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
## Random Forest Regressor Model

# Initialize RandomForestRegressor
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

The r squared value for the model is 0.7448461407142164
Mean Squared Error: 2069.382874323705


In [14]:
## Neural Network

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(4)  # Output layer, 4 outputs su_rnbs, er_rnbs, on_rnbs, mh_rnbs
])

# Compile the model
model.compile(optimizer='adam', loss='mse')  # Using mean squared error (mse) as the loss function

# Fit the model to the training data
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Save the trained model
model.save('NN_trained_model.keras')

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

Epoch 1/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - loss: 22084.7051 - val_loss: 14481.4434
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 22906.7012 - val_loss: 14387.1602
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 20088.3770 - val_loss: 14273.9316
Epoch 4/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 21278.1543 - val_loss: 14125.9189
Epoch 5/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 18831.7207 - val_loss: 13928.0098
Epoch 6/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 19421.0332 - val_loss: 13655.2959
Epoch 7/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 21188.7891 - val_loss: 13284.7695
Epoch 8/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 19965.7305 - val_loss: 12807.7041
Epoch 

In [13]:
# Get feature importances
importances = model.feature_importances_
feature_names = X.columns

# Create a DataFrame to display feature importances
feature_importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort features by importance
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

# Display the feature importances
print(feature_importances_df)

     Feature  Importance
5    er_evap    0.178067
6    on_evap    0.167814
7    mh_evap    0.147316
1     er_pcp    0.126525
2     on_pcp    0.074411
10    on_tmp    0.056962
4    su_evap    0.053589
11    mh_tmp    0.045427
3     mh_pcp    0.041161
9     er_tmp    0.035763
0     su_pcp    0.030635
8     su_tmp    0.027422
15   Month_4    0.004118
14   Month_3    0.002445
22  Month_11    0.002057
23  Month_12    0.001406
13   Month_2    0.000743
20   Month_9    0.000708
18   Month_7    0.000699
16   Month_5    0.000604
21  Month_10    0.000595
19   Month_8    0.000574
12   Month_1    0.000525
17   Month_6    0.000434
