# Model Training Script
Lindsay Fitzpatrick
ljob@umich.edu
08/19/2024

This script reads in CFSR data from 1979 - 2010 and trains different machine learning
models to target RNBS from GLCC across the 5 Great Lakes simultaeously.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF, Matern, RationalQuadratic
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from sklearn.ensemble import RandomForestRegressor
import joblib
import calendar

## Functions

User Input

In [2]:
# This is the directory where the CFSR and GLCC files are located
dir = 'C:/Users/fitzpatrick/Desktop/Data/Input/'

In [4]:
def seconds_in_month(year, month):
    # Number of days in the month
    num_days = calendar.monthrange(year, month)[1]
    # Convert days to seconds
    return num_days * 24 * 60 * 60

In [5]:
def convert_mm_to_m3_per_sec(df):

    # Calculate the number of seconds for each month
    df['seconds'] = df.apply(lambda row: seconds_in_month(int(row['year']), int(row['month'])), axis=1)

    # Convert millimeters to meters
    df['WaterErie_m3'] = df['WaterErie'] / 1000
    df['WaterOntario_m3'] = df['WaterOntario'] / 1000
    df['WaterSuperior_m3'] = df['WaterSuperior'] / 1000
    df['WaterMichHuron_m3'] = (df['WaterMichigan'] + df['WaterHuron']) / 1000

    # Convert the data to cubic meters per second
    df['WaterErie_cms'] = df['WaterErie_m3'] / df['seconds']
    df['WaterOntario_cms'] = df['WaterOntario_m3'] / df['seconds']
    df['WaterSuperior_cms'] = df['WaterSuperior_m3'] / df['seconds']
    df['WaterMichHuron_cms'] = df['WaterMichHuron_m3'] / df['seconds']
    
    return df

In [None]:
def process_rnbs_data(csv_file):
    """
    Processes a DataFrame by melting it from wide to long format, creating a date column,
    converting it to a specific format, and renaming columns.
    
    Args:
    - df (pd.DataFrame): The input DataFrame with columns ['Year', 'Month', ...].

    Returns:
    - pd.DataFrame: The processed DataFrame with columns ['Date', 'RNBS'].
    """

    df = pd.read_csv(csv_file, skiprows=11)

    # Melt DataFrame to long format
    df_melted = df.melt(id_vars=['Year'], var_name='Month', value_name='Value')
    
    # Combine Year and Month to create Date in the format YYYYMM
    df_melted['Date'] = df_melted['Year'].astype(str) + df_melted['Month'].str[1:4] + '01'  # Extract month abbreviation and concatenate

    # Convert the Date column to datetime
    df_melted['Date'] = pd.to_datetime(df_melted['Date'], format='%Y%b%d')

    # Rename the 'Value' column to 'RNBS'
    df_melted.rename(columns={'Value': 'RNBS'}, inplace=True)

    # Drop the original 'Year' and 'Month' columns
    df_final = df_melted.drop(columns=['Year', 'Month'])

    # Reset the index
    df_final = df_final.set_index('Date')

    print(df_final)
    return df_final

## Begin Script

In [None]:
## Read in PCP data from CFSR
data_1 = pd.read_csv(dir+'CFSR_APCP_Basin_Sums.csv',sep=',')

## Read in EVAP data from CFSR
data_2 = pd.read_csv(dir+'CFSR_EVAP_Basin_Sums.csv',sep=',')

## Read in TMP data from CFSR
data_3 = pd.read_csv(dir+'CFSR_TMP_Basin_Avgs.csv',sep=',')

In [6]:
# Convert Total Precipitation to cms
data_1 = convert_mm_to_m3_per_sec(data_1)

# Convert Total Evaporation to cms
data_2 = convert_mm_to_m3_per_sec(data_2)

Read in the GLCC RNBS CSVs file in [cms]

https://www.greatlakescc.org/en/coordinating-committee-products-and-datasets/#:~:text=Monthly%20Residual%20Net%20Basin%20Supplies%3A

In [8]:
sup_file = dir + 'LakeSuperior_MonthlyNetBasinSupply_1900to2024.csv'
eri_file = dir + 'LakeErie_MonthlyNetBasinSupply_1900to2024.csv'
mih_file = dir + 'LakeMichiganHuron_MonthlyNetBasinSupply_1900to2024.csv'
ont_file = dir + 'LakeOntario_MonthlyNetBasinSupply_1900to2024.csv'

sup_rnbs = process_rnbs_data(sup_file)
eri_rnbs = process_rnbs_data(eri_file)
mih_rnbs = process_rnbs_data(mih_file)
ont_rnbs = process_rnbs_data(ont_file)

               RNBS
Date               
1900-01-01   -520.0
1901-01-01  -1130.0
1902-01-01   -890.0
1903-01-01   -340.0
1904-01-01   -110.0
...             ...
2020-12-01  -1600.0
2021-12-01    980.0
2022-12-01   -460.0
2023-12-01    830.0
2024-12-01 -99999.0

[1500 rows x 1 columns]
               RNBS
Date               
1900-01-01   -150.0
1901-01-01    250.0
1902-01-01   -130.0
1903-01-01    460.0
1904-01-01    940.0
...             ...
2020-12-01    650.0
2021-12-01   1280.0
2022-12-01    560.0
2023-12-01    920.0
2024-12-01 -99999.0

[1500 rows x 1 columns]
               RNBS
Date               
1900-01-01   -180.0
1901-01-01    430.0
1902-01-01   -230.0
1903-01-01   1490.0
1904-01-01   1590.0
...             ...
2020-12-01   2480.0
2021-12-01   2140.0
2022-12-01    650.0
2023-12-01   2840.0
2024-12-01 -99999.0

[1500 rows x 1 columns]
               RNBS
Date               
1900-01-01    770.0
1901-01-01    660.0
1902-01-01    770.0
1903-01-01    350.0
1904-01-01    340.0
...  

Here we prepare the data for training and testing. We set the features 'X' as total over lake
precipitation, total over lake evaporation, and the average air temperature over each lake. The
targets 'y' are RNBS for each lake simultaeously.

In [23]:
# Features
X = pd.DataFrame({
    'su_pcp': data_1['WaterSuperior_cms'],
    'er_pcp': data_1['WaterErie_cms'],
    'on_pcp': data_1['WaterOntario_cms'],
    'mh_pcp': data_1['WaterMichHuron_cms'], #data_1['WaterMichigan']+data_1['WaterHuron'], # add the sums
    'su_evap': data_2['WaterSuperior_cms'],
    'er_evap': data_2['WaterErie_cms'],
    'on_evap': data_2['WaterOntario_cms'],
    'mh_evap': data_2['WaterMichHuron_cms'], #data_2['WaterMichigan']+data_2['WaterHuron'], # add the sums
    'su_tmp': data_3['WaterSuperior'],
    'er_tmp': data_3['WaterErie'],
    'on_tmp': data_3['WaterOntario'],
    'mh_tmp': (data_3['WaterMichigan']+data_3['WaterHuron'])/2 # take the average temp
})

# Set the index by date
X.set_index(pd.to_datetime(data_1[['year', 'month']].assign(day=1)), inplace=True)

# Targets
rnbs = pd.DataFrame({
    'su_rnbs': sup_rnbs['RNBS'],
    'er_rnbs': eri_rnbs['RNBS'],
    'on_rnbs': ont_rnbs['RNBS'],
    'mh_rnbs': mih_rnbs['RNBS']
})

# Set the index of the targets RNBS to be the same index as the individual RNBS datasets
rnbs.index = sup_rnbs.index

# Merge the features and targets to align with the dates
# Drops the dates in the RNBS where we don't have CFS data 
merged_df = pd.merge(X, rnbs, left_index=True, right_index=True, how='inner')

# Pull the target variables back out 
y = merged_df[['su_rnbs', 'er_rnbs', 'on_rnbs', 'mh_rnbs']]

print(y)

            su_rnbs  er_rnbs  on_rnbs  mh_rnbs
1979-01-01   -640.0    890.0   1700.0   1770.0
1979-02-01   1140.0   1530.0    700.0   1920.0
1979-03-01   4280.0   2640.0   3200.0  10210.0
1979-04-01   5620.0   3090.0   3210.0  11910.0
1979-05-01   7660.0   1410.0   1470.0   8090.0
...             ...      ...      ...      ...
2010-08-01   2330.0   -800.0    180.0   1110.0
2010-09-01   2340.0  -1010.0    150.0   1000.0
2010-10-01  -1040.0   -430.0    830.0  -2070.0
2010-11-01    120.0   -100.0   1050.0   -630.0
2010-12-01  -1630.0      0.0   1240.0   -130.0

[384 rows x 4 columns]


Split the data into training and testing data sets. We could do it as a random 80/20 split
but instead we set split the data set by date ranges. This can easily be adjusted.

In [10]:
# Split data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_start_date = '1979-01-01'
train_end_date = '2004-12-01'
# Testing dataset
val_start_date = '2005-01-01'
val_end_date = '2011-01-01'

X_train = X[train_start_date:train_end_date]
y_train = y[train_start_date:train_end_date]
X_test = X[val_start_date:val_end_date]
y_test = y[val_start_date:val_end_date]

In [36]:
# Verify shapes
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (312, 12)
Shape of y_train: (312, 4)
Shape of X_test: (72, 12)
Shape of y_test: (72, 4)


It is best practice to standardize the data from 0-1 before training

In [57]:
# Standardize the data
x_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train_scaled = x_scaler.fit_transform(X_train)
X_test_scaled = x_scaler.fit_transform(X_test)
y_train_scaled = y_scaler.fit_transform(y_train)
y_test_scaled = y_scaler.fit_transform(y_test)

print(X_train_scaled.shape)
print(X_test_scaled.shape)

(312, 12)
(72, 12)


## Training
Below we train different models using the same data and calculate the r squared values on the 
test data to compare performance.

In [59]:
# Testing Different Kernels
# Basic kernel using ConstantKernel: r2 = 0.8259
#kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))

# Matt's optimal kernel: r2 = 0.8321
kernel = 1.0 * Matern(nu=1.5) * RationalQuadratic()

# Test to add a seasonality component: r2 = 0.8279
#period = 3.0  # Period of the season
#kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)) + ExpSineSquared(length_scale=1.0, periodicity=period, periodicity_bounds=(1e-2, 1e2))

# Set up the model
gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.1, n_restarts_optimizer=10, random_state=42)

# Fit the model
gpr.fit(X_train_scaled, y_train_scaled)

# Save the trained model
joblib.dump(gpr, 'GP_trained_model.joblib')
joblib.dump(x_scaler, 'x_scaler.joblib')
joblib.dump(y_scaler, 'y_scaler.joblib')

# Predictions
y_pred, sigma = gpr.predict(X_test_scaled, return_std=True)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test_scaled, y_pred)
r_squared = r2_score(y_test_scaled, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

The r squared value for the model is 0.8320987950154999
Mean Squared Error: 0.16790120498450012


In [13]:
## Random Forest Regressor Model: r2 = 0.7389

# Initialize RandomForestRegressor
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, 'RF_trained_model.joblib')

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

The r squared value for the model is 0.7388005673888829
Mean Squared Error: 879987.1633680556


In [14]:
## Neural Network: r2 = 0.4002

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(4)  # Number of targets
])

# Compile the model
model.compile(optimizer='adam', loss='mse')  # Using mean squared error (mse) as the loss function

# Fit the model to the training data
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Save the trained model
joblib.dump(model,'NN_trained_model.joblib')

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

Epoch 1/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 8631859.0000 - val_loss: 8135449.5000
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 8376133.0000 - val_loss: 8133447.5000
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 8280033.5000 - val_loss: 8130782.5000
Epoch 4/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 9098262.0000 - val_loss: 8127111.0000
Epoch 5/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 8604907.0000 - val_loss: 8122127.0000
Epoch 6/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 8342619.5000 - val_loss: 8115297.5000
Epoch 7/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 8966826.0000 - val_loss: 8105810.5000
Epoch 8/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 8399102.0000 