# Model Training Script
Lindsay Fitzpatrick
ljob@umich.edu
12/18/2024

This script reads in CFSR data from 1979 - 2010 and trains machine learning models to target CNBS from L2SWBM across the 5 Great Lakes simultaeously.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF, Matern, RationalQuadratic, ExpSineSquared
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import joblib
import calendar

## User Input

In [6]:
# This is the directory where you cloned the repo
path_to_repo = '/Users/fitzpatrick/Desktop/'

dir = path_to_repo + 'cnbs-predictor-1/data/input/'

## Functions

In [3]:
def seconds_in_month(year, month):
    # Number of days in the month
    num_days = calendar.monthrange(year, month)[1]
    # Convert days to seconds
    return num_days * 24 * 60 * 60

In [4]:
def shift_variables(df, lag=0, lead=0):
    """
    Create the variables columns to include lags (last month values) and lead variables
    
    Parameters:
    - df (pd.DataFrame): The DataFrame containing the time series data.
    - lag (int): The number of months you want to include lagged variables. Default = 0
    - lead (int): The number of months for the advance variables. Default = 0
    
    Returns:
    - pd.DataFrame: The DataFrame with added variable columns for lags and leading.
    """
    df = df.copy()  # To avoid modifying the original DataFrame

    new_columns = []  # List to store the new lag and lead columns

    # Generate target columns for the lag and lead months
    for column in df.columns:
        for lag_month in range(1, lag + 1):
            new_columns.append(df[column].shift(lag_month).rename(f'{column}_mo-{lag_month}'))
        for lead_month in range(1, lead + 1):
            new_columns.append(df[column].shift(-lead_month).rename(f'{column}_mo{lead_month}'))

    # Concatenate the new columns with the original DataFrame
    df = pd.concat([df] + new_columns, axis=1)

    # Drop rows with any NaN values generated by shifting for the target
    df = df.dropna()

    return df

## Begin Script

In [8]:
## Read in PCP data from CFSR [mm]
data_1 = pd.read_csv(dir+'training/CFSR_APCP_Basin_Avgs_05.csv',sep=',')

## Read in EVAP data from CFSR [mm]
data_2 = pd.read_csv(dir+'training/CFSR_EVAP_Basin_Avgs_031.csv',sep=',')

## Read in TMP data from CFSR [K]
data_3 = pd.read_csv(dir+'training/CFSR_TMP_Basin_Avgs_031.csv',sep=',')

Read in L2SWBM in [mm]

https://zenodo.org/records/13883098

In [9]:
sup_evap = pd.read_csv(dir + 'l2swbm/superiorEvap_MonthlyRun.csv')
sup_runoff = pd.read_csv(dir + 'l2swbm/superiorRunoff_MonthlyRun.csv')
sup_precip = pd.read_csv(dir + 'l2swbm/superiorPrecip_MonthlyRun.csv')

eri_evap = pd.read_csv(dir + 'l2swbm/erieEvap_MonthlyRun.csv')
eri_runoff = pd.read_csv(dir + 'l2swbm/erieRunoff_MonthlyRun.csv')
eri_precip = pd.read_csv(dir + 'l2swbm/eriePrecip_MonthlyRun.csv')

ont_evap = pd.read_csv(dir + 'l2swbm/ontarioEvap_MonthlyRun.csv')
ont_runoff = pd.read_csv(dir + 'l2swbm/ontarioRunoff_MonthlyRun.csv')
ont_precip = pd.read_csv(dir + 'l2swbm/ontarioPrecip_MonthlyRun.csv')

mih_evap = pd.read_csv(dir + 'l2swbm/miHuronEvap_MonthlyRun.csv')
mih_runoff = pd.read_csv(dir + 'l2swbm/miHuronRunoff_MonthlyRun.csv')
mih_precip = pd.read_csv(dir + 'l2swbm/miHuronPrecip_MonthlyRun.csv')

Here we prepare the data for training and testing. We set the features 'X' as total over lake
precipitation, total over lake evaporation, and the average air temperature over each lake. The
targets 'y' are RNBS for each lake simultaeously.

In [10]:
# Features
X = pd.DataFrame({
    'sup_pcp_w': data_1['sup_lake'],
    'eri_pcp_w': data_1['eri_lake'],
    'ont_pcp_w': data_1['ont_lake'],
    'mih_pcp_w': data_1['mih_lake'],
    'sup_pcp_l': data_1['sup_land'],
    'eri_pcp_l': data_1['eri_land'],
    'ont_pcp_l': data_1['ont_land'],
    'mih_pcp_l': data_1['mih_land'],
    'sup_evap_w': data_2['sup_lake'],
    'eri_evap_w': data_2['eri_lake'],
    'ont_evap_w': data_2['ont_lake'],
    'mih_evap_w': data_2['mih_lake'],
    'sup_evap_l': data_2['sup_land'],
    'eri_evap_l': data_2['eri_land'],
    'ont_evap_l': data_2['ont_land'],
    'mih_evap_l': data_2['mih_land'],
    'sup_tmp_w': data_3['sup_lake'],
    'eri_tmp_w': data_3['eri_lake'],
    'ont_tmp_w': data_3['ont_lake'],
    'mih_tmp_w': data_3['mih_lake'],
    'sup_tmp_l': data_3['sup_land'],
    'eri_tmp_l': data_3['eri_land'],
    'ont_tmp_l': data_3['ont_land'],
    'mih_tmp_l': data_3['mih_land']
})

# Set the index by date
X.set_index(pd.to_datetime(data_1[['year', 'month']].assign(day=1)), inplace=True)

# Targets are the components of NBS (P, E, R)
targets = pd.DataFrame({
    'sup_evap_t': sup_evap['Median'],
    'sup_pcp_t': sup_precip['Median'],
    'sup_rnoff_t': sup_runoff['Median'],
    'eri_evap_t': eri_evap['Median'],
    'eri_pcp_t': eri_precip['Median'],
    'eri_rnoff_t': eri_runoff['Median'],
    'ont_evap_t': ont_evap['Median'],
    'ont_pcp_t': ont_precip['Median'],
    'ont_rnoff_t': ont_runoff['Median'],
    'mih_evap_t': mih_evap['Median'],
    'mih_pcp_t': mih_precip['Median'],
    'mih_rnoff_t': mih_runoff['Median']
})

# Set the index of the targets
targets.set_index(pd.to_datetime(eri_evap[['Year', 'Month']].assign(day=1)), inplace=True)

In [11]:
shifted_X = shift_variables(X ,lag=0, lead=0)

In [12]:
shifted_targets = shift_variables(targets, lag=0, lead=0)

In [13]:
# Make sure the indices/dates align after the shifts
aligned_y = shifted_targets.loc[shifted_X.index]

print(f'Number of Targets: {aligned_y.shape[1]}')
print(f'Number of Features: {shifted_X.shape[1]}')

Number of Targets: 12
Number of Features: 24


Split the data into training and testing data sets. We could do it as a random 80/20 split
but instead we set split the data set by date ranges. This can easily be adjusted.

In [14]:
# Split dataset by date ranges into training and testing sets
train_start_date = '1979-01-01'
train_end_date = '2004-12-01'
# Testing dataset
val_start_date = '2005-01-01'
val_end_date = '2011-01-01'

X_train = shifted_X[train_start_date:train_end_date]
y_train = aligned_y[train_start_date:train_end_date]
X_test = shifted_X[val_start_date:val_end_date]
y_test = aligned_y[val_start_date:val_end_date]

It is best practice to standardize the data from 0-1 before training

In [15]:
# Standardize the data
x_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train_scaled = x_scaler.fit_transform(X_train)
X_test_scaled = x_scaler.fit_transform(X_test)
y_train_scaled = y_scaler.fit_transform(y_train)
y_test_scaled = y_scaler.fit_transform(y_test)

Training

Below we train different models using the same data and calculate the r squared values on the 
test data to compare performance.

In [None]:
# Testing Different Kernels
# Basic kernel using ConstantKernel: r2 = 0.8259
#kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))

# Matt's optimal kernel: 
# # MVP r2 = 0.8159
# # Geo mask r2: 0.8176
# # 
kernel = 1.0 * Matern(nu=1.5) * RationalQuadratic()

# Test to add a seasonality component: r2 = 0.8279
#period = 3.0  # Period of the season
#kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)) + ExpSineSquared(length_scale=1.0, periodicity=period, periodicity_bounds=(1e-2, 1e2))

#kernel = 1.0 * ExpSineSquared(periodicity=12)

#kernel = 1.0 * RBF() + 1.0 * Matern(nu=2.5) + 1.0 * RationalQuadratic()

# Set up the model
gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.1, n_restarts_optimizer=10, random_state=42)

# Fit the model
gpr.fit(X_train_scaled, y_train_scaled)

# Save the trained model
joblib.dump(gpr, dir +'GP_trained_model.joblib')
joblib.dump(x_scaler, dir + 'x_scaler.joblib')
joblib.dump(y_scaler, dir + 'y_scaler.joblib')

# Predictions
y_pred, sigma = gpr.predict(X_test_scaled, return_std=True)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test_scaled, y_pred)
r_squared = r2_score(y_test_scaled, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

The r squared value for the model is 0.8420223920088205
Mean Squared Error: 0.15797760799117966


In [None]:
## Random Forest Regressor Model:

# Initialize RandomForestRegressor
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, dir + 'RF_trained_model.joblib')

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

The r squared value for the model is 0.5129044536132797
Mean Squared Error: 614.908055784716


In [None]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(144)  # **** Number of targets ****
])

# Compile the model
model.compile(optimizer='adam', loss='mse')  # Using mean squared error (mse) as the loss function

# Fit the model to the training data
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Save the trained model
joblib.dump(model,dir + 'NN_trained_model.joblib')

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

Epoch 1/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - loss: 6831.1494 - val_loss: 6295.4204
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 6645.6333 - val_loss: 6203.2036
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 6553.0889 - val_loss: 5992.6641
Epoch 4/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 6358.4233 - val_loss: 5567.8203
Epoch 5/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 5769.0596 - val_loss: 4821.2827
Epoch 6/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 4925.3877 - val_loss: 3730.0469
Epoch 7/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 3651.0073 - val_loss: 2587.1331
Epoch 8/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 2534.3660 - val_loss: 2005.3083
Epoch 9/50
[1m8/8[0m 