# Model Training Script
Lindsay Fitzpatrick
ljob@umich.edu
12/18/2024

This script reads in CFSR data from 1979 - 2010 and trains machine learning models to target CNBS from L2SWBM across the 5 Great Lakes simultaeously.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF, Matern, RationalQuadratic, ExpSineSquared
from sklearn.metrics import mean_squared_error, r2_score
#import tensorflow as tf
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import joblib
import calendar

## Functions

User Input

In [3]:
# This is the directory where the CFSR and L2SWBM files are located
dir = 'C:/Users/fitzpatrick/Desktop/Data/'

In [4]:
def seconds_in_month(year, month):
    # Number of days in the month
    num_days = calendar.monthrange(year, month)[1]
    # Convert days to seconds
    return num_days * 24 * 60 * 60

In [5]:
def shift_variables(df, lag=0, forecast=0):
    """
    Create the variables columns to include lags (last month values) and forecast variables
    
    Parameters:
    - df (pd.DataFrame): The DataFrame containing the time series data.
    - lag (int): The number of months you want to include lagged variables. Default = 0
    - forecast (int): The number of months for the forecast variables. Default = 0
    
    Returns:
    - pd.DataFrame: The DataFrame with added variable columns for lags and forecasting.
    """
    df = df.copy()  # To avoid modifying the original DataFrame

    # Generate target columns for the next `forecast_months` months
    for column in df.columns:
        for lag in range(1, lag + 1):
            df[f'{column}_mo-{lag}'] = df[column].shift(lag)
        for month in range(1, forecast+1):
            df[f'{column}_mo{month}'] = df[column].shift(-month)

    # Drop rows with any NaN values generated by shifting for the target
    df = df.dropna()

    return df

## Begin Script

In [6]:
## Read in PCP data from CFSR [mm]
data_1 = pd.read_csv(dir+'CFSR/CFSR_APCP_Basin_Avgs_05.csv',sep=',')

## Read in EVAP data from CFSR [mm]
data_2 = pd.read_csv(dir+'CFSR/CFSR_EVAP_Basin_Avgs_031.csv',sep=',')

## Read in TMP data from CFSR [K]
data_3 = pd.read_csv(dir+'CFSR/CFSR_TMP_Basin_Avgs_031.csv',sep=',')

## Read in SWE data from CFSR [mm]
data_4 = pd.read_csv(dir+'CFSR/CFSR_SWE_Basin_Avgs_031.csv',sep=',')

## Read in SST data from CFSR [K]
data_5 = pd.read_csv(dir+'CFSR/CFSR_STMP_Basin_Avgs_031.csv',sep=',')

Read in L2SWBM in [mm]

https://zenodo.org/records/13883098

In [7]:
sup_evap = pd.read_csv(dir + 'L2SWBM/superiorEvap_MonthlyRun.csv')
sup_runoff = pd.read_csv(dir + 'L2SWBM/superiorRunoff_MonthlyRun.csv')
sup_precip = pd.read_csv(dir + 'L2SWBM/superiorPrecip_MonthlyRun.csv')

eri_evap = pd.read_csv(dir + 'L2SWBM/erieEvap_MonthlyRun.csv')
eri_runoff = pd.read_csv(dir + 'L2SWBM/erieRunoff_MonthlyRun.csv')
eri_precip = pd.read_csv(dir + 'L2SWBM/eriePrecip_MonthlyRun.csv')

ont_evap = pd.read_csv(dir + 'L2SWBM/ontarioEvap_MonthlyRun.csv')
ont_runoff = pd.read_csv(dir + 'L2SWBM/ontarioRunoff_MonthlyRun.csv')
ont_precip = pd.read_csv(dir + 'L2SWBM/ontarioPrecip_MonthlyRun.csv')

mih_evap = pd.read_csv(dir + 'L2SWBM/miHuronEvap_MonthlyRun.csv')
mih_runoff = pd.read_csv(dir + 'L2SWBM/miHuronRunoff_MonthlyRun.csv')
mih_precip = pd.read_csv(dir + 'L2SWBM/miHuronPrecip_MonthlyRun.csv')

Here we prepare the data for training and testing. We set the features 'X' as total over lake
precipitation, total over lake evaporation, and the average air temperature over each lake. The
targets 'y' are RNBS for each lake simultaeously.

In [8]:
# Features
X = pd.DataFrame({
    'sup_pcp_w': data_1['sup_lake'],
    'eri_pcp_w': data_1['eri_lake'],
    'ont_pcp_w': data_1['ont_lake'],
    'mih_pcp_w': data_1['mih_lake'],
    'sup_pcp_l': data_1['sup_land'],
    'eri_pcp_l': data_1['eri_land'],
    'ont_pcp_l': data_1['ont_land'],
    'mih_pcp_l': data_1['mih_land'],
    'sup_evap_w': data_2['sup_lake'],
    'eri_evap_w': data_2['eri_lake'],
    'ont_evap_w': data_2['ont_lake'],
    'mih_evap_w': data_2['mih_lake'],
    'sup_evap_l': data_2['sup_land'],
    'eri_evap_l': data_2['eri_land'],
    'ont_evap_l': data_2['ont_land'],
    'mih_evap_l': data_2['mih_land'],
    'sup_tmp_w': data_3['sup_lake'],
    'eri_tmp_w': data_3['eri_lake'],
    'ont_tmp_w': data_3['ont_lake'],
    'mih_tmp_w': data_3['mih_lake'],
    'sup_tmp_l': data_3['sup_land'],
    'eri_tmp_l': data_3['eri_land'],
    'ont_tmp_l': data_3['ont_land'],
    'mih_tmp_l': data_3['mih_land']
})

# Set the index by date
X.set_index(pd.to_datetime(data_1[['year', 'month']].assign(day=1)), inplace=True)

# Targets are the components of NBS (P, E, R)
targets = pd.DataFrame({
    'sup_evap_t': sup_evap['Median'],
    'sup_pcp_t': sup_precip['Median'],
    'sup_rnoff_t': sup_runoff['Median'],
    'eri_evap_t': eri_evap['Median'],
    'eri_pcp_t': eri_precip['Median'],
    'eri_rnoff_t': eri_runoff['Median'],
    'ont_evap_t': ont_evap['Median'],
    'ont_pcp_t': ont_precip['Median'],
    'ont_rnoff_t': ont_runoff['Median'],
    'mih_evap_t': mih_evap['Median'],
    'mih_pcp_t': mih_precip['Median'],
    'mih_rnoff_t': mih_runoff['Median'],
})

# Set the index of the targets
targets.set_index(pd.to_datetime(eri_evap[['Year', 'Month']].assign(day=1)), inplace=True)

In [9]:
shifted_X = shift_variables(X ,lag=0, forecast=8)

  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{co

In [10]:
shifted_targets = shift_variables(targets, lag=0, forecast=11)

  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{column}_mo{month}'] = df[column].shift(-month)
  df[f'{co

In [11]:
# Current Basin conditions
X2 = pd.DataFrame({
    'sup_swe_l': data_4['sup_land'],
    'eri_swe_l': data_4['eri_land'],
    'ont_swe_l': data_4['ont_land'],
    'mih_swe_l': data_4['mih_land'],
    'mih_sst_w': data_5['mih_lake'],
    'eri_sst_w': data_5['eri_lake'],
    'ont_sst_w': data_5['ont_lake'],
    'mih_sst_w': data_5['mih_lake'],
})

# Set the index by date
X2.set_index(pd.to_datetime(data_4[['year', 'month']].assign(day=1)), inplace=True)

In [12]:
shifted_X2 = shift_variables(X2, lag=0, forecast=0)

In [13]:
merged_X = pd.merge(shifted_X, shifted_X2, left_index=True, right_index=True, how='inner')

In [17]:
# Make sure the indices/dates align after the shifts
aligned_y = shifted_targets.loc[shifted_X.index]

print(f'Number of Targets: {aligned_y.shape[1]}')
print(f'Number of Features: {merged_X.shape[1]}')

Number of Targets: 144
Number of Features: 223


Split the data into training and testing data sets. We could do it as a random 80/20 split
but instead we set split the data set by date ranges. This can easily be adjusted.

In [218]:
# Split data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(lagged_X, y, test_size=0.2, random_state=42)
train_start_date = '1979-01-01'
train_end_date = '2004-12-01'
# Testing dataset
val_start_date = '2005-01-01'
val_end_date = '2011-01-01'

#X_train = shifted_X[train_start_date:train_end_date]
X_train = merged_X[train_start_date:train_end_date]
y_train = aligned_y[train_start_date:train_end_date]
#X_test = shifted_X[val_start_date:val_end_date]
X_test = merged_X[val_start_date:val_end_date]
y_test = aligned_y[val_start_date:val_end_date]

In [219]:
# Verify shapes
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (312, 223)
Shape of y_train: (312, 144)
Shape of X_test: (64, 223)
Shape of y_test: (64, 144)


It is best practice to standardize the data from 0-1 before training

In [220]:
# Standardize the data
x_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train_scaled = x_scaler.fit_transform(X_train)
X_test_scaled = x_scaler.fit_transform(X_test)
y_train_scaled = y_scaler.fit_transform(y_train)
y_test_scaled = y_scaler.fit_transform(y_test)

(312, 223)
(64, 223)


## Training
Below we train different models using the same data and calculate the r squared values on the 
test data to compare performance.

In [221]:
# Testing Different Kernels
# Basic kernel using ConstantKernel: r2 = 0.8259
#kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))

# Matt's optimal kernel: 
# # MVP r2 = 0.8159
# # Geo mask r2: 0.8176
# # 
kernel = 1.0 * Matern(nu=1.5) * RationalQuadratic()

# Test to add a seasonality component: r2 = 0.8279
#period = 3.0  # Period of the season
#kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)) + ExpSineSquared(length_scale=1.0, periodicity=period, periodicity_bounds=(1e-2, 1e2))

#kernel = 1.0 * ExpSineSquared(periodicity=12)

#kernel = 1.0 * RBF() + 1.0 * Matern(nu=2.5) + 1.0 * RationalQuadratic()

# Set up the model
gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.1, n_restarts_optimizer=10, random_state=42)

# Fit the model
gpr.fit(X_train_scaled, y_train_scaled)

# Save the trained model
#joblib.dump(gpr, 'GP_trained_model.joblib')
#joblib.dump(x_scaler, 'x_scaler.joblib')
#joblib.dump(y_scaler, 'y_scaler.joblib')

# Predictions
y_pred, sigma = gpr.predict(X_test_scaled, return_std=True)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test_scaled, y_pred)
r_squared = r2_score(y_test_scaled, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

The r squared value for the model is 0.7427189240305423
Mean Squared Error: 0.2572810759694576




In [44]:
## Random Forest Regressor Model: r2 = 0.7389

# Initialize RandomForestRegressor
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, 'RF_trained_model.joblib')

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

The r squared value for the model is 0.6928700987238622
Mean Squared Error: 388.8125664945563


In [46]:
## Linear Regression Model 

# Initialize Linear Regression model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, 'LR_trained_model.joblib')

# Make predictions
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

# Get feature names from the DataFrame
feature_names = lagged_X.columns

# Get coefficients
coefficients = model.coef_

# Create a DataFrame for better visualization
importance_df = pd.DataFrame()

# Iterate through each target variable and store its coefficients
for i, target in enumerate(y.columns):
    target_importance_df = pd.DataFrame({
        'Feature': lagged_X.columns,
        'Coefficient': coefficients[i]  # Get coefficients for the i-th target
    })
    target_importance_df['Absolute Importance'] = target_importance_df['Coefficient'].abs()
    target_importance_df = target_importance_df.sort_values(by='Absolute Importance', ascending=False)
    target_importance_df['Target'] = target  # Add the target variable name
    importance_df = pd.concat([importance_df, target_importance_df], ignore_index=True)

# Set display option to show all rows
pd.set_option('display.max_rows', None)

# Print ranked features for each target
print(importance_df[['Target', 'Feature', 'Coefficient', 'Absolute Importance']])

The r squared value for the model is 0.7789350664714867
Mean Squared Error: 307.8098635510663
        Target         Feature  Coefficient  Absolute Importance
0    su_evap_y   er_tmp_w_lag1     6.193755             6.193755
1    su_evap_y   mh_tmp_w_lag1    -3.811819             3.811819
2    su_evap_y   er_tmp_l_lag1    -3.754319             3.754319
3    su_evap_y   su_tmp_w_lag1     3.658583             3.658583
4    su_evap_y   mh_tmp_l_lag1     3.413491             3.413491
5    su_evap_y        er_tmp_w    -2.474675             2.474675
6    su_evap_y        er_tmp_l    -2.022807             2.022807
7    su_evap_y   on_tmp_l_lag1    -1.640946             1.640946
8    su_evap_y        on_tmp_l     1.601433             1.601433
9    su_evap_y   on_tmp_w_lag1    -1.464392             1.464392
10   su_evap_y        su_tmp_l     1.430730             1.430730
11   su_evap_y        su_tmp_w    -1.394511             1.394511
12   su_evap_y   su_tmp_l_lag1     0.985434             0.985

In [27]:
## Neural Network: r2 = 0.4002

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(12)  # Number of targets
])

# Compile the model
model.compile(optimizer='adam', loss='mse')  # Using mean squared error (mse) as the loss function

# Fit the model to the training data
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Save the trained model
joblib.dump(model,'NN_trained_model.joblib')

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

Epoch 1/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 1869960.5000 - val_loss: 1924885.3750
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1916225.1250 - val_loss: 1924183.1250
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1858305.2500 - val_loss: 1923263.7500
Epoch 4/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1906919.0000 - val_loss: 1921910.0000
Epoch 5/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1808742.3750 - val_loss: 1919836.3750
Epoch 6/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1861943.6250 - val_loss: 1916625.0000
Epoch 7/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1873305.7500 - val_loss: 1911846.3750
Epoch 8/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1854955.8750 