# Model Training Script
Lindsay Fitzpatrick
ljob@umich.edu
08/19/2024

This script reads in CFSR data from 1979 - 2010 and trains different machine learning
models to target RNBS from GLCC across the 5 Great Lakes simultaeously.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF, Matern, RationalQuadratic, ExpSineSquared
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import joblib
import calendar

## Functions

User Input

In [4]:
# This is the directory where the CFSR and GLCC files are located
dir = 'C:/Users/fitzpatrick/Desktop/Data/Input/'

In [5]:
def seconds_in_month(year, month):
    # Number of days in the month
    num_days = calendar.monthrange(year, month)[1]
    # Convert days to seconds
    return num_days * 24 * 60 * 60

In [6]:
def convert_kg_to_cms_cfsr(df):

    # Calculate the number of seconds for each month
    df['seconds'] = df.apply(lambda row: seconds_in_month(int(row['year']), int(row['month'])), axis=1)

    # Convert kg to meters cubed and divide by the seconds in the month
    df['WaterErie_cms'] = (df['WaterErie'] / 1000) / df['seconds']
    df['WaterOntario_cms'] = (df['WaterOntario'] / 1000) / df['seconds']
    df['WaterSuperior_cms'] = (df['WaterSuperior'] / 1000) / df['seconds']
    df['WaterMichHuron_cms'] = ((df['WaterMichigan'] + df['WaterHuron']) / 1000) / df['seconds']
    df['LandErie_cms'] = (df['LandErie'] / 1000) / df['seconds']
    df['LandOntario_cms'] = (df['LandOntario'] / 1000) / df['seconds']
    df['LandSuperior_cms'] = (df['LandSuperior'] / 1000) / df['seconds']
    df['LandMichHuron_cms'] = ((df['LandMichigan'] + df['LandHuron']) / 1000) / df['seconds']
    
    return df

In [7]:
def convert_mm_to_cms_l2(df, lake):

    # Define the lake surface areas directly within the function
    lake_sa_dict = {
        'Superior': 82097*1000000,
        'MichHuron': (57753 + 5956)*1000000,
        'Erie': 25655*1000000,
        'Ontario': 19009*1000000
    }
    
    # Get the surface area for the specified lake
    lake_sa = lake_sa_dict.get(lake, None)
    
    # Check if the lake surface area was found
    if lake_sa is None:
        raise ValueError(f"Lake '{lake}' is not recognized. Please provide a valid lake name, either 'Superior', 'Erie', 'Ontario', or 'MichHuron'.")
    
    # Calculate the number of seconds for each month
    df['seconds'] = df.apply(lambda row: seconds_in_month(int(row['Year']), int(row['Month'])), axis=1)

    # Convert millimeters to meters cubed and divide by seconds
    df['Median_cms'] = (df['Median'] / 1000) / df['seconds'] * lake_sa
    df['2.5_cms'] = (df['2.5 Percentile'] / 1000) / df['seconds'] * lake_sa
    df['97.5_cms'] = (df['97.5 Percentile'] / 1000) / df['seconds'] * lake_sa
    
    return df

In [8]:
def create_lagged_variables(df, lags):
    """
    Create lagged variables for specified columns in a DataFrame.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame containing the time series data.
    - columns (list of str): The list of column names for which to create lagged variables.
    - lags (int): The number of lagged variables to create.
    
    Returns:
    - pd.DataFrame: A DataFrame with the original columns and the new lagged variables.
    """
    
    df = df.copy()  # To avoid modifying the original DataFrame
    
    for column in df.columns:
        for lag in range(1, lags + 1):
            df[f'{column}_lag{lag}'] = df[column].shift(lag)

    # Drop rows with any NaN values
    df = df.dropna()
    
    return df

## Begin Script

In [9]:
## Read in PCP data from CFSR [kg]
data_1 = pd.read_csv(dir+'CFSR_APCP_Basin_Sums.csv',sep=',')

## Read in EVAP data from CFSR [kg]
data_2 = pd.read_csv(dir+'CFSR_EVAP_Basin_Sums.csv',sep=',')

## Read in TMP data from CFSR [K]
data_3 = pd.read_csv(dir+'CFSR_TMP_Basin_Avgs.csv',sep=',')

In [10]:
# Convert Total Precipitation to cms
data_1 = convert_kg_to_cms_cfsr(data_1)

# Convert Total Evaporation to cms
data_2 = convert_kg_to_cms_cfsr(data_2)

Read in the GLCC RNBS CSVs file in [cms]

https://www.greatlakescc.org/en/coordinating-committee-products-and-datasets/#:~:text=Monthly%20Residual%20Net%20Basin%20Supplies%3A

In [11]:
sup_evap_file = pd.read_csv(dir + 'SupEvap_analysis19502022_prior19001969_1m.csv')
sup_runoff_file = pd.read_csv(dir + 'SupRunoff_analysis19502022_prior19001969_1m.csv')
sup_precip_file = pd.read_csv(dir + 'SupPrecip_analysis19502022_prior19001969_1m.csv')

eri_evap_file = pd.read_csv(dir + 'ErieEvap_analysis19502022_prior19001969_1m.csv')
eri_runoff_file = pd.read_csv(dir + 'ErieRunoff_analysis19502022_prior19001969_1m.csv')
eri_precip_file = pd.read_csv(dir + 'EriePrecip_analysis19502022_prior19001969_1m.csv')

ont_evap_file = pd.read_csv(dir + 'OntEvap_analysis19502022_prior19001969_1m.csv')
ont_runoff_file = pd.read_csv(dir + 'OntRunoff_analysis19502022_prior19001969_1m.csv')
ont_precip_file = pd.read_csv(dir + 'OntPrecip_analysis19502022_prior19001969_1m.csv')

mih_evap_file = pd.read_csv(dir + 'MiHurEvap_analysis19502022_prior19001969_1m.csv')
mih_runoff_file = pd.read_csv(dir + 'MiHurRunoff_analysis19502022_prior19001969_1m.csv')
mih_precip_file = pd.read_csv(dir + 'MiHurPrecip_analysis19502022_prior19001969_1m.csv')

In [12]:
print(sup_evap_file)

       Year  Month      Median  2.5 Percentile  97.5 Percentile
0    1950.0    1.0   67.245050       51.865840        84.204657
1    1950.0    2.0   35.728114       20.270061        50.587482
2    1950.0    3.0   12.926938       -2.109247        28.563223
3    1950.0    4.0   12.655983        3.428071        22.222697
4    1950.0    5.0   -3.229512       -9.853121         3.477014
..      ...    ...         ...             ...              ...
863  2021.0   12.0  112.903571       99.068128       126.926402
864  2022.0    1.0  139.083649      125.006839       153.776202
865  2022.0    2.0   68.063946       54.649119        81.100302
866  2022.0    3.0   39.705374       26.086656        53.065299
867  2022.0    4.0   12.252169        3.359856        21.576276

[868 rows x 5 columns]


In [13]:
sup_evap = convert_mm_to_cms_l2(sup_evap_file, 'Superior')
sup_runoff = convert_mm_to_cms_l2(sup_runoff_file, 'Superior')
sup_precip = convert_mm_to_cms_l2(sup_precip_file, 'Superior')

eri_evap = convert_mm_to_cms_l2(eri_evap_file, 'Erie')
eri_runoff = convert_mm_to_cms_l2(eri_runoff_file, 'Erie')
eri_precip = convert_mm_to_cms_l2(eri_precip_file, 'Erie')

ont_evap = convert_mm_to_cms_l2(ont_evap_file, 'Ontario')
ont_runoff = convert_mm_to_cms_l2(ont_runoff_file, 'Ontario')
ont_precip = convert_mm_to_cms_l2(ont_precip_file, 'Ontario')

mih_evap = convert_mm_to_cms_l2(mih_evap_file, 'MichHuron')
mih_runoff = convert_mm_to_cms_l2(mih_runoff_file, 'MichHuron')
mih_precip = convert_mm_to_cms_l2(mih_precip_file, 'MichHuron')

Here we prepare the data for training and testing. We set the features 'X' as total over lake
precipitation, total over lake evaporation, and the average air temperature over each lake. The
targets 'y' are RNBS for each lake simultaeously.

In [14]:
# Features
X = pd.DataFrame({
    'su_pcp_w': data_1['WaterSuperior_cms'],
    'er_pcp_w': data_1['WaterErie_cms'],
    'on_pcp_w': data_1['WaterOntario_cms'],
    'mh_pcp_w': data_1['WaterMichHuron_cms'], #data_1['WaterMichigan']+data_1['WaterHuron'], # add the sums
    'su_pcp_l': data_1['LandSuperior_cms'],
    'er_pcp_l': data_1['LandErie_cms'],
    'on_pcp_l': data_1['LandOntario_cms'],
    'mh_pcp_l': data_1['LandMichHuron_cms'],
    'su_evap_w': data_2['WaterSuperior_cms'],
    'er_evap_w': data_2['WaterErie_cms'],
    'on_evap_w': data_2['WaterOntario_cms'],
    'mh_evap_w': data_2['WaterMichHuron_cms'], #data_2['WaterMichigan']+data_2['WaterHuron'], # add the sums
    'su_evap_l': data_2['LandSuperior_cms'],
    'er_evap_l': data_2['LandErie_cms'],
    'on_evap_l': data_2['LandOntario_cms'],
    'mh_evap_l': data_2['LandMichHuron_cms'],
    'su_tmp_w': data_3['WaterSuperior'],
    'er_tmp_w': data_3['WaterErie'],
    'on_tmp_w': data_3['WaterOntario'],
    'mh_tmp_w': (data_3['WaterMichigan']+data_3['WaterHuron'])/2,
    'su_tmp_l': data_3['LandSuperior'],
    'er_tmp_l': data_3['LandErie'],
    'on_tmp_l': data_3['LandOntario'],
    'mh_tmp_l': (data_3['LandMichigan']+data_3['LandHuron'])/2 # take the average temp
})

# Set the index by date
X.set_index(pd.to_datetime(data_1[['year', 'month']].assign(day=1)), inplace=True)

# Targets are the components of NBS (P, E, R)
targets = pd.DataFrame({
    'su_evap_y': sup_evap['Median_cms'],
    'su_precip': sup_precip['Median_cms'],
    'su_runoff': sup_runoff['Median_cms'],
    'er_evap_y': eri_evap['Median_cms'],
    'er_precip': eri_precip['Median_cms'],
    'er_runoff': eri_runoff['Median_cms'],
    'on_evap_y': ont_evap['Median_cms'],
    'on_precip': ont_precip['Median_cms'],
    'on_runoff': ont_runoff['Median_cms'],
    'mh_evap_y': mih_evap['Median_cms'],
    'mh_precip': mih_precip['Median_cms'],
    'mh_runoff': mih_runoff['Median_cms'],
})

# Set the index of the targets
targets.set_index(pd.to_datetime(eri_evap[['Year', 'Month']].assign(day=1)), inplace=True)

In [19]:
lagged_X = create_lagged_variables(X, lags=1)
print (lagged_X)

               su_pcp_w    er_pcp_w    on_pcp_w     mh_pcp_w     su_pcp_l  \
1979-02-01  1725.202776  307.523427  174.595630  1647.644403  2994.613464   
1979-03-01  2985.410905  502.023507  216.720716  3366.383104  6179.396933   
1979-04-01  1833.850353  771.898492  361.687857  3439.840263  3900.078255   
1979-05-01  2518.523016  599.135634  264.458043  2364.981209  5220.389769   
1979-06-01  2927.137756  395.267754  202.659087  1864.819393  5602.199907   
...                 ...         ...         ...          ...          ...   
2010-08-01  1537.159708  156.390801  133.331169  1160.518649  3422.201604   
2010-09-01  2935.872420  281.306113  243.609570  2866.455046  7201.816009   
2010-10-01  1620.451353  390.675933  200.661594  1651.151799  3658.468471   
2010-11-01  2178.428373  477.138485  278.313391  2092.985524  4640.400044   
2010-12-01  1106.622457  311.816611  284.541208  2060.855935  2527.814144   

               er_pcp_l     on_pcp_l      mh_pcp_l    su_evap_w   er_evap_w

In [20]:
# Merge the features and targets to align with the dates
# Drops the dates where we don't have CFS data 
merged_df = pd.merge(lagged_X, targets, left_index=True, right_index=True, how='inner')

# Pull the target variables back out 
y = merged_df[['su_evap_y', 'su_precip', 'su_runoff', 'er_evap_y', 'er_precip', 'er_runoff',
               'on_evap_y', 'on_precip', 'on_runoff', 'mh_evap_y', 'mh_precip', 'mh_runoff']]

print(X)

               su_pcp_w    er_pcp_w    on_pcp_w     mh_pcp_w     su_pcp_l  \
1979-01-01  1418.815517  480.493648  435.053300  2731.419734  1908.337758   
1979-02-01  1725.202776  307.523427  174.595630  1647.644403  2994.613464   
1979-03-01  2985.410905  502.023507  216.720716  3366.383104  6179.396933   
1979-04-01  1833.850353  771.898492  361.687857  3439.840263  3900.078255   
1979-05-01  2518.523016  599.135634  264.458043  2364.981209  5220.389769   
...                 ...         ...         ...          ...          ...   
2010-08-01  1537.159708  156.390801  133.331169  1160.518649  3422.201604   
2010-09-01  2935.872420  281.306113  243.609570  2866.455046  7201.816009   
2010-10-01  1620.451353  390.675933  200.661594  1651.151799  3658.468471   
2010-11-01  2178.428373  477.138485  278.313391  2092.985524  4640.400044   
2010-12-01  1106.622457  311.816611  284.541208  2060.855935  2527.814144   

               er_pcp_l     on_pcp_l      mh_pcp_l    su_evap_w   er_evap_w

Split the data into training and testing data sets. We could do it as a random 80/20 split
but instead we set split the data set by date ranges. This can easily be adjusted.

In [21]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(lagged_X, y, test_size=0.2, random_state=42)
#train_start_date = '1979-01-01'
#train_end_date = '2004-12-01'
# Testing dataset
#val_start_date = '2005-01-01'
#val_end_date = '2011-01-01'

#X_train = X[train_start_date:train_end_date]
#y_train = y[train_start_date:train_end_date]
#X_test = X[val_start_date:val_end_date]
#y_test = y[val_start_date:val_end_date]

In [22]:
# Verify shapes
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (306, 48)
Shape of y_train: (306, 12)
Shape of X_test: (77, 48)
Shape of y_test: (77, 12)


It is best practice to standardize the data from 0-1 before training

In [23]:
# Standardize the data
x_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train_scaled = x_scaler.fit_transform(X_train)
X_test_scaled = x_scaler.fit_transform(X_test)
y_train_scaled = y_scaler.fit_transform(y_train)
y_test_scaled = y_scaler.fit_transform(y_test)

print(X_train_scaled.shape)
print(X_test_scaled.shape)

(306, 48)
(77, 48)


## Training
Below we train different models using the same data and calculate the r squared values on the 
test data to compare performance.

In [28]:
# Testing Different Kernels
# Basic kernel using ConstantKernel: r2 = 0.8259
#kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))

# Matt's optimal kernel: r2 = 0.8321
#kernel = 1.0 * Matern(nu=1.5) * RationalQuadratic()

# Test to add a seasonality component: r2 = 0.8279
#period = 3.0  # Period of the season
#kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)) + ExpSineSquared(length_scale=1.0, periodicity=period, periodicity_bounds=(1e-2, 1e2))

#kernel = 1.0 * ExpSineSquared(periodicity=12)

kernel = 1.0 * RBF() + 1.0 * Matern(nu=2.5) + 1.0 * RationalQuadratic()

# Set up the model
gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.1, n_restarts_optimizer=10, random_state=42)

# Fit the model
gpr.fit(X_train_scaled, y_train_scaled)

# Save the trained model
#joblib.dump(gpr, 'GP_trained_model.joblib')
#joblib.dump(x_scaler, 'x_scaler.joblib')
#joblib.dump(y_scaler, 'y_scaler.joblib')

# Predictions
y_pred, sigma = gpr.predict(X_test_scaled, return_std=True)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test_scaled, y_pred)
r_squared = r2_score(y_test_scaled, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

The r squared value for the model is 0.8083878321974806
Mean Squared Error: 0.1916121678025194




In [30]:
## Random Forest Regressor Model: r2 = 0.7389

# Initialize RandomForestRegressor
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, 'RF_trained_model.joblib')

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

The r squared value for the model is 0.6781358056116463
Mean Squared Error: 106837.73060150853


AttributeError: 'RandomForestRegressor' object has no attribute 'coef_'

In [38]:
## Linear Regression Model 

# Initialize Linear Regression model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, 'LR_trained_model.joblib')

# Make predictions
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

# Get feature names from the DataFrame
feature_names = lagged_X.columns

# Get coefficients
coefficients = model.coef_

# Create a DataFrame for better visualization
importance_df = pd.DataFrame()

# Iterate through each target variable and store its coefficients
for i, target in enumerate(y.columns):
    target_importance_df = pd.DataFrame({
        'Feature': lagged_X.columns,
        'Coefficient': coefficients[i]  # Get coefficients for the i-th target
    })
    target_importance_df['Absolute Importance'] = target_importance_df['Coefficient'].abs()
    target_importance_df = target_importance_df.sort_values(by='Absolute Importance', ascending=False)
    target_importance_df['Target'] = target  # Add the target variable name
    importance_df = pd.concat([importance_df, target_importance_df], ignore_index=True)

# Print ranked features for each target
print(importance_df[['Target', 'Feature', 'Coefficient', 'Absolute Importance']])

The r squared value for the model is 0.776036270114854
Mean Squared Error: 78431.12444419645
        Target         Feature  Coefficient  Absolute Importance
0    su_evap_y   on_tmp_w_lag1  -286.617750           286.617750
1    su_evap_y   er_tmp_w_lag1   189.563720           189.563720
2    su_evap_y   on_tmp_l_lag1   180.238058           180.238058
3    su_evap_y   er_tmp_l_lag1  -135.715438           135.715438
4    su_evap_y        er_tmp_w   -90.550113            90.550113
..         ...             ...          ...                  ...
571  mh_runoff  mh_evap_l_lag1    -0.044336             0.044336
572  mh_runoff   su_pcp_w_lag1    -0.042586             0.042586
573  mh_runoff   er_pcp_l_lag1     0.035229             0.035229
574  mh_runoff   su_pcp_l_lag1     0.004302             0.004302
575  mh_runoff       on_evap_l    -0.001247             0.001247

[576 rows x 4 columns]


In [27]:
## Neural Network: r2 = 0.4002

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(12)  # Number of targets
])

# Compile the model
model.compile(optimizer='adam', loss='mse')  # Using mean squared error (mse) as the loss function

# Fit the model to the training data
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Save the trained model
joblib.dump(model,'NN_trained_model.joblib')

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

Epoch 1/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 1869960.5000 - val_loss: 1924885.3750
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1916225.1250 - val_loss: 1924183.1250
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1858305.2500 - val_loss: 1923263.7500
Epoch 4/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1906919.0000 - val_loss: 1921910.0000
Epoch 5/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1808742.3750 - val_loss: 1919836.3750
Epoch 6/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1861943.6250 - val_loss: 1916625.0000
Epoch 7/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1873305.7500 - val_loss: 1911846.3750
Epoch 8/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1854955.8750 