# Model Training Script
Lindsay Fitzpatrick
ljob@umich.edu
08/19/2024

This script reads in CFSR data from 1979 - 2010 and trains different machine learning
models to target RNBS from GLCC across the 5 Great Lakes simultaeously.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ConstantKernel, RBF, Matern, RationalQuadratic
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from sklearn.ensemble import RandomForestRegressor
import joblib
import calendar

## Functions

User Input

In [2]:
# This is the directory where the CFSR and GLCC files are located
dir = 'C:/Users/fitzpatrick/Desktop/Data/Input/'

In [5]:
def seconds_in_month(year, month):
    # Number of days in the month
    num_days = calendar.monthrange(year, month)[1]
    # Convert days to seconds
    return num_days * 24 * 60 * 60

In [18]:
def convert_kg_to_cms_cfsr(df):

    # Calculate the number of seconds for each month
    df['seconds'] = df.apply(lambda row: seconds_in_month(int(row['year']), int(row['month'])), axis=1)

    # Convert kg to meters cubed and divide by the seconds in the month
    df['WaterErie_m3'] = (df['WaterErie'] / 1000) / df['seconds']
    df['WaterOntario_m3'] = (df['WaterOntario'] / 1000) / df['seconds']
    df['WaterSuperior_m3'] = (df['WaterSuperior'] / 1000) / df['seconds']
    df['WaterMichHuron_m3'] = ((df['WaterMichigan'] + df['WaterHuron']) / 1000) / df['seconds']

    # Convert the data to cubic meters per second
    #df['WaterErie_cms'] = df['WaterErie_m3'] / df['seconds']
    #df['WaterOntario_cms'] = df['WaterOntario_m3'] / df['seconds']
    #df['WaterSuperior_cms'] = df['WaterSuperior_m3'] / df['seconds']
    #df['WaterMichHuron_cms'] = df['WaterMichHuron_m3'] / df['seconds']
    
    return df

In [20]:
def convert_mm_to_cms_l2(df, lake):

    # Define the lake surface areas directly within the function
    lake_sa_dict = {
        'Superior': 82097*1000000,
        'MichHuron': (57753 + 5956)*1000000,
        'Erie': 25655*1000000,
        'Ontario': 19009*1000000
    }
    
    # Get the surface area for the specified lake
    lake_sa = lake_sa_dict.get(lake, None)
    
    # Check if the lake surface area was found
    if lake_sa is None:
        raise ValueError(f"Lake '{lake}' is not recognized. Please provide a valid lake name, either 'Superior', 'Erie', 'Ontario', or 'MichHuron'.")
    
    # Calculate the number of seconds for each month
    df['seconds'] = df.apply(lambda row: seconds_in_month(int(row['Year']), int(row['Month'])), axis=1)

    # Convert millimeters to meters cubed and divide by seconds
    df['Median_cms'] = (df['Median'] / 1000) / df['seconds'] * lake_sa
    df['2.5_cms'] = (df['2.5 Percentile'] / 1000) / df['seconds'] * lake_sa
    df['97.5_cms'] = (df['97.5 Percentile'] / 1000) / df['seconds'] * lake_sa
    
    return df

## Begin Script

In [3]:
## Read in PCP data from CFSR [kg]
data_1 = pd.read_csv(dir+'CFSR_APCP_Basin_Sums.csv',sep=',')

## Read in EVAP data from CFSR [kg]
data_2 = pd.read_csv(dir+'CFSR_EVAP_Basin_Sums.csv',sep=',')

## Read in TMP data from CFSR [K]
data_3 = pd.read_csv(dir+'CFSR_TMP_Basin_Avgs.csv',sep=',')

In [19]:
# Convert Total Precipitation to cms
data_1 = convert_kg_to_cms_cfsr(data_1)

# Convert Total Evaporation to cms
data_2 = convert_kg_to_cms_cfsr(data_2)

     year  month     BasinErie     WaterErie      LandErie  BasinOntario  \
0    1979      1  1.869364e+12  1.286954e+12  7.145087e+12  1.032279e+13   
1    1979      2  1.102083e+12  7.439607e+11  4.435970e+12  4.059830e+12   
2    1979      3  2.140239e+12  1.344620e+12  8.363889e+12  5.413565e+12   
3    1979      4  2.926690e+12  2.000761e+12  1.089467e+13  8.373868e+12   
4    1979      5  2.524354e+12  1.604725e+12  9.559551e+12  6.477845e+12   
..    ...    ...           ...           ...           ...           ...   
379  2010      8  5.752132e+11  4.188771e+11  2.225233e+12  4.348570e+12   
380  2010      9  1.088025e+12  7.291454e+11  4.074307e+12  6.778341e+12   
381  2010     10  1.371390e+12  1.046386e+12  4.754692e+12  5.580925e+12   
382  2010     11  2.093816e+12  1.236743e+12  8.388468e+12  6.453425e+12   
383  2010     12  1.213677e+12  8.351696e+11  4.662361e+12  6.970965e+12   

     WaterOntario   LandOntario    BasinHuron    WaterHuron  ...  \
0    1.165247e+12  

Read in the GLCC RNBS CSVs file in [cms]

https://www.greatlakescc.org/en/coordinating-committee-products-and-datasets/#:~:text=Monthly%20Residual%20Net%20Basin%20Supplies%3A

In [8]:
sup_evap_file = pd.read_csv(dir + 'SupEvap_analysis19502022_prior19001969_1m.csv')
sup_runoff_file = pd.read_csv(dir + 'SupRunoff_analysis19502022_prior19001969_1m.csv')
sup_precip_file = pd.read_csv(dir + 'SupPrecip_analysis19502022_prior19001969_1m.csv')

eri_evap_file = pd.read_csv(dir + 'ErieEvap_analysis19502022_prior19001969_1m.csv')
eri_runoff_file = pd.read_csv(dir + 'ErieRunoff_analysis19502022_prior19001969_1m.csv')
eri_precip_file = pd.read_csv(dir + 'EriePrecip_analysis19502022_prior19001969_1m.csv')

ont_evap_file = pd.read_csv(dir + 'OntEvap_analysis19502022_prior19001969_1m.csv')
ont_runoff_file = pd.read_csv(dir + 'OntRunoff_analysis19502022_prior19001969_1m.csv')
ont_precip_file = pd.read_csv(dir + 'OntPrecip_analysis19502022_prior19001969_1m.csv')

mih_evap_file = pd.read_csv(dir + 'MiHurEvap_analysis19502022_prior19001969_1m.csv')
mih_runoff_file = pd.read_csv(dir + 'MiHurRunoff_analysis19502022_prior19001969_1m.csv')
mih_precip_file = pd.read_csv(dir + 'MiHurPrecip_analysis19502022_prior19001969_1m.csv')

In [11]:
print(sup_evap_file)

       Year  Month      Median  2.5 Percentile  97.5 Percentile
0    1950.0    1.0   67.245050       51.865840        84.204657
1    1950.0    2.0   35.728114       20.270061        50.587482
2    1950.0    3.0   12.926938       -2.109247        28.563223
3    1950.0    4.0   12.655983        3.428071        22.222697
4    1950.0    5.0   -3.229512       -9.853121         3.477014
..      ...    ...         ...             ...              ...
863  2021.0   12.0  112.903571       99.068128       126.926402
864  2022.0    1.0  139.083649      125.006839       153.776202
865  2022.0    2.0   68.063946       54.649119        81.100302
866  2022.0    3.0   39.705374       26.086656        53.065299
867  2022.0    4.0   12.252169        3.359856        21.576276

[868 rows x 5 columns]


In [22]:
sup_evap = convert_mm_to_cms_l2(sup_evap_file, 'Superior')
sup_runoff = convert_mm_to_cms_l2(sup_runoff_file, 'Superior')
sup_precip = convert_mm_to_cms_l2(sup_precip_file, 'Superior')

eri_evap = convert_mm_to_cms_l2(eri_evap_file, 'Erie')
eri_runoff = convert_mm_to_cms_l2(eri_runoff_file, 'Erie')
eri_precip = convert_mm_to_cms_l2(eri_precip_file, 'Erie')

ont_evap = convert_mm_to_cms_l2(ont_evap_file, 'Ontario')
ont_runoff = convert_mm_to_cms_l2(ont_runoff_file, 'Ontario')
ont_precip = convert_mm_to_cms_l2(ont_precip_file, 'Ontario')

mih_evap = convert_mm_to_cms_l2(mih_evap_file, 'MichHuron')
mih_runoff = convert_mm_to_cms_l2(mih_runoff_file, 'MichHuron')
mih_precip = convert_mm_to_cms_l2(mih_precip_file, 'MichHuron')

In [23]:
print(mih_evap)

       Year  Month    Median  2.5 Percentile  97.5 Percentile  seconds  \
0    1950.0    1.0   74.9800         61.8440          87.8480  2678400   
1    1950.0    2.0   47.2140         35.0710          58.9410  2419200   
2    1950.0    3.0   34.2850         21.9170          46.7320  2678400   
3    1950.0    4.0   13.0790          4.9639          21.0750  2592000   
4    1950.0    5.0   -1.3051         -7.6169           5.3237  2678400   
..      ...    ...       ...             ...              ...      ...   
863  2021.0   12.0  101.9500         91.1620         113.0600  2678400   
864  2022.0    1.0   97.0030         86.1040         107.4900  2678400   
865  2022.0    2.0   50.9510         40.9880          60.8990  2419200   
866  2022.0    3.0   28.0770         14.6670          41.8310  2678400   
867  2022.0    4.0    5.2800         -2.7979          14.0110  2592000   

      Median_cms      2.5_cms     97.5_cms  
0    1783.490450  1471.034721  2089.571473  
1    1243.368356   92

Here we prepare the data for training and testing. We set the features 'X' as total over lake
precipitation, total over lake evaporation, and the average air temperature over each lake. The
targets 'y' are RNBS for each lake simultaeously.

In [31]:
# Features
X = pd.DataFrame({
    'su_pcp': data_1['WaterSuperior_cms'],
    'er_pcp': data_1['WaterErie_cms'],
    'on_pcp': data_1['WaterOntario_cms'],
    'mh_pcp': data_1['WaterMichHuron_cms'], #data_1['WaterMichigan']+data_1['WaterHuron'], # add the sums
    'su_evap': data_2['WaterSuperior_cms'],
    'er_evap': data_2['WaterErie_cms'],
    'on_evap': data_2['WaterOntario_cms'],
    'mh_evap': data_2['WaterMichHuron_cms'], #data_2['WaterMichigan']+data_2['WaterHuron'], # add the sums
    'su_tmp': data_3['WaterSuperior'],
    'er_tmp': data_3['WaterErie'],
    'on_tmp': data_3['WaterOntario'],
    'mh_tmp': (data_3['WaterMichigan']+data_3['WaterHuron'])/2 # take the average temp
})

# Set the index by date
X.set_index(pd.to_datetime(data_1[['year', 'month']].assign(day=1)), inplace=True)

# Targets are the components of NBS (P, E, R)
targets = pd.DataFrame({
    'su_evap_y': sup_evap['Median_cms'],
    'su_precip': sup_precip['Median_cms'],
    'su_runoff': sup_runoff['Median_cms'],
    'er_evap_y': eri_evap['Median_cms'],
    'er_precip': eri_precip['Median_cms'],
    'er_runoff': eri_runoff['Median_cms'],
    'on_evap_y': ont_evap['Median_cms'],
    'on_precip': ont_precip['Median_cms'],
    'on_runoff': ont_runoff['Median_cms'],
    'mh_evap_y': mih_evap['Median_cms'],
    'mh_precip': mih_precip['Median_cms'],
    'mh_runoff': mih_runoff['Median_cms'],
})

# Set the index of the targets
targets.set_index(pd.to_datetime(eri_evap[['Year', 'Month']].assign(day=1)), inplace=True)

# Merge the features and targets to align with the dates
# Drops the dates where we don't have CFS data 
merged_df = pd.merge(X, targets, left_index=True, right_index=True, how='inner')

# Pull the target variables back out 
y = merged_df[['su_evap_y', 'su_precip', 'su_runoff', 'er_evap_y', 'er_precip', 'er_runoff',
               'on_evap_y', 'on_precip', 'on_runoff', 'mh_evap_y', 'mh_precip', 'mh_runoff']]

print(y)

              su_evap_y    su_precip    su_runoff    er_evap_y    er_precip  \
1979-01-01  2680.892922  1494.874063   257.166155   312.941786   786.010043   
1979-02-01   910.966970  1788.406043   281.665468    93.454255   655.479311   
1979-03-01   332.175362  3351.435924   682.915606   128.148602   739.937556   
1979-04-01   113.220853  1821.843920  2949.727473     0.291687  1099.047531   
1979-05-01   -68.105721  3272.661548  4323.088740    85.882325   825.281810   
...                 ...          ...          ...          ...          ...   
2010-08-01   685.641445  2311.430246   615.482288  1175.825652   425.954992   
2010-09-01  2307.301820  3786.534086  1019.244390  1840.633000   692.348476   
2010-10-01  2614.566215  1281.846080   856.096629  1781.961770   795.875877   
2010-11-01  3466.873675  2140.159834   862.461925  1370.139640   745.499460   
2010-12-01  4341.156823  1239.853513   629.275467   992.495179   446.836078   

              er_runoff   on_evap_y   on_precip    

Split the data into training and testing data sets. We could do it as a random 80/20 split
but instead we set split the data set by date ranges. This can easily be adjusted.

In [32]:
# Split data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_start_date = '1979-01-01'
train_end_date = '2004-12-01'
# Testing dataset
val_start_date = '2005-01-01'
val_end_date = '2011-01-01'

X_train = X[train_start_date:train_end_date]
y_train = y[train_start_date:train_end_date]
X_test = X[val_start_date:val_end_date]
y_test = y[val_start_date:val_end_date]

In [33]:
# Verify shapes
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (312, 12)
Shape of y_train: (312, 12)
Shape of X_test: (72, 12)
Shape of y_test: (72, 12)


It is best practice to standardize the data from 0-1 before training

In [34]:
# Standardize the data
x_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train_scaled = x_scaler.fit_transform(X_train)
X_test_scaled = x_scaler.fit_transform(X_test)
y_train_scaled = y_scaler.fit_transform(y_train)
y_test_scaled = y_scaler.fit_transform(y_test)

print(X_train_scaled.shape)
print(X_test_scaled.shape)

(312, 12)
(72, 12)


## Training
Below we train different models using the same data and calculate the r squared values on the 
test data to compare performance.

In [35]:
# Testing Different Kernels
# Basic kernel using ConstantKernel: r2 = 0.8259
#kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))

# Matt's optimal kernel: r2 = 0.8321
kernel = 1.0 * Matern(nu=1.5) * RationalQuadratic()

# Test to add a seasonality component: r2 = 0.8279
#period = 3.0  # Period of the season
#kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e2)) + ExpSineSquared(length_scale=1.0, periodicity=period, periodicity_bounds=(1e-2, 1e2))

# Set up the model
gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.1, n_restarts_optimizer=10, random_state=42)

# Fit the model
gpr.fit(X_train_scaled, y_train_scaled)

# Save the trained model
joblib.dump(gpr, 'GP_trained_model.joblib')
joblib.dump(x_scaler, 'x_scaler.joblib')
joblib.dump(y_scaler, 'y_scaler.joblib')

# Predictions
y_pred, sigma = gpr.predict(X_test_scaled, return_std=True)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test_scaled, y_pred)
r_squared = r2_score(y_test_scaled, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

The r squared value for the model is 0.8280261388798332
Mean Squared Error: 0.1719738611201668


In [36]:
## Random Forest Regressor Model: r2 = 0.7389

# Initialize RandomForestRegressor
model = RandomForestRegressor(random_state=42)

# Train the model
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, 'RF_trained_model.joblib')

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

The r squared value for the model is 0.6900094629075637
Mean Squared Error: 120295.37954958137


In [40]:
## Neural Network: r2 = 0.4002

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(12)  # Number of targets
])

# Compile the model
model.compile(optimizer='adam', loss='mse')  # Using mean squared error (mse) as the loss function

# Fit the model to the training data
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Save the trained model
joblib.dump(model,'NN_trained_model.joblib')

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print(f"The r squared value for the model is {r_squared}")
print(f"Mean Squared Error: {mse}")

Epoch 1/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 1916398.7500 - val_loss: 1924345.6250
Epoch 2/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1904733.6250 - val_loss: 1923514.3750
Epoch 3/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1846397.5000 - val_loss: 1922388.6250
Epoch 4/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1930362.6250 - val_loss: 1920776.1250
Epoch 5/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1859909.2500 - val_loss: 1918465.6250
Epoch 6/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1890100.2500 - val_loss: 1915144.3750
Epoch 7/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1890050.3750 - val_loss: 1910493.0000
Epoch 8/50
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1917566.3750 