## TITLE: Train Models
### AUTHOR: Harrison
### DATE: 2025-04-23
##### DESCRIPTION: Trains Multiple Linear Regression and LSTM model on CAMELS data for discharge prediction


In [None]:
import os
import logging
import sys
from pathlib import Path

# Project directory structure
PROJECT_DIR = os.path.dirname(os.path.abspath(''))
DATA_DIR = os.path.join(PROJECT_DIR, 'data')
USGS_DATA_DIR = os.path.join(DATA_DIR, 'raw', 'usgs_streamflow')
FORCING_DATA_DIR = os.path.join(DATA_DIR, 'raw', 'basin_mean_forcing', 'daymet')
FORCING_DATA_DIR = Path(FORCING_DATA_DIR)
STATIC_DATA_DIR = os.path.join(DATA_DIR, 'raw', 'basin_metadata')
FIGURE_DIR = os.path.join(PROJECT_DIR, 'outputs', 'figures')
MODEL_DIR = os.path.join(PROJECT_DIR, 'models')
os.chdir(PROJECT_DIR)

In [13]:
import pandas as pd

# try:
#     master_df = pd.read_csv(os.path.join(DATA_DIR, 'processed', 'CAMELS.csv'), index_col=0)
#     master_df.index = pd.to_datetime(master_df.index)
# except:
print("Preprocessed data not found. Running preprocessing script.")
# Create dictionary for storing data
data_dict = {}

# Define column names
columns = ['STAID', 'YEAR', 'MONTH', 'DAY', 'Q', 'QAQC']
# Read in all data
for huc in os.listdir(USGS_DATA_DIR):
    for basin_data in os.listdir(os.path.join(USGS_DATA_DIR, huc)):
        # Read in fixed width .txt file as a dataframe
        basin_df = pd.read_fwf(os.path.join(USGS_DATA_DIR, huc, basin_data), 
                            header=None, names=columns,
                            dtype={'STAID': str, 'QAQC': str})
        # Convert date columns to datetime and set as index
        basin_df['DATE'] = pd.to_datetime(basin_df[['YEAR', 'MONTH', 'DAY']])
        basin_df.set_index('DATE', inplace=True)
        # Drop unnecessary columns
        basin_df.drop(columns=['YEAR', 'MONTH', 'DAY', 'QAQC'], inplace=True)
        # Add dataframe to dictionary
        data_dict[basin_df['STAID'].iloc[0]] = basin_df
        
import matplotlib.pyplot as plt
import re

# Read in forcing data
COLS = ["Year", "Mnth", "Day", "Hr",
        "dayl(s)", "prcp(mm/day)", "srad(W/m2)",
        "swe(mm)", "tmax(C)", "tmin(C)", "vp(Pa)"]

for huc in os.listdir(FORCING_DATA_DIR):
    for basin_data in os.listdir(os.path.join(FORCING_DATA_DIR, huc)):
        # Get basin number
        basin_no = basin_data.split('_')[0]
        
        # Check to see if the first line looks like YYYY MM DD, if so there are no headers
        # Get the first non-empty line
        with open(os.path.join(FORCING_DATA_DIR, huc, basin_data), 'r') as f:
            for line in f:
                line = line.strip()
                if line:            # skip any empty lines just in case
                    break
                
        has_header_row = not re.match(r"^\d{4}\s+\d{2}\s+\d{2}", line)
        
        if has_header_row:
            basin_df = pd.read_csv(os.path.join(FORCING_DATA_DIR, huc, basin_data), skiprows=3, sep=r"\s+")
        else:
            basin_df = pd.read_csv(os.path.join(FORCING_DATA_DIR, huc, basin_data), sep=r"\s+", header=None, names=COLS)
            
        basin_df.rename(columns={'Mnth': 'Month'}, inplace=True)

        # Convert date columns to datetime and set as index
        basin_df['DATE'] = pd.to_datetime(basin_df[['Year', 'Month', 'Day']])
        basin_df.set_index('DATE', inplace=True)
        # Drop unnecessary columns
        basin_df.drop(columns=['Year', 'Month', 'Day', 'Hr'], inplace=True)
        
        # Make sure all dtypes are numeric
        for dtype in basin_df.dtypes:
            assert dtype != object
            
        # Concatenate forcing dataframe with discharge dataframe
        try:
            data_dict[basin_no] = pd.concat([data_dict[basin_no], basin_df], axis=1)
        except Exception as e:
            print(f"STAID not found: {e}")
            
# Read in annual average hydrometeorological data
HMET_STATIC_COLS = ['HUC', 'STAID', 'Annual Runoff (mm d-1)', 'Annual Precip (mm d-1)', 'Annual PET (mm d-1)', 'Annual Temp (C)']
STATIC_COLS = ['HUC', 'STAID', 'DA (km2)', 'Elevation (m)', 'Slope (m km-1)', 'Frac Forest (%)']
hmet_static_df = pd.read_csv(os.path.join(STATIC_DATA_DIR, 'basin_annual_hydrometeorology_characteristics_nldas.txt'), 
                            sep=r'\s+', names=HMET_STATIC_COLS, dtype={'STAID': str}, header=0)

static_df = pd.read_csv(os.path.join(STATIC_DATA_DIR, 'basin_physical_characteristics.txt'), sep=r'\s+',
                        names=STATIC_COLS, dtype={'STAID': str}, header=0)

# Append static data to respective basin dataframe
for basin, df in data_dict.items():
    row_hmet = hmet_static_df[hmet_static_df['STAID'] == basin]
    row = static_df[static_df['STAID'] == basin]
    row_hmet.name = basin
    if not row_hmet.empty:
        static_values_hmet = row_hmet.iloc[0]
        static_values = row.iloc[0]
        
    # Append HMET static variables
    data_dict[basin]['Annual Runoff (mm d-1)'] = static_values_hmet['Annual Runoff (mm d-1)']
    data_dict[basin]['Annual Precip (mm d-1)'] = static_values_hmet['Annual Precip (mm d-1)']
    data_dict[basin]['Annual PET (mm d-1)'] = static_values_hmet['Annual PET (mm d-1)']
    data_dict[basin]['Annual Temp (C)'] = static_values_hmet['Annual Temp (C)']
    
    # Append static variables
    data_dict[basin]['DA (km2)'] = static_values['DA (km2)']
    data_dict[basin]['Elevation (m)'] = static_values['Elevation (m)']
    data_dict[basin]['Slope (m km-1)'] = static_values['Slope (m km-1)']
    data_dict[basin]['Frac Forest (%)'] = static_values['Frac Forest (%)']
    
# Turn data dict into master dataframe
master_df = pd.concat(data_dict.values(), axis=0)
master_df.dropna(inplace=True)

master_df.head()
master_df.to_csv(os.path.join(DATA_DIR, 'processed', 'CAMELS_daymet.csv'), index=True)

Preprocessed data not found. Running preprocessing script.
01013500
<bound method NDFrame.head of        Year  Month  Day  Hr   dayl(s)  prcp(mm/day)  srad(W/m2)  swe(mm)  \
0      1980      1    1  12  30172.51          0.00      153.40      0.0   
1      1980      1    2  12  30253.10          0.00      145.27      0.0   
2      1980      1    3  12  30344.18          0.00      146.96      0.0   
3      1980      1    4  12  30408.33          0.00      146.20      0.0   
4      1980      1    5  12  30413.48          0.00      170.43      0.0   
...     ...    ...  ...  ..       ...           ...         ...      ...   
12779  2014     12   27  12  30042.52          0.00      103.01      0.0   
12780  2014     12   28  12  30062.75          2.79      104.63      0.0   
12781  2014     12   29  12  30067.20          0.02      193.62      0.0   
12782  2014     12   30  12  30067.89          0.00      180.57      0.0   
12783  2014     12   31  12  30107.30          0.00      185.32   

In [4]:
import numpy as np
import random

def train_test_split_evenSites(df, split_pct, seed):
    # Set random seed
    np.random.seed(seed)
    random.seed(seed)
    
    # Get unique site identifiers
    sites = df['STAID'].unique()
    
    # Store splits here
    train_splits = []
    test_splits  = []
    
    for site in sites:
        temp_df = df[df['STAID'] == site]
        split_ind = int(np.floor((1-split_pct)*int(len(temp_df))))
        start_ind = np.random.randint(0, len(temp_df) - split_ind)
        end_ind = start_ind + split_ind
        test_df = temp_df.iloc[start_ind:end_ind, :]
        train_df = pd.concat([temp_df.iloc[:start_ind, :], temp_df.iloc[end_ind:, :]])
        train_splits.append(train_df)
        test_splits.append(test_df)

    # Zip lists together, shuffle them, then unzip them
    zipped_list = list(zip(train_splits, test_splits))
    random.shuffle(zipped_list)
    train_splits, test_splits = zip(*zipped_list)
    
    Train = pd.concat(train_splits)
    Test = pd.concat(test_splits)
    
    Train.drop("STAID", axis=1, inplace=True)
    Test.drop('STAID', axis=1, inplace=True)
    
    X_train = Train.drop('Q', axis=1)
    y_train = Train['Q']
    X_test = Test.drop('Q', axis=1)
    y_test = Test['Q']
    
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = train_test_split_evenSites(master_df, 0.8, 42)

In [5]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(6751320, 15) (6751320,) (1687180, 15) (1687180,)


In [9]:
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf

feat_scaler = MinMaxScaler()
X_train = pd.DataFrame(
    feat_scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

y_scaler = MinMaxScaler()
y_train = pd.Series(y_scaler.fit_transform(y_train.values.reshape(-1,1)).squeeze(),
    index=y_train.index, name='Q')

# apply the scalers to the validation set
X_val  = pd.DataFrame(feat_scaler.transform(X_test),
                      columns=X_train.columns, 
                      index=X_test.index)

y_val  = pd.Series(y_scaler.transform(y_test.values.reshape(-1,1)).squeeze(),
                   index=y_test.index, name='Q')

### Train multiple linear regression model

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

print(f"R^2: {r2_score(y_scaler.inverse_transform(np.array(y_val).reshape(-1,1)),
      y_scaler.inverse_transform(np.array(y_pred).reshape(-1, 1)))}")

# Save trained model
import joblib
import pickle
fpath = os.path.join(MODEL_DIR, 'linear_regression_weights.npz')
np.savez(
    fpath,
    coef=model.coef_.astype(np.float32),     
    intercept=np.asarray(model.intercept_, dtype=np.float32),
    n_features_in=np.int32(model.n_features_in_),
    feature_names_in=getattr(model, "feature_names_in_", None)
)

R^2: -0.03969659402824233


In [11]:
def data_generator(X, y, window_length, batch_size):
    X_array = X.to_numpy(dtype=np.float32)
    y_array = y.to_numpy(dtype=np.float32)
    n_samples = len(X)

    while True:
        indices = np.arange(n_samples - window_length + 1)
        X_batch = []
        y_batch = []
        
        # for site, indices in valid_sequences.items():
        for i in indices:
            # Get start and end dates of sequence
            sequence_start = X.index[i]
            sequence_end = X.index[i + window_length - 1]
            if (sequence_end - sequence_start).days == window_length - 1:
                X_seq = X_array[i:i + window_length]
                y_seq = y_array[i + window_length - 1]
                
                X_batch.append(X_seq)
                y_batch.append(y_seq)
                
                if len(X_batch) == batch_size:
                    y_batch_array = np.array(y_batch)
                    yield np.array(X_batch), np.array(y_batch)
                    X_batch, y_batch = [], []
            
        if X_batch and y_batch:  # If there are any remaining sequences not yielded yet
            yield np.array(X_batch), np.array(y_batch)

def get_steps_per_epoch(X, window_length, batch_size):
    """A function to count the number of valid sequences in the training data, X

    Args:
        X (pd.DataFrame or np.array): The training data
        window_length (int): The window length for input to the LSTM
        batch_size (int): The batch size for the LSTM

    Returns:
        int: the number of steps per epoch
    """
    n_samples = len(X)
    indices = np.arange(n_samples - window_length + 1)
    valid_sequence_count = 0

    for i in indices:
        sequence_start = X.index[i]
        sequence_end = X.index[i + window_length - 1]
        if (sequence_end - sequence_start).days == window_length - 1:
            valid_sequence_count += 1
    print(valid_sequence_count)
    steps_per_epoch = valid_sequence_count // batch_size
    
    return steps_per_epoch

In [None]:
import keras
import tensorflow as tf
from keras import layers, metrics
# from sklearn.metrics import mean_squared_error, r2_score

def build_model(hidden_layers, hidden_units, optimizer, window_length, 
                n_features, lr, dropout=0,  activation_dense="relu"):
    loss_fn = 'mean_squared_error'
    optimizer = optimizer
    model = keras.Sequential()
    model.add(layers.Input(shape=(window_length, n_features)))
    for i in range(hidden_layers):
        if i != hidden_layers - 1:
            model.add(layers.LSTM(units=hidden_units, return_sequences=True))
            model.add(layers.Dropout(dropout))
        else:
            model.add(layers.LSTM(units=hidden_units))
            model.add(layers.Dropout(dropout))

    model.add(layers.Dense(units=1, activation=activation_dense))
    
    model.compile(loss=loss_fn, optimizer=optimizer, metrics=[metrics.R2Score(), metrics.RootMeanSquaredError(), metrics.MeanAbsoluteError()])
    model.summary()
              
    return model

### Hyperparameters

In [23]:
import keras 

seed = 42
hidden_layers = 1
hidden_units = 32
window_length = 180
n_features = len(X_train.columns)
dropout = 0.3
lr = 0.0001
batch_size = 256
n_epochs = 150
optimizer = keras.optimizers.Adam(learning_rate=lr)

In [24]:
train_gen = data_generator(X_train, y_train, window_length, batch_size)
steps_per_epoch = get_steps_per_epoch(X_train, window_length, batch_size)

6511799


In [25]:
test_gen = data_generator(X_test, y_test, window_length, batch_size)
val_steps = get_steps_per_epoch(X_test, window_length, batch_size)

1566641


In [26]:
from keras import layers, metrics
from sklearn.metrics import mean_squared_error, r2_score

model = build_model(hidden_layers, hidden_units, optimizer, window_length, n_features, lr, dropout)

In [27]:
import tensorflow as tf
from keras.callbacks import EarlyStopping

keras.utils.set_random_seed(seed)
model_name = 'River_LSTM'
earlyStopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min')
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=5, min_lr=0.0000001, verbose=1)
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(PROJECT_DIR,'models', f'{model_name}.weights.h5'), save_best_only=True, save_weights_only=True, monitor='val_loss', mode='min', verbose=0)

callbacks = [earlyStopping, lr_scheduler, checkpoint_callback]
history = model.fit(train_gen, steps_per_epoch=steps_per_epoch, batch_size=batch_size, 
                    epochs=n_epochs, verbose=1, validation_data=test_gen, 
                    validation_steps=val_steps, callbacks=callbacks)

Epoch 1/150
[1m25436/25436[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1913s[0m 75ms/step - loss: 1.7974e-04 - mean_absolute_error: 0.0103 - r2_score: -9.1669 - root_mean_squared_error: 0.0132 - val_loss: 1.5980e-04 - val_mean_absolute_error: 0.0103 - val_r2_score: -1.9534 - val_root_mean_squared_error: 0.0126 - learning_rate: 1.0000e-04
Epoch 2/150
[1m25436/25436[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1889s[0m 74ms/step - loss: 1.5491e-04 - mean_absolute_error: 0.0101 - r2_score: -2.3029 - root_mean_squared_error: 0.0124 - val_loss: 1.5979e-04 - val_mean_absolute_error: 0.0103 - val_r2_score: -1.9532 - val_root_mean_squared_error: 0.0126 - learning_rate: 1.0000e-04
Epoch 3/150
[1m25436/25436[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1683s[0m 66ms/step - loss: 1.5491e-04 - mean_absolute_error: 0.0101 - r2_score: -2.1822 - root_mean_squared_error: 0.0124 - val_loss: 1.5972e-04 - val_mean_absolute_error: 0.0103 - val_r2_score: -1.9544 - val_root_mean_squared_error: 


KeyboardInterrupt



In [None]:
plt.figure(figsize=(6,4), dpi=400)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('LSTM Model Loss')
plt.ylabel('Loss (MSE)')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.savefig(os.path.join(FIGURE_DIR, 'LSTM_loss.png'))
plt.show()