In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
import joblib

In [2]:
dir = 'C:/Users/fitzpatrick/Desktop/Data/Input/'
lake = ['MiHur','Erie','Ont','Sup']

In [3]:
# Step 1: Read in the historical data
# P, E, R are read in from a csv I created for easy use
data_1 = pd.read_csv(dir+lake[1]+'_dataset_6mo_cms.csv',sep=',')
data_1['Date'] = pd.to_datetime(data_1['Date'])
data_1['Month'] = data_1['Date'].dt.month

# NBS from L2SWBM for each lake
data_2 = pd.read_csv(dir+lake[1]+'_l2swbm_nbs.csv',sep=',')
data_2['Date'] = pd.to_datetime(data_2['Date'])
data_2.set_index('Date', inplace=True)

# Merge them into one dataframe to align the timeframes
df = data_1.merge(data_2, on='Date')
df.head()

Unnamed: 0,Date,R(m),P(m),E(m),R(m-1),P(m-1),E(m-1),R(m-2),P(m-2),E(m-2),...,P(m-5),E(m-5),R(m-6),P(m-6),E(m-6),Month,NBS(m),NBS(m-1),NBS(m-2),NBS(m-3)
0,1979-07-01,270.147413,838.679304,447.521306,325.820659,799.979063,289.695399,524.612246,1229.248943,199.110481,...,688.470282,396.122241,0.0,1054.20189,456.765696,7,48.9967,79.0767,152.4003,278.7354
1,1979-08-01,273.041284,862.776304,581.730925,270.147413,838.679304,447.521306,325.820659,799.979063,289.695399,...,1088.281147,82.955132,369.239992,688.470282,396.122241,8,29.2268,48.9967,79.0767,152.4003
2,1979-09-01,485.873656,651.082724,562.513794,273.041284,862.776304,581.730925,270.147413,838.679304,447.521306,...,1655.896637,145.850495,652.335147,1088.281147,82.955132,9,-13.9793,29.2268,48.9967,79.0767
3,1979-10-01,338.298057,752.020394,609.128485,485.873656,651.082724,562.513794,273.041284,862.776304,581.730925,...,1229.248943,199.110481,1275.329971,1655.896637,145.850495,10,-61.0342,-13.9793,29.2268,48.9967
4,1979-11-01,485.438656,1342.701253,405.993138,338.298057,752.020394,609.128485,485.873656,651.082724,562.513794,...,799.979063,289.695399,524.612246,1229.248943,199.110481,11,41.2668,-61.0342,-13.9793,29.2268


In [None]:
# Step 3: Prepate the data
# Extract features (precipitation, evaporation, runoff, water levels, and month) and target (net basin supply) from historical data
features = df[['P(m)', 'E(m)', 'R(m)', 'Date',]] #'Previous_Month_Water_Levels']]
features['Month'] = pd.to_datetime(features['Date']).dt.month  # Extract month from date
features = features.drop(columns=['Date'])

target = df['NBS(m)']

print(features, target)

In [None]:
# Step 3: Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 4: Define preprocessing pipeline
numeric_features = ['Precipitation', 'Runoff', 'Evaporation']
categorical_features = ['Month']

numeric_transformer = Pipeline(steps=[
    ('scaler', 'passthrough')  # You can replace 'passthrough' with any scaling method you prefer
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Step 5: Define Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
# Step 6: Create the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', rf_model)])

In [None]:
# Step 7: Train the model
pipeline.fit(X_train, y_train)

In [None]:
# Step 8: Evaluate the model using test data
y_pred = pipeline.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

In [None]:
# Step 9: Save the model
joblib.dump(pipeline, 'nb_supply_model_with_month.pkl')

In [None]:
# Step 10: Load the model
loaded_pipeline = joblib.load('nb_supply_model_with_month.pkl')

# Step 11: Predict net basin supply for new values
new_values = pd.DataFrame({'Precipitation': [100], 'Runoff': [50], 'Evaporation': [20], 'Month': [6]})
predicted_nbs = loaded_pipeline.predict(new_values)

# Step 12: Print the predicted net basin supply
print("Predicted Net Basin Supply:", predicted_nbs[0])

# Step 13: Print the MAE
print("Mean Absolute Error (MAE) on test data:", mae)