In [None]:
import numpy as np
import pandas as pd
from copulas.multivariate import GaussianMultivariate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from scipy.stats import anderson
import matplotlib.pyplot as plt
import joblib

In [None]:
# Load historical data (replace this with your actual historical data)
historical_data = pd.read_csv('historical_data.csv')

# Shift the water level so the model is training on using the previous month's water level
historical_data['Previous_Month_Water_Level'] = historical_data['Water_Level'].shift(1)
# Drop the first line of data since previous_month_water_level is now nan
historical_data = historical_data.dropna()

# Extract features (precipitation, evaporation, runoff, water levels, and month) and target (net basin supply) from historical data
features = historical_data[['Precipitation', 'Evaporation', 'Runoff', 'Previous_Month_Water_Levels', 'Date']]
features['Month'] = pd.to_datetime(features['Date']).dt.month  # Extract month from date
features = features.drop(columns=['Date'])

target = historical_data['Net_Basin_Supply']

# Normalize features and target
scaler_features = MinMaxScaler()
scaler_target = MinMaxScaler()

features[['Precipitation', 'Evaporation', 'Runoff', 'Previous_Month_Water_Levels']] = scaler_features.fit_transform(features[['precipitation', 'evaporation', 'runoff', 'water_levels']])
target_normalized = scaler_target.fit_transform(target.values.reshape(-1, 1)).flatten()

# One-hot encode month feature
month_encoder = OneHotEncoder()
month_encoded = month_encoder.fit_transform(features[['Month']])
features.drop(columns=['Month'], inplace=True)

# Concatenate encoded month feature with other features
features_normalized = np.concatenate([features.values, month_encoded.toarray()], axis=1)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_normalized, target_normalized, test_size=0.2, random_state=42)

# Fit copula model on training data
copula = GaussianMultivariate()
copula.fit(X_train, y_train)
# Save the trained copula model for net basin supply prediction
joblib.dump(copula, 'copula_model_net_basin_supply.pkl')

# Validate copula model on testing data

# Visual inspection: Plot empirical copula against fitted copula
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.scatter(y_test, copula.partial_fit(X_test), label='Empirical vs Fitted')
plt.xlabel('Actual Net Basin Supply')
plt.ylabel('Predicted Net Basin Supply')
plt.title('Actual vs Predicted Net Basin Supply')
plt.legend()

# Goodness-of-fit test: Anderson-Darling test
test_statistic, critical_values, significance_level = anderson(y_test)
print('Anderson-Darling test statistic:', test_statistic)
print('Critical values:', critical_values)
print('Significance level:', significance_level)

# Out-of-sample testing: Compare predictions with actual values
predicted_values = scaler_target.inverse_transform(copula.partial_fit(X_test).reshape(-1, 1)).flatten()
mse = np.mean((scaler_target.inverse_transform(y_test.reshape(-1, 1)).flatten() - predicted_values) ** 2)
print('Mean Squared Error (MSE):', mse)

In [None]:
# Load the trained copula model for net basin supply prediction
copula_model = joblib.load('copula_model_net_basin_supply.pkl')

# Prepare forecasted data
forecasted_runoff = pd.read_csv('forecasted_runoff.csv')  # Assuming your forecasted runoff data is stored in a CSV file
forecasted_precipitation = pd.read_csv('forecasted_precipitation.csv')  # Assuming your forecasted precipitation data is stored in a CSV file
forecasted_evaporation = pd.read_csv('forecasted_evaporation.csv')  # Assuming your forecasted evaporation data is stored in a CSV file
last_month_water_level = pd.read_csv('last_month_water_level.csv')['water_level'].values[0]  # Assuming your last month water level data is stored in a CSV file

# Concatenate the forecasted data into a single DataFrame
forecasted_data = pd.concat([forecasted_runoff, forecasted_precipitation, forecasted_evaporation, last_month_water_level], axis=1)

# Extract features from the forecasted data
features = forecasted_data[['precipitation', 'evaporation', 'runoff', 'water_levels', 'month']]

# One-hot encode month feature
month_encoder = OneHotEncoder()
month_encoded = month_encoder.fit_transform(features[['month']])
features.drop(columns=['month'], inplace=True)

# Concatenate encoded month feature with other features
features = np.concatenate([features.values, month_encoded.toarray()], axis=1)

# Normalize features
scaler_features = MinMaxScaler()
features_normalized = scaler_features.fit_transform(features)

# Predict net basin supply for the next month
predicted_net_basin_supply_normalized = copula_model.predict(features_normalized)

# Inverse transform the predicted net basin supply to obtain the original scale
scaler_target = copula_model.named_steps['copula'].marginal_distributions_['net_basin_supply']
predicted_net_basin_supply = scaler_target.inverse_transform(predicted_net_basin_supply_normalized.reshape(-1, 1))

# Create a DataFrame to store the predicted net basin supply
predicted_net_basin_supply_df = pd.DataFrame(predicted_net_basin_supply, columns=['predicted_net_basin_supply'])

print("Predicted net basin supply for the next month:")
print(predicted_net_basin_supply_df)