# Predictability of (Shannon) Entropy

In the data preprocessing, we computed the entropy of the distribution of each individual streamflow time series in bits per sample.  We'll now use an ensemble decision tree method called XGBoost (eXtreme Gradient Boosted decision tree) {cite}`chen2016xgboost` to see if the entropy (or uncertainty) of a distribution can be predicted from catchment attributes.  The dictionary size (number of quantization levels) is varied to test if the additional information in the distribution can be exploited by the model.  The model input features are added in successive model tests to compare the contribution of catchment attribute groups related to climate, terrain, land cover, and soil.  

In [9]:
import os
import pandas as pd
import numpy as np

In [10]:
# load the catchment characteristics
fname = 'BCUB_HYSETS_properties_with_climate_with_entropy.csv'
df = pd.read_csv(os.path.join('data', fname))

Subdivide the attributes into related classes: terrain, land cover, soil, climate.

In [7]:
print(df.columns.tolist())

['official_id', 'watershed_id', 'name', 'centroid_lat_deg_n', 'centroid_lon_deg_e', 'drainage_area_km2', 'drainage_area_gsim_km2', 'flag_gsim_boundaries', 'flag_artificial_boundaries', 'elevation_m', 'slope_deg', 'gravelius', 'perimeter', 'flag_shape_extraction', 'aspect_deg', 'flag_terrain_extraction', 'land_use_forest_frac_2010', 'land_use_grass_frac_2010', 'land_use_wetland_frac_2010', 'land_use_water_frac_2010', 'land_use_urban_frac_2010', 'land_use_shrubs_frac_2010', 'land_use_crops_frac_2010', 'land_use_snow_ice_frac_2010', 'flag_land_use_extraction', 'logk_ice_x100', 'porosity_x100', 'flag_subsoil_extraction', 'year_from', 'year_to', 'record_length', 'agency', 'status', 'updated_official_basin', 'in_bcub', 'prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'high_prcp_freq', 'high_prcp_duration', 'low_prcp_freq', 'low_prcp_duration', 'H_4_bits', 'H_6_bits', 'H_8_bits']


In [8]:
terrain = ['drainage_area_km2', 'elevation_m', 'slope_deg', 'gravelius', 'perimeter', 'aspect_deg']
land_cover = [
    'land_use_forest_frac_2010', 'land_use_grass_frac_2010', 'land_use_wetland_frac_2010', 'land_use_water_frac_2010', 
    'land_use_urban_frac_2010', 'land_use_shrubs_frac_2010', 'land_use_crops_frac_2010', 'land_use_snow_ice_frac_2010']
soil = ['logk_ice_x100', 'porosity_x100']
climate = ['prcp', 'srad', 'swe', 'tmax', 'tmin', 'vp', 'high_prcp_freq', 'high_prcp_duration', 'low_prcp_freq', 'low_prcp_duration']
all_attributes = terrain + land_cover + soil + climate

In [11]:
def train_test_split(input_data, target_column, holdout_pct):
    n_holdout = int(holdout_pct * len(input_data))
    excluded_set = np.random.choice(input_data.index.values, holdout, replace=False)
    feature_columns = [c for c in input_data.columns if c != target_column]
    X_train = input_data.loc[~input_data.index.isin(excluded_idxs), feature_columns].copy().values
    Y_train = input_data.loc[~input_data.index.isin(excluded_idxs), target_column].copy().values
    X_test = input_data.loc[input_data.index.isin(excluded_idxs), feature_columns].copy().values
    Y_test = input_data.loc[input_data.index.isin(excluded_idxs), target_column].copy().values
    return X_train, Y_train, X_test, Y_test

In [None]:

class XGBInstance:
    def __init__(
        self,
        df,
        cv_param,
        bitrate,
        feature_columns,
        loss,
        concurrent_data,
        test_dir,
        target_column,
        n_simulations,
    ):
        self.input_df = df
        self.bitrate = bitrate
        self.loss = loss
        self.target_column = target_column
        self.features = feature_columns
        self.stations = pd.unique(df[["proxy", "target"]].values.ravel("K"))
        self.cv_param = cv_param
        self.K = 100  # final leave out test set
        self.max_dist = 1000  # km
        self.n_simulations = n_simulations
        self.feature_scores = []
        self.concurrent_data = concurrent_data
        self.test_dir = test_dir
        

    def create_feature_diff_cols(self):
        features = list(set(["_".join(e.split("_")[1:]) for e in self.features]))
        features = [f for f in features if f != "distance"]

        for c in features:
            self.input_df[f"{c}_diff"] = (
                self.input_df[f"proxy_{c}"] - self.input_df[f"target_{c}"]
            )

    def pairwise_leave_out_at_random(self):
        """
        Leave out K stations at random from the training data.
        """
        excluded_station_ids = np.random.choice(self.stations, self.K, replace=False)

        test_mask = self.input_df.apply(
            lambda row: row["proxy"] in excluded_station_ids
            and row["target"] in excluded_station_ids,
            axis=1,
        )
        train_mask = self.input_df.apply(
            lambda row: row["proxy"] not in excluded_station_ids
            and row["target"] not in excluded_station_ids,
            axis=1,
        )

        # split the data into training and test sets
        train_df = self.input_df[train_mask].copy().reset_index(drop=True)
        test_df = self.input_df[test_mask].copy().reset_index(drop=True)

        train_stations_unique = list(
            set(train_df[["proxy", "target"]].values.flatten())
        )
        test_stations_unique = list(set(test_df[["proxy", "target"]].values.flatten()))
        # assert that the training and test sets are mutually exclusive
        assert len(np.intersect1d(train_stations_unique, test_stations_unique)) == 0
        print(
            f"    {len(train_stations_unique)} unique stations in training set, {len(test_stations_unique)} in test set."
        )
        return train_df, test_df

    def filter_training_data(self):
        self.input_df = self.input_df[
            self.input_df["centroid_distance"] <= self.max_dist
        ]
        train_df, test_df = self.pairwise_leave_out_at_random()
        return train_df, test_df

    def prepare_input_data(self):
        """
        First, create a hold-out test set by leaving out K stations at random.
        Then format the train and test data for the model inputs.
        """
        # Split the data into features and target
        self.X_train_filtered, self.X_test_filtered = self.filter_training_data()

        X_train = self.X_train_filtered[self.features].values
        X_test = self.X_test_filtered[self.features].values

        Y_train = self.X_train_filtered[self.target_column].values
        Y_test = self.X_test_filtered[self.target_column].values

        return X_train, Y_train, X_test, Y_test

    def save_feature_importance_plot(self):

        # Sort the feature importances
        feature_importances = self.model.get_booster().get_score(
            importance_type="weight"
        )

        sorted_importances = sorted(
            feature_importances.items(), key=lambda x: x[1], reverse=True
        )

        # Unpack the labels and values
        labels, values = zip(*sorted_importances)

        # get the corresponding feature names
        self.feature_labels = [self.features[int(e.split("f")[1])] for e in labels]
        self.feaure_score_dict = {k: v for k, v in zip(self.feature_labels, values)}
        self.feature_scores.append(self.feaure_score_dict)
        fs_df = pd.DataFrame(self.feature_scores).T

        # return fs_df
        # Create the plot
        # fig, ax = plt.subplots()
        # ax.barh(self.feature_labels, values)
        # ax.set_xlabel('Importance')
        # ax.set_title('Feature Importance')

        # Save the plot as a PNG file
        # fig_dir = os.path.join(BASE_DIR, 'processed_data', 'feature_importance_figs')
        # plt.gcf().set_size_inches(8, 12)
        # plt.savefig(os.path.join(fig_dir,
        #             f'{self.model_id}.png'),
        # bbox_inches='tight')

        # Close the plot to avoid displaying it
        # plt.close()
        return fs_df

    def __call__(self, trial):

        # Define hyperparameter space
        # max_depth = trial.suggest_int("max_depth", 2, 6)
        learning_rate = trial.suggest_float("learning_rate", 0.04, 0.25)
        # reg_alpha = trial.suggest_float("reg_alpha", 1.0, 1.2)  # L1 reg
        # reg_lambda = trial.suggest_float("reg_lambda", 0.9, 1.0, log=True)  # L2 reg
        subsample = trial.suggest_float("subsamples", 0.6, 0.85)
        colsample = trial.suggest_float("colsample_bytree", 0.6, 0.85)

        params = {
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "eta": learning_rate,
            "max_depth": 6,
            "min_child_weight": 1,
            "subsample": subsample,
            "colsample_bytree": colsample,
            "seed": 42,
            "device": "cuda",
        }
        self.common_params = params
        prior = self.target_column.lower().split("_")[-1].split("r")[0]

        # formulate a unique model id using the model parameters
        model_id = f"xgb_{self.bitrate}_b_{prior}_p_{len(self.features)}_features_{concurrent_data}"
        for k, v in self.common_params.items():
            if k in [
                "loss",
                "max_features",
                "tree_method",
                "random_state",
                "eval_metric",
                "device",
            ]:
                continue
            if v == "binary:logistic":
                v = "logistic"
                self.loss_code = "auc"
            if isinstance(v, float):
                v = f"{v:1.3e}"
            model_id += f"_{k}_{v}"

        self.model_id = model_id
        mean_error, stdev_error = self.run_model()
        return mean_error

    def run_model(self):
        """
        1. Train-test split: Randomly select 5% of stations to create final hold-out test set.
        2. Model Fit: Fit the model on pairs from the remaining training set.
        3. Cross Validation: Evaluate the model with 5-fold cross validation.
        4. Stability test: Repeat steps 1-3 ten times holding xgboost hyperparameters constant.
        5. Hyperparameter search: Repeat steps 1-4 five times with different hyperparameters.
        """
        model_save_folder = os.path.join(
            self.project_dir, "xgb_models", self.concurrent_data
        )
        results_save_folder = os.path.join(
            self.project_dir, "xgb_results", self.concurrent_data
        )
        for folder in [results_save_folder, model_save_folder]:
            if not os.path.exists(folder):
                os.makedirs(folder)

        simulation_test_metrics = []
        for n in range(self.n_simulations):
            model_fname = f"{n}_{self.model_id}_{self.concurrent_data}.json"
            results_fpath = os.path.join(
                results_save_folder, model_fname.replace(".json", ".csv")
            )
            model_fpath = os.path.join(model_save_folder, model_fname)
            if os.path.exists(results_fpath):
                print(f"    Model {n} already exists, skipping.")
                continue
            t0 = time()
            X_train, Y_train, X_test, Y_test = self.prepare_input_data()

            dtrain = xgb.DMatrix(X_train, label=Y_train)

            # train the model and do cross validation
            self.cv_results = xgb.cv(
                params=self.common_params,
                dtrain=dtrain,
                num_boost_round=2000,
                nfold=self.cv_param,
                stratified=True,
                metrics="auc",
                early_stopping_rounds=10,
                as_pandas=True,
                seed=42,
                verbose_eval=False,
            )

            # Get the optimal number of boosting rounds
            optimal_num_boost_round = self.cv_results[f"test-auc-mean"].idxmax()
            print(f"    Optimal number of boosting rounds: {optimal_num_boost_round}")

            # train a final model with the optimal number of boosting
            # rounds and identical parameters
            self.final_model = xgb.XGBClassifier(
                n_estimators=optimal_num_boost_round,
                learning_rate=self.common_params["eta"],
                max_depth=self.common_params["max_depth"],
                min_child_weight=self.common_params["min_child_weight"],
                subsample=self.common_params["subsample"],
                colsample_bytree=self.common_params["colsample_bytree"],
                seed=self.common_params["seed"],
                use_label_encoder=False,  # Set to False to comply with scikit-learn standards
            )
            self.final_model.fit(X_train, Y_train)

            # Predict probabilities on the held-out test set
            y_test_pred_prob = self.final_model.predict(X_test)

            # Calculate the AUC on the test set
            auc = roc_auc_score(Y_test, y_test_pred_prob)
            ascore = accuracy_score(Y_test, y_test_pred_prob)
            print(f"AUC on the test set: {auc:.3f}, accuracy score = {ascore:.2f}")

            self.final_model.save_model(model_fpath)

            # evaluate the model on held out test data
            # try using the pred_contribs=True to get the feature importance
            res_df = pd.DataFrame(
                {
                    "actual": Y_test,
                    "predicted_prob": y_test_pred_prob,
                    "proxy": self.X_test_filtered["proxy"].values,
                    "target": self.X_test_filtered["target"].values,
                }
            )

            res_df.to_csv(os.path.join(results_save_folder, results_fpath), index=False)
            simulation_test_metrics.append(auc)

        t5 = time()
        mean_metric = np.mean(simulation_test_metrics)
        stdev_metric = np.std(simulation_test_metrics)

        print(
            f"    Completed {self.n_simulations} simulations in {t5 - t0:.2f} seconds"
        )
        print(f"        -mean auc: {mean_metric:.2f} ± {stdev_metric:.3f}")
        print(" ")
        return mean_metric, stdev_metric

In [None]:
# define the amount of data to set aside for final testing
holdout_pct = 0.05

for bitrate in [4, 6, 8]:
    # set the target column
    target_column = f'H_{bitrate}_bits'
    test_attributes = []
    # add attribute groups successively
    for attribute_set in [climate, terrain, land_cover, soil]:
        test_attributes += attribute_set
        
        input_data = df[test_attributes + [target_column]].copy()
        
        # reset the index to ensure the random selection is done properly
        input_data.reset_index(drop=True, inplace=True)
        
        # randomly select 5% of the stations to leave out for a hold-out test set
        # to ensure none of the data are seen in training
        X_train, X_test, Y_train, Y_test = train_test_split(input_data, target_column, holdout_pct)
        params = {
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "eta": learning_rate,
            "max_depth": 6,
            "min_child_weight": 1,
            "subsample": subsample,
            "colsample_bytree": colsample,
            "seed": 42,
            "device": "cuda",
        }
        xgb_model = 
        
        
        

## Citations

```{bibliography}
:filter: docname in docnames
```