In [1]:
# Annual Water Use

# %%

import os, sys
import pandas as pd
from datetime import datetime

from iwateruse.featurize import MultiOneHotEncoder
from iwateruse import data_cleaning, report, splittors, pre_train_utils, make_dataset, figures
from iwateruse import denoise, model_diagnose

import matplotlib.pyplot as plt
from xgboost import plot_importance
import xgboost as xgb

#
import numpy as np
from flopy.plot import styles


import warnings

warnings.filterwarnings('ignore')
xgb.set_config(verbosity=0)

# %%
from iwateruse.model import Model
from iwateruse import targets, weights, pipelines, outliers_utils, estimators
from iwateruse import selection
from pycaret.regression import *


# %%
# =============================
# Setup Training
# =============================
figures_folder = "figs"
model = Model(name='annual_pc', log_file = 'train_log.log',  feature_status_file= r"..\features_status.xlsx")
model.raw_target = 'wu_rate'
model.target = 'per_capita'

datafile = r"C:\work\water_use\ml_experiments\annual_v_0_0\clean_train_db.csv"
make_dataset.make_ds_per_capita_basic(model, datafile=datafile)
model.df_train['pop_density']  = model.df_train['pop']/model.df_train['WSA_SQKM']
model.df_train.loc[model.df_train['WSA_SQKM']==0, 'pop_density'] = 0
# add water use
seed1 = 123
seed2 = 456

# %%
model.apply_func(func=targets.compute_per_capita, type='target_func', args=None)

opts = ['pop<=100', 'per_capita>=500', 'per_capita<=25']
model.apply_func(func=outliers_utils.drop_values, type='outliers_func', opts = opts )
model.apply_func(func = outliers_utils.drop_na_target, type='outliers_func')
model.apply_func(func=None, type='add_features_func', args=None)

# split
model.apply_func(func=splittors.random_split, args={'frac': 0.70, 'seed': seed1})


# =============================
# Prepare the initial estimator
# =============================

features = model.features
target = model.target
final_dataset = model.df_train
final_dataset = final_dataset.drop_duplicates(subset = ['sys_id', 'Year'], keep = 'first')
ignore_features = list(set(final_dataset.columns).difference(set(features + [target])))



  from pandas import MultiIndex, Int64Index




In [2]:
# =============================
# Prepare the initial estimator
# =============================

reg1 = setup(data = final_dataset, target = target, ignore_features = ignore_features,
             fold=5, fold_shuffle = True,
             train_size = 0.7)

# compare models
learning_algorithems = ['xgboost', 'rf', 'lightgbm', 'et'] # include = learning_algorithems
#best = compare_models(n_select = 3)

xx = 1



RuntimeError: This version of PyCaret requires scikit-learn==0.23.2, got 1.0.2. Support for newer scikit-learn versions will be added in a future release.

In [8]:
params = {
    'objective': "reg:squarederror",
    'tree_method': 'hist',
    'colsample_bytree': 0.8,
    'learning_rate': 0.20,
    'max_depth': 7,
    'alpha': 100,
    'n_estimators': 500,
    'rate_drop': 0.9,
    'skip_drop': 0.5,
    'subsample': 0.8,
    'reg_lambda': 10,
    'min_child_weight': 50,
    'gamma': 10,
    'max_delta_step': 0,
    'seed': 123
}
xgb = create_model('xgboost', **params)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,34.8555,2912.355,53.9662,0.6724,0.3343,0.2693
1,34.0206,2784.2732,52.7662,0.6691,0.3332,0.2713
2,34.0687,2766.6926,52.5994,0.6654,0.3451,0.2746
3,33.264,2527.7795,50.277,0.6902,0.3384,0.2771
4,34.7611,2862.3735,53.5012,0.6621,0.3458,0.2812
Mean,34.194,2770.6948,52.622,0.6718,0.3394,0.2747
Std,0.5779,132.4242,1.2728,0.0098,0.0053,0.0042


In [10]:
tuned_xgb = tune_model(xgb, fold = 3, search_library = 'scikit-optimize',
                      search_algorithm = 'bayesian', n_iter = 100)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,33.7045,2809.4966,53.0047,0.6787,0.3251,0.2662
1,32.4234,2586.5684,50.8583,0.6874,0.3274,0.2685
2,32.2201,2510.6396,50.1063,0.6978,0.3264,0.2704
Mean,32.7826,2635.5682,51.3231,0.6879,0.3263,0.2684
Std,0.6571,126.8322,1.2281,0.0078,0.001,0.0017


In [12]:
tuned_xgb

XGBRegressor(alpha=100, base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1,
             colsample_bytree=0.5972129389586888, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, gamma=10, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.04371772054983907,
             max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=11,
             max_leaves=0, min_child_weight=3, missing=nan,
             monotone_constraints='()', n_estimators=300, n_jobs=-1,
             num_parallel_tree=1, objective='reg:squarederror',
             predictor='auto', random_state=8108, ...)

In [17]:
tuned_xgb.get_params()['max_delta_step']


0