In [1]:
# Annual Water Use

# %%

import os, sys
import pandas as pd
from datetime import datetime

from iwateruse.featurize import MultiOneHotEncoder
from iwateruse import data_cleaning, report, splittors, pre_train_utils, make_dataset, figures
from iwateruse import denoise, model_diagnose

import matplotlib.pyplot as plt
from xgboost import plot_importance
import xgboost as xgb

#
import numpy as np
from flopy.plot import styles


import warnings

warnings.filterwarnings('ignore')
xgb.set_config(verbosity=0)

# %%
from iwateruse.model import Model
from iwateruse import targets, weights, pipelines, outliers_utils, estimators
from iwateruse import selection
from pycaret.regression import *


# %%
# =============================
# Setup Training
# =============================
figures_folder = "figs"
model = Model(name='annual_pc', log_file = 'train_log.log',  feature_status_file= r"..\features_status.xlsx")
model.raw_target = 'wu_rate'
model.target = 'per_capita'

datafile = r"C:\work\water_use\ml_experiments\annual_v_0_0\clean_train_db.csv"
make_dataset.make_ds_per_capita_basic(model, datafile=datafile)
model.df_train['pop_density']  = model.df_train['pop']/model.df_train['WSA_SQKM']
model.df_train.loc[model.df_train['WSA_SQKM']==0, 'pop_density'] = 0
# add water use
seed1 = 123
seed2 = 456

# %%
model.apply_func(func=targets.compute_per_capita, type='target_func', args=None)

opts = ['pop<=100', 'per_capita>=500', 'per_capita<=25']
model.apply_func(func=outliers_utils.drop_values, type='outliers_func', opts = opts )
model.apply_func(func = outliers_utils.drop_na_target, type='outliers_func')
model.apply_func(func=None, type='add_features_func', args=None)

# split
model.apply_func(func=splittors.random_split, args={'frac': 0.70, 'seed': seed1})


# =============================
# Prepare the initial estimator
# =============================

features = model.features
target = model.target
final_dataset = model.df_train
final_dataset = final_dataset.drop_duplicates(subset = ['sys_id', 'Year'], keep = 'first')
ignore_features = list(set(final_dataset.columns).difference(set(features + [target])))





In [2]:
# =============================
# Prepare the initial estimator
# =============================

reg1 = setup(data = final_dataset, target = target, ignore_features = ignore_features,
             fold=5, fold_shuffle = True,
             train_size = 0.7)

# compare models
learning_algorithems = ['xgboost', 'rf', 'lightgbm', 'et'] # include = learning_algorithems
best = compare_models(n_select = 5)





Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,28.0768,2182.3783,46.7101,0.7408,0.2931,0.2256,11.664
rf,Random Forest Regressor,30.7817,2385.7606,48.837,0.7167,0.3111,0.2517,24.426
xgboost,Extreme Gradient Boosting,40.2851,3382.5263,58.1529,0.5983,0.3761,0.3298,16.564
catboost,CatBoost Regressor,41.0028,3454.3304,58.7671,0.5898,0.3783,0.3367,6.622
knn,K Neighbors Regressor,36.4803,3594.1074,59.9431,0.5732,0.3648,0.2852,0.952
lightgbm,Light Gradient Boosting Machine,47.6496,4373.7793,66.1301,0.4806,0.4275,0.4012,1.092
dt,Decision Tree Regressor,39.3102,4702.3868,68.5608,0.4416,0.4082,0.2929,1.198
gbr,Gradient Boosting Regressor,56.8516,6008.3539,77.5086,0.2865,0.4891,0.4792,20.698
br,Bayesian Ridge,62.0373,7139.2011,84.4854,0.152,0.5272,0.5252,0.942
omp,Orthogonal Matching Pursuit,63.6348,7307.8319,85.4832,0.1321,0.5369,0.5412,0.642


In [None]:
params = {
    'objective': "reg:squarederror",
    'tree_method': 'hist',
    'colsample_bytree': 0.8,
    'learning_rate': 0.20,
    'max_depth': 7,
    'alpha': 100,
    'n_estimators': 500,
    'rate_drop': 0.9,
    'skip_drop': 0.5,
    'subsample': 0.8,
    'reg_lambda': 10,
    'min_child_weight': 50,
    'gamma': 10,
    'max_delta_step': 0,
    'seed': 123
}
#xgb = create_model('xgboost', **params)
lgb = create_model('lightgbm')

In [None]:
tuned_lgb= tune_model(lgb, fold = 3, search_library = 'scikit-optimize',
                      search_algorithm = 'bayesian', n_iter = 100, fit_params = {'objective': 'quantile', 'alpha':0.5})

In [None]:
tuned_lgb

In [None]:
tuned_lgb.get_params()


In [3]:
# tune et
et = create_model('et')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.7537,2097.7748,45.8015,0.7463,0.2852,0.2194
1,27.3681,2115.1423,45.9907,0.7439,0.2873,0.2183
2,28.7721,2263.346,47.5746,0.7379,0.3021,0.2351
3,28.2764,2255.4219,47.4913,0.7308,0.2957,0.2276
4,28.2135,2180.2063,46.6927,0.7453,0.2954,0.2276
Mean,28.0768,2182.3783,46.7101,0.7408,0.2931,0.2256
Std,0.4793,68.6648,0.735,0.0058,0.0062,0.0062


In [None]:
tuned_lgb= tune_model(et, fold = 3, search_library = 'scikit-optimize',
                      search_algorithm = 'bayesian', n_iter = 100)

IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,Fold,MAE,MSE,RMSE,R2,RMSLE,MAPE


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
