In [1]:
# Annual Water Use

# %%

import os, sys
import pandas as pd
from datetime import datetime

from iwateruse.featurize import MultiOneHotEncoder
from iwateruse import data_cleaning, report, splittors, pre_train_utils, make_dataset, figures
from iwateruse import denoise, model_diagnose

import matplotlib.pyplot as plt
from xgboost import plot_importance
import xgboost as xgb

#
import numpy as np
from flopy.plot import styles


import warnings

warnings.filterwarnings('ignore')
xgb.set_config(verbosity=0)

# %%
from iwateruse.model import Model
from iwateruse import targets, weights, pipelines, outliers_utils, estimators
from iwateruse import selection
from pycaret.regression import *


# %%
# =============================
# Setup Training
# =============================
figures_folder = "figs"
model = Model(name='annual_pc', log_file = 'train_log.log',  feature_status_file= r"..\features_status.xlsx")
model.raw_target = 'wu_rate'
model.target = 'per_capita'

datafile = r"C:\work\water_use\ml_experiments\annual_v_0_0\clean_train_db.csv"
df_train = pd.read_csv(datafile)
model.add_training_df( df_train = df_train)
#make_dataset.make_ds_per_capita_basic(model, datafile=datafile)
model.df_train['pop_density']  = model.df_train['pop']/model.df_train['WSA_SQKM']
model.df_train.loc[model.df_train['WSA_SQKM']==0, 'pop_density'] = 0
# add water use
seed1 = 123
seed2 = 456

# %%
model.apply_func(func=targets.compute_per_capita, type='target_func', args=None)

opts = ['pop<=2000', 'per_capita>=500', 'per_capita<=25']
model.apply_func(func=outliers_utils.drop_values, type='outliers_func', opts = opts )
model.apply_func(func = outliers_utils.drop_na_target, type='outliers_func')
model.apply_func(func=None, type='add_features_func', args=None)

# split
model.apply_func(func=splittors.random_split, args={'frac': 0.70, 'seed': seed1})


# =============================
# Prepare the initial estimator
# =============================

features = model.features
target = model.target
final_dataset = model.df_train
final_dataset = final_dataset.drop_duplicates(subset = ['sys_id', 'Year'], keep = 'first')
ignore_features = list(set(final_dataset.columns).difference(set(features + [target])))





In [2]:
# =============================
# Prepare the initial estimator
# =============================

reg1 = setup(data = final_dataset, target = target, ignore_features = ignore_features,
             fold=5, fold_shuffle = True,
             train_size = 0.7)

# compare models
learning_algorithems = ['xgboost', 'rf', 'lightgbm', 'et'] # include = learning_algorithems
#best = compare_models(n_select = 5)





Unnamed: 0,Description,Value
0,session_id,4079
1,Target,per_capita
2,Original Data,"(34338, 122)"
3,Missing Values,True
4,Numeric Features,115
5,Categorical Features,3
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(24036, 133)"


In [None]:
# xgb = create_model('xgboost')

In [None]:
evaluate_model(xgb)

In [3]:
params = {
    'objective': "reg:squarederror",
    'tree_method': 'hist',
    'colsample_bytree': 0.8,
    'learning_rate': 0.20,
    'max_depth': 7,
    'alpha': 100,
    'n_estimators': 500,
    'rate_drop': 0.9,
    'skip_drop': 0.5,
    'subsample': 0.8,
    'reg_lambda': 10,
    'min_child_weight': 50,
    'gamma': 10,
    'max_delta_step': 0,
    'seed': 123
}
xgb = create_model('xgboost', **params)


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,30.4867,2355.9417,48.538,0.6971,0.3189,0.2484
1,30.4984,2332.1287,48.2921,0.7178,0.3175,0.2449
2,29.2002,2094.4082,45.7647,0.7422,0.3159,0.2438
3,30.1912,2205.3201,46.9608,0.7294,0.3086,0.2422
4,29.8922,2215.3477,47.0675,0.7181,0.3106,0.2392
Mean,30.0537,2240.6292,47.3246,0.7209,0.3143,0.2437
Std,0.4814,94.8004,1.004,0.0149,0.004,0.003


In [None]:
tuned_xgb= tune_model(xgb, fold = 3, search_library = 'scikit-optimize',
                      search_algorithm = 'bayesian', n_iter = 100)

In [None]:
lgb = create_model('lightgbm')
tuned_lgb= tune_model(lgb, fold = 3, search_library = 'scikit-optimize',
                      search_algorithm = 'bayesian', n_iter = 100, fit_params = {'objective': 'quantile', 'alpha':0.5})

In [None]:
tuned_lgb

In [None]:
tuned_lgb.get_params()


In [None]:
# tune et
et = create_model('et')

In [None]:
tuned_lgb= tune_model(et, fold = 3, search_library = 'scikit-optimize',
                      search_algorithm = 'bayesian', n_iter = 100)