In [19]:
import numpy as np
import pandas as pd


from mymodels import data_engineer
from mymodels import MyModel

In [20]:
mymodel = MyModel(random_state = 0)

In [None]:
data_engineer_pipeline = data_engineer(
    missing_values_cols = ["CRIM", "ZN", "INDUS", "CHAS", "AGE", "LSTAT"],
    impute_method = ["median", "median", "median", "median", "median", "median"],
    cat_features = None,
    encode_method = None,
    # scale_cols = ["CRIM", "ZN"],
    # scale_method = ["standard", "minmax"],
    n_jobs = -1,
    verbose = False
)

In [22]:
data = pd.read_csv("data/housing.zip", encoding = "utf-8", 
                   na_values = np.nan, index_col = ["ID"]).sample(300)

mymodel.load(
    model_name = "rfr",
    input_data = data,
    y = "MEDV",
    x_list = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", \
              "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT"],
    test_ratio = 0.3,
    stratify = False,
    data_engineer_pipeline = data_engineer_pipeline,
    model_configs_path = "model_configs.yml"
)

In [23]:
mymodel.format(
    results_dir = "results/housing",
    show = False,
    plot_format = "jpg",
    plot_dpi = 500,
    save_optimal_model = True,
    save_raw_data = True
)

In [24]:
mymodel.diagnose(sample_k = None)

Data diagnosis will be performing on TRAINING DATASET ONLY!!!




Numerical Features Statistics:
Feature Name  Count  Null Count Null Ratio    Min    25% Median    75%    Max   Mean    Std Kurtosis Skewness
        CRIM    210           8      3.81%   0.01   0.07   0.24   2.22  67.92   3.15   7.61    30.88     4.79
          ZN    210          11      5.24%   0.00   0.00   0.00  20.00  95.00  11.44  22.16     4.10     2.19
       INDUS    210           9      4.29%   0.46   5.19   8.14  18.10  27.74  10.93   6.89    -1.26     0.37
        CHAS    210          10      4.76%   0.00   0.00   0.00   0.00   1.00   0.07   0.26     9.63     3.40
         NOX    210           0      0.00%   0.40   0.45   0.52   0.62   0.87   0.55   0.11    -0.09     0.74
          RM    210           0      0.00%   3.56   5.89   6.21   6.67   8.78   6.32   0.70     1.90     0.48
         AGE    210           7      3.33%   2.90  37.80  73.40  92.80 100.00  65.38  29.84    -1.17    -0.47
         DIS    210           0      0.00%   1.29   2.06   3.42   5.40  12.13   3.94   2

In [25]:
mymodel.optimize(
    strategy = "tpe",
    cv = 5,
    trials = 10,
    n_jobs = 5,
    direction = "maximize",
    eval_function = None
)

Best trial: 8. Best value: 0.802179: 100%|██████████| 10/10 [00:28<00:00,  2.89s/it]


In [26]:
mymodel.evaluate(
    show_train = True,
    dummy = True,
    eval_metric = None
)




{
    "model": {
        "test": {
            "R2": 0.8538595686199106,
            "RMSE": 3.319752524703978,
            "MAE": 2.398369022934521
        },
        "train": {
            "R2": 0.9244567771756836,
            "RMSE": 2.5535573651507937,
            "MAE": 1.70578234511813
        }
    },
    "dummy": {
        "test": {
            "R2": -0.0011006040968644104,
            "RMSE": 8.688791565356667,
            "MAE": 6.28079365079365
        },
        "train": {
            "R2": 0.0,
            "RMSE": 9.29068775234453,
            "MAE": 6.748680272108844
        }
    }
}


In [None]:
mymodel.explain(
    select_background_data = "train",
    select_shap_data = "test",
    sample_background_data_k = 50,
    sample_shap_data_k = 50,
    max_display = None
)