In [1]:
import numpy as np
import pandas as pd


from mymodels import data_engineer
from mymodels import MyModel

In [2]:
mymodel = MyModel(random_state = 0)

In [3]:
data_engineer_pipeline = data_engineer(
    missing_values_cols = ["CRIM", "ZN", "INDUS", "CHAS", "AGE", "LSTAT"],
    impute_method = ["median", "median", "median", "median", "median", "median"],
    cat_features = None,
    encode_method = None,
    # scale_cols = ["CRIM", "ZN"],
    # scale_method = ["standard", "minmax"],
    n_jobs = -1,
    verbose = False
)

In [4]:
data = pd.read_csv("data/housing.zip", encoding = "utf-8", 
                   na_values = np.nan, index_col = ["ID"]).sample(300)

mymodel.load(
    model_name = "rfr",
    input_data = data,
    y = "MEDV",
    x_list = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", \
              "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT"],
    test_ratio = 0.3,
    stratify = False,
    data_engineer_pipeline = data_engineer_pipeline,
    model_configs_path = "model_configs.yml"
)

In [5]:
mymodel.format(
    results_dir = "results/housing",
    show = False,
    plot_format = "jpg",
    plot_dpi = 500,
    save_optimal_model = True,
    save_raw_data = True
)

In [6]:
mymodel.diagnose(sample_k = None)

Data diagnosis will be performing on TRAINING DATASET ONLY!!!




Numerical Features Statistics:
Feature Name  Count  Null Count Null Ratio    Min    25% Median    75%    Max   Mean    Std Kurtosis Skewness
        CRIM    210          10      4.76%   0.01   0.08   0.33   4.30  88.98   3.92   8.87    45.15     5.63
          ZN    210          11      5.24%   0.00   0.00   0.00   0.00  95.00  10.25  22.98     4.31     2.32
       INDUS    210           7      3.33%   0.74   5.75  10.59  18.10  27.74  11.93   6.92    -1.40     0.04
        CHAS    210           7      3.33%   0.00   0.00   0.00   0.00   1.00   0.08   0.28     7.24     3.03
         NOX    210           0      0.00%   0.39   0.46   0.54   0.63   0.87   0.56   0.12    -0.15     0.68
          RM    210           0      0.00%   4.14   5.89   6.19   6.54   8.78   6.27   0.71     2.41     0.73
         AGE    210          11      5.24%   6.50  45.80  78.70  94.65 100.00  69.63  28.03    -0.91    -0.65
         DIS    210           0      0.00%   1.17   2.01   2.83   4.85  10.59   3.64   2

In [7]:
mymodel.optimize(
    strategy = "tpe",
    cv = 5,
    trials = 10,
    n_jobs = 5,
    direction = "maximize",
    eval_function = None
)

  0%|          | 0/10 [00:00<?, ?it/s]

In [9]:
mymodel.evaluate(
    show_train = True,
    dummy = True,
    eval_metric = None
)




{
    "model": {
        "test": {
            "R2": 0.8351668740323488,
            "RMSE": 3.3792250544262363,
            "MAE": 2.4551365360927493
        },
        "train": {
            "R2": 0.915391474339811,
            "RMSE": 2.615832846297857,
            "MAE": 1.9876742774741172
        }
    },
    "dummy": {
        "test": {
            "R2": -0.007931638717779688,
            "RMSE": 8.356230439044612,
            "MAE": 6.342074074074073
        },
        "train": {
            "R2": 0.0,
            "RMSE": 8.992965454420421,
            "MAE": 6.378902494331065
        }
    }
}


In [10]:
mymodel.explain(
    select_background_data = "train",
    select_shap_data = "test",
    sample_background_data_k = 50,
    sample_shap_data_k = 50,
    max_display = None
)

  ax.set_ylim([min_val, max_val])
