In [None]:
from mymodels.data_engineer import data_engineer
from mymodels.pipeline import MyPipeline


"""
# Global settings for font
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Times New Roman'
"""


"""
# User-defined pipeline
from sklearn.pipeline import Pipeline
self_defined_data_engineer_pipeline = Pipeline()
"""


"""
# For debugging
import logging
logging.basicConfig(
    level = logging.DEBUG,
    format = "%(asctime)s - %(levelname)s - %(message)s"
)
"""

In [2]:
mymodel = MyPipeline(
    results_dir = "results/obesity",
    random_state = 0,
    show = False,
    plot_format = "jpg",
    plot_dpi = 500
)

In [None]:
mymodel.load(
    file_path = "data/obesity.csv",
    y = "0be1dad",
    x_list = ["Gender","Age","Height","Weight",\
                "family_history_with_overweight",\
                "FAVC","FCVC","NCP","CAEC","SMOKE",\
                "CH2O","SCC","FAF","TUE","CALC","MTRANS"],
    index_col = "id",
    test_ratio = 0.3,
    inspect = False
)

In [4]:
# Return an instance of `sklearn.pipeline.Pipeline` object
data_engineer_pipeline = data_engineer(
    outlier_cols = None,
    missing_values_cols = None,
    impute_method = None,
    cat_features = ["Gender", "CAEC", "CALC", "MTRANS"],
    encode_method = ["onehot", "ordinal", "ordinal", "ordinal"],
    scale_cols = ["Age", "Height", "Weight"],
    scale_method = ["standard", "standard", "standard"],
    n_jobs = 5,
    verbose = False
)

In [None]:
mymodel.optimize(
    model_name = "knc",
    data_engineer_pipeline = data_engineer_pipeline,
    cv = 5,
    trials = 10,
    n_jobs = 5,
    # cat_features = ["Gender", "CAEC", "CALC", "MTRANS"],  # For CatBoost ONLY
    optimize_history = True,
    save_optimal_params = True,
    save_optimal_model = True
)

In [None]:
mymodel.evaluate(save_raw_data = True)

In [None]:
mymodel.explain(
    select_background_data = "train",
    select_shap_data = "test",
    sample_background_data_k = 100,
    sample_shap_data_k = 100,
    output_raw_data = True
)