In [6]:
import numpy as np
import pandas as pd


from mymodels import data_engineer
from mymodels import MyModel

In [7]:
mymodel = MyModel(random_state = 0)

In [8]:
data_engineer_pipeline = data_engineer(
    outlier_cols = None,
    missing_values_cols = None,
    impute_method = None,
    # cat_features = ["Gender", "family_history_with_overweight", "FAVC", "CAEC", "SMOKE", "SCC", "CALC", "MTRANS"],
    # encode_method = ["ordinal", "ordinal", "ordinal", "ordinal", "ordinal", "ordinal", "ordinal", "ordinal"],
    # scale_cols = ["Age", "Height", "Weight"],
    # scale_method = ["standard", "standard", "standard"],
    n_jobs = 5,
    verbose = False
)

In [9]:
data = pd.read_csv("data/obesity.zip", encoding="utf-8",
                   na_values=np.nan, index_col=["id"])

mymodel.load(
    model_name = "catc",
    input_data = data,
    y = "NObeyesdad",
    x_list = ["Gender","Age","Height","Weight",\
              "family_history_with_overweight",\
              "FAVC","FCVC","NCP","CAEC","SMOKE",\
              "CH2O","SCC","FAF","TUE","CALC","MTRANS"],
    test_ratio = 0.3,
    stratify = False,
    data_engineer_pipeline = data_engineer_pipeline,
    cat_features = ["Gender", "family_history_with_overweight", "FAVC", "CAEC", "SMOKE", "SCC", "CALC", "MTRANS"],
    model_configs_path = "model_configs.yml"
)



In [10]:
mymodel.format(
    results_dir = "results/obesity",
    show = False,
    plot_format = "jpg",
    plot_dpi = 300,
    save_optimal_model = True,
    save_raw_data = True,
    save_shap_values = True
)

In [11]:
mymodel.diagnose(sample_k = None)




Categorical Features Statistics:
                  Feature Name  Count  Null Count Null Ratio  Unique Count Unique Ratio
                        Gender  14530           0      0.00%             2        0.01%
family_history_with_overweight  14530           0      0.00%             2        0.01%
                          FAVC  14530           0      0.00%             2        0.01%
                          CAEC  14530           0      0.00%             4        0.03%
                         SMOKE  14530           0      0.00%             2        0.01%
                           SCC  14530           0      0.00%             2        0.01%
                          CALC  14530           0      0.00%             3        0.02%
                        MTRANS  14530           0      0.00%             5        0.03%

Numerical Features Statistics:
Feature Name  Count  Null Count Null Ratio   Min   25% Median    75%    Max  Mean   Std Kurtosis Skewness
         Age  14530           0     

In [12]:
mymodel.optimize(
    strategy = "tpe",
    cv = 5,
    trials = 100,
    n_jobs = 5,
    direction = "maximize",
    eval_function = None
)

Best trial: 78. Best value: 0.909911: 100%|██████████| 100/100 [22:00<00:00, 13.20s/it]


In [13]:
mymodel.evaluate(
    show_train = True,
    dummy = True,
    eval_metric = None
)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  plt.tight_layout()


{
    "model": {
        "test": {
            "Overall Accuracy": 0.9083172768143867,
            "Precision": 0.9083154356310408,
            "Recall": 0.9083172768143867,
            "F1": 0.9082729356235455,
            "Kappa": 0.8924475626415974
        },
        "train": {
            "Overall Accuracy": 0.9530626290433586,
            "Precision": 0.9529751820665949,
            "Recall": 0.9530626290433586,
            "F1": 0.9529726529980629,
            "Kappa": 0.9449447296078889
        }
    },
    "dummy": {
        "test": {
            "Overall Accuracy": 0.1920359666024406,
            "Precision": 0.03687781246893368,
            "Recall": 0.1920359666024406,
            "F1": 0.061873657342812215,
            "Kappa": 0.0
        },
        "train": {
            "Overall Accuracy": 0.1961459050240881,
            "Precision": 0.038473216057718584,
            "Recall": 0.1961459050240881,
            "F1": 0.0643286339837343,
            "Kappa": 0.0
        }
  

In [14]:
mymodel.explain(
    select_background_data = "train",
    select_shap_data = "test",
    sample_background_data_k = 50,
    sample_shap_data_k = 50
)

  fig = pl.figure(figsize=figsize)
PDP is not supported for multi-class classifier currently.
Random forest and Decision tree in sklearn for binary classification
are not supported either.



In [15]:
data_pred = pd.read_csv("data/obesity_test.csv", encoding = "utf-8",
                        na_values = np.nan, index_col = ["id"])

data_pred = data_pred.loc[:, ["Gender","Age","Height","Weight",\
                              "family_history_with_overweight",\
                              "FAVC","FCVC","NCP","CAEC","SMOKE",\
                              "CH2O","SCC","FAF","TUE","CALC","MTRANS"]]

y_pred = mymodel.predict(data = data_pred)
y_pred.name = "NObeyesdad"
y_pred.to_csv("results/obesity/prediction.csv", encoding = "utf-8", index = True)