In [1]:
from mymodels.data_engineer import data_engineer
from mymodels.pipeline import MyPipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
mymodel = MyPipeline(
    results_dir = "results/obesity",
    random_state = 0,
    show = False,
    plot_format = "jpg",
    plot_dpi = 500
)

In [3]:
mymodel.load(
    file_path = "data/obesity.csv",
    y = "0be1dad",
    x_list = ["Gender","Age","Height","Weight",\
                "family_history_with_overweight",\
                "FAVC","FCVC","NCP","CAEC","SMOKE",\
                "CH2O","SCC","FAF","TUE","CALC","MTRANS"],
    index_col = "id",
    test_ratio = 0.3,
    inspect = False
)

Label Encoding Mapping:
  0rmal_Weight -> 0
  Insufficient_Weight -> 1
  Obesity_Type_I -> 2
  Obesity_Type_II -> 3
  Obesity_Type_III -> 4
  Overweight_Level_I -> 5
  Overweight_Level_II -> 6


In [4]:
mymodel.diagnose(sample_k=None)


Data diagnosis should be performed on TRAINING DATA ONLY.

DATA DESCRIPTION

X_train shape: (14530, 16)

Y_train shape: (14530,)
MISSING VALUES DIAGNOSIS

X_train info:
<class 'pandas.core.frame.DataFrame'>
Index: 14530 entries, 8462 to 2732
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          14530 non-null  object 
 1   Age                             14530 non-null  float64
 2   Height                          14530 non-null  float64
 3   Weight                          14530 non-null  float64
 4   family_history_with_overweight  14530 non-null  int64  
 5   FAVC                            14530 non-null  int64  
 6   FCVC                            14530 non-null  float64
 7   NCP                             14530 non-null  float64
 8   CAEC                            14530 non-null  object 
 9   SMOKE                           14530 non-null  

In [5]:
# Return an instance of `sklearn.pipeline.Pipeline` object
data_engineer_pipeline = data_engineer(
    outlier_cols = None,
    missing_values_cols = None,
    impute_method = None,
    cat_features = ["Gender", "CAEC", "CALC", "MTRANS"],
    encode_method = ["onehot", "ordinal", "ordinal", "ordinal"],
    scale_cols = ["Age", "Height", "Weight"],
    scale_method = ["standard", "standard", "standard"],
    n_jobs = 5,
    verbose = False
)

In [6]:
mymodel.optimize(
    model_name = "knc",
    data_engineer_pipeline = data_engineer_pipeline,
    cv = 5,
    trials = 10,
    n_jobs = 5,
    # cat_features = ["Gender", "CAEC", "CALC", "MTRANS"],  # For CatBoost ONLY
    optimize_history = True,
    save_optimal_params = True,
    save_optimal_model = True
)

Best trial: 2. Best value: 0.813998: 100%|██████████| 10/10 [00:10<00:00,  1.02s/it]


In [7]:
mymodel.evaluate(save_raw_data = True)

Accuracy of knc: 
 {
    "test_accuracy": 0.8265895953757225,
    "test_precision": 0.8231658620248044,
    "test_recall": 0.8265895953757225,
    "test_f1": 0.8234238446942336,
    "test_kappa": 0.7963138235930491,
    "train_accuracy": 1.0,
    "train_precision": 1.0,
    "train_recall": 1.0,
    "train_f1": 1.0,
    "train_kappa": 1.0
}


In [8]:
mymodel.explain(
    select_background_data = "train",
    select_shap_data = "test",
    sample_background_data_k = 50,
    sample_shap_data_k = 50,
    output_raw_data = True
)

  6%|▌         | 3/50 [00:16<04:25,  5.66s/it]


KeyboardInterrupt: 