In [1]:
from mymodels.data_engineer import data_engineer
from mymodels.pipeline import MyPipeline


"""
# Global settings for font
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Times New Roman'
"""


"""
# User-defined pipeline
from sklearn.pipeline import Pipeline
self_defined_data_engineer_pipeline = Pipeline()
"""


"""
# For debugging
import logging
logging.basicConfig(
    level = logging.DEBUG,
    format = "%(asctime)s - %(levelname)s - %(message)s"
)
"""

  from .autonotebook import tqdm as notebook_tqdm


'\n# For debugging\nimport logging\nlogging.basicConfig(\n    level = logging.DEBUG,\n    format = "%(asctime)s - %(levelname)s - %(message)s"\n)\n'

In [2]:
mymodel = MyPipeline(
    results_dir = "results/titanic",
    random_state = 0,
    show = False,
    plot_format = "jpg",
    plot_dpi = 500
)

In [3]:
mymodel.load(
    file_path = "data/titanic.csv",
    y = "Survived",
    x_list = ["Pclass", "Sex", "Embarked", "Age", "SibSp", "Parch", "Fare"],
    index_col = ["PassengerId"],
    test_ratio = 0.3,
    inspect = False
)

In [4]:
mymodel.diagnose(sample_k=None)


Data diagnosis should be performed on TRAINING DATA ONLY.

DATA DESCRIPTION

X_train shape: (623, 7)

Y_train shape: (623,)
MISSING VALUES DIAGNOSIS

X_train info:
<class 'pandas.core.frame.DataFrame'>
Index: 623 entries, 858 to 685
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    623 non-null    int64  
 1   Sex       623 non-null    object 
 2   Embarked  621 non-null    object 
 3   Age       502 non-null    float64
 4   SibSp     623 non-null    int64  
 5   Parch     623 non-null    int64  
 6   Fare      623 non-null    float64
dtypes: float64(2), int64(3), object(2)
memory usage: 38.9+ KB
None

Y_train info:
<class 'pandas.core.series.Series'>
Index: 623 entries, 858 to 685
Series name: Survived
Non-Null Count  Dtype
--------------  -----
623 non-null    int64
dtypes: int64(1)
memory usage: 9.7 KB
None

Categorical features: []

Numerical features: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

Creating d

In [5]:
# Return an instance of `sklearn.pipeline.Pipeline` object
# User can define their own pipeline
data_engineer_pipeline = data_engineer(
    outlier_cols = None,
    missing_values_cols = ["Age", "Embarked"],
    impute_method = ["mean", "most_frequent"],
    cat_features = ["Sex", "Embarked"],
    encode_method = ["onehot", "onehot"],
    # scale_cols = ["Fare"],
    # scale_method = ["standard"],
    n_jobs = 5,
    verbose = False
)

In [6]:
mymodel.optimize(
    model_name = "xgbc",
    data_engineer_pipeline = data_engineer_pipeline,
    cv = 5,
    trials = 10,
    n_jobs = 5,
    cat_features = None,  # For CatBoost ONLY
    optimize_history = True,
    save_optimal_params = True,
    save_optimal_model = True
)

Best trial: 3. Best value: 0.820874: 100%|██████████| 10/10 [00:07<00:00,  1.42it/s]


In [7]:
mymodel.evaluate(save_raw_data = True)

Accuracy of xgbc: 
 {
    "test_accuracy": 0.8395522388059702,
    "test_precision": 0.8383645141313706,
    "test_recall": 0.8395522388059702,
    "test_f1": 0.8374262441964279,
    "test_kappa": 0.6492146596858639,
    "train_accuracy": 0.8860353130016051,
    "train_precision": 0.8865263169171381,
    "train_recall": 0.8860353130016051,
    "train_f1": 0.8847418120847637,
    "train_kappa": 0.7555228846674662
}


In [8]:
mymodel.explain(
    select_background_data = "train",
    select_shap_data = "test",
    sample_background_data_k = 50,
    sample_shap_data_k = 50,
    output_raw_data = True
)