In [1]:
import numpy as np
import pandas as pd
from mymodels.data_engineer import data_engineer
from mymodels.pipeline import MyPipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
mymodel = MyPipeline(
    results_dir = "results/housing",
    random_state = 0,
    show = False,
    plot_format = "jpg",
    plot_dpi = 500
)

In [3]:
data = pd.read_csv("data/housing.csv", encoding = "utf-8", 
                   na_values = np.nan, index_col = ["ID"])
mymodel.load(
    input_data = data,
    y = "MEDV",
    x_list = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", \
              "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT"],
    test_ratio = 0.3,
    inspect = True
)


Total samples: 506

Train X data info:
<class 'pandas.core.frame.DataFrame'>
Index: 354 entries, H-142 to H-173
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     342 non-null    float64
 1   ZN       340 non-null    float64
 2   INDUS    339 non-null    float64
 3   CHAS     340 non-null    float64
 4   NOX      354 non-null    float64
 5   RM       354 non-null    float64
 6   AGE      340 non-null    float64
 7   DIS      354 non-null    float64
 8   RAD      354 non-null    int64  
 9   TAX      354 non-null    int64  
 10  PTRATIO  354 non-null    float64
 11  B        354 non-null    float64
 12  LSTAT    337 non-null    float64
dtypes: float64(11), int64(2)
memory usage: 38.7+ KB
None

Train X data head:
           CRIM    ZN  INDUS  CHAS    NOX     RM    AGE     DIS  RAD  TAX  \
ID                                                                          
H-142   1.62864   0.0  21.89   0.0  0.624  5.019  10

In [4]:
mymodel.diagnose(sample_k=None)


Data diagnosis should be performed on TRAINING DATA ONLY.

DATA DESCRIPTION

X_train shape: (354, 13)

Y_train shape: (354,)
MISSING VALUES DIAGNOSIS

X_train info:
<class 'pandas.core.frame.DataFrame'>
Index: 354 entries, H-142 to H-173
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     342 non-null    float64
 1   ZN       340 non-null    float64
 2   INDUS    339 non-null    float64
 3   CHAS     340 non-null    float64
 4   NOX      354 non-null    float64
 5   RM       354 non-null    float64
 6   AGE      340 non-null    float64
 7   DIS      354 non-null    float64
 8   RAD      354 non-null    int64  
 9   TAX      354 non-null    int64  
 10  PTRATIO  354 non-null    float64
 11  B        354 non-null    float64
 12  LSTAT    337 non-null    float64
dtypes: float64(11), int64(2)
memory usage: 38.7+ KB
None

Y_train info:
<class 'pandas.core.series.Series'>
Index: 354 entries, H-142 to H-173
Series name: M



In [5]:
# Return an instance of `sklearn.pipeline.Pipeline` object
# User can define their own pipeline
data_engineer_pipeline = data_engineer(
    outlier_cols = None,
    missing_values_cols = ["CRIM", "ZN", "INDUS", "CHAS", "AGE", "LSTAT"],
    impute_method = ["median", "median", "median", "median", "median", "median"],
    cat_features = None,
    encode_method = None,
    # scale_cols = ["CRIM", "ZN"],
    # scale_method = ["standard", "minmax"],
    n_jobs = -1,
    verbose = False
)

In [6]:
mymodel.optimize(
    model_name = "mlpr",
    data_engineer_pipeline = data_engineer_pipeline,
    cv = 5,
    trials = 10,
    n_jobs = -1,
    # cat_features = None,
    optimize_history = True,
    save_optimal_params = True,
    save_optimal_model = True
)

The Scaler is recommended for:
    - LinearRegression
    - LogisticRegression
    - SVR
    - SVC
    - KNR
    - KNC
    - MLPRegressor
    - MLPClassifier

Best trial: 8. Best value: 0.626241: 100%|██████████| 10/10 [01:02<00:00,  6.28s/it]


In [7]:
mymodel.evaluate(
    dummy = True,
    save_raw_data = True
)

ValueError: Found input variables with inconsistent numbers of samples: [354, 152]

In [None]:
mymodel.explain(
    select_background_data = "train",
    select_shap_data = "test",
    sample_background_data_k = 50,
    sample_shap_data_k = 50,
    output_raw_data = True
)