## This example contains sample script for launching models.

In [1]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import pandas as pd
import numpy as np

In [2]:
RANDOM_STATE = 42
N_JOBS = 5

### Data

In [3]:
X, y = make_classification(
    n_samples=500,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    n_classes=2,
    random_state=RANDOM_STATE
    
)
X = pd.DataFrame(X, columns=[f"some_col_{i}" for i in range(X.shape[1])])

In [4]:
# change some columns to be categorical
cat_columns = ["some_col_0", "some_col_1"]
X[cat_columns] = OrdinalEncoder(dtype=np.int32).fit_transform(X[cat_columns])

In [5]:
display(X.describe())
display(X.info())

Unnamed: 0,some_col_0,some_col_1,some_col_2,some_col_3,some_col_4,some_col_5,some_col_6,some_col_7,some_col_8,some_col_9,some_col_10,some_col_11,some_col_12,some_col_13,some_col_14,some_col_15,some_col_16,some_col_17,some_col_18,some_col_19
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,249.5,249.5,0.436686,0.526321,-0.544381,0.462955,0.010047,0.579898,-0.683287,-0.075303,0.033441,-0.93933,-0.09288,0.029436,0.261796,-0.032766,0.001575,-0.416248,-0.345848,0.211322
std,144.481833,144.481833,4.116657,2.023436,1.976744,1.920844,1.050412,1.858028,2.447539,1.052274,2.175533,3.943875,2.032104,2.085403,3.780237,0.98491,1.040952,1.986714,1.994773,5.662579
min,0.0,0.0,-11.73767,-5.444744,-7.76214,-5.318966,-3.688365,-5.544137,-9.415962,-2.773815,-5.587234,-10.475633,-6.307616,-6.723926,-9.564439,-2.712613,-3.221016,-6.778668,-5.55729,-18.912843
25%,124.75,124.75,-2.436978,-0.828924,-1.835585,-0.795271,-0.689097,-0.524035,-2.31081,-0.776885,-1.609301,-3.553121,-1.504029,-1.376666,-2.312293,-0.688044,-0.700489,-1.883092,-1.774041,-4.206106
50%,249.5,249.5,0.44653,0.630676,-0.507281,0.544779,-0.02533,0.524006,-0.718819,-0.064861,0.012494,-1.106477,-0.004774,-0.185116,0.332309,-0.043297,0.023814,-0.510957,-0.38123,0.300956
75%,374.25,374.25,3.484881,1.958799,0.90656,1.775366,0.725498,1.785628,1.029727,0.59919,1.507359,1.577966,1.268629,1.315163,2.755105,0.690443,0.738597,0.912333,1.009416,4.462904
max,499.0,499.0,11.376024,6.277344,3.928934,5.614516,3.529055,5.641164,5.360797,3.117681,7.697813,10.775108,6.439078,7.712351,12.375833,2.721912,2.943048,5.16565,5.549083,18.151846


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   some_col_0   500 non-null    int32  
 1   some_col_1   500 non-null    int32  
 2   some_col_2   500 non-null    float64
 3   some_col_3   500 non-null    float64
 4   some_col_4   500 non-null    float64
 5   some_col_5   500 non-null    float64
 6   some_col_6   500 non-null    float64
 7   some_col_7   500 non-null    float64
 8   some_col_8   500 non-null    float64
 9   some_col_9   500 non-null    float64
 10  some_col_10  500 non-null    float64
 11  some_col_11  500 non-null    float64
 12  some_col_12  500 non-null    float64
 13  some_col_13  500 non-null    float64
 14  some_col_14  500 non-null    float64
 15  some_col_15  500 non-null    float64
 16  some_col_16  500 non-null    float64
 17  some_col_17  500 non-null    float64
 18  some_col_18  500 non-null    float64
 19  some_col

None

In [6]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, stratify=y, random_state=RANDOM_STATE, test_size=0.2)

#### Models

In [8]:
import sys; sys.path.append("../")

# automl model
from src.automl.model import AutoML

# custom metric 
# from src.automl.metrics import RocAuc

# for propper logging
from src.automl.loggers import enable_logging_to_dir

In [11]:
# to store logs in file
enable_logging_to_dir()

! ls -l ml_data/2024_11_29___08_05_59

total 0
-rw-r--r-- 1 vscode vscode 0 Nov 29 08:06 error.log
-rw-r--r-- 1 vscode vscode 0 Nov 29 08:06 logs.log


In [12]:
# constants for the run

TUNING_TIMEOUT = 30 # time for tuning each model
TASK = "classification" # one of ["regression", "classification"]
METRIC = "roc_auc" # either sklearn metirc or a custom metric
STACK = True # whether to perform stacking
BLEND = True # whether to perform blending

In [13]:
model = AutoML(
    task=TASK,
    metric=METRIC,
    n_jobs=N_JOBS,
    tuning_timeout=TUNING_TIMEOUT,
    stack=STACK,
    blend=BLEND
)

In [14]:
model.fit(
    X_train, y_train,
    X_test, y_test,
    save_test=True,
    save_oof=True,
    save_models=True,
    categorical_features=cat_columns
)

[2024-11-29 08:06:25,892] - [   MODEL    ] - 1 out of 9. LogisticRegression
[2024-11-29 08:06:25,895] - [   START    ] - Working with LogisticRegression
[2024-11-29 08:06:25,897] - [   START    ] - Tuning LogisticRegression
[2024-11-29 08:06:27,615] - [   PARAMS   ] - C=0.046415888336127774, metric=0.8978427141963727
[2024-11-29 08:06:27,616] - [BEST PARAMS ] - {'C': 0.046415888336127774, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': 5, 'random_state': 42, 'time_series': False}
[2024-11-29 08:06:27,617] - [    END     ] - Tuning LogisticRegression
[2024-11-29 08:06:27,617] - [   START    ] - Fitting LogisticRegression
[2024-11-29 08:06:27,619] - [    FIT     ] - LogisticRegression fold 0
[2024-11-29 08:06:27,695] - [    FIT     ] - LogisticRegression fold 1
[2024-11-29 08:06:27,757] - [    FIT     ] - LogisticRegression fold 2
[2024-11-29 08:06:27,815] - [    FIT     ] - LogisticRegression fold 3
[2024-11-29 08:06:27,880] - [    FIT     ] - LogisticRegression fold 4
[2024-11-

<src.automl.model.main.AutoML at 0xffff26ea18b0>

In [16]:
roc_auc_score(y_test, model.predict(X_test)[:, 1])

0.9704000000000002

In [17]:
!tree ml_data/2024_11_29___08_05_59

[01;34mml_data/2024_11_29___08_05_59[00m
├── [01;34mBlender[00m
│   ├── Blender.joblib
│   ├── Blender.yaml
│   ├── oof_preds.csv
│   └── test_preds.csv
├── [01;34mCatBoostClassification[00m
│   ├── CatBoostClassification.joblib
│   ├── CatBoostClassification.yaml
│   ├── oof_preds.csv
│   └── test_preds.csv
├── [01;34mExtraTreesClassification[00m
│   ├── ExtraTreesClassification.joblib
│   ├── ExtraTreesClassification.yaml
│   ├── oof_preds.csv
│   └── test_preds.csv
├── [01;34mLightGBMClassification[00m
│   ├── LightGBMClassification.joblib
│   ├── LightGBMClassification.yaml
│   ├── oof_preds.csv
│   └── test_preds.csv
├── [01;34mLogisticRegression[00m
│   ├── LogisticRegression.joblib
│   ├── LogisticRegression.yaml
│   ├── oof_preds.csv
│   └── test_preds.csv
├── [01;34mRandomForestClassification[00m
│   ├── RandomForestClassification.joblib
│   ├── RandomForestClassification.yaml
│   ├── oof_preds.csv
│   └── test_preds.csv
├── [01;34mStacker[00m
│   ├── Stacker.j