## This example contains sample script for feature processing and launching models 

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np

In [24]:
RANDOM_STATE = 42
N_JOBS = 5
TARGET = "satisfaction"

### Data

In [25]:
df = pd.read_csv("../data/airlines_train.csv").drop(columns="Unnamed: 0").sample(n=2_000, random_state=RANDOM_STATE).assign(cnst=1)
df.info()
X, y = df.drop(columns=TARGET), df[TARGET]
y = LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test  = train_test_split(X, y, stratify=y, random_state=RANDOM_STATE, test_size=0.2)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 80638 to 54347
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   id                                 2000 non-null   int64  
 1   Gender                             2000 non-null   object 
 2   Customer Type                      2000 non-null   object 
 3   Age                                2000 non-null   int64  
 4   Type of Travel                     2000 non-null   object 
 5   Class                              2000 non-null   object 
 6   Flight Distance                    2000 non-null   int64  
 7   Inflight wifi service              2000 non-null   int64  
 8   Departure/Arrival time convenient  2000 non-null   int64  
 9   Ease of Online booking             2000 non-null   int64  
 10  Gate location                      2000 non-null   int64  
 11  Food and drink                     2000 non-null   

In [26]:
import sys; sys.path.append("../"); sys.path.append("../src/")

from src.automl.feature_selection.pipe import PreprocessingPipeline, ValTestsPipeline

In [27]:
feat_pipe = PreprocessingPipeline(obj_encoders=["ohe", "oe"])
val_test_pipe = ValTestsPipeline()

[2024-12-02 09:45:10,158] - [  PREPROC   ] - Успешно заданы шаги pipeline
[2024-12-02 09:45:10,160] - [ VAL_TESTS  ] - Успешно заданы шаги pipeline


#### Feature pipeline

In [28]:
print("Initial train data shape", X_train.shape)
X_train = feat_pipe.fit_transform(X_train, y_train)
print("Train data shape after pipeline", X_train.shape)
X_test = feat_pipe.transform(X_test)

Initial train data shape (1600, 24)
[2024-12-02 09:45:10,635] - [Pipeline] .. (step 1 of 6) Processing nan_cols_dropper, total=   0.0
[2024-12-02 09:45:10,658] - [Pipeline] ....... (step 2 of 6) Processing nan_imputer, total=   0.0
[2024-12-02 09:45:10,673] - [  PREPROC   ] - Corr features to drop: ['Arrival Delay in Minutes']
[2024-12-02 09:45:10,675] - [Pipeline] . (step 3 of 6) Processing corr_cols_dropper, total=   0.0
[2024-12-02 09:45:10,685] - [  PREPROC   ] - QConstant features to drop: ['cnst']
[2024-12-02 09:45:10,687] - [Pipeline] .... (step 4 of 6) Processing qconst_dropper, total=   0.0
[2024-12-02 09:45:10,703] - [Pipeline] .... (step 5 of 6) Processing outlier_capper, total=   0.0
[2024-12-02 09:45:10,712] - [Pipeline] ... (step 6 of 6) Processing feature_encoder, total=   0.0
Train data shape after pipeline (1600, 23)


In [29]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 23 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   OneHotEncoder__Gender_Male                         1600 non-null   int16  
 1   OneHotEncoder__Customer Type_disloyal Customer     1600 non-null   int16  
 2   OneHotEncoder__Type of Travel_Personal Travel      1600 non-null   int16  
 3   OneHotEncoder__Class_Eco                           1600 non-null   int16  
 4   OneHotEncoder__Class_Eco Plus                      1600 non-null   int16  
 5   StandardScaler__id                                 1600 non-null   float64
 6   StandardScaler__Age                                1600 non-null   float64
 7   StandardScaler__Flight Distance                    1600 non-null   float64
 8   StandardScaler__Inflight wifi service              1600 non-null   float64
 9   Standard

#### ValTest pipeline

In [30]:
## adding evidently shifted by distribution feature
X_train["bad_feature"] = np.random.uniform(0, 10, size=X_train.shape[0])
X_test["bad_feature"] = np.random.uniform(5, 15, size=X_test.shape[0])

In [31]:
print("Initial train data shape", X_train.shape)
X_train = val_test_pipe.fit_transform(X_train, X_test)
print("Train data shape after pipeline", X_train.shape)
X_test = val_test_pipe.transform(X_test)

Initial train data shape (1600, 24)
[2024-12-02 09:45:12,925] - [  PREPROC   ] - Features not passing psi test to drop: ['bad_feature']
[2024-12-02 09:45:12,926] - [Pipeline] .......... (step 1 of 2) Processing PSI_test, total=   0.2
[2024-12-02 09:45:12,946] - [Pipeline] .. (step 2 of 2) Processing Adversarial_test, total=   0.0
Train data shape after pipeline (1600, 23)


In [32]:
cat_features = X_train.columns[(X_train.columns.str.startswith("OneHotEncoder")) | (X_train.columns.str.startswith("OrdinalEncoder"))].tolist()

In [33]:
cat_features

['OneHotEncoder__Gender_Male',
 'OneHotEncoder__Customer Type_disloyal Customer',
 'OneHotEncoder__Type of Travel_Personal Travel',
 'OneHotEncoder__Class_Eco',
 'OneHotEncoder__Class_Eco Plus']

### Model

In [34]:
from src.automl.model import AutoML

In [35]:
TUNING_TIMEOUT = 30 # time for tuning each model
TASK = "classification" # one of ["regression", "classification"]
METRIC = "roc_auc" # either sklearn metirc or a custom metric
STACK = True # whether to perform stacking
BLEND = True # whether to perform blending

In [36]:
model = AutoML(
    task=TASK,
    metric=METRIC,
    n_jobs=N_JOBS,
    tuning_timeout=TUNING_TIMEOUT,
    stack=STACK,
    blend=BLEND
)

In [None]:
model.fit(
    X_train, y_train,
    X_test, y_test,
    save_test=False,
    save_oof=False,
    save_models=False,
    categorical_features=cat_features
)

[2024-12-02 09:45:19,244] - [   MODEL    ] - 1 out of 9. LogisticRegression
[2024-12-02 09:45:19,247] - [   START    ] - Working with LogisticRegression
[2024-12-02 09:45:19,249] - [   START    ] - Tuning LogisticRegression
[2024-12-02 09:45:19,903] - [   PARAMS   ] - C=0.3593813663804626, metric=0.9108633449988233
[2024-12-02 09:45:19,904] - [BEST PARAMS ] - {'C': 0.3593813663804626, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': 5, 'random_state': 42, 'time_series': False}
[2024-12-02 09:45:19,904] - [    END     ] - Tuning LogisticRegression
[2024-12-02 09:45:19,904] - [   START    ] - Fitting LogisticRegression
[2024-12-02 09:45:19,905] - [    FIT     ] - LogisticRegression fold 0
[2024-12-02 09:45:19,913] - [    FIT     ] - LogisticRegression fold 1
[2024-12-02 09:45:19,928] - [    FIT     ] - LogisticRegression fold 2
[2024-12-02 09:45:19,933] - [    FIT     ] - LogisticRegression fold 3
[2024-12-02 09:45:19,939] - [    FIT     ] - LogisticRegression fold 4
[2024-12-02 0