## This example contains sample script for feature processing and launching models 

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np

In [7]:
RANDOM_STATE = 42
N_JOBS = 5
TARGET = "satisfaction"

### Data

In [8]:
df = pd.read_csv("../data/airlines_train.csv").drop(columns="Unnamed: 0").sample(n=5_000, random_state=RANDOM_STATE).assign(cnst=1)
df.info()
X, y = df.drop(columns=TARGET), df[TARGET]
y = LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test  = train_test_split(X, y, stratify=y, random_state=RANDOM_STATE, test_size=0.2)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 80638 to 6423
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   id                                 5000 non-null   int64  
 1   Gender                             5000 non-null   object 
 2   Customer Type                      5000 non-null   object 
 3   Age                                5000 non-null   int64  
 4   Type of Travel                     5000 non-null   object 
 5   Class                              5000 non-null   object 
 6   Flight Distance                    5000 non-null   int64  
 7   Inflight wifi service              5000 non-null   int64  
 8   Departure/Arrival time convenient  5000 non-null   int64  
 9   Ease of Online booking             5000 non-null   int64  
 10  Gate location                      5000 non-null   int64  
 11  Food and drink                     5000 non-null   i

In [9]:
import sys; sys.path.append("../"); sys.path.append("../src/")

from src.automl.feature_selection.pipe import PreprocessingPipeline, ValTestsPipeline

In [10]:
feat_pipe = PreprocessingPipeline(obj_encoders=["ohe", "oe"])
val_test_pipe = ValTestsPipeline()

[2024-12-02 11:47:19,556] - [  PREPROC   ] - Успешно заданы шаги pipeline
[2024-12-02 11:47:19,560] - [ VAL_TESTS  ] - Успешно заданы шаги pipeline


#### Feature pipeline

In [11]:
print("Initial train data shape", X_train.shape)
X_train = feat_pipe.fit_transform(X_train, y_train)
print("Train data shape after pipeline", X_train.shape)
X_test = feat_pipe.transform(X_test)

Initial train data shape (4000, 24)
[2024-12-02 11:47:20,478] - [Pipeline] .. (step 1 of 6) Processing nan_cols_dropper, total=   0.0
[2024-12-02 11:47:20,509] - [Pipeline] ....... (step 2 of 6) Processing nan_imputer, total=   0.0
[2024-12-02 11:47:20,533] - [  PREPROC   ] - Corr features to drop: ['Departure Delay in Minutes']
[2024-12-02 11:47:20,534] - [Pipeline] . (step 3 of 6) Processing corr_cols_dropper, total=   0.0
[2024-12-02 11:47:20,542] - [  PREPROC   ] - QConstant features to drop: ['cnst']
[2024-12-02 11:47:20,546] - [Pipeline] .... (step 4 of 6) Processing qconst_dropper, total=   0.0
[2024-12-02 11:47:20,562] - [Pipeline] .... (step 5 of 6) Processing outlier_capper, total=   0.0
[2024-12-02 11:47:20,573] - [Pipeline] ... (step 6 of 6) Processing feature_encoder, total=   0.0
Train data shape after pipeline (4000, 23)


In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 23 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   OneHotEncoder__Gender_Male                         4000 non-null   int16  
 1   OneHotEncoder__Customer Type_disloyal Customer     4000 non-null   int16  
 2   OneHotEncoder__Type of Travel_Personal Travel      4000 non-null   int16  
 3   OneHotEncoder__Class_Eco                           4000 non-null   int16  
 4   OneHotEncoder__Class_Eco Plus                      4000 non-null   int16  
 5   StandardScaler__id                                 4000 non-null   float64
 6   StandardScaler__Age                                4000 non-null   float64
 7   StandardScaler__Flight Distance                    4000 non-null   float64
 8   StandardScaler__Inflight wifi service              4000 non-null   float64
 9   Standard

#### ValTest pipeline

In [13]:
## adding evidently shifted by distribution feature
X_train["bad_feature"] = np.random.uniform(0, 10, size=X_train.shape[0])
X_test["bad_feature"] = np.random.uniform(5, 15, size=X_test.shape[0])

In [14]:
print("Initial train data shape", X_train.shape)
X_train = val_test_pipe.fit_transform(X_train, X_test)
print("Train data shape after pipeline", X_train.shape)
X_test = val_test_pipe.transform(X_test)

Initial train data shape (4000, 24)
[2024-12-02 11:47:26,545] - [  PREPROC   ] - Features not passing psi test to drop: ['bad_feature']
[2024-12-02 11:47:26,546] - [Pipeline] .......... (step 1 of 2) Processing PSI_test, total=   0.2
[2024-12-02 11:47:26,646] - [Pipeline] .. (step 2 of 2) Processing Adversarial_test, total=   0.1
Train data shape after pipeline (4000, 23)


In [15]:
cat_features = X_train.columns[(X_train.columns.str.startswith("OneHotEncoder")) | (X_train.columns.str.startswith("OrdinalEncoder"))].tolist()

In [16]:
cat_features

['OneHotEncoder__Gender_Male',
 'OneHotEncoder__Customer Type_disloyal Customer',
 'OneHotEncoder__Type of Travel_Personal Travel',
 'OneHotEncoder__Class_Eco',
 'OneHotEncoder__Class_Eco Plus']

### Model

In [17]:
from src.automl.model import AutoML

In [18]:
TUNING_TIMEOUT = 60 # time for tuning each model
TASK = "classification" # one of ["regression", "classification"]
METRIC = "roc_auc" # either sklearn metirc or a custom metric
STACK = True # whether to perform stacking
BLEND = True # whether to perform blending

In [19]:
model = AutoML(
    task=TASK,
    metric=METRIC,
    n_jobs=N_JOBS,
    tuning_timeout=TUNING_TIMEOUT,
    stack=STACK,
    blend=BLEND
)

In [20]:
model.fit(
    X_train, y_train,
    X_test, y_test,
    save_test=False,
    save_oof=False,
    save_models=False,
    categorical_features=cat_features
)

[2024-12-02 11:47:35,259] - [   MODEL    ] - 1 out of 9. LogisticRegression
[2024-12-02 11:47:35,262] - [   START    ] - Working with LogisticRegression
[2024-12-02 11:47:35,265] - [   START    ] - Tuning LogisticRegression
[2024-12-02 11:47:36,048] - [   PARAMS   ] - C=2.782559402207126, metric=0.9149724126682101
[2024-12-02 11:47:36,048] - [BEST PARAMS ] - {'C': 2.782559402207126, 'class_weight': 'balanced', 'max_iter': 1000, 'n_jobs': 5, 'random_state': 42, 'time_series': False}
[2024-12-02 11:47:36,049] - [    END     ] - Tuning LogisticRegression
[2024-12-02 11:47:36,049] - [   START    ] - Fitting LogisticRegression
[2024-12-02 11:47:36,050] - [    FIT     ] - LogisticRegression fold 0
[2024-12-02 11:47:36,059] - [    FIT     ] - LogisticRegression fold 1
[2024-12-02 11:47:36,079] - [    FIT     ] - LogisticRegression fold 2
[2024-12-02 11:47:36,090] - [    FIT     ] - LogisticRegression fold 3
[2024-12-02 11:47:36,101] - [    FIT     ] - LogisticRegression fold 4
[2024-12-02 11:

<src.automl.model.main.AutoML at 0xffff0b3905e0>

In [21]:
model.predict(X_test)

array([[0.99078044, 0.00921956],
       [0.01059638, 0.98940362],
       [0.90615082, 0.09384918],
       ...,
       [0.8982518 , 0.1017482 ],
       [0.01059638, 0.98940362],
       [0.99080655, 0.00919345]])