<a href="https://colab.research.google.com/github/dkmachinelearning/dkmachinelearning/blob/main/SKLearnPipeline/Pipeline_excercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Basecode from https://blog.prokulski.science/2020/10/10/pipeline-w-scikit-learn/
import pandas as pd

# być może coś narysujemy
import matplotlib.pyplot as plt
import seaborn as sns

import time

In [None]:
from sklearn.model_selection import train_test_split

# modele
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# preprocessing
## zmienne ciągłe
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
## zmienne kategoryczne
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


# dodatkowe modele spoza sklearn
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [None]:
# dane nie mają nagłówka - samo sobie nadamy nazwy kolumn
col_names= ['age', 'work_class', 'final_weight', 'education', 'education_num',
            'marital_status', 'occupation', 'relationship', 'race', 'sex',
            'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
            'year_income']

# wczytujemy dane
adult_dataset = pd.read_csv("data/adult.data",
                            engine='python', sep=', ', # tu jest przeciek i spacja!
                            header=None, names=col_names,
                            na_values="?")

# kolumna 'final_weight' do niczego się nie przyda, więc od razu ją usuwamy
# wiadomo to z EDA, które tutaj pomijamy
adult_dataset.drop('final_weight', axis=1, inplace=True)

# usuwamy braki, żeby uprościć przykład
adult_dataset.dropna(inplace=True)

In [None]:
adult_dataset.dtypes

## age                int64
## work_class        object
## education         object
## education_num      int64
## marital_status    object
## occupation        object
## relationship      object
## race              object
## sex               object
## capital_gain       int64
## capital_loss       int64
## hours_per_week     int64
## native_country    object
## year_income       object
## dtype: object

In [None]:
X_train, X_test, y_train, y_test = train_test_split(adult_dataset.drop('year_income', axis=1),
                                                    adult_dataset['year_income'],
                                                    test_size=0.3,
                                                    random_state=42)

In [None]:
# lista kolumn numerycznych
cols_numerical = X_train.select_dtypes(include=['int64', 'float64']).columns

# transformer dla kolumn numerycznych
transformer_numerical = Pipeline(steps = [
    ('num_trans', StandardScaler())
])

In [None]:
# lista kolmn kategorycznych
cols_categorical = ['work_class', 'education', 'marital_status', 'occupation',
                    'relationship', 'race', 'sex', 'native_country']

# transformer dla kolumn numerycznych
transformer_categorical = Pipeline(steps = [
    ('cat_trans', OneHotEncoder())
])

In [None]:
# preprocesor danych
preprocessor = ColumnTransformer(transformers = [
    ('numerical', transformer_numerical, cols_numerical),
    ('categorical', transformer_categorical, cols_categorical)
])

In [None]:
pipe = Pipeline(steps = [
                ('preprocessor', preprocessor),
                ('classifier', RandomForestClassifier())
            ])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
pipe.predict(X_test)

In [None]:
pipe.predict_proba(X_test)

In [None]:
pipe.score(X_test, y_test)

In [None]:
Python
# klasyfikatory
classifiers = [
    DummyClassifier(strategy='stratified'),
    LogisticRegression(max_iter=500), # można tutaj podać hiperparametry
    KNeighborsClassifier(2), # 2 bo mamy dwie klasy
    ExtraTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
    XGBClassifier(),
    CatBoostClassifier(silent=True),
    LGBMClassifier(verbose=-1)
]

# transformatory dla kolumn liczbowych
scalers = [StandardScaler(), MinMaxScaler(), Normalizer()]

# transformatory dla kolumn kategorycznych
cat_transformers = [OrdinalEncoder(), OneHotEncoder()]

In [None]:
# miejsce na zebranie wyników
models_df = pd.DataFrame()

# przygotowujemy pipeline
pipe = Pipeline(steps = [
    ('preprocessor', preprocessor), # mniejszy pipeline
    ('classifier', None) # to ustalimy za moment
])

# dla każdego typu modelu zmieniamy kolejne transformatory kolumn
for model in classifiers:
    for num_tr in scalers:
        for cat_tr in cat_transformers:
            # odpowiednio zmieniamy jego paramety - dobieramy transformatory
            pipe_params = {
                'preprocessor__numerical__num_trans': num_tr,
                'preprocessor__categorical__cat_trans': cat_tr,
                'classifier': model
            }
            pipe.set_params(**pipe_params)

            # trenujemy tak przygotowany model (cały pipeline) mierząc ile to trwa
            start_time = time.time()
            pipe.fit(X_train, y_train)
            end_time = time.time()

            # sprawdzamy jak wyszło
            score = pipe.score(X_test, y_test)

            # zbieramy w dict parametry dla Pipeline i wyniki
            param_dict = {
                        'model': model.__class__.__name__,
                        'num_trans': num_tr.__class__.__name__,
                        'cat_trans': cat_tr.__class__.__name__,
                        'score': score,
                        'time_elapsed': end_time - start_time
            }

            models_df = models_df.append(pd.DataFrame(param_dict, index=[0]))

models_df.reset_index(drop=True, inplace=True)

In [None]:
models_df.sort_values('score', ascending=False)

In [None]:
models_df[['model', 'score', 'time_elapsed']] \
    .groupby('model') \
    .aggregate({
        'score': ['mean','std', 'min', 'max'],
        'time_elapsed': ['mean','std', 'min', 'max']
        }) \
    .reset_index() \
    .sort_values(('score', 'mean'), ascending=False)

In [None]:
sns.boxplot(data=models_df, x='score', y='model')

In [None]:
sns.boxplot(data=models_df, x='score', y='num_trans')

In [None]:
sns.boxplot(data=models_df, x='score', y='cat_trans')

In [None]:
sns.boxplot(data=models_df, x='time_elapsed', y='model')