In [1]:
import hydra
from omegaconf import DictConfig
from sklearn.pipeline import Pipeline

# Example 1 – standard preprocessing pipeline. Steps are hardcoded in the code itself 

In [2]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


def main():
    num_pipeline = Pipeline(
        [
            ("scaler", StandardScaler()),
            ("VarianceThreshold", VarianceThreshold(threshold=0.1)),
            ("imputer", SimpleImputer(strategy="constant", fill_value="-1")),
        ]
    )


if __name__ == "__main__":
    main()

# Example 2 – different pipelines for different models. Code becomes messy as pipelines are added

In [3]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

pipeline_lr = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="constant", fill_value="-1")),
        ("VarianceThreshold", VarianceThreshold(threshold=0.1)),
        ("scalar1", StandardScaler()),
        ("pca1", PCA(n_components=2)),
        ("lr_classifier", LogisticRegression(random_state=0)),
    ]
)


pipeline_dt = Pipeline(
    [
        ("scalar2", StandardScaler()),
        ("VarianceThreshold", VarianceThreshold(threshold=0.1)),
        ("imputer", SimpleImputer(strategy="constant", fill_value="-1")),
        ("pca2", PCA(n_components=2)),
        ("dt_classifier", DecisionTreeClassifier()),
    ]
)

pipeline_randomforest = Pipeline(
    [
        ("scalar3", StandardScaler()),
        ("pca3", PCA(n_components=2)),
        ("rf_classifier", RandomForestClassifier()),
    ]
)

# Example 3 – Hydra configuration file, for creating the pipeline from Example 1.  

```yaml
# Hydra config file for the decition_tree preprocessing steps
_target_: hydra_sklearn_pipeline.make_pipeline

#StepName:
#    _target_: <class to initiating the step>
#    param1: <step's first parameter>
#    param2: <step's second parameter, etc.>

steps_config: # use yaml list syntax to preserve to order
    - StandardScaler:
        _target_: sklearn.preprocessing.StandardScaler

    - VarianceThreshold:
        _target_: sklearn.feature_selection.VarianceThreshold
        threshold: 0.1

    - SimpleImputer:
        _target_: sklearn.impute.SimpleImputer
        strategy: 'constant'
        fill_value: '-1'
```

# Example 4 – Function to create a sklearn-hydra pipeline

In [2]:
import hydra
from omegaconf import DictConfig
from sklearn.pipeline import Pipeline


def make_pipeline(steps_config: DictConfig) -> Pipeline:
    """Creates a pipeline with all the preprocessing steps specified in `steps_config`, ordered in a sequential manner

    Args:
        steps_config (DictConfig): the config containing the instructions for
                                    creating the feature selectors or transformers

    Returns:
        [sklearn.pipeline.Pipeline]: a pipeline with all the preprocessing steps, in a sequential manner
    """
    steps = []

    for step_config in steps_config:

        # retrieve the name and parameter dictionary of the current steps
        step_name, step_params = step_config.items()[0]

        # instantiate the pipeline step, and append to the list of steps
        pipeline_step = (step_name, hydra.utils.instantiate(step_params))
        steps.append(pipeline_step)

    return Pipeline(steps)

# Example 5 – Configs hierarchy 

```bash
tree configs/ # from WSL
```

# Example 6 – config.yaml, which pointes to “decision_tree.yaml” when creating the preprocessing_pipelined

```yaml
# @package _global_

# specify here default preprocessing configuration
defaults:
    # can be any config file in the preprocessing_pipeline folder
    - preprocessing_pipeline: decision_tree.yaml 
```

# Example 7 – Driver code

In [None]:
import hydra
from omegaconf import DictConfig


@hydra.main(config_path="configs/", config_name="config.yaml")
def main(config: DictConfig):

    preprocessing_pipeline = hydra.utils.instantiate(
        config.preprocessing_pipeline, _recursive_=False
    )


if __name__ == "__main__":
    main()