## Build pipeline

In [1]:
import yaml

from pprint import pprint

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest

# Some pipeline we decide is 'the best' for our problem.
pipe = Pipeline([
    ('pca', PCA(n_components=10)),
    ('feature_union', FeatureUnion([
        ('feature_union_pipe1', Pipeline([
            ('min_max', MinMaxScaler()),
            ('pca', PCA(n_components=2))
        ])),
        ('feature_union_pipe2', Pipeline([
            ('qt', QuantileTransformer(output_distribution='uniform')),
        ]))
    ])),
    ('iso', IsolationForest(contamination='auto'))
])

## Write this model architecture to a `Gordo Model Definition`

### This a valid mapping to the Gordo config `model` key.

In [2]:
from gordo.serializer import into_definition

yaml_definition_of_pipeline = yaml.dump(into_definition(pipe))
print('-' * 30)
print(yaml_definition_of_pipeline)



------------------------------
sklearn.pipeline.Pipeline:
  memory: null
  steps:
  - sklearn.decomposition._pca.PCA:
      copy: true
      iterated_power: auto
      n_components: 10
      random_state: null
      svd_solver: auto
      tol: 0.0
      whiten: false
  - sklearn.pipeline.FeatureUnion:
      n_jobs: null
      transformer_list:
      - sklearn.pipeline.Pipeline:
          memory: null
          steps:
          - sklearn.preprocessing._data.MinMaxScaler:
              clip: false
              copy: true
              feature_range: !!python/tuple
              - 0
              - 1
          - sklearn.decomposition._pca.PCA:
              copy: true
              iterated_power: auto
              n_components: 2
              random_state: null
              svd_solver: auto
              tol: 0.0
              whiten: false
          verbose: false
      - sklearn.pipeline.Pipeline:
          memory: null
          steps:
          - sklearn.preprocessing._data.Quant

## Load definition back into a pipeline

### You probably won't need to do this, but it's how we get a replica of your defined model in Gordo

In [3]:
from gordo.serializer import from_definition


pipe = from_definition(yaml.load(yaml_definition_of_pipeline, Loader=yaml.FullLoader))
pprint(pipe.steps)

[('step_0', PCA(n_components=10)),
 ('step_1',
  FeatureUnion(transformer_list=[('step_0',
                                Pipeline(steps=[('step_0', MinMaxScaler()),
                                                ('step_1',
                                                 PCA(n_components=2))])),
                               ('step_1',
                                Pipeline(steps=[('step_0',
                                                 QuantileTransformer())]))])),
 ('step_2', IsolationForest())]


  after removing the cwd from sys.path.


## Let us train the pipelilne.

In [4]:
import numpy as np

X = np.random.random(int(1e5)).reshape(-1, 20)
y = X.copy()
X.shape

(5000, 20)

In [5]:
pipe.fit(X, y)

Pipeline(steps=[('step_0', PCA(n_components=10)),
                ('step_1',
                 FeatureUnion(transformer_list=[('step_0',
                                                 Pipeline(steps=[('step_0',
                                                                  MinMaxScaler()),
                                                                 ('step_1',
                                                                  PCA(n_components=2))])),
                                                ('step_1',
                                                 Pipeline(steps=[('step_0',
                                                                  QuantileTransformer())]))])),
                ('step_2', IsolationForest())])

## Predict as normal

In [6]:
predicted_anomolies = pipe.predict(X)
predicted_anomolies.shape

(5000,)

## Now we want to serialize it, for some reason.

In [7]:
from gordo.serializer import dumps, loads

serialized_pipe_bytes = dumps(pipe)
serialized_pipe_bytes[:20]

b'\x80\x03csklearn.pipeline\n'

## Load it back, ensuring the state is kept.

In [8]:
pipe_clone = loads(serialized_pipe_bytes)
predictions = pipe_clone.predict(X)
assert np.allclose(predicted_anomolies, predictions)

## Optionally, you can save it to a directory 

In [9]:
from tempfile import TemporaryDirectory
from gordo.serializer import dump, load

with TemporaryDirectory() as tmp:
    
    # Dump pipe to directory
    dump(pipe, tmp)
    
    # Load it back
    pipe_clone = load(tmp)
    
    assert np.allclose(pipe_clone.predict(X), predicted_anomolies)
    