In [1]:
%%HTML
<style>

.rendered_html {
  font-size:0.8em;
}
.rendered_html table, .rendered_html th, .rendered_html tr, .rendered_html td {
     font-size: 100%;
}

</style>

# The Machine Learning Bazaar

## MLBlocks and MLPrimitives

### by Carles Sala

# Carles Sala - Who am I?

### CEO @ Pythia
<img style="width: 600px" src="imgs/pythia.png">

### Researcher @ MIT - DAI-Lab
<img style="width: 400px" src="imgs/dai-lab.png">

# The Problem

## Basic Model
<img src="imgs/estimator.png"/>

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

dataset = load_boston()

X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target)

model = LinearRegression()
model.fit(X_train, y_train)

model.predict(X_test)[0:10]

array([19.73434269, 18.83833357, 35.04896329, 19.56917954, 28.30151468,
       10.76531108, 24.02095191, 19.99488184,  7.86085591, 17.03753753])

## Problem: Multiple steps required

In [39]:
from mit_d3m import load_dataset

dataset = load_dataset('196_autoMpg')

X_train, X_test, y_train, y_test = train_test_split(dataset.X, dataset.y)

try:
    model = LinearRegression()
    model.fit(X_train, y_train)
except Exception as e:
    print(e)

Input contains NaN, infinity or a value too large for dtype('float64').


In [26]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer()
X_train_clean = imputer.fit_transform(X_train, y_train)

model = LinearRegression()
model.fit(X_train_clean, y_train)

X_test_clean = imputer.transform(X_test)
model.predict(X_test_clean)[0:10]

array([16.78450851,  5.38115268, 27.73914297, 25.50780677, 29.35791647,
       27.55297534, 25.68492923, 33.5739275 , 28.99014042, 23.69385865])

# Solution: Pipelines

<img src="imgs/pipeline.png"/>

<img src="imgs/pipeline-steps.png"/>

# Basic Solution: scikit-learn Pipeline

In [28]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('regression', LinearRegression()),
])

In [29]:
pipeline.fit(X_train, y_train)

pipeline.predict(X_test)[0:10]

array([16.78450851,  5.38115268, 27.73914297, 25.50780677, 29.35791647,
       27.55297534, 25.68492923, 33.5739275 , 28.99014042, 23.69385865])

# Problems

* Multiple Libraries

* Hyperparameters

* Input/Output Dependencies

# Example: Text classification with Keras LSTM model

* Figure out the number of classes

* Compute the vocabulary length

* Tokenize the text

* Build the Keras Model

* Compile the Keras Model

# Solution: https://github.com/HDI-Project/MLBlocks

<center><img style="width: 400px;" src="imgs/mlblocks-logo.png"/></center>

* Supports combining any possible library

* Based on JSON specifications

* Easy hyperparameter tuning

* Complex pipeline configurations

# MLBlocks: Concepts

* Primitives
    * Classes or functions
    * Initialization arguments
    * Tunable hyperparameters
    

* Pipelines
    * List of Primitivies
    * Initialization arguments
    * Hyperparameter values
    

* Templates
    * List of Primitives
    * Initialization arguments
    * Tunable hyperparameter ranges

# MLBlocks Primitive Example - scikit-learn

In [8]:
import json
import mlblocks

primitive = mlblocks.load_primitive('sklearn.preprocessing.StandardScaler')
print(json.dumps(primitive, indent=4))

{
    "name": "sklearn.preprocessing.StandardScaler",
    "author": "Carles Sala <carles@pythiac.com>",
    "documentation": "http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html",
    "description": "Standardize features by removing the mean and scaling to unit variance",
    "classifiers": {
        "type": "preprocessor",
        "subtype": "transformer"
    },
    "modalities": [],
    "primitive": "sklearn.preprocessing.StandardScaler",
    "validation_dataset": "wine",
    "fit": {
        "method": "fit",
        "args": [
            {
                "name": "X",
                "type": "ndarray"
            }
        ]
    },
    "produce": {
        "method": "transform",
        "args": [
            {
                "name": "X",
                "type": "ndarray"
            }
        ],
        "output": [
            {
                "name": "X",
                "type": "ndarray"
            }
        ]
    },
    "hyperparameters":

# MLBlocks Primitive Example - Simple Function

In [9]:
primitive = mlblocks.load_primitive('keras.applications.mobilenet.preprocess_input')
print(json.dumps(primitive, indent=4))

{
    "name": "keras.applications.mobilenet.preprocess_input",
    "author": "Carles Sala <carles@pythiac.com>",
    "documentation": "https://www.tensorflow.org/api_docs/python/tf/keras/applications/mobilenet/preprocess_input",
    "description": "Preprocesses a tensor or Numpy array encoding a batch of images.",
    "classifiers": {
        "type": "preprocessor",
        "subtype": "transformer"
    },
    "modalities": [
        "image"
    ],
    "primitive": "keras.applications.mobilenet.preprocess_input",
    "produce": {
        "args": [
            {
                "name": "X",
                "keyword": "x",
                "type": "ndarray"
            }
        ],
        "output": [
            {
                "name": "X",
                "type": "ndarray"
            }
        ]
    },
    "hyperparameters": {
        "fixed": {},
        "tunable": {}
    }
}


# MLBlocks Primitive Example - Full Keras Model

In [10]:
primitive = mlblocks.load_primitive('keras.Sequential.LSTMTextClassifier')
print(json.dumps(primitive, indent=4))

{
    "name": "keras.Sequential.LSTMTextClassifier",
    "author": "Carles Sala <carles@pythiac.com>",
    "description": "keras.Sequential.LSTMTextClassifier",
    "classifiers": {
        "type": "estimator",
        "subtype": "regressor"
    },
    "modalities": [
        "text"
    ],
    "primitive": "mlprimitives.adapters.keras.Sequential",
    "fit": {
        "method": "fit",
        "args": [
            {
                "name": "X",
                "type": "ndarray"
            },
            {
                "name": "y",
                "type": "array"
            },
            {
                "name": "classes",
                "type": "int",
                "description": "Number of classes"
            },
            {
                "name": "vocabulary_size",
                "type": "int",
                "description": "vocabulary size + 1: number of known words in input."
            }
        ]
    },
    "produce": {
        "method": "predict",
        "args":

# MLBlocks Example: 20 News Groups

In [51]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups()
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target)

X_train[0]

'From: whaley@sigma.kpc.com (Ken Whaley)\nSubject: Re: Animation with XPutImage()?\nIn-Reply-To: dcr@mail.ast.cam.ac.uk\'s message of Thu, 22 Apr 1993 09:28:30 GMT\nOrganization: Kubota Pacific Computer Inc.\n\t<1993Apr22.092830.2190@infodev.cam.ac.uk>\nLines: 38\n\n| \n| Shared memory PutImage (also mentioned by nkissebe@delphi.beckman.uiuc.edu,\n| Nick Kisseberth) looks interesting, but I need someone to point me to some\n| documentation. Is this method likely to give better results than server-\n| resident pixmaps? I\'d also be interested in looking at the XView code\n| mentioned above...\n\nThere is no easy answer to this question: it depends on whether the display\ndevice can hold pixmaps in off-screen memory, and if so, how efficiently\nthe server manages these resources (having to deal with limited off-screen\nmemory is the bane of the server implementor\'s existence!).  \n\nI have worked with graphics devices where the off-screen memory to \nmain display copy rate eclipses that

# MLPipeline from JSON

In [54]:
pipeline_dict = {
    "primitives": [
        "mlprimitives.counters.UniqueCounter",
        "mlprimitives.text.TextCleaner",
        "mlprimitives.counters.VocabularyCounter",
        "keras.preprocessing.text.Tokenizer",
        "keras.preprocessing.sequence.pad_sequences",
        "keras.Sequential.LSTMTextClassifier"
    ],
    "input_names": {
        "mlprimitives.counters.UniqueCounter#1": {
            "X": "y"
        }
    },
    "output_names": {
        "mlprimitives.counters.UniqueCounter#1": {
            "counts": "classes"
        },
        "mlprimitives.counters.VocabularyCounter#1": {
            "counts": "vocabulary_size"
        }
    },
    "init_params": {
        "mlprimitives.counters.VocabularyCounter#1": {
            "add": 1
        },
        "mlprimitives.text.TextCleaner#1": {
            "language": "en"
        },
        "keras.preprocessing.sequence.pad_sequences#1": {
            "maxlen": 100
        },
        "keras.Sequential.LSTMTextClassifier#1": {
            "input_length": 100
        }
    }
}

# Create an MLPipeline Instance

In [55]:
from mlblocks import MLPipeline

pipeline = MLPipeline.from_dict(pipeline_dict)

# Fit

In [56]:
pipeline.fit(X_train, y_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Predict

In [59]:
predictions = pipeline.predict(X_test)

predictions[0:5]

array([ 6,  4, 14, 11, 19])

# Tunable Hyperparameters

In [61]:
import json

print(json.dumps(pipeline.get_tunable_hyperparameters(), indent=4))

{
    "mlprimitives.counters.UniqueCounter#1": {},
    "mlprimitives.text.TextCleaner#1": {
        "lower": {
            "type": "bool",
            "default": true
        },
        "accents": {
            "type": "bool",
            "default": true
        },
        "stopwrods": {
            "type": "bool",
            "default": true
        },
        "non_alpha": {
            "type": "bool",
            "default": true
        },
        "single_chars": {
            "type": "bool",
            "default": true
        }
    },
    "mlprimitives.counters.VocabularyCounter#1": {},
    "keras.preprocessing.text.Tokenizer#1": {
        "num_words": {
            "type": "int",
            "default": null,
            "range": [
                1,
                10000
            ]
        },
        "lower": {
            "type": "bool",
            "default": true
        },
        "char_level": {
            "type": "bool",
            "default": false
        }
    },
    

# But, where are the primitives?

## MLPrimitives: https://github.com/HDI-Project/MLPrimitives

## We need contributors!

# References

* MLBlocks: https://github.com/HDI-Project/MLBlocks
* MLPrimitives: https://github.com/HDI-Project/MLPrimitives
* BTB: https://github.com/HDI-Project/BTB
* Featuretools: https://www.featuretools.com

* Pythia Consulting: https://www.pythiac.com
* DAI-Lab: https://dai.lids.mit.edu

# Questions?

# Contact

* e-mail:
    * carles@pythiac.com
    * csala@csail.mit.edu
* Twitter:
    * [twitter.com/_xals](https://twitter.com/_xals)

# Thank You!