# sklearn model conversion

* http://onnx.ai/sklearn-onnx/
* https://github.com/onnx/sklearn-onnx

In [1]:
# example from http://onnx.ai/sklearn-onnx/
# Train a model.
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
clr = RandomForestClassifier()
clr.fit(X_train, y_train)

# Convert into ONNX format
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 4]))]
onx = convert_sklearn(clr, initial_types=initial_type)
with open("models/rf_iris.onnx", "wb") as f:
    f.write(onx.SerializeToString())

# Compute the prediction with ONNX Runtime
import onnxruntime as rt
import numpy
sess = rt.InferenceSession("models/rf_iris.onnx")
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
pred_onx = sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0]

In [2]:
import pandas as pd
import sklearn.metrics

In [3]:
print("iris data")
display(pd.DataFrame(X_train))

print("onnx predictions")
results = pd.DataFrame([y_test, pred_onx]).T
results.columns = ["y_test", "pred_onx"]
display(results.head(10))

print("Accuracy: {}".format(sklearn.metrics.accuracy_score(y_test, pred_onx)))
print("Bal. Acc: {}".format(sklearn.metrics.balanced_accuracy_score(y_test, pred_onx)))


iris data


Unnamed: 0,0,1,2,3
0,6.1,3.0,4.6,1.4
1,5.1,3.8,1.6,0.2
2,4.9,2.4,3.3,1.0
3,5.6,2.7,4.2,1.3
4,7.0,3.2,4.7,1.4
...,...,...,...,...
107,5.5,4.2,1.4,0.2
108,6.4,2.8,5.6,2.2
109,6.3,2.7,4.9,1.8
110,5.8,2.6,4.0,1.2


onnx predictions


Unnamed: 0,y_test,pred_onx
0,0,0
1,1,1
2,2,2
3,2,2
4,0,0
5,2,2
6,0,0
7,2,1
8,0,0
9,1,1


Accuracy: 0.9473684210526315
Bal. Acc: 0.9583333333333334


In [4]:
sklearn.metrics.confusion_matrix(y_test, pred_onx)

array([[15,  0,  0],
       [ 0,  7,  0],
       [ 0,  2, 14]])

## sklearn pipeline

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

scaler = StandardScaler()
pca = PCA(2)
lm = LogisticRegression()

pipe = Pipeline([
    ('scaler', scaler),
    ('pca', pca),
    ('clf', lm)
])

pipe.fit(X_train, y_train)


# save onnx
initial_type = [('float_input', FloatTensorType([None, 4]))]
onx = convert_sklearn(pipe, initial_types=initial_type)
with open("pipe_iris.onnx", "wb") as f:
    f.write(onx.SerializeToString())

# transform
sess = rt.InferenceSession("models/pipe_iris.onnx")
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name
pred_pipe_onx = sess.run([label_name], {input_name: X_test.astype(numpy.float32)})[0]

In [9]:
print("sklearn score: {}".format(sklearn.metrics.accuracy_score(y_test, pipe.predict(X_test))))
print("onnx score:    {}".format(sklearn.metrics.accuracy_score(y_test, pred_pipe_onx)))

sklearn score: 0.8947368421052632
onnx score:    0.8947368421052632


In [10]:
# sklearn model / onnx model prediction similarity
sklearn.metrics.confusion_matrix(pipe.predict(X_test), pred_pipe_onx)

array([[15,  0,  0],
       [ 0,  9,  0],
       [ 0,  0, 14]])