## SkLearn2PMML
### Python package for converting Scikit-Learn pipelines to PMML
### https://github.com/jpmml/sklearn2pmml

In [1]:
import numpy as np
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.decoration import CategoricalDomain, ContinuousDomain, MultiDomain
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.preprocessing import ExpressionTransformer, LookupTransformer

In [2]:
df = pd.read_csv("../../data/audit.csv")

cat_columns = ["Education", "Employment", "Gender", "Marital", "Occupation"]
cont_columns = ["Age", "Hours", "Income"]

df_X = df[cat_columns + cont_columns]
df_y = df["Adjusted"]

In [3]:
df_X.head(10)

Unnamed: 0,Education,Employment,Gender,Marital,Occupation,Age,Hours,Income
0,College,Private,Female,Unmarried,Service,38,72,81838.0
1,Associate,Private,Male,Absent,Transport,35,30,72099.0
2,HSgrad,Private,Male,Divorced,Clerical,32,40,154676.74
3,Bachelor,Private,Male,Married,Repair,45,55,27743.82
4,College,Private,Male,Married,Executive,60,40,7568.23
5,HSgrad,Private,Male,Married,Service,74,30,33144.4
6,Bachelor,Private,Male,Married,Executive,43,50,43391.17
7,Yr12,Private,Male,Married,Machinist,35,40,59906.65
8,Associate,Private,Female,Divorced,Clerical,25,40,126888.91
9,HSgrad,Private,Female,Absent,Sales,22,37,52466.49


In [4]:
df_y.head(10)

0    0
1    0
2    0
3    1
4    1
5    0
6    1
7    0
8    0
9    0
Name: Adjusted, dtype: int64

In [5]:
employment_mapping = {
    "Consultant" : "Private",
    "Private" : "Private",
    "PSFederal" : "Public",
    "PSLocal" : "Public",
    "PSState" : "Public",
    "SelfEmp" : "Private",
    "Volunteer" : "Other"
}

mapper = DataFrameMapper([
    (["Income"], [ContinuousDomain(), ExpressionTransformer("numpy.log(X[0])", dtype = np.float64)]),
    (["Employment"], [CategoricalDomain(), LookupTransformer(employment_mapping, default_value = None), OneHotEncoder(drop = "first")]),
    (["Gender", "Marital"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), OneHotEncoder(), PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False)]),
    (["Age", "Hours"], [ContinuousDomain(), StandardScaler()]),
    (["Education"], [CategoricalDomain(), OneHotEncoder(drop = "first")]),
    (["Occupation"], [CategoricalDomain(), OneHotEncoder(drop = "first")])
])

In [6]:
pipeline = PMMLPipeline([
    ("mapper", mapper),
    ("classifier", LogisticRegression(multi_class = "ovr", max_iter = 1000))
])

pipeline.fit(df_X, df_y)
pipeline.verify(df_X.sample(n = 10))

In [7]:
sklearn2pmml(pipeline, "../../data/SkLearnAudit.pmml")

### Model REST Scoring

In [16]:
from openscoring import Openscoring
from openscoring import EvaluationRequest

# Creating an Openscoring object
# Openscoring server have to be running on base-url
os = Openscoring(base_url = "http://localhost:8080", token="secret")

In [17]:
# Deploying a PMML model:
os.deployFile("SkLearnAudit", "../../data/SkLearnAudit.pmml")

<openscoring.common.ModelResponse at 0x13b4f3190>

In [18]:
# Evaluating the model with data records from DataFrame
dfResponse = os.evaluateCsv("SkLearnAudit", df_X)
print(dfResponse.head(5))

   Adjusted  probability(0)  probability(1)
0         0        0.945702        0.054298
1         0        0.974710        0.025290
2         0        0.947192        0.052808
3         1        0.362751        0.637249
4         1        0.376199        0.623801
