https://openscoring.io/blog/2020/02/23/sklearn_feature_specification_pmml

In [6]:
import numpy as np
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

from sklearn2pmml.decoration import CategoricalDomain
from sklearn2pmml.decoration import ContinuousDomain
from sklearn2pmml.preprocessing import ExpressionTransformer
from sklearn2pmml.preprocessing import LookupTransformer

from sklearn2pmml import sklearn2pmml
from sklearn2pmml.pipeline import PMMLPipeline

In [11]:
df = pd.read_csv("../data/audit.csv")
df

Unnamed: 0,Age,Employment,Education,Marital,Occupation,Income,Gender,Deductions,Hours,Adjusted
0,38,Private,College,Unmarried,Service,81838.00,Female,False,72,0
1,35,Private,Associate,Absent,Transport,72099.00,Male,False,30,0
2,32,Private,HSgrad,Divorced,Clerical,154676.74,Male,False,40,0
3,45,Private,Bachelor,Married,Repair,27743.82,Male,False,55,1
4,60,Private,College,Married,Executive,7568.23,Male,False,40,1
...,...,...,...,...,...,...,...,...,...,...
1894,62,Private,HSgrad,Married,Repair,24080.59,Male,False,40,0
1895,35,Consultant,Associate,Married,Repair,57497.30,Male,False,40,0
1896,32,Private,Bachelor,Married,Sales,30538.18,Male,False,44,0
1897,34,Private,College,Unmarried,Sales,113425.67,Male,False,45,0


In [14]:
def fit_convert(mapper,X,y):
    pipeline = PMMLPipeline([
        ("mapper", mapper)
    ])
    dfresult = pipeline.fit_transform(X,y)
    return dfresult

In [16]:
mapper = DataFrameMapper([
    (["Income"], None),
    (["Employment"], OneHotEncoder())
])

res = fit_convert(mapper,df, df["Adjusted"])
res

array([[8.1838000e+04, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [7.2099000e+04, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.5467674e+05, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       ...,
       [3.0538180e+04, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.1342567e+05, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.3998458e+05, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00]])

In [25]:
employment_sector = {
    "Consultant" :  0,
    "PSFederal" :   1,
    "PSLocal" :     1,
    "PSState" :     1,
    "Private" :     0,
    "SelfEmp" :     0
}

mapper = DataFrameMapper([
    (["Income"],        [ContinuousDomain()]),
    (["Hours"],         [ContinuousDomain()]),
    (["Employment"],    [CategoricalDomain(),LookupTransformer(employment_sector, default_value = 0)])
])

In [26]:
res = fit_convert(mapper,df, df["Adjusted"])
res

array([[81838.0, 72, 0],
       [72099.0, 30, 0],
       [154676.74, 40, 0],
       ...,
       [30538.18, 44, 0],
       [113425.67, 45, 0],
       [139984.58, 35, 0]], dtype=object)

In [32]:
from sklearn2pmml.preprocessing import PMMLLabelBinarizer

mapper = DataFrameMapper([
    (["Income"],        [ContinuousDomain()]),
    (["Hours"],         [ContinuousDomain()]),
    (["Employment"],    [CategoricalDomain(),PMMLLabelBinarizer()])
])

In [33]:
res = fit_convert(mapper,df, df["Adjusted"])
res

array([[8.1838000e+04, 7.2000000e+01, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [7.2099000e+04, 3.0000000e+01, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.5467674e+05, 4.0000000e+01, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       ...,
       [3.0538180e+04, 4.4000000e+01, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.1342567e+05, 4.5000000e+01, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.3998458e+05, 3.5000000e+01, 0.0000000e+00, ..., 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00]])