## JPMML-Evaluator-Python
### Python wrapper classes and functions for the JPMML-Evaluator library
### https://github.com/jpmml/jpmml-evaluator-python

In [2]:
import numpy as np
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.decoration import CategoricalDomain, ContinuousDomain, MultiDomain
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml.preprocessing import ExpressionTransformer, LookupTransformer

In [4]:
df = pd.read_csv("../../data/audit.csv")

cat_columns = ["Education", "Employment", "Gender", "Marital", "Occupation"]
cont_columns = ["Age", "Hours", "Income"]

df_X = df[cat_columns + cont_columns]
df_y = df["Adjusted"]

In [5]:
df_X.head(10)

Unnamed: 0,Education,Employment,Gender,Marital,Occupation,Age,Hours,Income
0,College,Private,Female,Unmarried,Service,38,72,81838.0
1,Associate,Private,Male,Absent,Transport,35,30,72099.0
2,HSgrad,Private,Male,Divorced,Clerical,32,40,154676.74
3,Bachelor,Private,Male,Married,Repair,45,55,27743.82
4,College,Private,Male,Married,Executive,60,40,7568.23
5,HSgrad,Private,Male,Married,Service,74,30,33144.4
6,Bachelor,Private,Male,Married,Executive,43,50,43391.17
7,Yr12,Private,Male,Married,Machinist,35,40,59906.65
8,Associate,Private,Female,Divorced,Clerical,25,40,126888.91
9,HSgrad,Private,Female,Absent,Sales,22,37,52466.49


In [6]:
df_y.head(10)

0    0
1    0
2    0
3    1
4    1
5    0
6    1
7    0
8    0
9    0
Name: Adjusted, dtype: int64

In [7]:
employment_mapping = {
    "Consultant" : "Private",
    "Private" : "Private",
    "PSFederal" : "Public",
    "PSLocal" : "Public",
    "PSState" : "Public",
    "SelfEmp" : "Private",
    "Volunteer" : "Other"
}

mapper = DataFrameMapper([
    (["Income"], [ContinuousDomain(), ExpressionTransformer("numpy.log(X[0])", dtype = np.float64)]),
    (["Employment"], [CategoricalDomain(), LookupTransformer(employment_mapping, default_value = None), OneHotEncoder(drop = "first")]),
    (["Gender", "Marital"], [MultiDomain([CategoricalDomain(), CategoricalDomain()]), OneHotEncoder(), PolynomialFeatures(degree = 2, interaction_only = True, include_bias = False)]),
    (["Age", "Hours"], [ContinuousDomain(), StandardScaler()]),
    (["Education"], [CategoricalDomain(), OneHotEncoder(drop = "first")]),
    (["Occupation"], [CategoricalDomain(), OneHotEncoder(drop = "first")])
])

In [8]:
pipeline = PMMLPipeline([
    ("mapper", mapper),
    ("classifier", LogisticRegression(multi_class = "ovr", max_iter = 1000))
])

pipeline.fit(df_X, df_y)
pipeline.verify(df_X.sample(n = 10))

In [10]:
sklearn2pmml(pipeline, "../../data/SkLearnAudit.pmml")

### Model Scoring

In [11]:
from jpmml_evaluator import LoadingModelEvaluatorBuilder
from jpmml_evaluator.py4j import Py4JBackend

# Load PMML model
py4jbackend  = Py4JBackend()
evaluator = LoadingModelEvaluatorBuilder(backend = py4jbackend ) \
    .loadFile("../../data/SkLearnAudit.pmml") \
    .build()

In [12]:
# Perform automated QA
evaluator.verify()

<jpmml_evaluator.Evaluator at 0x145b96200>

In [13]:
# Model Scoring
records = df_X.to_dict(orient = "records")
for record in records:
   result = evaluator.evaluate(record)
   print(result)

{'Adjusted': 0, 'probability(0)': 0.9457018719093554, 'probability(1)': 0.05429812809064459}
{'Adjusted': 0, 'probability(0)': 0.974710174044074, 'probability(1)': 0.025289825955926035}
{'Adjusted': 0, 'probability(0)': 0.9471923059668924, 'probability(1)': 0.05280769403310764}
{'Adjusted': 1, 'probability(0)': 0.36275095408147495, 'probability(1)': 0.637249045918525}
{'Adjusted': 1, 'probability(0)': 0.3761986905209832, 'probability(1)': 0.6238013094790168}
{'Adjusted': 0, 'probability(0)': 0.8408305784806632, 'probability(1)': 0.15916942151933675}
{'Adjusted': 1, 'probability(0)': 0.2189258390026365, 'probability(1)': 0.7810741609973635}
{'Adjusted': 0, 'probability(0)': 0.836472709096348, 'probability(1)': 0.163527290903652}
{'Adjusted': 0, 'probability(0)': 0.9704864919939612, 'probability(1)': 0.029513508006038778}
{'Adjusted': 0, 'probability(0)': 0.9904688283851482, 'probability(1)': 0.00953117161485185}
{'Adjusted': 0, 'probability(0)': 0.9923984744656431, 'probability(1)': 0.0

In [14]:
dfresult = evaluator.evaluateAll(df_X)
dfresult

Unnamed: 0,Adjusted,probability(0),probability(1)
0,0,0.945702,0.054298
1,0,0.974710,0.025290
2,0,0.947192,0.052808
3,1,0.362751,0.637249
4,1,0.376199,0.623801
...,...,...,...
1894,0,0.635403,0.364597
1895,0,0.673191,0.326809
1896,1,0.474302,0.525698
1897,0,0.914106,0.085894
