##  Converting Scikit-Learn hyperparameter-tuned pipelines to PMML
### https://openscoring.io/blog/2019/12/25/converting_sklearn_gridsearchcv_pipeline_pmml/

In [7]:
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.decoration import CategoricalDomain, ContinuousDomain
from sklearn2pmml.pipeline import PMMLPipeline

In [8]:
df = pd.read_csv("../data/audit.csv")

cat_columns = ["Education", "Employment", "Marital", "Occupation"]
cont_columns = ["Age", "Hours", "Income"]

df_X = df[cat_columns + cont_columns]
df_y = df["Adjusted"]

df.head(10)

Unnamed: 0,Age,Employment,Education,Marital,Occupation,Income,Gender,Deductions,Hours,Adjusted
0,38,Private,College,Unmarried,Service,81838.0,Female,False,72,0
1,35,Private,Associate,Absent,Transport,72099.0,Male,False,30,0
2,32,Private,HSgrad,Divorced,Clerical,154676.74,Male,False,40,0
3,45,Private,Bachelor,Married,Repair,27743.82,Male,False,55,1
4,60,Private,College,Married,Executive,7568.23,Male,False,40,1
5,74,Private,HSgrad,Married,Service,33144.4,Male,False,30,0
6,43,Private,Bachelor,Married,Executive,43391.17,Male,False,50,1
7,35,Private,Yr12,Married,Machinist,59906.65,Male,False,40,0
8,25,Private,Associate,Divorced,Clerical,126888.91,Female,False,40,0
9,22,Private,HSgrad,Absent,Sales,52466.49,Female,False,37,0


In [9]:
mapper = DataFrameMapper(
    [(cat_column, [CategoricalDomain(invalid_value_treatment = "as_is"), LabelBinarizer()]) for cat_column in cat_columns] +
    [([cont_column], [ContinuousDomain(invalid_value_treatment = "as_is"), StandardScaler()]) for cont_column in cont_columns]
)

selector = SelectKBest()
classifier = LogisticRegression(multi_class = "ovr", penalty = "elasticnet", solver = "saga", max_iter = 1000)
param_grid = {
    "selector__k" : [10, 20, 30],
    "classifier__l1_ratio" : [0.7, 0.8, 0.9]
}

In [10]:
pipeline = PMMLPipeline([
    ("mapper", mapper),
    ("selector", selector),
    ("classifier", classifier)
])
searcher = GridSearchCV(estimator = pipeline, param_grid = param_grid)
searcher.fit(df_X, df_y)
print(searcher.best_params_)

{'classifier__l1_ratio': 0.9, 'selector__k': 20}


In [11]:
best_pipeline = searcher.best_estimator_
best_pipeline.verify(df_X.sample(n = 5))

sklearn2pmml(best_pipeline, "../data/GridSearchAudit.pmml")