In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

base_link ="./"

# Read data

In [4]:
df  = pd.read_csv(base_link + 'Audit.csv')
df_raw = pd.read_csv(base_link + 'Audit.csv')
print(df.shape)

(1899, 10)


In [7]:
### re-order columns, put categorical cols first

df = df[['Employment', 'Education', 'Marital', 'Occupation','Gender', 'Deductions', 
         'Age',  'Income','Hours', 
         'Adjusted']]
df.head()

Unnamed: 0,Employment,Education,Marital,Occupation,Gender,Deductions,Age,Income,Hours,Adjusted
0,Private,College,Unmarried,Service,Female,False,38,81838.0,72,0
1,Private,Associate,Absent,Transport,Male,False,35,72099.0,30,0
2,Private,HSgrad,Divorced,Clerical,Male,False,32,154676.74,40,0
3,Private,Bachelor,Married,Repair,Male,False,45,27743.82,55,1
4,Private,College,Married,Executive,Male,False,60,7568.23,40,1


# Prepare data for modeling

In [9]:
# feature cols
feature_cols = 'Employment', 'Education', 'Marital', 'Occupation','Gender', 'Deductions',\
         'Age',  'Income','Hours'

# categorical cols in the input data
cate_cols = ['Employment', 'Education', 'Marital', 'Occupation', 'Gender', 'Deductions']


# label encoder:
for c in cate_cols:
    le = preprocessing.LabelEncoder()
    df[c] = le.fit_transform(df[c])

df.head()

Unnamed: 0,Employment,Education,Marital,Occupation,Gender,Deductions,Age,Income,Hours,Adjusted
0,4,2,4,11,0,0,38,81838.0,72,0
1,4,0,0,13,1,0,35,72099.0,30,0
2,4,4,1,1,1,0,32,154676.74,40,0
3,4,1,2,9,1,0,45,27743.82,55,1
4,4,2,2,2,1,0,60,7568.23,40,1


In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(df.iloc[:, :-1], df.iloc[:,-1], test_size=0.33, random_state=42)

# Model

## Option 1: Create txt file model, result,... for testing java version

In [11]:
# pred = lgb_sklearn_model.predict_proba(X_train)
# names = ["_target", "probability(0)", "probability(1)"]

# _target = pd.DataFrame((pred[:,1] >  pred[:,0]).astype(int))
# p0 = pd.DataFrame(pred[:,0])
# p1 = pd.DataFrame(pred[:,1])

# res = pd.concat([_target, p0, p1], 1)
# res.columns=names
# X_train.to_csv(base_link + "C.csv", index=False)
# lgb_sklearn_model.booster_.save_model(base_link + 'ClassificationC.txt')
# res.to_csv(base_link + "ClassificationC.csv", index=False)

# print("done")

## Option 2: Using sklearn2pmml directly 

In [15]:
X_train.head()

Unnamed: 0,Employment,Education,Marital,Occupation,Gender,Deductions,Age,Income,Hours
1572,4,2,2,9,1,0,31,43796.16,40
327,4,4,0,9,1,0,24,73809.11,40
948,5,4,2,1,1,0,75,26317.48,26
704,4,1,0,7,1,0,32,74199.12,40
1026,2,2,5,1,0,0,56,91953.88,40


In [16]:
cate_cols

['Employment', 'Education', 'Marital', 'Occupation', 'Gender', 'Deductions']

In [13]:
from sklearn2pmml import make_pmml_pipeline, sklearn2pmml
from sklearn2pmml.decoration import CategoricalDomain
from sklearn_pandas import DataFrameMapper
from sklearn2pmml.decoration import ContinuousDomain
from sklearn2pmml.pipeline import PMMLPipeline

params = {'classifier__categorical_feature':[0, 1, 2, 3, 4, 5]}
mapper = DataFrameMapper(
    [([column], CategoricalDomain()) for column in cate_cols] +
    [(['Age',  'Income','Hours'], ContinuousDomain(with_data = False))])

classifier = lgb.LGBMClassifier(n_estimators=5, learning_rate=0.1, num_leaves=10, max_depth=2)

pipeline = PMMLPipeline([
 ("mapper", mapper),
 ("classifier", classifier)
])

pipeline.fit(X = X_train, y = y_train, **params)

PMMLPipeline(steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[(['Employment'], CategoricalDomain(invalid_value_replacement=None,
         invalid_value_treatment='return_invalid',
         missing_value_replacement=None, missing_value_treatment='as_is',
         missing_values=None, with_data=True, with_statistics=True)), (['Education'], CategoricalD...   missing_values=None, outlier_treatment='as_is', with_data=False,
         with_statistics=True))],
        input_df=False, sparse=False)),
       ('classifier', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=2,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=5, n_jobs=-1, num_leaves=10, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0))])

In [14]:
sklearn2pmml(pipeline, base_link + "/pipeline_2.pmml")

Standard output is empty
Standard error:
Jan 31, 2019 10:32:50 PM org.jpmml.sklearn.Main run
INFO: Parsing PKL..
Jan 31, 2019 10:32:50 PM org.jpmml.sklearn.Main run
INFO: Parsed PKL in 43 ms.
Jan 31, 2019 10:32:50 PM org.jpmml.sklearn.Main run
INFO: Converting..
Jan 31, 2019 10:32:50 PM org.jpmml.sklearn.Main run
SEVERE: Failed to convert
java.lang.IndexOutOfBoundsException: Index: 13, Size: 13
	at java.util.ArrayList.rangeCheck(ArrayList.java:653)
	at java.util.ArrayList.get(ArrayList.java:429)
	at org.jpmml.lightgbm.Tree.selectValues(Tree.java:240)
	at org.jpmml.lightgbm.Tree.encodeNode(Tree.java:151)
	at org.jpmml.lightgbm.Tree.encodeNode(Tree.java:186)
	at org.jpmml.lightgbm.Tree.encodeTreeModel(Tree.java:94)
	at org.jpmml.lightgbm.ObjectiveFunction.createMiningModel(ObjectiveFunction.java:66)
	at org.jpmml.lightgbm.BinomialLogisticRegression.encodeMiningModel(BinomialLogisticRegression.java:49)
	at org.jpmml.lightgbm.GBDT.encodeMiningModel(GBDT.java:287)
	at lightgbm.sklearn.Boost

RuntimeError: The JPMML-SkLearn conversion application has failed. The Java executable should have printed more information about the failure into its standard output and/or standard error streams