In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

base_link ="./"

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# Read data

In [2]:
df  = pd.read_csv(base_link + 'Audit.csv')
df_raw = pd.read_csv(base_link + 'Audit.csv')
print(df.shape)

(1899, 10)


In [3]:
### re-order columns, put categorical cols first

df = df[['Employment', 'Education', 'Marital', 'Occupation','Gender', 'Deductions', 
         'Age',  'Income','Hours', 
         'Adjusted']]
df.head()

Unnamed: 0,Employment,Education,Marital,Occupation,Gender,Deductions,Age,Income,Hours,Adjusted
0,Private,College,Unmarried,Service,Female,False,38,81838.0,72,0
1,Private,Associate,Absent,Transport,Male,False,35,72099.0,30,0
2,Private,HSgrad,Divorced,Clerical,Male,False,32,154676.74,40,0
3,Private,Bachelor,Married,Repair,Male,False,45,27743.82,55,1
4,Private,College,Married,Executive,Male,False,60,7568.23,40,1


# Prepare data for modeling

In [4]:
# feature cols
feature_cols = 'Employment', 'Education', 'Marital', 'Occupation','Gender', 'Deductions',\
         'Age',  'Income','Hours'

# categorical cols in the input data
cate_cols = ['Employment', 'Education', 'Marital', 'Occupation', 'Gender', 'Deductions']


# label encoder:
for c in cate_cols:
    le = preprocessing.LabelEncoder()
    df[c] = le.fit_transform(df[c])

df.head()

Unnamed: 0,Employment,Education,Marital,Occupation,Gender,Deductions,Age,Income,Hours,Adjusted
0,4,2,4,11,0,0,38,81838.0,72,0
1,4,0,0,13,1,0,35,72099.0,30,0
2,4,4,1,1,1,0,32,154676.74,40,0
3,4,1,2,9,1,0,45,27743.82,55,1
4,4,2,2,2,1,0,60,7568.23,40,1


In [29]:
X_train, X_valid, y_train, y_valid = train_test_split(df.iloc[:, :-1], df.iloc[:,-1], test_size=0.2, random_state=42)

# Model

## Option 1: Create txt file model, result,... for testing java version

In [30]:
# pred = lgb_sklearn_model.predict_proba(X_train)
# names = ["_target", "probability(0)", "probability(1)"]

# _target = pd.DataFrame((pred[:,1] >  pred[:,0]).astype(int))
# p0 = pd.DataFrame(pred[:,0])
# p1 = pd.DataFrame(pred[:,1])

# res = pd.concat([_target, p0, p1], 1)
# res.columns=names
# X_train.to_csv(base_link + "C.csv", index=False)
# lgb_sklearn_model.booster_.save_model(base_link + 'ClassificationC.txt')
# res.to_csv(base_link + "ClassificationC.csv", index=False)

# print("done")

## Option 2: Using sklearn2pmml directly 

In [31]:
X_train.head()

Unnamed: 0,Employment,Education,Marital,Occupation,Gender,Deductions,Age,Income,Hours
1005,2,9,2,0,1,0,42,29954.32,40
1793,4,5,2,7,1,0,55,11144.16,50
1774,1,10,0,1,0,1,17,53034.17,40
339,4,2,2,1,1,0,39,15275.18,25
824,4,4,0,10,0,0,19,184671.16,20


In [None]:
cate_cols

In [38]:
from sklearn2pmml import make_pmml_pipeline, sklearn2pmml
from sklearn2pmml.decoration import CategoricalDomain
from sklearn_pandas import DataFrameMapper
from sklearn2pmml.decoration import ContinuousDomain
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn.preprocessing import LabelEncoder

params = {'classifier__categorical_feature':[0, 1, 2, 3, 4, 5]}
mapper = DataFrameMapper(
    [([column], [CategoricalDomain(), LabelEncoder()]) for column in cate_cols] +
    [(['Age',  'Income','Hours'], ContinuousDomain(with_data = False))])

classifier = lgb.LGBMClassifier(n_estimators=5, learning_rate=0.1, num_leaves=10, max_depth=2)

pipeline = PMMLPipeline([
 ("mapper", mapper),
 ("classifier", classifier)
])

pipeline.fit(X = X_train, y = y_train, **params)

PMMLPipeline(steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[(['Employment'], [CategoricalDomain(invalid_value_replacement=None,
         invalid_value_treatment='return_invalid',
         missing_value_replacement=None, missing_value_treatment='as_is',
         missing_values=None, with_data=True, with_statistics=True), LabelEncoder()]), (['Educati...   missing_values=None, outlier_treatment='as_is', with_data=False,
         with_statistics=True))],
        input_df=False, sparse=False)),
       ('classifier', LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=2,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=5, n_jobs=-1, num_leaves=10, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0))])

In [39]:
sklearn2pmml(pipeline, base_link + "/pipeline_2.pmml")

In [40]:
pipeline.predict_proba(X_train)

array([[0.75875668, 0.24124332],
       [0.61199313, 0.38800687],
       [0.84401451, 0.15598549],
       ...,
       [0.75875668, 0.24124332],
       [0.8238318 , 0.1761682 ],
       [0.61199313, 0.38800687]])

In [42]:
print(log_loss(y_train, pipeline.predict_proba(X_train)))
log_loss(y_valid, pipeline.predict_proba(X_valid))

0.4588835269629688


0.44756399589310475

In [34]:
lgb_model = lgb.LGBMClassifier(n_estimators=5, learning_rate=0.1, num_leaves=10, max_depth=2)
lgb_model.fit(X_train, y_train, feature_name=feature_cols, categorical_feature=cate_cols)
lgb_model.predict_proba(X_train)

array([[0.75875668, 0.24124332],
       [0.61199313, 0.38800687],
       [0.84401451, 0.15598549],
       ...,
       [0.75875668, 0.24124332],
       [0.8238318 , 0.1761682 ],
       [0.61199313, 0.38800687]])

In [43]:
print(log_loss(y_train, lgb_model.predict_proba(X_train)))
log_loss(y_valid, lgb_model.predict_proba(X_valid))

0.4588835269629688


0.44756399589310475