In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, balanced_accuracy_score
from aix360.algorithms.rule_induction.rbm.boolean_rule_cg import BooleanRuleCG as BRCG
from aix360.algorithms.rbm import FeatureBinarizer
import time

# Rule Induction using BRCG

## Binary classification with a random 20% test set

We read the adult dataset from the UCI repository. The goal is to learn a rule describing people who earn more than 50K.

In [3]:
data_type = {'age': float,
             'workclass': str,
             'fnlwgt': float,
             'education': str,
             'education-num': float,
             'marital-status': str,
             'occupation': str,
             'relationship': str,
             'race': str,
             'sex': str,
             'capital-gain': float,
             'capital-loss': float,
             'native-country': str,
             'hours-per-week': float,
             'label': str}

col_names = ['age', 'workclass', 'fnlwgt', 'education',
             'education-num', 'marital-status', 'occupation',
             'relationship', 'race', 'sex',
             'capital-gain', 'capital-loss', 'hours-per-week',
             'native-country', 'label']

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
                 header=None,
                 delimiter=', ',
                 engine='python',
                 names=col_names,
                 dtype=data_type)

### Comlum names shall not contain whitespace or arithmetic operators (+, -, *, /)
We eventually output the rule set in TRXF format, where compound features are supported by parsing an expression string. So simple features like column names of a data frame must not contain these so that they are parsed as a single variable rather than an expression.

In [4]:
df.columns = df.columns.str.replace('-', '_')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32561 non-null  float64
 1   workclass       32561 non-null  object 
 2   fnlwgt          32561 non-null  float64
 3   education       32561 non-null  object 
 4   education_num   32561 non-null  float64
 5   marital_status  32561 non-null  object 
 6   occupation      32561 non-null  object 
 7   relationship    32561 non-null  object 
 8   race            32561 non-null  object 
 9   sex             32561 non-null  object 
 10  capital_gain    32561 non-null  float64
 11  capital_loss    32561 non-null  float64
 12  hours_per_week  32561 non-null  float64
 13  native_country  32561 non-null  object 
 14  label           32561 non-null  object 
dtypes: float64(6), object(9)
memory usage: 3.7+ MB


In [5]:
TARGET_COLUMN = 'label'
print(df.head())

    age         workclass    fnlwgt  education  education_num  \
0  39.0         State-gov   77516.0  Bachelors           13.0   
1  50.0  Self-emp-not-inc   83311.0  Bachelors           13.0   
2  38.0           Private  215646.0    HS-grad            9.0   
3  53.0           Private  234721.0       11th            7.0   
4  28.0           Private  338409.0  Bachelors           13.0   

       marital_status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital_gain  capital_loss  hours_per_week native_country  label  
0        2174.0           0.0            40.0  United-States  <=50K  
1           0.0         

### The rule induction trains for specific 'foreground' aka 'positive' value of the target label, which we set to '>50K' below. This means that the rule set will characterize the set of adults who earn more than 50K).

In [6]:
POS_VALUE = '>50K' # Setting positive value of the label for which we train
values_dist = df[TARGET_COLUMN].value_counts()
print('Positive value {} occurs {} times.'.format(POS_VALUE,values_dist[POS_VALUE]))
print(values_dist)
# This is distribution of the two values of the target label

Positive value >50K occurs 7841 times.
<=50K    24720
>50K      7841
Name: label, dtype: int64


### Train-test split and encode labels as integers

In [31]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
# Split the data set into 80% training and 20% test set
print('Training set:')
print(train[TARGET_COLUMN].value_counts())
print('Test set:')
print(test[TARGET_COLUMN].value_counts())

y_train = train[TARGET_COLUMN].apply(lambda x: 1 if x == POS_VALUE else 0)
x_train = train.drop(columns=[TARGET_COLUMN])

y_test = test[TARGET_COLUMN].apply(lambda x: 1 if x == POS_VALUE else 0)
x_test = test.drop(columns=[TARGET_COLUMN])
# Split data frames into features and label

Training set:
<=50K    19778
>50K      6270
Name: label, dtype: int64
Test set:
<=50K    4942
>50K     1571
Name: label, dtype: int64


### Instantiate the BRCG explainer and train it using default parameters

In [8]:
fb = FeatureBinarizer(negations=True)
X_train_fb = fb.fit_transform(x_train)
x_test_fb = fb.transform(x_test)

explainer = BRCG(silent=True)
start_time = time.time()
explainer.fit(X_train_fb, y_train)
end_time = time.time()
print('Training time (sec): ' + str(end_time - start_time))

# compute performance metrics on test set
y_pred = explainer.predict(x_test_fb)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Balanced accuracy:', balanced_accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, pos_label=1))
print('Recall:', recall_score(y_test, y_pred, pos_label=1))

This use of ``*`` has resulted in matrix multiplication.
Using ``*`` for matrix multiplication has been deprecated since CVXPY 1.1.
    Use ``*`` for matrix-scalar and vector-scalar multiplication.
    Use ``@`` for matrix-matrix and matrix-vector multiplication.
    Use ``multiply`` for elementwise multiplication.
This code path has been hit 1 times so far.

This use of ``*`` has resulted in matrix multiplication.
Using ``*`` for matrix multiplication has been deprecated since CVXPY 1.1.
    Use ``*`` for matrix-scalar and vector-scalar multiplication.
    Use ``@`` for matrix-matrix and matrix-vector multiplication.
    Use ``multiply`` for elementwise multiplication.
This code path has been hit 2 times so far.

This use of ``*`` has resulted in matrix multiplication.
Using ``*`` for matrix multiplication has been deprecated since CVXPY 1.1.
    Use ``*`` for matrix-scalar and vector-scalar multiplication.
    Use ``@`` for matrix-matrix and matrix-vector multiplication.
    Use ``mu

Training time (sec): 58.75145697593689
Accuracy: 0.8211269768155995
Balanced accuracy: 0.7253901463211316
Precision: 0.6571207430340558
Recall: 0.5404201145767027


### Extract the rule set

In [9]:
trxf_ruleset = explainer.explain()
print(str(trxf_ruleset))

if
([age > 26.0] ^ [education_num > 9.0] ^ [marital_status == Married-civ-spouse] ^ [occupation != Craft-repair] ^ [occupation != Farming-fishing] ^ [occupation != Handlers-cleaners] ^ [occupation != Other-service])
then
1


## Export the resulting ruleset to a PMML file
### Construct a RuleSetClassifier object
A rule set by itself is merely a description of the given concept/target. Therefore, to use rule sets for a binary classification task, we must specify how to deal with potential overlaps between rule sets. For example, we could have learned 2 rule sets: one for >50K and another for <=50K. For instances where both rule sets are triggered, how do we classify that instance? There are 3 rule selection methods supported in PMML: First Hit, Weighted Sum, and Weighted Max. See here for more info: https://dmg.org/pmml/v4-4/RuleSet.html#xsdElement_RuleSelectionMethod. If we only learn a rule set for a single label, we can set a default label to which instances will be classified when the learned rule set does not trigger. 

In our case, since we only learn a rule set for a single label and use the default label for the rest, all 3 rule selection methods will have the same effect. However, if a rule selection method other than FirstHit is chosen, we need to compute the weights and confidence values for each rule.

In [11]:
import aix360.algorithms.rule_induction.trxf.classifier.ruleset_classifier as trxf_classifier
import aix360.algorithms.rule_induction.trxf.pmml_export as pmml
classifier = trxf_classifier.RuleSetClassifier([trxf_ruleset],
                                               rule_selection_method=trxf_classifier.RuleSelectionMethod.WEIGHTED_MAX,
                                               confidence_metric=trxf_classifier.ConfidenceMetric.LAPLACE,
                                               weight_metric=trxf_classifier.WeightMetric.CONFIDENCE,
                                               default_label='<=50K')
classifier.update_rules_with_metrics(x_test, y_test)

### Export the TRXF classifier to a PMML document

In [12]:
reader = pmml.TrxfReader()
reader.load_data_dictionary(x_test)
serializer = pmml.NyokaSerializer()
exporter = pmml.PmmlExporter(reader, serializer)
with open("adult_weighted_max_brcg.pmml", "w") as text_file:
    text_file.write(exporter.export(classifier))

In [41]:
# first row of x_test
ele = x_test.iloc[100:102]
ele

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
28197,40.0,Private,287008.0,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,15024.0,0.0,55.0,Germany
13925,33.0,Private,93056.0,7th-8th,4.0,Divorced,Handlers-cleaners,Own-child,White,Male,0.0,0.0,40.0,United-States


In [42]:
classifier.predict(ele)

1