# Visual Auditor Package Demo (Adult Dataset)
Author: David Munechika (david.munechika@gatech.edu)

In [4]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
import visual_auditor
from visual_auditor import SliceFinder

In [8]:
# Load Adult dataset
adult_data = pd.read_csv(
    "data/adult.data",
    names=[
        "Age", "Workclass", "Final Weight", "Education", "Education-Num", "Marital Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
        "Hours Per Week", "Country", "Target"],
        sep=r'\s*,\s*',
        engine='python',
        na_values="?")

# Drop NA values
adult_data = adult_data.dropna()

# Drop irrelevant fields
adult_data = adult_data.drop(columns=['Final Weight', 'Education-Num'])

# Bin numerical features
adult_data["Age"] = pd.cut(adult_data["Age"], 3, labels=["0", "1", "2"])
adult_data["Capital Gain"] = pd.cut(adult_data["Capital Gain"], 3, labels=["0", "1", "2"])
adult_data["Capital Loss"] = pd.cut(adult_data["Capital Loss"], 3, labels=["0", "1", "2"])
adult_data["Hours Per Week"] = pd.cut(adult_data["Hours Per Week"], 3, labels=["0", "1", "2"])

# Encode categorical features
encoders = {}
for column in adult_data.columns:
    if adult_data.dtypes[column] == np.object:
        le = LabelEncoder()
        adult_data[column] = le.fit_transform(adult_data[column])
        encoders[column] = le
        print(column, le.classes_, le.transform(le.classes_))

# Separate Target values
X, y = adult_data[adult_data.columns.difference(["Target"])], adult_data["Target"]

# Train a classifier
rfc = RandomForestClassifier(max_depth=5, n_estimators=10)
rfc.fit(X, y)

       Age         Workclass   Education      Marital Status  \
0       39         State-gov   Bachelors       Never-married   
1       50  Self-emp-not-inc   Bachelors  Married-civ-spouse   
2       38           Private     HS-grad            Divorced   
3       53           Private        11th  Married-civ-spouse   
4       28           Private   Bachelors  Married-civ-spouse   
...    ...               ...         ...                 ...   
32556   27           Private  Assoc-acdm  Married-civ-spouse   
32557   40           Private     HS-grad  Married-civ-spouse   
32558   58           Private     HS-grad             Widowed   
32559   22           Private     HS-grad       Never-married   
32560   52      Self-emp-inc     HS-grad  Married-civ-spouse   

              Occupation   Relationship   Race     Sex  Capital Gain  \
0           Adm-clerical  Not-in-family  White    Male          2174   
1        Exec-managerial        Husband  White    Male             0   
2      Handlers

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [6]:
# Generate slices
sf = SliceFinder(rfc, (X, y))
recommendations = sf.find_slice(k=20, epsilon=0.2, degree=2, max_workers=4)

for s in recommendations:
    print ('\n=====================\nSlice description:')
    for k, v in list(s.filters.items()):
        values = ''
        if k in encoders:
            le = encoders[k]
            for v_ in v:
                values += '%s '%(le.inverse_transform(v_)[0])
        else:
            for v_ in sorted(v, key=lambda x: x[0]):
                if len(v_) > 1:
                    values += '%s ~ %s'%(v_[0], v_[1])
                else:
                    values += '%s '%(v_[0])
        print ('%s:%s'%(k, values))
    print ('---------------------\neffect_size: %s'%(s.effect_size))
    print ('---------------------\nmetric: %s'%(s.metric))
    print ('size: %s'%(s.size))

degree 1
crossing
effect size filtering
sorting

Slice description:
Hours Per Week:1 
---------------------
effect_size: 0.30892987496765645
---------------------
metric: 0.424845806404716
size: 24747

Slice description:
Sex:Male 
---------------------
effect_size: 0.3290339785203574
---------------------
metric: 0.459876374342245
size: 20380

Slice description:
Marital Status:Married-civ-spouse 
---------------------
effect_size: 0.6073802141099277
---------------------
metric: 0.5954922479173419
size: 14065

Slice description:
Relationship:Husband 
---------------------
effect_size: 0.5513918172312393
---------------------
metric: 0.5920502688142034
size: 12463

Slice description:
Age:1 
---------------------
effect_size: 0.27663865964119166
---------------------
metric: 0.5047562488558341
size: 10794

Slice description:
Education:Bachelors 
---------------------
effect_size: 0.20688573995955764
---------------------
metric: 0.5025664503124488
size: 5044

Slice description:
Education

In [7]:
visual_auditor.find_slices_and_visualize(rfc, (X, y), k=100, epsilon=0.2, degree=2, max_workers=4)

In [None]:
# Alternative use - separate slice-finding and visualization functions
sf = SliceFinder(rfc, (X, y))
sf.find_slices_and_visualize(k=20, epsilon=0.2, degree=1, max_workers=4)

visual_auditor.visualize()