# Slice Finding on UCI Adult

In [1]:
!pip install pandas
!pip install xgboost
!pip install scikit-learn



In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost
import slice_finding as sf

In [3]:
# Read the adult dataset and create a simple XGBoost model. The task is to predict
# whether someone makes over 50K in income.

df = pd.read_csv("adult.csv")

df_prepped = df.drop(columns=['fnlwgt', 'educational-num'])

X = df_prepped.drop(columns=['income'])
y = df_prepped['income'] == '>50K'

X_continous  = X[['age', 'capital-gain', 'capital-loss', 'hours-per-week']]

X_categorical = X[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race',
                   'gender', 'native-country']]

X_encoded = pd.get_dummies(X_categorical)
X = pd.concat([X_continous, X_encoded], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

model = xgboost.XGBClassifier()
model.fit(X_train, y_train)

# Our outcomes will be y (true values), model_preds (predicted values), 
# model_probs (model probabilities of positive label), and is_error (if each prediction is incorrect).
model_preds = model.predict(X)
model_probs = model.predict_proba(X)
is_error = model_preds != y
print(f"Train + test error rate: {is_error.mean():.2%}")

Train + test error rate: 11.61%


In [4]:
# Discretize the dataset using a different method per-column so that we can perform slicing.

discrete_df = sf.discretization.discretize_data(df, {
    'age': { "method": "bin", "bins": [25, 45, 65] }, 
    'workclass': { "method": "unique" }, 
    'education': { "method": "unique" }, 
    'marital-status': { "method": "unique" }, 
    'occupation': { "method": "unique" }, 
    'relationship': { "method": "unique" }, 
    'race': { "method": "unique" }, 
    'gender': { "method": "unique" },   
    'capital-gain': { "method": "bin", "bins": [1] }, 
    'capital-loss': { "method": "bin", "bins": [1] }, 
    'hours-per-week': { "method": "bin", "bins": [40] }, 
    'native-country': { "method": lambda x, c: (x != 'United-States', {0: 'US', 1: 'Non-US'}) },
})

In [5]:
# We can run on MPS (Apple silicon) or cuda if desired, but this incurs some 
# additional time to transfer data to the device so may not always be worth it.
device = 'cpu'

# Create a slice finder object. We can call .sample(n) on this object directly
# to find slices programmatically without using the widget.
slice_finder = sf.sampling.SamplingSliceFinder(
    discrete_df,
    # Score functions describing what slices to rank higher. For instance,
    # OutcomeRateScore favors slices where the rate of the given binary outcome
    # is higher inside the slice than outside.
    {
        "positive_label": sf.OutcomeRateScore(y.values).to(device),
        "error": sf.OutcomeRateScore(is_error.values).to(device),
        "error_interaction": sf.InteractionEffectScore(is_error.values).to(device),
        "slice_size": sf.SliceSizeScore(0.25).to(device),
        "complexity": sf.NumFeaturesScore().to(device)
    },
    # The rows to sample from. Here we sample from only errors
    source_mask=is_error > 0,
    # Minimum number of rows that must be part of a slice for it to be scored
    min_items=int(len(discrete_df.df) * 0.01),
    holdout_fraction=0.5,     # Proportion of rows to hold out for slice evaluation
    max_features=3,           # Max number of slice features
    n_workers=8,              # For parallelism
    similarity_threshold=0.5, # Jaccard similarity between slices to be considered redundant
    device=device
)

# Create a widget to control the slice finder and to show metrics for each slice
w = sf.widget.SliceFinderWidget(slice_finder, metrics={
    "> 50K": y.values,
    "Model Prob.": model_probs[:,1],
    "Error": is_error.values,
}, score_weights={"error": 1.0, "error_interaction": 1.2, "slice_size": 0.3, "complexity": 0.2}, dev=True)
w

SliceFinderWidget(base_slice={'scoreValues': {'positive_label': 1.0, 'error': 1.0, 'error_interaction': 1.0, '…

In [6]:
w.slice_intersection_counts 
# w.selected_intersection_index

[]

In [7]:
w.slice_intersection_labels

[]

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:05<00:00,  8.55it/s]
