# Slice Finding on UCI Adult

In [None]:
!pip install pandas
!pip install xgboost
!pip install scikit-learn

In [None]:
import numpy as np
import pandas as pd
import tqdm
import torch
from sklearn.model_selection import train_test_split
import xgboost
import slice_finding as sf

In [None]:
df = pd.read_csv("adult.csv")

In [None]:
discrete_df = sf.discretization.discretize_data(df, {
    'age': { "method": "bin", "bins": [25, 45, 65] }, 
    'workclass': { "method": "unique" }, 
    'education': { "method": "unique" }, 
    'marital-status': { "method": "unique" }, 
    'occupation': { "method": "unique" }, 
    'relationship': { "method": "unique" }, 
    'race': { "method": "unique" }, 
    'gender': { "method": "unique" },   
    'capital-gain': { "method": "bin", "bins": [1] }, 
    'capital-loss': { "method": "bin", "bins": [1] }, 
    'hours-per-week': { "method": "bin", "bins": [40] }, 
    'native-country': { "method": lambda x, c: (x != 'United-States', {0: 'US', 1: 'Non-US'}) },
})

In [None]:
df_prepped = df.drop(columns=['fnlwgt', 'educational-num'])

X = df_prepped.drop(columns=['income'])
y = df_prepped['income'] == '>50K'

X_continous  = X[['age', 'capital-gain', 'capital-loss', 'hours-per-week']]

X_categorical = X[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race',
                   'gender', 'native-country']]

X_encoded = pd.get_dummies(X_categorical)
X = pd.concat([X_continous, X_encoded], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

model = xgboost.XGBClassifier()
model.fit(X_train, y_train)

model_preds = model.predict(X)
model_probs = model.predict_proba(X)
is_error = model_preds != y
print(f"Train + test error rate: {is_error.mean():.2%}")

In [None]:
device = 'cpu'

slice_finder = sf.sampling.SamplingSliceFinder(
    discrete_df,
    {
        "positive_label": sf.OutcomeRateScore(y.values).to(device),
        "error": sf.OutcomeRateScore(is_error.values).to(device),
        "error_interaction": sf.InteractionEffectScore(is_error.values).to(device),
        "slice_size": sf.SliceSizeScore(0.25).to(device),
        "complexity": sf.NumFeaturesScore().to(device)
    },
    source_mask=is_error > 0, # sample from only errors
    min_items=int(len(discrete_df.df) * 0.01),
    holdout_fraction=0.5,
    max_features=3,
    n_workers=8,
    similarity_threshold=0.5,
    device=device
)

w = sf.widget.SliceFinderWidget(slice_finder, metrics={
    "> 50K": y.values,
    "Model Prob.": model_probs[:,1],
    "Error": is_error.values,
}, score_weights={"error": 1.0, "error_interaction": 1.2, "slice_size": 0.3, "complexity": 0.2})
w