In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm
import os

import xgboost
from sklearn.model_selection import train_test_split

import slice_finding as sf
from importlib import reload
reload(sf);

In [None]:
df = pd.read_csv("example_data/adult.csv")
df.head()

In [None]:
def _assign_bin_values(col_data, bin_spec):
    lower = bin_spec.get("min", np.quantile(col_data, 0.25))
    upper = bin_spec.get("max", np.quantile(col_data, 0.75))
    assert lower < upper, (lower, upper)
    return np.digitize(col_data, [lower, upper])

def extreme_value_binning(col_name, col_data, gender_data):
    if col_name not in NORMAL_RANGES:
        return np.digitize(col_data, np.quantile(col_data, [0.25, 0.75]))
    norm_range = NORMAL_RANGES[col_name]
    if "female" in norm_range:
        return np.where(gender_data, 
                        _assign_bin_values(col_data, norm_range["female"]),
                        _assign_bin_values(col_data, norm_range["male"]))
    return _assign_bin_values(col_data, norm_range)

def discretize_data(df, spec):
    discrete_columns = {}
    for col, col_spec in spec.items():
        if callable(col_spec["method"]):
            discrete_columns[col] = col_spec["method"](df[col], col)
        elif col_spec["method"] == "keep":
            discrete_columns[col] = df[col].values
        elif col_spec["method"] == "bin":
            discrete_columns[col] = np.digitize(df[col], col_spec["bins"]) - 1
        elif col_spec["method"] == "unique":
            unique_vals = sorted(df[col].unique().tolist())
            discrete_columns[col] = df[col].apply(lambda v: unique_vals.index(v))
    return pd.DataFrame(discrete_columns, index=df.index)

discrete_df = discretize_data(df, {
    'age': { "method": "bin", "bins": [0, 25, 45, 65] }, 
    'workclass': { "method": "unique" }, 
    'education': { "method": "unique" }, 
    'marital-status': { "method": "unique" }, 
    'occupation': { "method": "unique" }, 
    'relationship': { "method": "unique" }, 
    'race': { "method": "unique" }, 
    'gender': { "method": "unique" },   
    'capital-gain': { "method": "bin", "bins": [0, 1] }, 
    'capital-loss': { "method": "bin", "bins": [0, 1] }, 
    'hours-per-week': { "method": "bin", "bins": [0, 39] }, 
    'native-country': { "method": lambda x, c: x != 'United-States' },
})

In [None]:
df_prepped = df.drop(columns=['fnlwgt', 'educational-num'])

X = df_prepped.drop(columns=['income'])
y = df_prepped['income'] == '>50K'

X_continous  = X[['age', 'capital-gain', 'capital-loss', 'hours-per-week']]

X_categorical = X[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race',
                   'gender', 'native-country']]

X_encoded = pd.get_dummies(X_categorical)
X = pd.concat([X_continous, X_encoded], axis=1)

In [None]:
if not os.path.exists("example_data/adult_model.json"):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

    model = xgboost.XGBClassifier()
    model.fit(X_train, y_train)

    print(model.score(X_test, y_test))

    model.save_model("example_data/adult_model.json")
else:
    model = xgboost.XGBClassifier()
    model.load_model("example_data/adult_model.json")

In [None]:
outcome_bins = 10
model_preds = model.predict(X)
discrete_outcomes = model_preds == y # df["income"] == ">50K"
print(discrete_outcomes.mean())

# Set up score functions
score_functions = {
    "error_rate": sf.OutcomeRateScore(1 - discrete_outcomes.values),
    "group_size": sf.SliceSizeScore(0.5, spread=0.25),
}

# the slices must contain at least this fraction of the dataset.
# a higher value would mean that fewer slices are eligible, and
# also that sampling is more likely to work well
min_items = int(len(discrete_df) * 0.001)

num_slices_to_return = 50

max_features = 3

## Run recursive implementation

In [None]:
results = sf.find_slices(
    discrete_df.astype(int),
    score_functions,
    min_items=min_items, 
    n_slices=num_slices_to_return,
    algorithm='recursive',
    max_features=max_features,
)

In [None]:
results

## Run sampling implementation

In [None]:
results_sampling_unranked = sf.find_slices(
    discrete_df.astype(int),
    score_functions,
    algorithm='sampling',
    source_mask=(discrete_outcomes == 0),
    min_items=min_items,
    num_samples=100,
    num_candidates=None,
    max_features=max_features
)
results_sampling = results_sampling_unranked.rank({fn_name: 1.0 for fn_name in score_functions},
                                                  n_slices=num_slices_to_return, 
                                                  similarity_threshold=1.0)

In [None]:
# Recall
len(set(results_sampling) & set(results)) / len(results)

In [None]:
len(results_sampling_unranked.results)