# Diverse Counterfactual Explanations（DiCE）の利用例 

Rと同様のシミュレーションデータに対して、dice-mlパッケージを使ってcounterfactual exampleを生成する

In [1]:
import warnings

import numpy as np
import pandas as pd

from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import dice_ml

warnings.simplefilter('ignore')
np.random.seed(42)

In [2]:
# Rで作ったシミュレーションデータを読み込む
# 同じ設定でシミュレーションデータを再生成しても良い
df = pd.read_csv("output/simulation.csv")

features = ["ds", "de", "biz"]
X, y = df[features], df["income"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [3]:
# Elastic Netで学習
# PolynomialFeatures()の仕様で前処理が微妙にRと違うが、
# 全く同じモデルを作りたいわけでもないのでこれでいく
en = Pipeline([
    ('pf', PolynomialFeatures(include_bias=False, degree=4)),
    ('ss', StandardScaler()),
    ('en', ElasticNetCV())
])

en.fit(X_train, y_train)

print(r2_score(y_test, en.predict(X_test)))

0.9698791221021554


In [4]:
# Counterfactual Exampleを生成する準備
dice = dice_ml.Dice(
    # データの指定
    # 連続変数とカテゴリカル変数を識別できる。今回はすべて連続値
    data_interface=dice_ml.Data(
        dataframe=df,
        continuous_features=features,
        outcome_name='income'
    ),
    # モデルの指定
    # scikit-learnを使っていること、（分類ではなく）回帰であることを明示
    model_interface=dice_ml.Model(
        model=en,
        backend="sklearn",
        model_type="regressor"
    ),
    # "random", "genetic", "kdtree"から選ぶ
    # 詳細はドキュメント参照
    method="genetic"
)

In [5]:
# 適当なインスタンスを取り出す
i = 0
X_i = X.loc[[i]]
display(df.loc[[i]])

Unnamed: 0,ds,de,biz,income
0,71.58107,49.545286,47.890238,814.251992


In [6]:
# Counterfactual Exampleを生成
counterfactual_expamples = dice.generate_counterfactuals(
    X_i,
    total_CFs=10, # いくつ作るか
    desired_range=[1000, 1010], # yが満たしてほしいレンジ
    permitted_range={j: [float(X_i[j]), 100] for j in features}, # 特徴量の制約
    features_to_vary="all" # どの特徴量を動かすか
)

100%|█████████████████████████████████████████████| 1/1 [01:12<00:00, 72.93s/it]


In [7]:
# 生成されたCounterfactual Exampleの確認
counterfactual_expamples.visualize_as_dataframe()

Query instance (original outcome : 807)


Unnamed: 0,ds,de,biz,income
0,71.58107,49.545286,47.890238,806.682341



Diverse Counterfactual set (new outcome: [1000, 1010])


Unnamed: 0,ds,de,biz,income
0,71.58107,61.551464,52.186575,1000.867792
0,74.712516,52.041833,55.955788,1001.318694
0,71.862401,50.731882,59.664843,1002.270765
0,72.804135,49.910173,59.472875,1003.481214
0,72.73776,52.219396,57.828721,1004.36974
0,72.533728,55.548157,55.461199,1002.124285
0,77.475946,50.004981,55.140692,1004.248909
0,73.073796,51.754719,57.908745,1005.112334
0,76.821571,50.471081,55.433376,1005.078864
0,80.641592,50.418549,51.90691,1000.304899


# adultデータセットでDiceを使う
- 年収5万ドル以上かどうかを予測するデータセット
- 分類問題なことと、カテゴリカル変数があることがシミュレーションデータとの違い

In [8]:
# データの準備
df = dice_ml.utils.helpers.load_adult_income_dataset()
X, y = df.drop('income', axis=1), df["income"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
# 連続変数とカテゴリカル変数で前処理を分ける
num_cols = ["age", "hours_per_week"]
cat_cols = X.columns.difference(num_cols)

# 連続変数は標準化を、カテゴリカル変数はOne Hot Encodingしておく
num_pipe = Pipeline(steps=[('ss', StandardScaler())])
cat_pipe = Pipeline(steps=[('le', OneHotEncoder())])

# 連続変数とカテゴリカル変数の処理を結合
transformer = ColumnTransformer(
    transformers=[
        ('num', num_pipe, num_cols),
        ('cat', cat_pipe, cat_cols)
    ]
)

# 自前の実装では最適化が難しいのでElastic Netを使ったが、
# diceならRandom Forestなど好きなモデルを使うことができる
rf = Pipeline(steps=[
    ('transformer', transformer),
    ('rf', RandomForestClassifier(n_estimators=500, min_samples_leaf=2, n_jobs=-1, random_state=42))
])

rf.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  ['age', 'hours_per_week']),
                                                 ('cat',
                                                  Pipeline(steps=[('le',
                                                                   OneHotEncoder())]),
                                                  Index(['education', 'gender', 'marital_status', 'occupation', 'race',
       'workclass'],
      dtype='object'))])),
                ('rf',
                 RandomForestClassifier(min_samples_leaf=2, n_estimators=500,
                                        n_jobs=-1, random_state=42))])

In [10]:
# シミュレーションデータのときとほぼ同じ
data_interface = dice_ml.Data(
    dataframe=df,
    continuous_features=num_cols,
    outcome_name='income'
)

model_interface = dice_ml.Model(
    model=rf,
    backend="sklearn",
    model_type='classifier' # 分類問題であることを指定
)

dice = dice_ml.Dice(
    data_interface=data_interface,
    model_interface=model_interface,
    method="genetic"
)

In [11]:
# 確率0.5が閾値になっている
# あまり無理のあるCounterfactual Exampleにならないように、閾値に近い人を探す
y_pred_proba = rf.predict_proba(X_test)[:, 1] 
X_test.iloc[(y_pred_proba < 0.5) & (y_pred_proba > 0.45)]

Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week
11917,54,Self-Employed,Some-college,Married,Sales,White,Male,40
14736,50,Private,Bachelors,Divorced,White-Collar,White,Female,50
21376,54,Self-Employed,Bachelors,Divorced,White-Collar,White,Male,50
21542,37,Private,Assoc,Married,Service,White,Male,40
1482,38,Self-Employed,HS-grad,Married,White-Collar,White,Male,99
...,...,...,...,...,...,...,...,...
7010,36,Self-Employed,Prof-school,Married,Sales,White,Male,30
6187,49,Self-Employed,Some-college,Married,Blue-Collar,White,Male,50
21081,43,Government,HS-grad,Married,Blue-Collar,White,Male,40
7274,28,Private,Bachelors,Single,White-Collar,White,Male,50


In [13]:
# Counterfactual Exampleの生成
counterfactual_expamples = dice.generate_counterfactuals(
    X_test.loc[[7010]],
    total_CFs=10,
    desired_class='opposite',
    features_to_vary='all'
)

counterfactual_expamples.visualize_as_dataframe()

100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.77s/it]

Query instance (original outcome : 0)





Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,36,Self-Employed,Prof-school,Married,Sales,White,Male,30,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,43.0,Self-Employed,Prof-school,Married,Professional,White,Male,30.0,1
0,36.0,Self-Employed,Doctorate,Married,Professional,White,Male,35.0,1
0,36.0,Private,Bachelors,Married,Sales,White,Male,35.0,1
0,34.0,Self-Employed,Prof-school,Married,Professional,White,Male,35.0,1
0,36.0,Private,Assoc,Married,Professional,White,Female,30.0,1
0,35.0,Private,Bachelors,Married,Sales,White,Female,30.0,1
0,39.0,Private,Bachelors,Married,Sales,White,Female,30.0,1
0,40.0,Private,Prof-school,Married,Professional,White,Female,30.0,1
0,40.0,Self-Employed,Masters,Married,White-Collar,White,Female,30.0,1
0,34.0,Self-Employed,Doctorate,Married,Professional,White,Male,32.0,1
