<a href="https://colab.research.google.com/github/danielsaggau/AI-labour/blob/master/XAI/Counterfactual_getstarted.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install dice-ml

Collecting dice-ml
  Downloading dice_ml-0.7.2-py3-none-any.whl (242 kB)
[?25l[K     |█▍                              | 10 kB 26.4 MB/s eta 0:00:01[K     |██▊                             | 20 kB 30.8 MB/s eta 0:00:01[K     |████                            | 30 kB 35.2 MB/s eta 0:00:01[K     |█████▍                          | 40 kB 36.7 MB/s eta 0:00:01[K     |██████▊                         | 51 kB 39.3 MB/s eta 0:00:01[K     |████████▏                       | 61 kB 33.0 MB/s eta 0:00:01[K     |█████████▌                      | 71 kB 28.5 MB/s eta 0:00:01[K     |██████████▉                     | 81 kB 28.8 MB/s eta 0:00:01[K     |████████████▏                   | 92 kB 30.5 MB/s eta 0:00:01[K     |█████████████▌                  | 102 kB 30.1 MB/s eta 0:00:01[K     |██████████████▉                 | 112 kB 30.1 MB/s eta 0:00:01[K     |████████████████▎               | 122 kB 30.1 MB/s eta 0:00:01[K     |█████████████████▋              | 133 kB 30.1 MB/s eta 0

In [10]:
!git clone https://github.com/interpretml/DiCE.git

Cloning into 'DiCE'...
remote: Enumerating objects: 4002, done.[K
remote: Counting objects: 100% (1492/1492), done.[K
remote: Compressing objects: 100% (751/751), done.[K
remote: Total 4002 (delta 1026), reused 1063 (delta 700), pack-reused 2510[K
Receiving objects: 100% (4002/4002), 9.27 MiB | 12.07 MiB/s, done.
Resolving deltas: 100% (2791/2791), done.


In [13]:
%cd '/content/DiCE'
!pip install -e .

/content/DiCE
Obtaining file:///content/DiCE
Installing collected packages: dice-ml
  Attempting uninstall: dice-ml
    Found existing installation: dice-ml 0.7.2
    Uninstalling dice-ml-0.7.2:
      Successfully uninstalled dice-ml-0.7.2
  Running setup.py develop for dice-ml
Successfully installed dice-ml-0.7.2


In [1]:
# Sklearn imports
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# Tensorflow import
import tensorflow as tf

# DiCE imports
import dice_ml
from dice_ml.utils import helpers  # helper functions

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
dataset = helpers.load_adult_income_dataset()

In [21]:
# description of transformed features
adult_info = helpers.get_adult_data_info()
adult_info

{'age': 'age',
 'education': 'education level (Assoc, Bachelors, Doctorate, HS-grad, Masters, Prof-school, School, Some-college)',
 'gender': 'male or female?',
 'hours_per_week': 'total work hours per week',
 'income': '0 (<=50K) vs 1 (>50K)',
 'marital_status': 'marital status (Divorced, Married, Separated, Single, Widowed)',
 'occupation': 'occupation (Blue-Collar, Other/Unknown, Professional, Sales, Service, White-Collar)',
 'race': 'white or other race?',
 'workclass': 'type of industry (Government, Other/Unknown, Private, Self-Employed)'}

In [22]:
target = dataset["income"]
train_dataset, test_dataset, y_train, y_test = train_test_split(dataset,
                                                                target,
                                                                test_size=0.2,
                                                                random_state=0,
                                                                stratify=target)
x_train = train_dataset.drop('income', axis=1)
x_test = test_dataset.drop('income', axis=1)

In [23]:
# Step 1: dice_ml.Data
d = dice_ml.Data(dataframe=train_dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income')

In [24]:
numerical = ["age", "hours_per_week"]
categorical = x_train.columns.difference(numerical)

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', RandomForestClassifier())])
model = clf.fit(x_train, y_train)

In [25]:
# Using sklearn backend
m = dice_ml.Model(model=model, backend="sklearn")
# Using method=random for generating CFs
exp = dice_ml.Dice(d, m, method="random")

In [26]:
e1 = exp.generate_counterfactuals(x_test[0:1], total_CFs=2, desired_class="opposite")
e1.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:00<00:00,  2.16it/s]

Query instance (original outcome : 0)





Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,29,Private,HS-grad,Married,Blue-Collar,White,Female,38,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,-,-,Doctorate,-,-,-,Male,-,1
1,-,Government,Doctorate,-,-,-,-,-,1


In [27]:
e1.visualize_as_dataframe(show_only_changes=False)

Query instance (original outcome : 0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,29,Private,HS-grad,Married,Blue-Collar,White,Female,38,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,29,Private,Doctorate,Married,Blue-Collar,White,Male,38,1
1,29,Government,Doctorate,Married,Blue-Collar,White,Female,38,1


In [28]:
# Changing only age and education
e2 = exp.generate_counterfactuals(x_test[0:1],
                                  total_CFs=2,
                                  desired_class="opposite",
                                  features_to_vary=["education", "occupation"]
                                  )
e2.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:00<00:00,  2.48it/s]

Query instance (original outcome : 0)





Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,29,Private,HS-grad,Married,Blue-Collar,White,Female,38,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,-,-,Assoc,-,Service,-,-,-,1
1,-,-,Prof-school,-,White-Collar,-,-,-,1


In [29]:
# Restricting age to be between [20,30] and Education to be either {'Doctorate', 'Prof-school'}.
e3 = exp.generate_counterfactuals(x_test[0:1],
                                  total_CFs=2,
                                  desired_class="opposite",
                                  permitted_range={'age': [20, 30], 'education': ['Doctorate', 'Prof-school']})
e3.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:00<00:00,  2.53it/s]

Query instance (original outcome : 0)





Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,29,Private,HS-grad,Married,Blue-Collar,White,Female,38,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,-,-,Prof-school,-,White-Collar,-,-,-,1
1,-,Government,Doctorate,-,-,-,-,-,1


In [30]:
query_instance = x_test[0:1]
imp = exp.local_feature_importance(query_instance, total_CFs=10)
print(imp.local_importance)

100%|██████████| 1/1 [00:01<00:00,  1.00s/it]

[{'education': 0.7, 'occupation': 0.6, 'workclass': 0.5, 'marital_status': 0.1, 'gender': 0.1, 'race': 0.0, 'age': 0.0, 'hours_per_week': 0.0}]





In [31]:
query_instances = x_test[0:20]
imp = exp.global_feature_importance(query_instances)
print(imp.summary_importance)

100%|██████████| 20/20 [00:13<00:00,  1.49it/s]

{'education': 0.645, 'occupation': 0.31, 'marital_status': 0.295, 'age': 0.22, 'workclass': 0.205, 'hours_per_week': 0.195, 'race': 0.095, 'gender': 0.08}





In [32]:
# supress deprecation warnings from TF
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

backend = 'TF'+tf.__version__[0]  # TF1
ML_modelpath = helpers.get_adult_income_modelpath(backend=backend)
# Step 2: dice_ml.Model
m = dice_ml.Model(model_path=ML_modelpath, backend=backend)

In [33]:
# Step 3: initiate DiCE
exp = dice_ml.Dice(d, m)

In [34]:
# query instance in the form of a dictionary or a dataframe; keys: feature name, values: feature value
query_instance = {'age': 22,
                  'workclass': 'Private',
                  'education': 'HS-grad',
                  'marital_status': 'Single',
                  'occupation': 'Service',
                  'race': 'White',
                  'gender': 'Female',
                  'hours_per_week': 45}

In [35]:
# generate counterfactuals
dice_exp = exp.generate_counterfactuals(query_instance, total_CFs=4, desired_class="opposite")

Diverse Counterfactuals found! total time taken: 00 min 49 sec


In [36]:
# visualize the result, highlight only the changes
dice_exp.visualize_as_dataframe(show_only_changes=True)

Query instance (original outcome : 0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,22.0,Private,HS-grad,Single,Service,White,Female,45.0,0.019



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,70.0,-,Masters,-,White-Collar,-,-,51.0,0
1,-,Self-Employed,Doctorate,Married,-,-,-,-,0
2,47.0,-,-,Married,-,-,-,-,0
3,36.0,-,Prof-school,Married,-,-,-,62.0,0


# Variational Autoencoder


In [4]:
d = dice_ml.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'],
                 outcome_name='income', data_name='adult', test_size=0.1)

In [5]:
backend = {'model': 'pytorch_model.PyTorchModel',
           'explainer': 'feasible_base_vae.FeasibleBaseVAE'}
ML_modelpath = helpers.get_adult_income_modelpath(backend='PYT')
ML_modelpath = ML_modelpath[:-4] + '_2nodes.pth'
m = dice_ml.Model(model_path=ML_modelpath, backend=backend)
m.load_model()
print('ML Model', m.model)

  backend, ','.join(BackEndTypes.ALL))


ML Model Sequential(
  (0): Linear(in_features=29, out_features=20, bias=True)
  (1): ReLU()
  (2): Linear(in_features=20, out_features=2, bias=True)
  (3): Softmax(dim=None)
)




In [9]:
# query instance in the form of a dictionary; keys: feature name, values: feature value
query_instance = {'age': 41,
                  'workclass': 'Private',
                  'education': 'HS-grad',
                  'marital_status': 'Single',
                  'occupation': 'Service',
                  'race': 'White',
                  'gender': 'Female',
                  'hours_per_week': 45}

In [7]:
# initiate DiCE
exp = dice_ml.Dice(d, m, encoded_size=10, lr=1e-2,
                   batch_size=2048, validity_reg=42.0, margin=0.165, epochs=25,
                   wm1=1e-2, wm2=1e-2, wm3=1e-2)
#exp.train(pre_trained=1)

Dataset Shape: (26048, 30)
Datasets Columns: Index(['age', 'hours_per_week', 'workclass_Government',
       'workclass_Other/Unknown', 'workclass_Private',
       'workclass_Self-Employed', 'education_Assoc', 'education_Bachelors',
       'education_Doctorate', 'education_HS-grad', 'education_Masters',
       'education_Prof-school', 'education_School', 'education_Some-college',
       'marital_status_Divorced', 'marital_status_Married',
       'marital_status_Separated', 'marital_status_Single',
       'marital_status_Widowed', 'occupation_Blue-Collar',
       'occupation_Other/Unknown', 'occupation_Professional',
       'occupation_Sales', 'occupation_Service', 'occupation_White-Collar',
       'race_Other', 'race_White', 'gender_Female', 'gender_Male', 'income'],
      dtype='object')


In [10]:
# generate counterfactuals
dice_exp = exp.generate_counterfactuals(query_instance, total_CFs=5, desired_class="opposite")
# visualize the results
dice_exp.visualize_as_dataframe(show_only_changes=True)

FileNotFoundError: ignored

# Advancements