In [261]:
import pandas
import numpy 
import dice_ml
import sklearn

In [262]:
path = "healthcare-dataset-stroke-data.csv"
healthcare_dataset = pandas.read_csv(path)

In [263]:
healthcare_dataset

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [264]:
simplified_dataset = healthcare_dataset.loc[:,['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']]



In [265]:
simplified_dataset.dropna(subset = ['bmi'], inplace = True )

In [266]:
mapping = {'formerly smoked':1,
            'smokes':1,
            'never smoked': 0,
            'Unknown':0  }
simplified_dataset.replace({'smoking_status':mapping}, inplace = True)

In [267]:
simplified_dataset['smoking_status'].unique()

array([1, 0], dtype=int64)

In [268]:
simplified_dataset

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,67.0,0,1,228.69,36.6,1,1
2,80.0,0,1,105.92,32.5,0,1
3,49.0,0,0,171.23,34.4,1,1
4,79.0,1,0,174.12,24.0,0,1
5,81.0,0,0,186.21,29.0,1,1
...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0,0
5106,81.0,0,0,125.20,40.0,0,0
5107,35.0,0,0,82.99,30.6,0,0
5108,51.0,0,0,166.29,25.6,1,0


In [269]:
target = simplified_dataset['stroke']
dataset_X = simplified_dataset.drop('stroke',axis=1)

In [270]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(dataset_X,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=target)

In [271]:
#List of names of continuous features: (Important, it is a list)
continuous_feat = ['age','avg_glucose_level','bmi']

numeric_transformer = sklearn.pipeline.Pipeline(steps=[ ('scaler', sklearn.preprocessing.StandardScaler()) ])


In [272]:
#transformation = sklearn.compose.ColumnTransformer(transformers=[('num',numeric_transformer,continuous_feat)])

In [273]:
#transformation

In [274]:
clf = sklearn.pipeline.Pipeline(steps=[('preprocessor', sklearn.preprocessing.StandardScaler()),('classifier', sklearn.ensemble.RandomForestClassifier())])
model = clf.fit(x_train, y_train)

In [277]:
y_pred = model.predict(x_test)


In [278]:
score = sklearn.metrics.accuracy_score(y_test, y_pred)

In [279]:
score

0.955193482688391

In [291]:
 import dice_ml

In [293]:
data_dice = dice_ml.Data(dataframe=simplified_dataset,continuous_features = continuous_feat, outcome_name = 'stroke')

In [294]:
model_dice = dice_ml.Model(model = model,backend='sklearn')

In [295]:
explainer = dice_ml.Dice(data_dice,model_dice,method='random')

In [296]:
input_datapoint = x_test[0:1]

In [297]:
counterfactual = explainer.generate_counterfactuals(input_datapoint,total_CFs=10,desired_class = 'opposite')

100%|██████████| 1/1 [00:02<00:00,  2.51s/it]


In [298]:
counterfactual.visualize_as_dataframe()

Query instance (original outcome : 0)


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,78.0,0,0,55.32,29.6,1,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,78.0,0.0,1.0,270.17,30.0,1.0,1
1,78.0,1.0,0.0,56.01,26.6,1.0,1
2,78.0,0.0,1.0,244.77,30.0,1.0,1
3,78.0,1.0,0.0,56.01,21.7,1.0,1
4,78.0,0.0,1.0,270.73,30.0,1.0,1
5,78.0,1.0,0.0,56.01,16.4,1.0,0
6,78.0,0.0,1.0,262.01,30.0,1.0,1
7,78.0,1.0,0.0,56.01,15.9,1.0,0
8,78.0,0.0,1.0,248.74,30.0,1.0,1
9,78.0,1.0,0.0,56.01,21.0,1.0,1


In [301]:
#Create a range of feasible counterfactuals

features_to_vary = ['avg_glucose_level','bmi','heart_disease','hypertension']
permited_ranges = {'avg_glucose_level':[50,250],
                   'bmi':[18,35]}
counterfactual_2 = explainer.generate_counterfactuals(input_datapoint, total_CFs=10, desired_class = 'opposite',
                                                     permitted_range = permited_ranges, features_to_vary = features_to_vary)

100%|██████████| 1/1 [00:04<00:00,  4.46s/it]


In [302]:
counterfactual_2.visualize_as_dataframe(show_only_changes=True)

Query instance (original outcome : 0)


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,78.0,0,0,55.32,29.6,1,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status,stroke
0,-,1,-,56.00999999999986,21.8,-,1
1,-,1,-,56.00999999999986,21.1,-,1
2,-,1,-,56.00999999999986,29.0,-,-
3,-,1,-,56.00999999999986,22.3,-,1
4,-,1,-,56.00999999999986,21.2,-,1
5,-,1,-,56.00999999999986,19.0,-,1
6,-,1,-,56.00999999999986,29.0,-,-
7,-,1,-,56.00999999999986,25.5,-,1
8,-,1,-,56.00999999999986,29.0,-,-
9,-,1,-,56.00999999999986,20.9,-,1


In [290]:
dataset_X

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,smoking_status
0,67.0,0,1,228.69,36.6,1
2,80.0,0,1,105.92,32.5,0
3,49.0,0,0,171.23,34.4,1
4,79.0,1,0,174.12,24.0,0
5,81.0,0,0,186.21,29.0,1
...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,0
5106,81.0,0,0,125.20,40.0,0
5107,35.0,0,0,82.99,30.6,0
5108,51.0,0,0,166.29,25.6,1
