In [1]:
import dice_ml
from dice_ml.utils import helpers  # helper functions

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

In [11]:
#preprocessing
dataset =  pd.read_csv("data/train.csv")
#enrolle_id is just database numbering. city is basically same as city_development index. gender can'T be changed so shouldnt be included in the model
dataset.drop(["employee_id","region","KPIs_met >80%"],axis = 1, inplace=True)
#dice and sklearn can't handle missing data. easiest way to circumvent is to drop all missing data. 
dataset.dropna(inplace=True)
def awards_won(x):
    if x == 0:
        return "no"
    else:
        return "yes"
def education(x):
    if x.startswith("Be"):
        return 0
    if x.startswith("Ba"):
        return 2
    if x.startswith("M"):
        return 3

dataset["awards_won?"]=dataset["awards_won?"].apply(awards_won)
#dataset["education"]=dataset["education"].apply(education)

dataset.dtypes
target = dataset["is_promoted"]
#split data into train and test
datasetX = dataset.drop("is_promoted", axis=1)
x_train, x_test, y_train, y_test = train_test_split(datasetX,
                                                    target,
                                                    test_size=0.2,
                                                    shuffle=False)

x_train.head()

Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score
0,Sales & Marketing,Master's & above,f,sourcing,1,35,5.0,8,no,49
1,Operations,Bachelor's,m,other,1,30,5.0,4,no,60
2,Sales & Marketing,Bachelor's,m,sourcing,1,34,3.0,7,no,50
3,Sales & Marketing,Bachelor's,m,other,2,39,1.0,10,no,50
4,Technology,Bachelor's,m,other,1,45,3.0,2,no,73


In [3]:
#create a pipeline which converts 

numerical = ['no_of_trainings','age','avg_training_score',
            'previous_year_rating','length_of_service']
categorical = x_train.columns.difference(numerical)

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)])

In [4]:
#model 
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', RandomForestClassifier())])
model = clf.fit(x_train, y_train)
print("Training Accuracy :", model.score(x_test, y_test))

Training Accuracy : 0.9317714755445952


In [12]:
#data for dice. all continuous features need to be listed. rest is assumed to be categorical
d = dice_ml.Data(dataframe=dataset, continuous_features=['no_of_trainings', 
                                              'age','avg_training_score',
                                              'previous_year_rating','length_of_service'], outcome_name='is_promoted')
backend = 'sklearn'

#dice machine learning model instance
m = dice_ml.Model(model=model, backend=backend)

#cf instance
query_instances = x_train[0:1]
query_instances

Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score
0,Sales & Marketing,Master's & above,f,sourcing,1,35,5.0,8,no,49


In [13]:
from GeneticExtension import GeneticExtension
exp_genetic = GeneticExtension(d, m)


feature_weights = {'education': 1}
dice_exp_genetic = exp_genetic.generate_counterfactuals(query_instances, total_CFs=3, desired_class="opposite",proximity_weight=1.5, diversity_weight=1.0,feature_weights=feature_weights)


dice_exp_genetic.visualize_as_dataframe(show_only_changes=True)

100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.24s/it]

Query instance (original outcome : 0)





Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted
0,Sales & Marketing,Master's & above,f,sourcing,1.0,35.0,5.0,8.0,no,49.0,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,department,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted
0,-,Bachelor's,m,-,-,-,4.0,-,-,-,1
0,-,-,m,other,-,-,-,-,-,48.0,1
0,-,-,-,other,-,37.0,-,-,-,50.0,1


In [14]:
mads = d.get_mads(normalized=True)
#print(mads)
feature_weights = {}
for feature in mads:
    feature_weights[feature] = round(1/mads[feature], 2)
#feature_weights["training_hours"] = 5
feature_weights

  feature_weights[feature] = round(1/mads[feature], 2)


{'no_of_trainings': inf,
 'age': 10.0,
 'avg_training_score': 6.0,
 'previous_year_rating': 4.0,
 'length_of_service': 18.0}