In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
%load_ext autoreload
%autoreload 2

In [2]:
dataset =  pd.read_csv("../../ExampleData/aug_train.csv")
dataset.drop(["gender","enrollee_id","city"],axis = 1, inplace=True)
dataset.dropna(inplace=True)
dataset["target"]=dataset["target"].astype(str)

In [3]:
target = dataset["target"]
datasetX = dataset.drop("target", axis=1)
x_train, x_test, y_train, y_test = train_test_split(datasetX,
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0)

numerical = list(x_train.select_dtypes(include=[np.number]))
categorical = x_train.columns.difference(numerical)

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)])
clf = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', RandomForestClassifier())])
model = clf.fit(x_train, y_train)
print("Training Accuracy :", model.score(x_test, y_test))

Training Accuracy : 0.8376565295169947


In [4]:
dataset.loc[1]

city_development_index                     0.776
relevent_experience       No relevent experience
enrolled_university                no_enrollment
education_level                         Graduate
major_discipline                            STEM
experience                                    15
company_size                               50-99
company_type                             Pvt Ltd
last_new_job                                  >4
training_hours                                47
target                                       0.0
Name: 1, dtype: object

In [57]:
from CFApi import CFGenerator
cf_generator = CFGenerator(dataset,"target", clf.predict)
cf_generator.metadata["enrolled_university"].feature_weight = 10
cf_generator.metadata["experience"].is_excluded = True

cf_generator.generator_options["max_iterations"]=150

In [61]:
df = cf_generator.generate_counterfactuals(dataset.loc[1], [0.9,1.1],15, multithreaded=True)
print(cf_generator.cf_found_percentage)
df

100.0


Unnamed: 0,city_development_index,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,0.776,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47.0,0.0
1,0.8,-,Full time course,-,-,-,10/49,Other,2,-,1.0
2,0.8,-,-,-,-,-,10/49,Other,1,-,1.0
3,-,-,-,-,Humanities,-,10/49,Early Stage Startup,-,-,1.0
4,0.7,-,-,-,Arts,-,10/49,-,1,-,1.0
5,0.8,-,-,-,-,-,10/49,-,-,-,1.0
6,-,-,-,-,Humanities,-,10/49,-,-,-,1.0
7,-,-,-,-,Humanities,-,10/49,Other,-,-,1.0
8,0.8,-,-,-,Humanities,-,10/49,-,-,-,1.0
9,0.7,-,-,-,Humanities,-,10/49,Other,1,-,1.0
