# Counterfactual Explanation for the Feature Engineering Data-Domain (FEDD)

In [None]:
# import libraries
from collections import namedtuple
from lightgbm import LGBMRanker
from findhr.preprocess.mapping import AttachMetadata, DetachMetadata, DerivedColumn
from findhr.preprocess.example_mappings import RelevantExperienceForRole, ExtractMonthDurationJob, MatchOrdinal, ExtractListOfProperty, MatchFeatureAtLeastInList, MatchFeatureSet, MatchBinary

from findhr.xai.counterfactual import dice_ml
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

np.random.seed(42)

# suppress warnings
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

### Define Helper Classes and Functions

In [None]:
%run ./Example_InputDataSources.ipynb

In [None]:
# Joined DataFrame.
df_all.head()

In [None]:
# Joined metadata.
md_all

In [None]:
# Define helper function
def rank2relevance(df, top_k, col_rank):
    return top_k + 1 - df[col_rank].values.ravel()

### Define the Preprocessing pipeline

In [None]:
# Define subsets of columns
cols_id = ['id_c', 'id_j']
all_cols = ['id_c', 'education_background_c', 'professional_experience_c',
   'skills_c', 'gender_c', 'agg_perceived_foreign_c', 'id_j',
   'education_reqs_j', 'experience_reqs_role_j',
   'experience_reqs_duration_j', 'skills_j', 'gender_j',
   'agg_perceived_foreign_j', 'ranking', 'shortlisted', 'score']
# Define the subset of columns of the HUDD dataset describing the candidate,
# which are used in the preprocessing+prediction pipeline
cols_c = ['education_background_c', 'professional_experience_c',
          'skills_c', 'gender_c', 'agg_perceived_foreign_c']
cols_j = ['education_reqs_j', 'experience_reqs_role_j',
   'experience_reqs_duration_j', 'skills_j', 'gender_j', 'agg_perceived_foreign_j']
cols_pred_preprocess = cols_c + cols_j
cols_not_for_pred = ['id_c', 'id_j', 'ranking', 'shortlisted']
cols_sensitive = ['gender_c', 'agg_perceived_foreign_c']
col_target = ['score']

df_CDS_JDS = df_all[cols_id + [col for col in df_all if col not in cols_id+col_target] + col_target ]

cols_dict_HUDD = {'cols_pred_preprocess': cols_pred_preprocess,
                    'cols_sensitive': cols_sensitive,
                    'cols_id': cols_id,
                    'cols_not_for_pred': cols_not_for_pred,
                    'col_target': col_target}



In [None]:
def build_fitness_matrix(df_CDS_JDS, cols_dict):
    # Calculated features.
    maps_derived_1 = {
        (('professional_experience_c', 'experience_reqs_role_j',), ('relevant_exp_role_c',)): RelevantExperienceForRole(),
    }

    maps_derived_2 = {
            (('relevant_exp_role_c',), ('role_duration_months_c',)): ExtractMonthDurationJob(duration_key='duration_months'),
            (('education_background_c',), ('degree_list_c',)): ExtractListOfProperty(property_key='degree')
    }

    # Fitness features about the matching between candidate's features and job's requirements.
    maps_matching = {
        (('experience_reqs_duration_j', 'role_duration_months_c'), ('fitness_experience',)): MatchOrdinal(),
        (('education_reqs_j', 'education_background_c'), ('fitness_education',)): MatchFeatureAtLeastInList(),
        (('skills_j', 'skills_c'), ('fitness_skills',)): MatchFeatureSet(),
        (('gender_j', 'gender_c'), ('fitness_gender',)): MatchBinary(),
        (('agg_perceived_foreign_j', 'agg_perceived_foreign_c'), ('fitness_foreign',)): MatchBinary()
    }


    # Calculation as fit-transform preprocessing
    pipeline_fitness = Pipeline(steps=[
        ("init", AttachMetadata(md_all)),
        ('mapping_1', DerivedColumn(maps_derived_1)),
        ('mapping_2', DerivedColumn(maps_derived_2)),
        ("matching", DerivedColumn(maps_matching)),
        # ("fitness", GroundTruthLinearWeightedScorer(gt_weights_fair)),
        ("end", DetachMetadata())
    ])

    pipeline_fitness.fit(X=df_CDS_JDS)
    fitness_matrix = pipeline_fitness.transform(X=df_CDS_JDS)
    df_fitness_mat = fitness_matrix.copy(deep=True)
    columns_keep = cols_dict['cols_id'] + \
                   [col for col in fitness_matrix if
                    col.startswith('fitness_')] + cols_dict['cols_sensitive'] + cols_dict['col_target']

    df_fitness_mat = df_fitness_mat[columns_keep]

    # From scores, we can learn regressors; or we can produce ranks, and learn ranking models
    df_fitness_mat['rank'] = df_fitness_mat.groupby("id_j")['score'].rank('dense', ascending=False)
    df_fitness_mat['rank'] = df_fitness_mat['rank'].apply(lambda x: x if x <= TOP_K else TOP_K + 1)

    return pipeline_fitness, df_fitness_mat

### Define the ranking model
To better understand the LambdaRank framework, we point to the relevant literature (e.g. [Burges, 2010, From RankNet to LambdaRank to LambdaMART: An Overview](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/MSR-TR-2010-82.pdf))

In [None]:
def data_split_FEDD(df_fitness_mat):
    all_jobs = df_fitness_mat['id_j'].unique()
    # For this example, we assume that train, val and test sets are the same
    train_jobs = test_jobs = val_jobs = all_jobs
    # train_jobs, test_jobs = train_test_split(all_jobs, test_size=0.5, random_state=42, # shuffle=False)
    # train_jobs, val_jobs = train_test_split(train_jobs, test_size=0.25, random_state=42, shuffle=False)

    # Build train, test and validation sets, ensuring they are sorted by id_j, id_c
    df_train = df_fitness_mat[df_fitness_mat['id_j'].isin(train_jobs)].sort_values(["id_j", "id_c"])
    df_val = df_fitness_mat[df_fitness_mat['id_j'].isin(val_jobs)].sort_values(["id_j", "id_c"])
    df_test = df_fitness_mat[df_fitness_mat['id_j'].isin(test_jobs)].sort_values(["id_j", "id_c"])

    return df_train, df_val, df_test


def init(df_fitness_mat):
    df_train, df_val, df_test = data_split_FEDD(df_fitness_mat)
    # Define subsets of columns
    cols_id = ['id_j', 'id_c']  # ids
    cols_pred = sorted([  # predictive
        'fitness_experience',
        'fitness_education',
        'fitness_skills',
        'fitness_gender',
        'fitness_foreign'])
    cols_sensitive = ['gender_c']  # sensitive attribute(s)
    col_target = 'score'  # target value for ranking
    col_rank = 'rank'  # rank value for ranking

    cols_dict_FEDD = {'cols_id': cols_id,
                      'cols_pred': cols_pred,
                      'cols_sensitive': cols_sensitive,
                      'col_target': col_target,
                      'col_rank': col_rank}
    # Define the ranking model
    ranker = LGBMRanker(
        objective="lambdarank",
        class_weight="balanced",
        boosting_type="gbdt",
        importance_type="gain",
        learning_rate=0.01,
        n_estimators=10,
        force_row_wise=True,
        n_jobs=-1,  # max parallelism
        verbose=-1  # no verbosity
    )
    return ranker, df_train, df_val, df_test, cols_dict_FEDD


def train(ranker, df_train, df_val, cols_dict):
    df_train_counts = df_train.groupby("id_j")["id_j"].count().to_numpy()
    df_val_counts = df_val.groupby("id_j")["id_j"].count().to_numpy()

    # Fitting ranker:
    ranker.fit(
        X=df_train[cols_dict['cols_pred']],
        # LightGBM relevance is the higher the better
        y=rank2relevance(df_train, TOP_K, cols_dict['col_rank']),
        group = df_train_counts,
        eval_at = [TOP_K],
        # LightGBM relevance is the higher the better
        eval_set =[(df_val[cols_dict['cols_pred']], rank2relevance(df_val, TOP_K, cols_dict['col_rank']))],
        eval_group =[df_val_counts]
    )

    return ranker


def evaluate(ranker, df_test, cols_dict):
    df_test_counts = df_test.groupby("id_j")["id_j"].count().to_numpy()
    # Predicting ranker:
    df_test['lambda'] = ranker.predict(df_test[cols_dict['cols_pred']])
    df_test['pred_rank'] = df_test.groupby("id_j")['lambda'].rank('dense', ascending=False)
    df_test['pred_rank'] = df_test['pred_rank'].apply(lambda x: x if x <= TOP_K else TOP_K + 1)

    return df_test


def ranking_pipeline(df_fitness_mat):
    ranker, df_train, df_val, df_test, cols_dict_FEDD = init(df_fitness_mat)
    ranker = train(ranker, df_train, df_val, cols_dict_FEDD)
    df_test = evaluate(ranker, df_test, cols_dict_FEDD)
    return ranker, df_test, cols_dict_FEDD

### Prepare data and functions for the counterfactual explanation

In [None]:
def define_cols_dict():
    outcome_name_col = 'lambda'  # 'pred_rank'
    continuous_features = sorted(['fitness_skills', 'fitness_gender','fitness_foreign'])
    categorical_features = sorted(['fitness_education', 'fitness_experience'])
    cols_pred = sorted(continuous_features + categorical_features)
    return {'outcome_name_col': outcome_name_col, 'continuous_features': continuous_features,
            'categorical_features': categorical_features, 'cols_pred': cols_pred}


def extract_explicand_data_cf(id_j, candidate_position, df_test, ranker, cols_dict_FEDD, df_CDS_JDS):
    # Extract the data of the candidate to be explained

    # df_id_j contains the data for the job id_j
    df_id_j_FEDD = df_test[df_test['id_j'] == id_j]
    # Add the lambda and the predicted rank for all the candidates
    df_id_j_FEDD['lambda'] = ranker.predict(df_id_j_FEDD[cols_dict_FEDD['cols_pred']])
    df_id_j_FEDD['pred_rank'] = df_id_j_FEDD.groupby("id_j")['lambda'].rank('dense', ascending=False)

    exp_c_pred_rank = candidate_position

    # Extract the explicand candidate id_c
    exp_c_id_c = df_id_j_FEDD.loc[df_id_j_FEDD['pred_rank'] == exp_c_pred_rank, 'id_c'].iloc[0]

    # Isolate the candidates' profiles applying for the job id_j
    df_id_j_HUDD = df_CDS_JDS[df_CDS_JDS['id_j'] == id_j]

    # Isolate the explicand candidate profile
    exp_c_profile = df_CDS_JDS[df_CDS_JDS['id_c'] == exp_c_id_c]

    exp_c = {'id_c': exp_c_id_c, 'profile': exp_c_profile}

    cols_dict = define_cols_dict()

    return df_id_j_FEDD, df_id_j_HUDD, exp_c, cols_dict


def prepare_data_cf(df_id_j_FEDD, cols_dict):

    # Convert data types
    df_id_j_FEDD_pre = df_id_j_FEDD[cols_dict['categorical_features']].astype('int').copy(deep=True)
    df_id_j_FEDD_pre[cols_dict['continuous_features']] = df_id_j_FEDD[cols_dict['continuous_features']].astype('float').copy(deep=True)
    df_id_j_FEDD_pre[cols_dict['outcome_name_col']] = df_id_j_FEDD[cols_dict['outcome_name_col']].copy(deep=True)
    feature_dtypes = {col: df_id_j_FEDD_pre[col].dtype for col in df_id_j_FEDD_pre[cols_dict['cols_pred']].columns}

    return df_id_j_FEDD_pre, feature_dtypes


def define_target(args, df_id_j_FEDD, target_rank=TOP_K):
    # 'in_top_k' or 'out_top_k' depending on the candidate position
    explicand_class = 'in_top_k' if args.candidate_position <= target_rank else 'out_top_k'

    # target rank for counterfactual explanation
    if args.target_rank:
        tgt_cf_rank = args.target_rank
        tgt_cf_score = df_id_j_FEDD[df_id_j_FEDD['pred_rank'] == tgt_cf_rank]['score'].iloc[0]
        tgt_cf_candidate = df_id_j_FEDD[df_id_j_FEDD['pred_rank'] == tgt_cf_rank]

    elif args.target_score:
        tgt_cf_rank = None
        tgt_cf_score = args.target_score
        tgt_cf_candidate = None
    else:
        raise ValueError('Either target rank or target score must be provided')

    return explicand_class, tgt_cf_rank, tgt_cf_score, tgt_cf_candidate


def define_explainer_FEDD(ranker, df_id_j_FEDD_pre, cols_dict_cf, feature_dtypes, explanation_method, target_rank=TOP_K):
    data_dice = dice_ml.Data(dataframe=df_id_j_FEDD_pre[cols_dict_cf['cols_pred'] + [cols_dict_cf['outcome_name_col']]],
                             continuous_features=cols_dict_cf['continuous_features'],
                             categorical_features=cols_dict_cf['categorical_features'],
                             outcome_name=cols_dict_cf['outcome_name_col'])

    kwargs = {'top_k': target_rank, 'features_dtype': feature_dtypes}

    model_dice = dice_ml.Model(model=ranker,
                               backend={'explainer': 'dice_xgboost.DiceGenetic',
                                        'model': "lgbmranker_model.LGBMRankerModel"},
                               model_type="regressor",
                               # model_type="classifier",
                               kw_args=kwargs)

    explainer = dice_ml.Dice(data_dice, model_dice, method=explanation_method)

    return explainer, data_dice, model_dice


def get_explanations_FEDD(df_id_j_FEDD, exp_c, cols_dict_cf, explainer, target_rank=TOP_K):

    # Get the predicted lambda for the candidate in target_rank position
    c_th_lambda = df_id_j_FEDD[df_id_j_FEDD['pred_rank'] == target_rank].iloc[0]['lambda']
    print(f'Predicted lambda for candidate in rank {target_rank}: {c_th_lambda}')
    # Generate the counterfactual explanations
    explanations = explainer.generate_counterfactuals(exp_c['profile'][cols_dict_cf['cols_pred']],
                                                      total_CFs=4,
                                                      desired_range=[c_th_lambda, 100], # Look for alternative candidates with a higher lambda than c_th_lambda
                                                      # desired_class="opposite",
                                                      verbose=True)
    return explanations

### Compute counterfactual explanation for an instance

In [None]:
# The job id for which the counterfactual explanation is to be generated'
# Valid values 1-5
id_j = 1

# The position of the candidate to explain the prediction
candidate_position = 12

# Alternative ways to define the target of the explanation (less than or equal to TOP_K):
target_rank = TOP_K

# Select the algorithm for generating the counterfactual explanation:
# valid values are 'genetic' or 'random'
explanation_method = 'genetic'


In [None]:
# Helper structure to save the target rank and candidate position for the explanations
args = namedtuple('Args', ['target_rank', 'target_candidate'])

args.target_rank = target_rank
args.candidate_position = candidate_position

In [None]:
# Define the pipeline:
# - first the fitness pipeline
# - then the ranking pipeline
pipeline_fitness, df_fitness_mat = build_fitness_matrix(df_CDS_JDS, cols_dict_HUDD)
ranker, df_test, cols_dict_FEDD = ranking_pipeline(df_fitness_mat)

# Extract the data of the candidate to be explained
df_id_j_FEDD, df_id_j_HUDD, exp_c, cols_dict_cf = extract_explicand_data_cf(id_j, candidate_position, df_test, ranker, cols_dict_FEDD, df_CDS_JDS)

# Prepare the data for the explanations. Define the column data types
df_id_j_FEDD_pre, feature_dtypes = prepare_data_cf(df_id_j_FEDD, cols_dict_cf)

# Extract the target for the counterfactual explanation (i.e., the data of the candidate in the target position)
explicand_class, tgt_cf_rank, tgt_cf_score, tgt_cf_candidate = define_target(args, df_id_j_FEDD, target_rank=target_rank)

# Define the counterfactual explainer for the ranker
explainer, data_dice, model_dice = define_explainer_FEDD(ranker, df_id_j_FEDD_pre, cols_dict_cf, feature_dtypes, explanation_method, target_rank=target_rank)

# Generate the counterfactual explanations
explanations_FEDD = get_explanations_FEDD(df_id_j_FEDD, exp_c, cols_dict_cf, explainer, target_rank=target_rank)


### Compare original instance with the obtained counterfactuals

In [None]:
# print(explanations_FEDD.cf_examples_list.visualize_as_dataframe(show_only_changes=True))
print(explanations_FEDD.visualize_as_dataframe(show_only_changes=True))


The query instance has a original predicted outcome of about -0.11. The obtained counterfactual have a greater lambda of 0.13.
The counterfactual explanations shows that the candidate should improve the skills (from fitness_skill=0.2 to 1.0), to reach the selected top 10 candidates.