# Factual Explanation of LambdaMART
This notebook shows an example of how to use the APIs with the ExplainableBoostingMachine model from the [interpret](https://interpret.ml/docs/ebm.html) package and how to get a factual explanation for the model in the form of feature importance.

In [None]:
# Load and join raw data sources and their metadata.
%run Example_InputDataSources.ipynb

In [None]:
# Joined DataFrame.
df_all.head()

In [None]:
# Joined metadata.
md_all

In [None]:
# Import libraries
from findhr.preprocess.example_mappings import RelevantExperienceForRole, ExtractMonthDurationJob, MatchOrdinal, ExtractListOfProperty, MatchFeatureAtLeastInList, MatchFeatureSet, MatchBinary
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from findhr.preprocess.mapping import AttachMetadata, DetachMetadata, DerivedColumn
from lightgbm import LGBMRanker
from sklearn.metrics import ndcg_score
import matplotlib.pyplot as plt
import numpy as np
# Importing libraries to avoid warnings at running time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# Setting category columns in DataFrame based on metadata.
cat_cols = [k for k, v in md_all.items() if v.attr_type=='category']
df_all[cat_cols] = df_all[cat_cols].astype('category')
# Dataframe metadata.
df_all.info()

In [None]:
# Define ids, target feature(s), and predictive features.
id_cols = ['id_c', 'id_c']
target_cols = ['score', 'ranking', 'shortlisted']
pred_cols = df_all.columns.difference(target_cols + id_cols)

In [None]:
# For this example we assume that the training, validation and test set coincides
df_train = df_val = df_test = df_all

df_train_counts = df_all.groupby("id_j")["id_j"].count().to_numpy()
df_val_counts = df_all.groupby("id_j")["id_j"].count().to_numpy()
df_train_counts, df_val_counts



In [None]:
### Build the preprocessing and prediction pipelines

In [None]:
# Calculated features.
maps_derived_1 = {
    (('professional_experience_c', 'experience_reqs_role_j',), ('relevant_exp_role_c',)): RelevantExperienceForRole(),
}

maps_derived_2 = {
        (('relevant_exp_role_c',), ('role_duration_months_c',)): ExtractMonthDurationJob(duration_key='duration_months'),
        (('education_background_c',), ('degree_list_c',)): ExtractListOfProperty(property_key='degree')
}

# Fitness features about the matching between candidate's features and job's requirements.
maps_matching = {
    (('experience_reqs_duration_j', 'role_duration_months_c'), ('fitness_experience',)): MatchOrdinal(),
    (('education_reqs_j', 'education_background_c'), ('fitness_education',)): MatchFeatureAtLeastInList(),
    (('skills_j', 'skills_c'), ('fitness_skills',)): MatchFeatureSet(),
    (('gender_j', 'gender_c'), ('fitness_gender',)): MatchBinary(),
    (('agg_perceived_foreign_j', 'agg_perceived_foreign_c'), ('fitness_foreign',)): MatchBinary()
}

# Helper variable for the fitness features
list_cols_fitness = ['fitness_experience', 'fitness_education', 'fitness_skills', 'fitness_gender', 'fitness_foreign']
maps_matching

In [None]:
# Scikit-learn transformation for numeric and categorical features

numeric_features = list_cols_fitness
categorical_features = ['gender_c', 'agg_perceived_foreign_c']
# imputing and scaling numeric features
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), # Not needed for the used dataset.
        ("scaler", StandardScaler()) # Not needed for the decision tree, let's keep it for the sake of generality.
    ]
)
# imputing and encoding categorical features
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), # Not needed for the used dataset, again for the sake of generality.
        ("encoder", OneHotEncoder()), # Convert to one-hot encoding
    ]
)
# combining the two above
column_preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        # ("cat", categorical_transformer, categorical_features)
    ],
)

In [None]:

# The pipeline is composed of two phases:
# 1. Preprocessing with metadata (using findhr package)
pipeline_derived = Pipeline(steps=[
    ("init", AttachMetadata(md_all)),
    ('mapping_1', DerivedColumn(maps_derived_1)),
    ('mapping_2', DerivedColumn(maps_derived_2)),
    ("matching", DerivedColumn(maps_matching)),
    # ("fitness", GroundTruthLinearWeightedScorer(gt_weights_fair)),
    ("end", DetachMetadata())
])
# 2. Standard scikit-learn preprocessing to prepare the data for the model covered by column preprocessor.


In [None]:
# Pipeline definition for regression model on the target feature "score".
# Note that LGBMRanker is not fully compatible with sklearn Pipeline
# https://github.com/microsoft/LightGBM/issues/5041#issuecomment-1054827692

pipeline_rank = Pipeline(
    steps=[
        # first phase: preprocessing with metadata
        ('fitness_value', pipeline_derived),
        # second phase: preprocessing without metadata (standard scikit-learn)
        ("column_preprocessor", column_preprocessor),
        # model inference
        # ("ranker", LGBMRanker( # Define the ranking model
        #     objective = "lambdarank",
        #     class_weight = "balanced",
        #     boosting_type = "gbdt",
        #     importance_type = "gain",
        #     learning_rate = 0.1,
        #     n_estimators = 10,
        #     force_row_wise = True,
        #     verbose = -1              # no verbosity
        # ))
       ]
)

ranker = LGBMRanker( # Define the ranking model
            objective = "lambdarank",
            class_weight = "balanced",
            boosting_type = "gbdt",
            importance_type = "gain",
            learning_rate = 0.1,
            min_data_in_leaf = 10,
            n_estimators = 10,
            force_row_wise = True,
            verbose = -1           # no verbosity during training, no warnings
        )


In [None]:
# Define the top_K number for each job_id
TOP_K = 10
# Helper function to transform the rank into relevance, to train the LGBMRanker through the ndcg_score
# LightGBM relevance is the higher the better
def rank2relevance(df):
    # Convert the ranking col to a relevance scale from 0 to TOP_K
    return np.maximum(TOP_K + 1 - df['ranking'].values.ravel(), 0)

# Note the first time we call the pipeline, it will fit the metadata and the transformations
transformed_data = pipeline_rank.fit_transform(df_train.loc[:, pred_cols])
transformed_val_data = pipeline_rank.transform(df_val.loc[:, pred_cols])


In [None]:
# Fitting ranker:
fitting_params = dict(
    X = transformed_data,
    y =  rank2relevance(df_train),
    group = df_train_counts,
    eval_at = [TOP_K],
    eval_set =[(transformed_val_data, rank2relevance(df_val))],
    eval_group =[df_val_counts]
)

In [None]:
ranker.fit(**fitting_params)
# pipeline_rank.fit(df_train.loc[:, pred_cols])

In [None]:
# Model prediction.
transformed_test_data = pipeline_rank.transform(df_test.loc[:, pred_cols])
lambda_pred = ranker.predict(transformed_test_data)
lambda_pred

In [None]:
# Show an example of the relevance for the test dataset
test_relevance = rank2relevance(df_test).reshape(1, -1)
test_relevance.ravel()[np.where(test_relevance)[1]]

### Model validation
In this example, we do not have enough jobs to obtain proper train, validation and test data splits.
Therefore, we validate the model by checking the relation between the "lambdas" predicted by the LGBMRanker model, and the score, separately for each job offer.
To better understand the LambdaRank framework, we point to the relevant literature (e.g. [Burges, 2010, From RankNet to LambdaRank to LambdaMART: An Overview](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/MSR-TR-2010-82.pdf))

In [None]:
df_all['lambda_pred'] = lambda_pred

In [None]:
# We now show that there is a clear trend of correlation between the score and the predicted lambda from the models for each job_id
# Let's describe the prediction with a scatterplot

# Set plotting variables
cmap = plt.get_cmap('Set1')
plt.figure(figsize=(12, 8))

# export the unique ids for jobs
unique_ids = df_all['id_j'].unique()

# Produce a plot for each unique job_id
for i, uid in enumerate(df_all['id_j'].unique()):
    subset = df_all[df_all['id_j'] == uid]
    color = cmap(i / len(unique_ids))

    plt.scatter(subset['lambda_pred'], subset['score'],
                color=color,
                label=str(uid), edgecolor='black', alpha=0.7)

    # If there is more than one data point, compute and plot the trendline.
    if len(subset) > 1:
        coeffs = np.polyfit(subset['lambda_pred'], subset['score'], 1)
        p = np.poly1d(coeffs)
        x_vals = np.linspace(subset['lambda_pred'].min(), subset['lambda_pred'].max(), 100)
        y_vals = p(x_vals)

        plt.plot(x_vals, y_vals, color=color, linestyle='--', linewidth=2)

# Finalize plotting
plt.xlabel('lambda_pred')
plt.ylabel('score')
plt.legend(title='id_j')
plt.show()


In [None]:
ndcg_score(rank2relevance(df_test).reshape(1, -1), ranker.predict(transformed_test_data).reshape(1, -1), k=TOP_K)

### Example Model Explanation with RankingSHAP

In [None]:
from findhr.xai.factual.ranking_shap import RankingShap
from scipy.stats import kendalltau

In [None]:
# Define the background data
# we use the training data of the candidates applying for the job with id_j = 3
background_data = pipeline_rank.transform(df_train[df_train['id_j'] == 3][pred_cols])

# Define the rank similarity coefficient for comparing the predicted ranking for explanations
rank_similarity_coefficient = lambda x, y: kendalltau(x, y)[0]
# rank_similarity_coefficient = lambda x, y: ndcg_score(x.reshape(1, -1), y.reshape(1, -1), k=TOP_K)


In [None]:
ranking_shap_explainer = RankingShap(
    permutation_sampler="kernel",
    background_data=background_data,
    original_model=ranker.predict,
    name="rankingshap",
    rank_similarity_coefficient=rank_similarity_coefficient,
)

In [None]:
# Select the explicand data
explicand_data = df_train[df_train['id_j'] == 3][pred_cols] #df_train[pred_cols].iloc[200:210, :]
# Transform the explicand data
transformed_explicand_data = pipeline_rank.transform(explicand_data)


In [None]:
# Get the explanation
out_exp = ranking_shap_explainer.get_query_explanation(transformed_explicand_data)

# Get the feature importance for the explicand data after renaming the features as fitness features
out_exp_renamed = {list_cols_fitness[k-1]: v for k, v in dict(out_exp).items()}
out_exp_renamed

In [None]:
# The most important features are skills and experience. The others have low importance. In particular, the small values of importance are due to approximation errors

In [None]:
# Plot the feature importance
plt.figure(figsize=(10, 5))
plt.barh(list(out_exp_renamed.keys()), list(out_exp_renamed.values()))
plt.xlabel('Feature Importance')