# Explainable Boosting Machine (EBM) Model
This notebook shows an example of how to use the APIs with the ExplainableBoostingMachine model from the [interpret](https://interpret.ml/docs/ebm.html) package and how to get a factual explanation for the model in the form of feature importance.

In [None]:
# Load and join raw data sources and their metadata.
%run Example_InputDataSources.ipynb

In [None]:
# Joined DataFrame.
df_all.head()

In [None]:
# Joined metadata.
md_all

In [None]:
# Importing libraries to avoid warnings at running time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from sklearn import set_config
set_config(transform_output = "pandas")

In [None]:
# Setting category columns in DataFrame based on metadata.
cat_cols = [k for k, v in md_all.items() if v.attr_type=='category']
df_all[cat_cols] = df_all[cat_cols].astype('category')
# Dataframe metadata.
df_all.info()

In [None]:
# Define ids, target feature(s), and predictive features.
id_cols = ['id_j', 'id_c']
target_cols = ['score', 'ranking', 'shortlisted']
pred_cols = df_all.columns.difference(target_cols + id_cols)

In [None]:
from findhr.preprocess.example_mappings import RelevantExperienceForRole, ExtractMonthDurationJob, MatchOrdinal, \
    ExtractListOfProperty, MatchFeatureAtLeastInList, MatchFeatureSet, MatchBinary

# Calculated features.
maps_derived_1 = {
    (('professional_experience_c', 'experience_reqs_role_j',), ('relevant_exp_role_c',)): RelevantExperienceForRole(),
}

maps_derived_2 = {
    (('relevant_exp_role_c',), ('role_duration_months_c',)): ExtractMonthDurationJob(duration_key='duration_months'),
    (('education_background_c',), ('degree_list_c',)): ExtractListOfProperty(property_key='degree')
}

# Fitness features about the matching between candidate's features and job's requirements.
maps_matching = {
    (('experience_reqs_duration_j', 'role_duration_months_c'), ('fitness_experience',)): MatchOrdinal(),
    (('education_reqs_j', 'education_background_c'), ('fitness_education',)): MatchFeatureAtLeastInList(),
    (('skills_j', 'skills_c'), ('fitness_skills',)): MatchFeatureSet(),
    (('gender_j', 'gender_c'), ('fitness_gender',)): MatchBinary(),
    (('agg_perceived_foreign_j', 'agg_perceived_foreign_c'), ('fitness_foreign',)): MatchBinary()
}

# Helper variable for the fitness features
list_cols_fitness = ['fitness_experience', 'fitness_education', 'fitness_skills', 'fitness_gender', 'fitness_foreign']
maps_matching

In [None]:
# Scikit-learn transformation for numeric and categorical features
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer

numeric_features = list_cols_fitness
categorical_features = ['gender_c', 'agg_perceived_foreign_c']
# imputing and scaling numeric features
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),  # Not needed for the used dataset.
        ("scaler", StandardScaler())  # Not needed for the decision tree, let's keep it for the sake of generality.
    ]
)
# imputing and encoding categorical features
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        # Not needed for the used dataset, again for the sake of generality.
        ("encoder", OneHotEncoder()),  # Convert to one-hot encoding
    ]
)
# combining the two above
column_preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        # ("cat", categorical_transformer, categorical_features)
    ],
)

In [None]:
from findhr.preprocess.mapping import AttachMetadata, DetachMetadata, DerivedColumn

# The pipeline is composed of two phases:
# 1. Preprocessing with metadata (using findhr package)
pipeline_derived = Pipeline(steps=[
    ("init", AttachMetadata(md_all)),
    ('mapping_1', DerivedColumn(maps_derived_1)),
    ('mapping_2', DerivedColumn(maps_derived_2)),
    ("matching", DerivedColumn(maps_matching)),
    # ("fitness", GroundTruthLinearWeightedScorer(gt_weights_fair)),
    ("end", DetachMetadata())
])
# 2. Standard scikit-learn preprocessing to prepare the data for the model covered by column preprocessor.


In [None]:
## Pipeline Including ExplainableBoostingRegressor

In [None]:
# Pipeline definition for regression model on the target feature "score".
from findhr.preprocess.mapping import AttachMetadata, DerivedColumn, DetachMetadata
from interpret.glassbox import ExplainableBoostingRegressor
pipeline_regr = Pipeline(
    steps=[
        # first phase: preprocessing with metadata
        ('fitness_value', pipeline_derived
         ),
        # second phase: preprocessing without metadata (standard scikit-learn)
        ("column_preprocessor", column_preprocessor),
        # model inference
        ("regressor", ExplainableBoostingRegressor())
       ]
)

In [None]:
# Model fit.
pipeline_regr.fit(df_all.loc[:, pred_cols], df_all.loc[:, 'score'])

In [None]:
# Example model prediction.
pipeline_regr.predict(df_all.loc[:10, pred_cols])

### Example Model Explanation

In [None]:
### Get a global explanation from ExplainableBoostingRegressor
explanation_global = pipeline_regr.named_steps['regressor'].explain_global()#name=list_cols_fitness)


In [None]:
# Visualize the global explanation through plotting the feature importance.
explanation_global.visualize()

In [None]:
# Get the transformed data at the end before the model prediction.
idx_explicand_sample = 0
explicand_sample = df_all.loc[:, pred_cols].iloc[idx_explicand_sample:idx_explicand_sample+1]
transformed_data = pipeline_regr[:-1].transform(explicand_sample)
explanation_local = pipeline_regr.named_steps['regressor'].explain_local(transformed_data)


In [None]:
# Visualize the local explanation for the first sample explained.
# See documentation at https://interpret.ml/docs/ebm.html for further details.
explanation_local.visualize(0)