# Selecting robust features by permutation importance cross multiple seeds
- Author: senkin.zhan@datarobot.com
- demo data for regression or binary classfication: https://s3.amazonaws.com/datarobot_public_datasets/ai_accelerators/500_Lending_Club_Loans.csv
- demo data for regression or multiple classfication: 
https://s3.amazonaws.com/datarobot_public_datasets/ai_accelerators/multiclass_lending_club.csv

## Summary

Machine learning models have biases using small data, and some industries such as health care and manufaturing lack labled data. In light of this, a good approach is to select robust features to build models.

This accelerator introduces an approach to select robust features, use multiple seeds for cross validation, add dummy features to compute the median permutation importance, and then select the most robust dummy features.

This notebook outlines how to:

1. Connect to DataRobot
2. Create multiple projects by multiple seeds and adding dummy features
3. Create blend models of top-performing models
4. Retrieve modeling permutation importance from the top-performing blend models
5. Remove features whose permutation importance are lower than dummy features


## Setup

### Install and import libraries

In [None]:
!pip install pandas numpy datarobot boto3 plotly dask awswrangler

In [None]:
import datetime as dt
import json
import os
import pickle
import random
import re
import time
import zipfile

import awswrangler as wr  # For aws s3 bucket read
import boto3  # For aws credentials
from dask import compute, delayed  # For parallelization
import datarobot as dr
from datarobot.models.dataset import Dataset
import numpy as np
import pandas as pd
import plotly
import plotly.graph_objects as go

### Bind variables

In [None]:
# 1: Run this notebook at Local or DataRobot Notebooks
ENV = "local"  # ("local" or "dr_notebooks")

# 2: DataRobot API Token and end point
TOKEN = "<INSERT YOUR DataRobot API Token>"  #  # can skip if use DataRobot Notebooks
END_POINT = "https://app.datarobot.com/api/v2"
AI_CATALOG_DATA_ID = "<INSERT YOUR AI CATALOG DATASET ID>"
AWS_KEY = "<INSERT YOUR AWS ACCESS KEY>"
AWS_SECRET = "<INSERT YOUR AWS SECRET>"
AWS_S3_INPUT_PATH = "<INSERT YOUR AWS S3 INPUT PATH>"
AWS_S3_OUTPUT_PATH = "<INSERT YOUR AWS S3 OUTPUT PATH>"

# 3: Input and Output Path(file should be csv with UTF-8 encoding)
input_folder = "./input/"
output_folder = "./output/"
os.makedirs(output_folder, exist_ok=True)
os.makedirs(input_folder, exist_ok=True)
file_name = "500_Lending_Club_Loans.csv"  # input file name
input_file = input_folder + file_name

data_source = (
    "local"  # ("local" or "ai_catalog" or "aws_s3", dr_notebooks can not use local)
)
data_output = ["ai_catalog", "aws_s3"]  # ("ai_catalog" or "aws_s3", multiple selects)

# 4: Target, if multiclass, use 0,1,2... as target
target = "is_bad"

# 5: Regression task or not
regression = False

# 6: If regression choose 1, if binaryclass choose 2, if multiclass the number of class
class_num = 2

# 7: Cross Validation folds, 10 is recommended
cv = 10

# 8: Run how many times with random seed, 10 is recommended
iteration = 10
# max iteration is 10
if iteration > 10:
    iteration = 10

# 9: Choose metric to measure best performance like "AUC" "MAE" "R Squared", if use default choose 0
metrics = "LogLoss"

# 10: Choose feature importance aggregation method like "median" "min", median is recommended
how = "median"

# 11: Keep series id column, if have multiple ids ["ID1","ID2"], if none []
keep_column = ["ID"]

# 12: Use relative value or absolute values of feature impact. Absolute value is recommended
impact_abs = True

# 13: If enable partial dependency plot of top 5 features 。If True the running time become longer
pdp = True

# 14: If use groupkfold input group column ["Group_ID"], if not input False
group_col = False

# 15: If add error features(dummy features)。If true will add 3 0~100 random numeric features(error_1, error_2, error_3)
error_feature = True

### Connect to DataRobot

You can read more about different options for [connecting to DataRobot from the client](https://docs.datarobot.com/en/docs/api/api-quickstart/api-qs.html).

In [None]:
def __init_datarobot_env(end_point, token, env):
    if env == "local":
        c = dr.Client(
            endpoint=end_point,
            token=token,
            connect_timeout=60,
        )
    else:
        c = dr.Client()
    print("DataRobot API Version:", dr.__version__)
    print("Endpoint:", c.endpoint)
    print("Connection:", c.verify)


__init_datarobot_env(END_POINT, TOKEN, ENV)

## Import data

In [None]:
def __read_inputs(input_path, ext, error_feature, target, keep_column, group_col):
    # check extension and deal with。If input Excel, should install xlrd by pip.
    def read_file_autohandle(input_path, ext):
        if ext == ".csv":
            result = pd.read_csv(input_path, encoding="utf-8-sig")
            found_csv = False
            return result, found_csv
        elif ext == ".zip":
            with zipfile.ZipFile(input_path) as existing_zip:
                existing_zip.extractall(zip_folder)

            def check_files(zip_folder):
                tmp_folder = ""
                result = ""
                found_csv = False
                for f in os.listdir(zip_folder):
                    file, ext2 = os.path.splitext(f)
                    if ext2 == ".csv":
                        result = pd.read_csv(
                            zip_folder + "/" + file + ".csv", encoding="utf-8-sig"
                        )
                        found_csv = zip_folder
                        break
                    if (ext2 == "") and (file != ".DS_Store"):
                        print(file)
                        tmp_folder = zip_folder + "/" + file + "/"
                    else:
                        continue
                return result, found_csv, tmp_folder

            # check zip folder
            result, found_csv, tmp_folder = check_files(zip_folder)
            if found_csv == False:
                result, found_csv, tmp_folder = check_files(tmp_folder)
                if found_csv == False:
                    result, found_csv, tmp_folder = check_files(tmp_folder)
            return result, found_csv

        else:
            print("Please input csv file.")

    df_x, found_csv = read_file_autohandle(input_path, ext)

    # if use error_feature,generate random numeric features.
    if error_feature == True:
        # setup numpy seed
        np.random.seed(seed=1)
        df_x["error_1"] = np.random.randn(len(df_x))
        df_x["error_2"] = np.random.randn(len(df_x))
        df_x["error_3"] = np.random.randn(len(df_x))

    # create dataframe with different keep_column and group_column conditions
    df_y = df_x[target]
    df_id = pd.DataFrame()
    if len(keep_column) != 0:
        df_id = pd.DataFrame(df_x[keep_column])
        df_x.drop(columns=keep_column, inplace=True)
        if group_col != False:
            df_id[group_col] = df_x[group_col]
    if len(keep_column) == 0:
        if group_col != False:
            df_id = df_x[group_col]

    # features naming should be same as datarobot's special symbols
    for col in df_x.columns:
        df_x.rename(
            columns={
                col: col.replace("{", "_")
                .replace("-", "_")
                .replace("$", "_")
                .replace(".", "_")
                .replace("}", "_")
                .replace("\n", "_")
                .replace('"', "_")
            },
            inplace=True,
        )

    # if use zip,found the file inside zip then create new csv file
    if ext == ".zip":
        for f in os.listdir(found_csv):
            _, ext2 = os.path.splitext(f)
            if ext2 == ".csv":
                df_x.to_csv(found_csv + f, index=False, encoding="utf-8-sig")

        # create new zip file using the new csv file
        with zipfile.ZipFile(
            zip_folder[:-1] + ".zip", "w", compression=zipfile.ZIP_STORED
        ) as new_zip:
            for folder, subfolders, files in os.walk(zip_folder):
                new_zip.write(folder)
                for file in files:
                    new_zip.write(os.path.join(folder, file))

    return (df_id, df_x, df_y)


# define zip folder if use zip file
f, _ = os.path.splitext(input_folder + file_name)
zip_folder = str(dt.date.today()) + "_temp_{0}/".format(f)

# get input file extension
_, ext = os.path.splitext(input_folder + file_name)

# save input csv to input folder
if data_source == "local":
    print("Put the csv file to " + input_folder)
if data_source == "ai_catalog":
    dataset_id = AI_CATALOG_DATA_ID  # ai_catalog id, eg. "64a2520bf822342ecfb8fcc"
    dataset = Dataset.get(dataset_id)
    dataset.get_as_dataframe().to_csv(input_file, index=False)  # save to input folder
if data_source == "aws_s3":
    my_session = boto3.Session(
        aws_access_key_id=AWS_KEY, aws_secret_access_key=AWS_SECRET
    )  # aws access_key and secret_access_key
    dataset = wr.s3.read_csv(
        path=AWS_S3_INPUT_PATH,
        dataset=True,
        boto3_session=my_session,
        encoding="ISO-8859-1",
    )  # path is bucket path like 's3://senkin-demo/'
    dataset.to_csv(input_file, index=False)  # save to input folder

df_id, df_x, df_y = __read_inputs(
    input_file, ext, error_feature, target, keep_column, group_col
)

## Modeling

### Create multiple projects cross multiple seeds

In [None]:
%%time


def __run_autopilot(
    i, target, zip_folder, input_file, ext, df_x, group_col, regression, cv
):
    # if use zip file,load new zip file or else load df_x
    if ext == ".zip":
        project = dr.Project.create(
            zip_folder[:-1] + ".zip",
            project_name=str(dt.date.today()) + "_" + input_file + "_Seed" + str(i),
        )
    else:
        project = dr.Project.create(
            df_x,
            project_name=str(dt.date.today()) + "_" + input_file + "_Seed" + str(i),
        )

    ao = dr.AdvancedOptions(seed=i)
    if group_col != False:
        pm = dr.GroupCV(holdout_pct=0, partition_key_cols=[group_col], reps=cv, seed=i)
    elif regression == False:
        if class_num == 2:
            pm = dr.StratifiedCV(holdout_pct=0, reps=cv, seed=i)
        else:
            pm = dr.RandomCV(holdout_pct=0, reps=cv, seed=i)
    elif regression == True:
        pm = dr.RandomCV(holdout_pct=0, reps=cv, seed=i)
    try:
        if class_num <= 2:
            project.analyze_and_model(
                worker_count=-1,
                target=target,
                mode="quick",
                partitioning_method=pm,
                advanced_options=ao,
                max_wait=10000,
            )
        else:
            project.analyze_and_model(
                worker_count=-1,
                target=target,
                mode="quick",
                partitioning_method=pm,
                advanced_options=ao,
                target_type="Multiclass",
                max_wait=10000,
            )
        print("The new project has ID for seed {} is".format(i), project.id)

    except:
        print("The seed {} has occured error".format(i))

    return project


delayed_dr_projects = []

for seed in range(1, iteration + 1):
    temp = delayed(__run_autopilot)(
        seed, target, zip_folder, input_file, ext, df_x, group_col, regression, cv
    )
    delayed_dr_projects.append(temp)

projects = compute(delayed_dr_projects)[0]
project_id_list = []
for p in projects:
    project_id_list.append(p.id)

### Wait for autopilot completed and run cross validation

In [None]:
%%time


def __get_model_scores(project):
    return pd.DataFrame(
        [
            [
                model.metrics[project.metric]["crossValidation"],
                model.metrics[project.metric]["validation"],
                model.model_type,
                model.id,
                model.sample_pct,
                model.model_category,
                model,
                model.blueprint_id,
            ]
            for model in project.get_model_records(with_metric=project.metric)
        ],
        columns=[
            "cv",
            "v",
            "type",
            "model_id",
            "sample_pct",
            "category",
            "model",
            "blueprint_id",
        ],
    ).sort_values(["cv", "v"], na_position="last")


def __run_cross_validation(projects, i):
    try:
        project = dr.Project.get(project_id=projects[i - 1].id)
        project.wait_for_autopilot(check_interval=60.0, timeout=800000, verbosity=1)
        print("Autopilot of the project for seed " + str(i + 1) + " is completed")
        print("Confirming CV status....")
        jobs_list = project.get_all_jobs()
        for job in jobs_list:
            job.wait_for_completion(max_wait=60000)

        df_model = __get_model_scores(project)

        # if can build 100% sample size model,run cross validation first
        if max(df_model["sample_pct"]) == 100:
            df_model = df_model[
                df_model["sample_pct"] == sorted(list(set(df_model["sample_pct"])))[-2]
            ]
            df_model = df_model[df_model["category"] == "model"]
            for model in df_model[df_model["cv"].isnull()]["model"]:
                print("Seed" + str(i) + " " + model.model_type + " started CV")
                model.cross_validate()
        else:
            print("Seed " + str(i) + " looks error occured. Ignored")
    except:
        print("Seed {} has occured error".format(i))


delayed_dr_projects = []

for seed in range(1, iteration + 1):
    temp = delayed(__run_cross_validation)(projects, seed)
    delayed_dr_projects.append(temp)

compute(delayed_dr_projects)

### Wait for cross validation completed and create blend model

In [None]:
%%time


def __wait_for_cv(projects, iteration):
    for i in range(1, iteration + 1):
        try:
            project = dr.Project.get(project_id=projects[i - 1].id)
            jobs_list = project.get_all_jobs()
            for job in jobs_list:
                job.wait_for_completion(max_wait=60000)
            print("completed CV of in seed" + str(i))
        except:
            print("Seed {} has occured error".format(i))


def __get_blender_detail(project):
    return pd.DataFrame(
        [
            [
                model.model_type,
                model.id,
                model.sample_pct,
                model.model_category,
                model,
                model.blender_method,
                model.model_ids,
            ]
            for model in project.get_blenders()
        ],
        columns=[
            "type",
            "model_id",
            "sample_pct",
            "category",
            "model",
            "blueprint_method",
            "model_ids",
        ],
    )


def __create_blender(projects, i):
    def _sort_leader_board(project):
        df_model = __get_model_scores(project)
        df_model = df_model[df_model["category"] == "model"]
        df_model = df_model[
            df_model["sample_pct"] == sorted(list(set(df_model["sample_pct"])))[-2]
        ]
        parent_models = [model.id for model in df_model["model"]]
        return parent_models

    def _run_blender(project, parent_models):
        blender_method = ["GLM", "ENET", "AVG", "MED"]
        df_blender = __get_blender_detail(project)
        for j in range(3, 7):
            if len(parent_models) < j:
                continue
            for b in blender_method:
                if project.check_blendable(parent_models[:j], b).supported == True:
                    df_tmp_blenders = df_blender[df_blender.blueprint_method == b]
                    check = [
                        set(df_tmp_blenders.model_ids.iloc[x]) == set(parent_models[:j])
                        for x in range(0, len(df_tmp_blenders))
                    ]
                    if any(check) != True:
                        project.blend(parent_models[:j], b)
                        print(
                            b
                            + " blender "
                            + str(j)
                            + " models for project "
                            + str(i + 1)
                            + " started"
                        )
                    else:
                        print(
                            b
                            + " blender "
                            + str(j)
                            + " models for project "
                            + str(i + 1)
                            + " was created already"
                        )
                else:
                    print(b + " was not supported")

    try:
        project = dr.Project.get(project_id=projects[i - 1].id)
        parent_models = _sort_leader_board(project)
        _run_blender(project, parent_models)
        print("started creating all blenders for project " + str(i))
    except:
        print("Seed {} has occured error".format(i))


__wait_for_cv(projects, iteration)
delayed_dr_projects = []

for seed in range(1, iteration + 1):
    temp = delayed(__create_blender)(projects, seed)
    delayed_dr_projects.append(temp)

compute(delayed_dr_projects)

### Wait for blend compeleted and retrieve permutation importance

In [None]:
%%time


def __get_impact(projects, i, impact_abs):
    feature_impact = {}
    try:
        project = dr.Project.get(project_id=projects[i - 1].id)
        jobs_list = project.get_all_jobs()
        print("Waiting blenders finished in seed" + str(i))
        for job in jobs_list:
            job.wait_for_completion(max_wait=60000)
        print("Completed seed" + str(i))
        df_model = __get_model_scores(project)
        df_model = df_model[df_model["category"] == "blend"]

        model = dr.Model.get(projects[i - 1].id, df_model.iloc[0, 3])
        print("Request and get feature impact for seed" + str(i))
        feature_impact[i] = model.get_or_request_feature_impact(max_wait=60000)

        df_tmp2 = pd.DataFrame(
            feature_impact[i],
            columns=[
                "redundantWith",
                "featureName",
                "impactNormalized",
                "impactUnnormalized",
            ],
        )
        if impact_abs == False:
            df_tmp2 = pd.DataFrame(df_tmp2[["featureName", "impactNormalized"]])
            df_tmp2 = df_tmp2.rename(columns={"impactNormalized": "Seed_" + str(i)})
        else:
            df_tmp2 = pd.DataFrame(df_tmp2[["featureName", "impactUnnormalized"]])
            df_tmp2 = df_tmp2.rename(columns={"impactUnnormalized": "Seed_" + str(i)})
        fn_list = list(df_tmp2["featureName"])
        df_tmp2 = df_tmp2.T
        df_tmp2.columns = fn_list
        df_tmp2 = df_tmp2.drop(df_tmp2.index[0])
    except:
        print("Seed {} has occured error".format(i))

    return df_tmp2


delayed_dr_projects = []

for seed in range(1, iteration + 1):
    temp = delayed(__get_impact)(projects, seed, impact_abs)
    delayed_dr_projects.append(temp)

df_tmp = compute(delayed_dr_projects)[0]
df_tmp = pd.concat(df_tmp)

df_tmp = df_tmp.fillna(0)
df_stat = pd.DataFrame()
df_stat["max"] = df_tmp.max()
df_stat["median"] = df_tmp.median()
df_stat["min"] = df_tmp.min()
df_output = pd.concat([df_tmp, df_stat.T])
if how == "min":
    df_output = df_output.T.sort_values(by=["min", "median", "max"], ascending=False)
else:
    df_output = df_output.T.sort_values(by=["median", "max"], ascending=False)
df_output = df_output.T

In [None]:
df_output


### Create Log file

In [None]:
def __create_log(
    project_id_list, iteration, metrics, input_file, target, regression, cv, group_col
):
    model_lists, model_name, metric_name, cross_val_score, model_id, sample_pct = (
        {},
        [],
        [],
        [],
        [],
        [],
    )
    print("Preparting export files...")
    # create Log file
    for i in range(0, iteration):
        try:
            project = dr.Project.get(project_id=project_id_list[i])
            df_model = __get_model_scores(project)
            df_model = df_model[df_model["category"] == "blend"]
            model = dr.Model.get(project_id_list[i], df_model.iloc[0, 3])
            model_name.append(model.model_type)
            if metrics == 0:
                met = project.metric
                metric_name.append(met)
                cross_val_score.append(model.metrics[met]["crossValidation"])
            if metrics != 0:
                metric_name.append(metrics)
                cross_val_score.append(model.metrics[metrics]["crossValidation"])

            model_id.append(model.id)
            sample_pct.append(model.sample_pct)
            df_model = __get_model_scores(project)
            df_model = df_model[df_model["category"] == "model"]
            df_model = df_model[
                df_model["sample_pct"] == sorted(list(set(df_model["sample_pct"])))[-2]
            ]
            model_lists[i] = df_model.iloc[:, 2]
        except:
            print("Seed {} has occured error".format(i))

    # create DataFrame from log file
    df_log = pd.DataFrame(
        project_id_list,
        columns=["project_id"],
        index=["Seed_" + str(i) for i in range(1, iteration + 1)],
    )
    df_log["model_name"] = model_name
    df_log["metric_name"] = metric_name
    df_log["CV_score"] = cross_val_score
    df_log["model_id"] = model_id
    df_log["sample_pct"] = sample_pct
    df_log["input_file"] = input_file
    df_log["target"] = target
    df_log["regression"] = regression
    df_log["cv_number"] = cv
    df_log["group_cv"] = group_col
    model_rank = []
    for i in range(0, iteration):
        model_rank_temp = []
        for j in range(0, 5):
            if project_id_list[i] == "error":
                model_rank_temp.append("error")
            else:
                model_tmp = model_lists[i].reset_index()["type"][j]
                model_rank_temp.append(model_tmp)
        model_rank.append(model_rank_temp)
    df_tmp = pd.DataFrame(
        model_rank,
        index=["Seed_" + str(i) for i in range(1, iteration + 1)],
        columns=["Model_rank_" + str(i) for i in range(1, 6)],
    )
    df_log = pd.concat([df_log, df_tmp], axis=1)
    return df_log


df_log = __create_log(
    project_id_list, iteration, metrics, input_file, target, regression, cv, group_col
)

In [None]:
df_log

In [None]:
def get_associations_df(df_output, project_id_list, output_folder):
    project = dr.Project.get(project_id=[x for x in project_id_list if x != "error"][0])
    association_data = project.get_associations(
        assoc_type="association", metric="mutualInfo"
    )
    df = pd.DataFrame(association_data["strengths"])
    df = df[~(df["feature1"] == df["feature2"])]

    df_med = df_output.T["median"].reset_index()
    df = pd.merge(df, df_med, left_on="feature1", right_on="index")
    df.drop(columns="index", inplace=True)
    df.rename(
        columns={"median": "feature1_median_imp", "statistic": "mutual_information"},
        inplace=True,
    )
    df = pd.merge(df, df_med, left_on="feature2", right_on="index")
    df.drop(columns="index", inplace=True)
    df.rename(columns={"median": "feature2_median_imp"}, inplace=True)

    df_min = df_output.T["min"].reset_index()
    df = pd.merge(df, df_min, left_on="feature1", right_on="index")
    df.drop(columns="index", inplace=True)
    df.rename(columns={"min": "feature1_min_imp"}, inplace=True)
    df = pd.merge(df, df_min, left_on="feature2", right_on="index")
    df.drop(columns="index", inplace=True)
    df.rename(columns={"min": "feature2_min_imp"}, inplace=True)
    df = df[
        [
            "feature1",
            "feature1_median_imp",
            "feature1_min_imp",
            "feature2",
            "feature2_median_imp",
            "feature2_min_imp",
            "mutual_information",
        ]
    ]
    df = df.sort_values("mutual_information", ascending=False)
    df = df.reset_index()
    df.drop(columns="index", inplace=True)

    def order_change(x):
        if x.feature1_median_imp < x.feature2_median_imp:
            x.featureX = x.feature1
            x.feature1 = x.feature2
            x.feature2 = x.featureX
            x.featureX_median_imp = x.feature1_median_imp
            x.feature1_median_imp = x.feature2_median_imp
            x.feature2_median_imp = x.featureX_median_imp
            x.featureX_min_imp = x.feature1_min_imp
            x.feature1_min_imp = x.feature2_min_imp
            x.feature2_min_imp = x.featureX_min_imp
        return x

    df = df.apply(lambda x: order_change(x), axis=1)
    return df


## Check features relevance. Output the relevance values of feature impacts cross all seeds among the top 50 most useful features
## Display the larger median impact cross the seeds on the feature1 column.


df_association = get_associations_df(df_output, project_id_list, output_folder)

In [None]:
df_association

## Plotting permutation importance and feature effects

In [None]:
%%time
def __get_effects(project_id_list, iteration, topx):
    feature_effects = {}
    for i in range(0, iteration):
        try:
            project = dr.Project.get(project_id=project_id_list[i])
            df_model = __get_model_scores(project)
            df_model = df_model[df_model["category"] == "blend"].reset_index(drop=True)
            model = dr.Model.get(project_id_list[i], df_model.iloc[0, 3])
            model.get_feature_effect_metadata()
            if class_num <= 2:
                model.request_feature_effect()
            else:
                model.request_feature_effects_multiclass(top_n_features=topx)
        except:
            print("Seed {} has occured error".format(i))

    for i in range(0, iteration):
        try:
            project = dr.Project.get(project_id=project_id_list[i])
            jobs_list = project.get_all_jobs()
            print("Waiting feature effect seed " + str(i + 1))
            for job in jobs_list:
                job.wait_for_completion(max_wait=60000)
            df_model = __get_model_scores(project)
            df_model = df_model[df_model["category"] == "blend"].reset_index(drop=True)
            model = dr.Model.get(project_id_list[i], df_model.iloc[0, 3])
            if class_num <= 2:
                feature_effects[i] = model.get_feature_effect("validation")
            else:
                feature_effects[i] = model.get_feature_effects_multiclass("validation")
        except:
            print("Seed {} has occured error".format(i))

    return feature_effects


# show top x features effects
if pdp:
    topx = 100
    feature_effects = __get_effects(project_id_list, iteration, topx)

In [None]:
def __create_graph(df_output2, output_folder, how, topx, exp=True):
    os.makedirs(output_folder + "graph/", exist_ok=True)
    df_graph = df_output2.T.copy()
    if how == "min":
        df_graph = df_graph.sort_values(by=["min", "median", "max"], ascending=True)
        df_graph = df_graph.tail(topx)
    else:
        df_graph = df_graph.sort_values(by=["median", "max"], ascending=True)
        df_graph = df_graph.tail(topx)

    # create histogram
    x = df_graph["median"].values
    y = df_graph.index.values
    fig = go.Figure(
        data=go.Bar(
            x=x,
            y=y,
            orientation="h",
            error_x=dict(
                type="data",
                symmetric=False,
                array=df_graph["max"] - df_graph["median"],
                arrayminus=df_graph["median"] - df_graph["min"],
            ),
        )
    )
    fig.update_yaxes(type="category", dtick=1)
    fig.update_layout(
        height=800,
        width=800,
        title="Feature impact among seeds - Bar plot",
        xaxis_title="Feature impact",
        yaxis_title="Feature",
    )
    fig.show()
    if exp == True:
        plotly.offline.plot(
            fig,
            filename=output_folder + "graph/" + str(how) + "_bar.html",
            auto_open=False,
        )

    # create Box
    if how == "min":
        df_graph = df_graph.sort_values(by=["min", "median", "max"], ascending=False)
    else:
        df_graph = df_graph.sort_values(by=["median", "max"], ascending=False)
    df_graph = df_graph.T
    df_graph.drop(["max", "median", "min"], inplace=True, axis=0)
    df_graph = df_graph.reset_index()

    fig = go.Figure()
    for col in df_graph.columns[1:]:
        fig.add_trace(
            go.Box(y=df_graph[col], name=col)
        )  # ,hovertext=df_graph["index"]))
    fig.update_xaxes(type="category", dtick=1)
    fig.update_traces(boxpoints="all", jitter=0.5)
    fig.update_layout(
        height=800,
        width=800,
        title="Feature impact among seeds - Box plot",
        yaxis_title="Feature impact",
    )
    fig.show()
    if exp == True:
        plotly.offline.plot(
            fig,
            filename=output_folder + "graph/" + str(how) + "_box.html",
            auto_open=False,
        )

    # create Violine
    fig = go.Figure()
    for col in df_graph.columns[1:]:
        fig.add_trace(
            go.Violin(y=df_graph[col], name=col)
        )  # , hovertext=df_graph["index"]))
    fig.update_xaxes(type="category", dtick=1)
    fig.update_traces(points="all", jitter=0.5, box_visible=True)
    fig.update_layout(
        height=800,
        width=800,
        title="Feature impact among seeds - Violine plot",
        yaxis_title="Feature impact",
    )
    fig.show()
    if exp == True:
        plotly.offline.plot(
            fig,
            filename=output_folder + "graph/" + str(how) + "_violine.html",
            auto_open=False,
        )
        print("All files are exported")


# show top x features permutation importance
topx = 100
__create_graph(df_output, output_folder, how, topx)

In [None]:
def extract_pdp(feature_effects, i):
    df = pd.DataFrame()
    fe = feature_effects[i]
    for r in range(len(fe)):
        row = str(fe[r])
        row = (
            row.replace("class=", "'class':")
            .replace("feature_name=", "'feature_name':'")
            .replace(", feature_type=", "', 'feature_type':'")
            .replace(", feature_impact_score=", "', 'feature_impact_score':")
            .replace("weight_label=", "'weight_label':")
            .replace("partial_dependence=", "'partial_dependence':")
            .replace("predicted_vs_actual=", "'predicted_vs_actual':")
            .replace("FeatureEffectsMulticlass(", "{")
            .replace("}]})", "}]}}")
            .replace("False", "0")
            .replace("True", "1")
            .replace("'", '"')
            .replace("None", '"None"')
        )

        row = json.loads(row)
        row = pd.json_normalize(row).reset_index(drop=True)
        df = pd.concat([df, row])

    return df


def __create_effect_plot(
    df_output2, output_folder, feature_effects, iteration, topx, project_id_list, df_x
):
    max_dependence = 0
    min_dependence = 100
    os.makedirs(output_folder + "pdp/", exist_ok=True)

    if class_num <= 2:
        file_num = 1
    else:
        file_num = class_num

    for class_no in range(file_num):
        for x in df_output2.columns[:topx]:
            effect = pd.DataFrame()
            for i in range(0, iteration):
                if project_id_list[i] == "error":
                    continue
                if class_num <= 2:
                    df_feature_effects = pd.json_normalize(feature_effects[i])
                    df_feature_effects["class"] = 0
                else:
                    df_feature_effects = extract_pdp(feature_effects, i)
                df = df_feature_effects[df_feature_effects["class"] == class_no]
                df_x_class = df_x[df_x["class"] == class_no]
                if x in list(df["feature_name"]):
                    z = pd.DataFrame()
                    z = list(df[df["feature_name"] == x]["partial_dependence.data"])

                    def flatten_2d(data):
                        for block in data:
                            for elem in block:
                                yield elem

                    z = list(flatten_2d(z))
                    z = pd.DataFrame(z)
                    effect[["label", "seed" + str(i + 1)]] = z[["label", "dependence"]]
                    if max_dependence < z["dependence"].max():
                        max_dependence = z["dependence"].max()
                    if min_dependence > z["dependence"].min():
                        min_dependence = z["dependence"].min()
                    if list(df[df["feature_name"] == x]["feature_type"]) == ["numeric"]:
                        effect["label"] = effect["label"].astype("float")
                    else:

                        def order(effect, df_x_class, x):
                            l_order = list(df_x_class[x].value_counts().index)
                            j = 0
                            l_order_dic = {}
                            for x in l_order:
                                l_order_dic[x] = j
                                j += 1
                            effect["order"] = effect["label"].map(l_order_dic)
                            effect["order"] = effect["order"].fillna(j)
                            effect = effect.sort_values("order")
                            effect.drop(columns="order", inplace=True)
                            return effect

                        effect = order(effect, df_x_class, x)

            if x in list(df["feature_name"]):
                effect["mean"] = effect.iloc[:, 1:].mean(axis=1)
                effect.to_csv(
                    output_folder + "pdp/" + x + "_class" + str(class_no) + "_pdp.csv",
                    index=False,
                    encoding="utf-8-sig",
                )

    for x in df_output2.columns[:topx]:
        if x in list(df["feature_name"]):
            for class_no in range(file_num):
                df_x_class = df_x[df_x["class"] == class_no]
                effect = pd.read_csv(
                    output_folder + "pdp/" + x + "_class" + str(class_no) + "_pdp.csv",
                    encoding="utf-8-sig",
                )
                if effect["label"].nunique() != 1:
                    effect.dropna(inplace=True)
                    fig = go.Figure()
                    if list(df[df["feature_name"] == x]["feature_type"]) == ["numeric"]:
                        for y in effect.columns[1:]:
                            if y == effect.columns[-1]:
                                fig.add_trace(
                                    go.Scatter(
                                        x=effect["label"],
                                        y=effect[y],
                                        mode="lines",
                                        line=dict(width=5),
                                        name=y,
                                    )
                                )
                            else:
                                fig.add_trace(
                                    go.Scatter(
                                        x=effect["label"],
                                        y=effect[y],
                                        mode="lines",
                                        line=dict(width=2, dash="dash"),
                                        name=y,
                                    )
                                )

                    else:
                        effect.sort_values(by="label", inplace=True)
                        effect = pd.concat([effect, effect.iloc[0, :]]).iloc[1:, :]
                        for y in effect.columns[1:]:
                            if y == effect.columns[-1]:
                                fig.add_trace(
                                    go.Scatter(
                                        x=effect["label"],
                                        y=effect[y],
                                        mode="markers",
                                        marker=dict(size=10),
                                        name=y,
                                    )
                                )
                            else:
                                fig.add_trace(
                                    go.Scatter(
                                        x=effect["label"],
                                        y=effect[y],
                                        mode="markers",
                                        marker=dict(size=5),
                                        name=y,
                                    )
                                )
                    fig.update_layout(
                        title="Partial dependence plot for "
                        + x
                        + "_class"
                        + str(class_no),
                        xaxis_title=x,
                        yaxis_title="Feature effect for ",
                    )
                    plotly.offline.plot(
                        fig,
                        filename=output_folder
                        + "pdp/"
                        + x
                        + "_class"
                        + str(class_no)
                        + "_pdp.html",
                        auto_open=False,
                    )
                    fig.show()

                    if (
                        list(df[df["feature_name"] == x]["feature_type"]) == ["numeric"]
                        and df_x_class[x].dtype.kind in "iufc"
                    ):
                        fig = go.Figure()
                        fig.add_trace(
                            go.Histogram(
                                x=df_x_class[
                                    (df_x_class[x] >= effect.label.min())
                                    & (df_x_class[x] <= effect.label.max())
                                ][x]
                            )
                        )
                    else:
                        fig = go.Figure()
                        # df_x_class.sort_values(by=x, inplace=True)
                        fig.add_trace(go.Histogram(x=df_x_class.sort_values(x)[x]))
                    fig.update_layout(
                        title="Original distribution of "
                        + x
                        + "_class"
                        + str(class_no),
                        xaxis_title=x,
                        yaxis_title="Count",
                    )
                    plotly.offline.plot(
                        fig,
                        filename=output_folder
                        + "pdp/"
                        + x
                        + "_class"
                        + str(class_no)
                        + "_hist.html",
                        auto_open=False,
                    )
                    fig.show()


# show top x features effects
topx = 5
if pdp:
    if class_num > 2:
        df_x["class"] = df_x[target]
    else:
        df_x["class"] = 0
    __create_effect_plot(
        df_output,
        output_folder,
        feature_effects,
        iteration,
        topx,
        project_id_list,
        df_x,
    )

## Sort and output the the top impact features based on the topX 

In [None]:
def export_new_data(
    df_output2,
    name,
    df_id,
    df_x,
    df_y,
    error_feature,
    keep_column,
    output_folder,
    file_name,
):
    # output new data
    lis = list(df_output2.columns)
    drops = set(df_x.columns) - set(lis)
    df_x2 = df_x.drop(columns=list(drops))

    # drop error features from TopX
    if error_feature == True:
        df_x2 = df_x2[
            [c for c in df_x2.columns if c not in ["error_1", "error_2", "error_3"]]
        ]

    if (len(keep_column) != 0) or (group_col != False):
        df_topx = pd.concat([df_id, df_x2, df_y], axis=1)
    else:
        df_topx = pd.concat([df_x2, df_y], axis=1)
    print("{} columns will be exported".format(len(df_x2.columns)))
    print(df_x2.columns)
    return df_topx


def change_topX(
    top,
    how,
    output_folder,
    input_file,
    df_output,
    remove_list,
    keep_list,
    df_id,
    df_x,
    df_y,
    error_feature,
    keep_column,
    file_name,
):
    # load output file
    name, ext = os.path.splitext(input_file)
    name = re.split("_[0-9]+columns", name)[0]
    if how == "min":
        df_output = df_output.sort_values(
            by=["min", "median", "max"], ascending=False, axis=1
        )
    else:
        df_output = df_output.sort_values(by=["median", "max"], ascending=False, axis=1)

    # remove feautes which are in remove_list
    if len(remove_list) != 0:
        df_output = df_output[[c for c in df_output.columns if c not in remove_list]]

    # if top>0, sort within TopX features
    if top > 0:
        sort_col = list(df_output.columns[: int(top)])
        # add columns in keep list
        if len(keep_list) != 0:
            for col in keep_list:
                sort_col.append(col)
        df_output2 = df_output[list(set(sort_col))]
        df_topx = export_new_data(
            df_output2,
            name,
            df_id,
            df_x,
            df_y,
            error_feature,
            keep_column,
            output_folder,
            file_name,
        )

    # If top is -4,remove features lower than the lowest error_X(error_1,error_2,error_3).
    elif top == -4:
        if error_feature:
            error_1_place = list(df_output.columns).index("error_1")
            error_2_place = list(df_output.columns).index("error_2")
            error_3_place = list(df_output.columns).index("error_3")
            worst_error_rank = max(error_1_place, error_2_place, error_3_place)
            sort_col = list(df_output.columns[: int(worst_error_rank) + 1])
            # add columns in keep list
            if len(keep_list) != 0:
                for col in keep_list:
                    sort_col.append(col)
            df_output2 = df_output[list(set(sort_col))]
            df_topx = export_new_data(
                df_output2,
                name,
                df_id,
                df_x,
                df_y,
                error_feature,
                keep_column,
                output_folder,
                file_name,
            )
        else:
            df_topx = pd.DataFrame()
            assert False, "Please enable error_features"

    # If top is in [-1,-2,-3],remove features lower than [error_1,error_2,error_3]
    elif top in [-1, -2, -3]:
        if error_feature:
            error_x_place = list(df_output.columns).index("error_{}".format(abs(top)))
            sort_col = list(df_output.columns[: int(error_x_place) + 1])
            # add columns in keep list
            if len(keep_list) != 0:
                for col in keep_list:
                    sort_col.append(col)
            df_output2 = df_output[list(set(sort_col))]
            df_topx = export_new_data(
                df_output2,
                name,
                df_id,
                df_x,
                df_y,
                error_feature,
                keep_column,
                output_folder,
                file_name,
            )
        else:
            df_topx = pd.DataFrame()
            assert False, "Please enable error_features"

    # if top=0,no output
    elif top == 0:
        df_topx = pd.DataFrame()

    else:
        df_topx = pd.DataFrame()
        assert False, "Please enter a valid value"

    return df_topx


# top parameter controls the top feature impact list
# if 0,show all the featuers
# if enable error_feature,will remove features with some conditions below
# if -4,remove features lower than the lowest error_X(error_1,error_2,error_3)
# if -3,remove features lower than error_3
# if -2,remove features lower than error_2
# if -1,remove features lower than error_1
top = -4
# feature impact ranked by median or others, median is recommended
how = "median"
# Input features name if want to remove which are in the top list or keep null
remove_list = []
# Input features name if want to keep which are not in the top list or keep null
keep_list = []

df_topx = change_topX(
    top,
    how,
    output_folder,
    input_file,
    df_output,
    remove_list,
    keep_list,
    df_id,
    df_x,
    df_y,
    error_feature,
    keep_column,
    file_name,
)

In [None]:
df_topx

## Save output files

In [None]:
%%time
df_output.to_csv(output_folder + "seeds_check.csv", index=True, encoding="utf-8-sig")
df_log.to_csv(output_folder + "log_file.csv", index=True, encoding="utf-8-sig")
df_association.to_csv(
    output_folder + "association.csv", index=False, encoding="utf-8-sig"
)
df_topx.to_csv(
    output_folder + "robust_feature_" + file_name, index=False, encoding="utf-8-sig"
)
print("save to local output folder finished!")

for d in data_output:
    if d == "ai_catalog":
        dr.Dataset.upload(output_folder + "seeds_check.csv")
        dr.Dataset.upload(output_folder + "log_file.csv")
        dr.Dataset.upload(output_folder + "association.csv")
        dr.Dataset.upload(output_folder + "robust_feature_" + file_name)
        print("save to ai_catalog finished!")
    if d == "aws_s3":
        my_session = boto3.Session(
            aws_access_key_id=AWS_KEY, aws_secret_access_key=AWS_SECRET
        )  # aws access_key and secret_access_key
        wr.s3.upload(
            local_file=output_folder + "seeds_check.csv",
            path=AWS_S3_OUTPUT_PATH + "seeds_check.csv",
            boto3_session=my_session,
        )
        wr.s3.upload(
            local_file=output_folder + "log_file.csv",
            path=AWS_S3_OUTPUT_PATH + "log_file.csv",
            boto3_session=my_session,
        )
        wr.s3.upload(
            local_file=output_folder + "association.csv",
            path=AWS_S3_OUTPUT_PATH + "association.csv",
            boto3_session=my_session,
        )
        wr.s3.upload(
            local_file=output_folder + "robust_feature_" + file_name,
            path=AWS_S3_OUTPUT_PATH + "robust_feature_" + file_name,
            boto3_session=my_session,
        )
        print("save to aws_s3 finished!")