In [1]:
# first we shall create a sample model 

from Models.fifa_model_generation import get_fifa_model

# the function returns a trained fifa model
model_obj, x_dataset, y_dataset  = get_fifa_model()
model_obj



GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [9]:
"""Interpretation Class"""

import pandas as pd

class DevExplainer:
    """
    Interpretation class. Before calling interpretation subclasses like partial
    dependence, one must call Interpretation.load_data().
    Examples
    --------
        >>> from skater.core.explanations import Interpretation
        >>> interpreter = Interpretation()
        >>> interpreter.load_data(X, feature_ids = ['a','b'])
        >>> interpreter.partial_dependence([feature_id1, feature_id2], regressor.predict)
    """

    def __init__(self, model_obj, x_train, y_train, model_type="classification", log_level=50, features=None, selective_features=None, target=None, unique_values = None, prediction_fn=None, sample=False):
        self.model = self.verify_model(model_obj)
        self.x_train = self.verify_data(x_train)
        self.y_train = self.verify_data(y_train)
        self.model_type = self.verify_model_type(model_type)
        self.feature_names = self.verify_features(features)
        self.target_names = self.verify_target(target)
        self.unique_values = self.verify_unqiue(unique_values)
        self.prediction_fn = prediction_fn
        self.log_level = log_level
        self.interpreter = None
        self.inmemory_model = None
        self.sampling = sample
        self.selective_features = self.verify_selective_features(selective_features)
        self.n_samples = x_dataset.shape[0] if self.sampling==False else 1000
        
    def verify_selective_features(self, selective_features):
        if selective_features is not None:
            print("\nSelective Features : ", selective_features)
            return selective_features
        else:
            return self.feature_names
    
    def verify_model(self, model):
        print("\nModel Object Type : ", type(model))
        if type(model) != "str":
            print("Model Object Validated")
        else:
            raise TypeError('Please provide Proper Model Object')
        return model
            
    def verify_data(self, dataset):
        print("\nValidating dataset")
        print("Dataset Type : ", type(dataset))
        if isinstance(dataset, pd.DataFrame) or isinstance(dataset, pd.Series):
            return dataset
        else:
            raise TypeError('Please provide dataset of type: \nPandas Dataframe\nPandas series')
    
    def verify_model_type(self, model_type):
        print("\nValidating model type")
        print("Type entered :", model_type.lower())
        if model_type.lower() == "classification" or model_type.lower() == "regression":
            print("Model Type Validated")
            return model_type.lower()
        else:
            raise TypeError('Please provide model_type: \nClassification \nRegression')
    
    def verify_features(self, features):
        if features is None:
            print("\nValidating feature names")
            print("List of features detected :", list(self.x_train))
            return list(self.x_train)

    def verify_target(self, target):
        if target is None:
            print("\nValidating target names")
            print("Target Column :", y_dataset.name)
            return [y_dataset.name]
            
    def verify_unqiue(self, unqiue):
        if unqiue is None:
            print("\nValidating Unique Values")
            if self.model_type == "classification":
                print("List of Target Values :", list(y_dataset.unique()))
                return list(y_dataset.unique())
            else:
                return None 
            
            
    def create_interpretation(self):
        from skater.core.explanations import Interpretation
        if self.interpreter is None:            
            print("Creating Interpretation")
            self.interpreter = Interpretation(
                training_data=self.x_train, 
                training_labels=self.y_train, 
                feature_names=self.feature_names,
                class_names=self.target_names,
                index=None,
                log_level=self.log_level
            )
            return self.interpreter
        else:
            return self.interpreter
        
        
    def create_inmemory_model(self):
        from skater.model import InMemoryModel
        print(self.inmemory_model)
        if self.inmemory_model is None:            
            print("Creating In Memory Model")
            self.inmemory_model = InMemoryModel(
                prediction_fn = self.model_prediction_fn(),
                input_formatter=None, 
                output_formatter=None, 
                target_names=self.target_names,
                feature_names=self.feature_names,
                unique_values=self.find_unique(),
                examples=self.x_train[0:10], 
                model_type="classifier" if self.model_type=="classification" else "regressor", 
                probability=True if self.model_type=="classification" else False, 
                log_level=self.log_level
            )
            return self.inmemory_model
        else:
            return self.inmemory_model

    def feature_importance_values(self):
        self.interpreter = self.create_interpretation()
        self.inmemory_model = self.create_inmemory_model()
        feature_importance = []
        feature_importance = self.interpreter.feature_importance.feature_importance(
                model_instance = self.inmemory_model, 
                ascending=False, 
                filter_classes=self.unique_values, 
                n_jobs=-1,
                progressbar=True, 
                n_samples=self.n_samples, 
                method='prediction-variance', 
                scorer_type='default',
                use_scaling=False
            )
        return feature_importance

    
    def feature_importance_plot(self):
        self.interpreter = self.create_interpretation()
        self.inmemory_model = self.create_inmemory_model()
        feature_importance = []
        feature_importance = self.interpreter.feature_importance.plot_feature_importance(
                ax=None, 
                n_features=1000, 
                importance_threshold=0.0, 
                figure_size=(50, 50),
                modelinstance = self.inmemory_model, 
                ascending=False, 
                filter_classes=self.unique_values, 
                n_jobs=-1,
                progressbar=True, 
                n_samples=self.n_samples, 
                method='prediction-variance', 
                scorer_type='default',
                use_scaling=False
            )
        return feature_importance

    
    def partial_dependence_values(self):
        self.interpreter = self.create_interpretation()
        self.inmemory_model = self.create_inmemory_model()
        pdp_list = []
        print(self.inmemory_model)
        for feature in self.selective_features:
            pdp = self.interpreter.partial_dependence.partial_dependence(
                feature_ids = [feature],
                modelinstance = self.inmemory_model, 
                filter_classes=self.unique_values, 
                grid=None,
                grid_resolution=1000, 
                n_jobs=-1, 
                grid_range=None, 
                sample=self.sampling,
                sampling_strategy='random-choice', 
                n_samples=self.n_samples,
                bin_count=50, 
                return_metadata=True,
                progressbar=True, 
            )
            pdp_list.append(pdp)
        return pdp_list

    def partial_dependence_plot(self):
        self.interpreter = self.create_interpretation()
        self.inmemory_model = self.create_inmemory_model()
        pdp_list = []
        print(self.selective_features)
        for feature in self.selective_features:
            pdp = self.interpreter.partial_dependence.plot_partial_dependence(
                [feature], 
                modelinstance = self.inmemory_model, 
                filter_classes=self.unique_values,
                grid=None, 
                grid_resolution=30, 
                grid_range=None,
                n_jobs=-1, 
                sample=False, 
                sampling_strategy='random-choice',
                n_samples=self.n_samples, 
                bin_count=50, 
                with_variance=False,
                figsize=(50, 50), 
                progressbar=True, 
                variance_type='estimate', 
                plot_title="Hello!!"
            )
            pdp_list.append(pdp)
        return pdp_list
    
    def surrogate_tree_values(self):
        self.interpreter = self.create_interpretation()
        self.inmemory_model = self.create_inmemory_model()
        surrogate_explainer = self.interpreter.tree_surrogate(
            oracle=self.inmemory_model, 
            seed=5, 
            splitter='best',
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            min_weight_fraction_leaf=0.0,
            max_features=None,
            max_leaf_nodes=100,
            min_impurity_decrease=0.0, 
            min_impurity_split=None, 
            class_weight="balanced",
            presort=True, 
            impurity_threshold=0.01
        )
        surrogate_explainer.fit(
            X=self.x_train,
            Y=self.y_train,
            use_oracle=True,
            prune='post',
            cv=5,
            n_iter_search=10,
            scorer_type='default',
            n_jobs=1, 
            param_grid=None, 
            impurity_threshold=0.01, 
            verbose=True
        )
        return surrogate_explainer

        
            
            
    def model_prediction_fn(self):
        if self.prediction_fn is None:
            if callable(getattr(self.model, "predict_proba", None)):
                print("Checking if predict proba exists :",callable(getattr(self.model, "predict_proba", None)))
                return self.model.predict_proba            
            elif callable(getattr(self.model, "predict", None)):
                print("Checking if predict exists :",callable(getattr(self.model, "predict", None)))
                return self.model.predict
            else:
                raise TypeError("Please pass appropriate prediction function")
        else:
            return self.prediction_fn
    
    def find_unique(self):
        if self.model_type == "classification":
            return list(self.y_train.unique())
        else:
            return None
    
            
    def wait(self, wait_time):
        import time
        time.sleep(wait_time)

In [10]:
explainer_obj = DevExplainer(
    model_obj = model_obj,
    x_train = x_dataset,
    y_train = y_dataset,
    model_type = "Regression",
    log_level = 50,
    sample = False,
    selective_features = ['age', 'height_cm', 'weight_kg']
)
# explainer_obj.partial_dependence_plot()
# explainer_obj.partial_dependence_values()
# explainer_obj.feature_importance_values()
# explainer_obj.feature_importance_plot()
abc = explainer_obj.surrogate_tree_values()
# explainer_obj.surrogate_tree_plot()

2021-07-21 12:15:19,740 - skater.core.global_interpretation.tree_surrogate - INFO - post pruning applied ...
2021-07-21 12:15:19,768 - skater.core.global_interpretation.tree_surrogate - DEBUG - Unique Labels in ground truth provided [-1060339.3457989   -630110.21910172  -547676.86422611 ...
 82898700.09983645 90632444.74975468 93770775.16196345]
2021-07-21 12:15:19,771 - skater.core.global_interpretation.tree_surrogate - DEBUG - Unique Labels in predictions generated [ 1335405.05779897  2556689.19410935  2850394.88508878  3230903.97768439
  4134270.8439583   4274953.17114527  4834267.16912808  5160300.0995562
  5398174.0794991   5633962.50956993  5837304.15559774  5905794.77412427
  6130966.84268969  6258197.41854456  6502971.20422269  7158286.45596959
  7670120.76340605  7672640.942963    7674544.55784866  7680983.49733494
  7739218.20864096  7899125.51020557  7939033.98019274  8435655.86735781
  8509391.59887244  8839151.11388542  9030043.33030275  9605870.69430728
 10322518.70286221


Model Object Type :  <class 'sklearn.ensemble.gradient_boosting.GradientBoostingRegressor'>
Model Object Validated

Validating dataset
Dataset Type :  <class 'pandas.core.frame.DataFrame'>

Validating dataset
Dataset Type :  <class 'pandas.core.series.Series'>

Validating model type
Type entered : regression
Model Type Validated

Validating feature names
List of features detected : ['age', 'height_cm', 'weight_kg', 'attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys', 'skill_dribbling', 'skill_curve']

Validating target names
Target Column : value_eur

Validating Unique Values

Selective Features :  ['age', 'height_cm', 'weight_kg']
Creating Interpretation
None
Creating In Memory Model
Checking if predict exists : True


2021-07-21 12:15:19,938 - skater.core.global_interpretation.tree_surrogate - DEBUG - Added index 61 back
2021-07-21 12:15:19,941 - skater.core.global_interpretation.tree_surrogate - DEBUG - new score generate 1127217.4815458946
2021-07-21 12:15:19,943 - skater.core.global_interpretation.tree_surrogate - DEBUG - Added index 62 back
2021-07-21 12:15:19,947 - skater.core.global_interpretation.tree_surrogate - DEBUG - new score generate 1115607.7981677535
2021-07-21 12:15:19,948 - skater.core.global_interpretation.tree_surrogate - DEBUG - Added index 63 back
2021-07-21 12:15:19,950 - skater.core.global_interpretation.tree_surrogate - DEBUG - new score generate 1114087.4493929532
2021-07-21 12:15:19,951 - skater.core.global_interpretation.tree_surrogate - DEBUG - Added index 64 back
2021-07-21 12:15:19,953 - skater.core.global_interpretation.tree_surrogate - DEBUG - new score generate 1113089.5031010446
2021-07-21 12:15:19,955 - skater.core.global_interpretation.tree_surrogate - DEBUG - Add

In [34]:
fig[0].savefig('dummy.png')