In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import make_scorer
from sklearn.base import clone

In [36]:
class Evaluator:
    def __init__(self, estimator, cv=None, groups=None):
        self.estimator = estimator
        self.cv = cv
        self.groups = groups

    def evaluate(self, X, y, scoring):
        if self.cv is None:
            # Simple train-test split evaluation
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            model = clone(self.estimator).fit(X_train, y_train)
            score = scoring(model, X_test, y_test)
            return {'mean_score': score, 'std_dev': 0}
        else:
            # Cross-validation
            scores = cross_val_score(clone(self.estimator), X, y, cv=self.cv, scoring=scoring, groups=self.groups)
            return {'mean_score': np.mean(scores), 'std_dev': np.std(scores)}

In [37]:
class FeatureImportanceWrapper:
    def __init__(self, evaluator, features):
        self.evaluator = evaluator
        self.features = features

    def evaluate_feature_importance(self, X, y, scoring):
        results = []
        for feature_set in self.features:
            if isinstance(feature_set, str):
                # Single feature
                feature_indices = np.where(np.array(X.columns) == feature_set)[0]
                feature_key = feature_set
            elif isinstance(feature_set, tuple):
                # A group of features
                feature_indices = np.concatenate([np.where(np.array(X.columns) == feat)[0] for feat in feature_set[1]])
                feature_key = feature_set[0]
            
            X_subset = X.iloc[:, feature_indices]
            eval_results = self.evaluator.evaluate(X_subset, y, scoring)
            results.append({
                'Feature': feature_key,
                'Mean Score': eval_results['mean_score'],
                'Std Dev': eval_results['std_dev']
            })

        return pd.DataFrame(results)

In [45]:
import plotly.graph_objects as go

def plot_feature_importance(df, include_std=False):
    # Create the bar chart with optional error bars
    if include_std:
        error_y = dict(type='data', array=df['Std Dev'], visible=True)
    else:
        error_y = None

    fig = go.Figure(data=[
        go.Bar(
            x=df['Feature'],
            y=df['Mean Score'],
            error_y=error_y,
            marker_color='blue'
        )
    ])

    # Update the layout
    fig.update_layout(
        title='Feature Importance',
        xaxis_title='Features',
        yaxis_title='Mean Score',
        xaxis=dict(tickangle=-45),
        template='plotly_white'
    )

    fig.show()

In [38]:
# Example usage
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [39]:
X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, random_state=42)
X = pd.DataFrame(X, columns=[f'Feature_{i}' for i in range(20)])

In [48]:
feature_sets = ['Feature_0', 'Feature_1', ('Group_01', ['Feature_2', 'Feature_3', 'Feature_4']), 'Feature_5', 'Feature_6']

In [49]:
rf = RandomForestClassifier(random_state=42)
evaluator = Evaluator(estimator=rf, cv=5)

In [50]:
wrapper = FeatureImportanceWrapper(evaluator, feature_sets)

In [51]:
importance_scores = wrapper.evaluate_feature_importance(X, y, make_scorer(accuracy_score))

In [52]:
importance_scores

Unnamed: 0,Feature,Mean Score,Std Dev
0,Feature_0,0.487,0.038549
1,Feature_1,0.638,0.015684
2,Group_01,0.488,0.016613
3,Feature_5,0.814,0.023958
4,Feature_6,0.506,0.018276


In [53]:
plot_feature_importance(importance_scores)