In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier as RFC
from cuml.ensemble import RandomForestClassifier as CURFC
import numpy as np
from json import loads
import matplotlib.pyplot as plt

def calculate_importances(nodes, n_features):
    importances = np.zeros((len(nodes), n_features))


    def calculate_node_importances(node, i_root):
        if "gain" not in node:
            return

        samples = node["instance_count"]
        gain = node["gain"]
        feature = node["split_feature"]
        feature_gains[feature] += gain * samples

        for child in node["children"]:
            calculate_node_importances(child, i_root)


    for i, root in enumerate(nodes):
        feature_gains = np.zeros(n_features)
        calculate_node_importances(root, i)
        importances[i] = feature_gains / feature_gains.sum()

    return np.mean(importances, axis=0)

In [None]:
np.set_printoptions(precision=3, suppress=True)
random_times = 100

data = load_iris()
x = data.data
y = data.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

### sklearn classification result

In [None]:
fis = np.zeros((random_times, len(data.feature_names))).astype(np.float64)
for i in range(random_times):
    rfc = RFC(random_state=i)
    rfc.fit(x_train, y_train)
    fis[i, :] = rfc.feature_importances_
print('sklearn classification report')
print(classification_report(y_test, rfc.predict(x_test)))

plt.figure()
plt.boxplot(fis, labels=data.feature_names, vert=False, showmeans=True)
plt.title(f'sklearn feature importance result with {random_times} times random')
plt.show()

### cuml classification result

In [None]:
cufis = np.zeros((random_times, len(data.feature_names))).astype(np.float64)
for i in range(random_times):
    curfc = CURFC(random_state=i)
    curfc.fit(x_train.astype(np.float32), y_train.astype(np.float32))
    tree_nodes = loads(curfc.get_json())
    cufis[i, :] = calculate_importances(tree_nodes, len(data.feature_names))
print('cuml classification report')
print(classification_report(y_test, curfc.predict(x_test)))

plt.figure()
plt.boxplot(cufis, labels=data.feature_names, vert=False, showmeans=True)
plt.title(f'cuml feature importance result with {random_times} times random')
plt.show()