In [None]:
from itertools import product
import numpy as np
import warnings
from derive_conceptualspace.load_data.load_semanticspaces import get_all, display_svm
from derive_conceptualspace.util.base_changer import ThreeDPlane
from derive_conceptualspace.util.threedfigure import ThreeDFigure, make_meshgrid
argmax = lambda l: max(enumerate(l), key=lambda x:x[1])[0]
unique = lambda iterable: list({i:None for i in iterable}.keys())

## Get the most 3 important dimensions from the Decision Tree and then display, colored by labelclass

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import HTML
import pyperclip

from misc_util.logutils import setup_logging
from misc_util.pretty_print import Markdown, display

from derive_conceptualspace.pipeline import SnakeContext, load_envfiles, cluster_loader
from derive_conceptualspace.util.result_analysis_tools import getfiles_allconfigs, display_metrics, show_lambda_elements, highlight_nonzero_max
from derive_conceptualspace.settings import DEFAULT_N_CPUS
from derive_conceptualspace.util.threadworker import WorkerPool
from derive_conceptualspace.cli.args_from_filename import get_filename, print_envvars
from derive_conceptualspace.util.desc_object import DescriptionList
from derive_conceptualspace.evaluate.shallow_trees import classify_shallowtree
from derive_conceptualspace.evaluate.shallow_trees import CATNAMES

plt.rcParams['figure.figsize'] = [16, 10]

In [None]:
setup_logging()
load_envfiles("siddata")
configs, print_cnf = getfiles_allconfigs("clusters", verbose=False, parse_all=True)

In [None]:
loaders = dict(clusters=cluster_loader, embedding=lambda **args: args["embedding"].embedding_, pp_descriptions=DescriptionList.from_json)
clusters, embedding, descriptions = SnakeContext.loader_context(config=configs[0]).load("clusters", "embedding", "pp_descriptions", loaders=loaders)

In [None]:
descriptions.additionals_names

In [None]:
classify_shallowtree(clusters, embedding, descriptions, one_vs_rest=False, dt_depth=None, test_percentage_crossval=0, classes="veranstaltungsnummer", do_plot=False, verbose=True, return_features=False)

In [None]:
res = classify_shallowtree(clusters, embedding, descriptions, one_vs_rest=False, dt_depth=None, test_percentage_crossval=0, classes="Geonames", do_plot=False, verbose=True, return_features=False)
assert res == 1

In [None]:
clfs, inputs, targets, scores, catnames = classify_shallowtree(clusters, embedding, descriptions, one_vs_rest=True, dt_depth=3, test_percentage_crossval=0, classes="Geonames", do_plot=False, verbose=False, return_features=True)

### Get most important features

now I want to get the most important features ==> find the nonzero elements from clf.feature_importances_ (and their dimension-name) 
but also the actual tree-decision as you see when plotting it with graphviz -> maybe https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html#sphx-glr-auto-examples-tree-plot-unveil-tree-structure-py ?

In [None]:
def get_decision_path(X_test, clf, catnames, axnames):
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    classes = [catnames[clf.classes_[np.argmax(i)]] for i in clf.tree_.value]
    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()
        node_depth[node_id] = depth
        # If the left and right child of a node is not the same we have a split node
        is_split_node = children_left[node_id] != children_right[node_id]
        # If a split node, append left and right children and depth to `stack` so we can loop through them
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True
    print("The binary tree structure has {n} nodes and has the following tree structure:\n".format(n=n_nodes))
    for i in range(n_nodes):
        if is_leaves[i]:
            print("{space}node={node} is a leaf node - category {cat}".format(space=node_depth[i] * "\t", node=i, cat=classes[i]))
        else:
            print("{space}node={node} is a split node: go to node {left} if {feature} <= {threshold} else to node {right}. Cat={cat}".format(
                    space=node_depth[i] * "\t", node=i, left=children_left[i], feature=axnames[clf.tree_.feature[i]], threshold=clf.tree_.threshold[i], right=children_right[i], cat=classes[i]))

In [None]:
axnames = [f",".join([i]+clusters["clusters"][i][:2]) for i in inputs.columns]
for clf, catname in zip(clfs, catnames):
    get_decision_path(inputs, clf, catname, axnames)
    print("Feature Importances:\n", {elem: axnames[i] for i, elem in enumerate(clf.feature_importances_) if elem > 0})
    print("=="*50+"\n"+"=="*50)

#### Use feature_importance

In [None]:
axnames = [f",".join([i]+clusters["clusters"][i][:2]) for i in inputs.columns]
for clf, catname in zip(clfs, catnames):
    feats = [(i[0], round(i[1],3)) for i in sorted({axnames[i]: elem for i, elem in enumerate(clf.feature_importances_) if elem > 0}.items(), key=lambda x:x[1], reverse=True)][:3]
    print(catname[1].ljust(max(len(i[1]) for i in catnames)), feats)

#### Use early decisions

In [None]:
def get_decisions(X_test, clf, catnames, axnames):
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    classes = [catnames[clf.classes_[np.argmax(i)]] for i in clf.tree_.value]
    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()
        node_depth[node_id] = depth
        # If the left and right child of a node is not the same we have a split node
        is_split_node = children_left[node_id] != children_right[node_id]
        # If a split node, append left and right children and depth to `stack` so we can loop through them
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True
    alls = {}
    for i in range(n_nodes):
        if not is_leaves[i]:
            alls.setdefault(node_depth[i], []).append((axnames[clf.tree_.feature[i]], clf.tree_.threshold[i]))  
    return (alls[0]+alls[1]) if len(alls) > 1 else alls[0]

In [None]:
axnames = [f",".join([i]+clusters["clusters"][i][:2]) for i in inputs.columns]
for clf, catname in zip(clfs, catnames):
    print(catname[1].ljust(max(len(i[1]) for i in catname)), get_decisions(inputs, clf, catname, axnames))

## let's plot!

In [None]:
clfs, inputs, targets, scores, catnames = classify_shallowtree(clusters, embedding, descriptions, one_vs_rest=True, dt_depth=2, test_percentage_crossval=0, classes="Geonames", do_plot=False, verbose=False, return_features=True)

In [None]:
axnames = [f",".join([i]+clusters["clusters"][i][:2]) for i in inputs.columns]
best_split = argmax(scores)
print(f"Best category to split: {catnames[best_split][1]} with score {max(scores):.3f}")

important_feats = [i[0] for i in sorted({axnames[i]: elem for i, elem in enumerate(clf.feature_importances_) if elem > 0}.items(), key=lambda x:x[1], reverse=True)][:3]
early_splits = [i[0] for i in get_decisions(inputs, clf, catnames, axnames)]
print(important_feats)
print(early_splits)

In [None]:
def plot_boundary(inputs, clf, targets, catnames, axnames):
    early_splits = [i[0] for i in get_decisions(inputs, clf, catnames, axnames)]
    if len(unique(early_splits)) < 3: 
        warnings.warn("<3 dimensions!!")
    inputcols = inputs.columns
    inputs = inputs[[i[0].split(",")[0] for i in get_decisions(inputs, clf, catnames, axnames)]]
    
    quaders = list(product(*[((0, i[1]), (i[1]+.001, inputs.max()[n])) for n,i in enumerate(get_decisions(inputs, clf, catnames, axnames))]))
    index_nrs = [list(inputcols).index(i) for i in inputs.columns]
    tmp = [np.zeros(len(quaders)) for _ in range(len(inputcols))]
    for num, elem in enumerate(index_nrs):
        tmp[elem] = np.array([[i[0] for i in j] for j in quaders]).T[num]
    boundary_targets = clf.predict(np.column_stack(tmp))
    
    def get_coords(quad):
        coords = list(zip(*list(product(*quad))))
        arg = np.array(list(zip(*coords))).T
        arg = arg.T[[0,2,6,4,1,3,7,5]].T #this is the order from "Mesh Cube" from docs: https://plotly.com/python/3d-mesh/
        return [list(i) for i in arg]
    
    pos_inpt = inputs.iloc[np.where(targets)]
    pos_custom_data = [{"Name": list(pos_inpt.index)[i]} for i in range(len(pos_inpt))]
    neg_inpt = inputs.iloc[np.where(1-targets)]
    neg_custom_data = [{"Name": list(neg_inpt.index)[i]} for i in range(len(neg_inpt))]

    with ThreeDFigure(name=catnames[1]+" | "+",".join(inputs.columns)) as fig:
        fig.fig.update_layout(scene = dict(xaxis_title=inputs.columns[0], yaxis_title=inputs.columns[1], zaxis_title=inputs.columns[2]))
        fig.add_markers(pos_inpt, color="red", name=f"class: {catnames[1]}", custom_data=pos_custom_data)
        fig.add_markers(neg_inpt, color="blue", name="others", custom_data=neg_custom_data)
        for quad, target in zip(quaders, boundary_targets):
            if target == 1:
                fig.add_quader(get_coords(quad), name=f"Boundary for {catnames[1]}")
            else:
                fig.add_quader(get_coords(quad), name=f"Boundary for other", color="blue")

        fig.show()

In [None]:
def plot_from_nr(nr):
    axnames = [f",".join([i]+clusters["clusters"][i][:2]) for i in inputs.columns]
    print(f"Plotting {catnames[nr]}. Accuracy: {scores[nr]:.3f}")
    plot_boundary(inputs, clfs[nr], targets[nr], catnames[nr], axnames)

In [None]:
plot_from_nr(0)