In [None]:
from derive_conceptualspace.evaluate.shallow_trees import classify_shallowtree
from derive_conceptualspace.pipeline import SnakeContext, load_envfiles
from derive_conceptualspace.util.result_analysis_tools import get_best_conf, highlight_nonzero_max, highlight_max
from derive_conceptualspace.cli.args_from_filename import get_filename, print_envvars
from misc_util.logutils import setup_logging

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import builtins
import pyperclip
from derive_conceptualspace.util.result_analysis_tools import df_to_latex
from misc_util.logutils import setup_logging
from misc_util.pretty_print import display, pretty_print as print
from derive_conceptualspace.util.threedfigure import ThreeDFigure
from derive_conceptualspace.semantic_directions.cluster_names import get_name_dict
from derive_conceptualspace.pipeline import SnakeContext, load_envfiles, cluster_loader
from derive_conceptualspace.util.result_analysis_tools import getfiles_allconfigs
from derive_conceptualspace.util.desc_object import DescriptionList
from derive_conceptualspace.evaluate.shallow_trees import classify_shallowtree

In [None]:
import pandas as pd
import numpy as np
import pyperclip
from derive_conceptualspace.util.result_analysis_tools import getfiles_allconfigs
from derive_conceptualspace.util.result_analysis_tools import df_to_latex, shorten_met
from derive_conceptualspace.util.desc_object import DescriptionList
from derive_conceptualspace.pipeline import cluster_loader
from joblib import parallel_backend
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from IPython.display import Markdown
flatten = lambda l: [item for sublist in l for item in sublist] 

In [None]:
setup_logging()
load_envfiles("siddata")
configs, print_cnf = getfiles_allconfigs("clusters", verbose=True)

# All configs

In [None]:
def get_decisions(X_test, clf, catnames, axnames):
    n_nodes = clf.tree_.node_count
    children_left = clf.tree_.children_left
    children_right = clf.tree_.children_right
    classes = [catnames[clf.classes_[np.argmax(i)]] for i in clf.tree_.value]
    node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
    is_leaves = np.zeros(shape=n_nodes, dtype=bool)
    stack = [(0, 0)]  # start with the root node id (0) and its depth (0)
    while len(stack) > 0:
        # `pop` ensures each node is only visited once
        node_id, depth = stack.pop()
        node_depth[node_id] = depth
        # If the left and right child of a node is not the same we have a split node
        is_split_node = children_left[node_id] != children_right[node_id]
        # If a split node, append left and right children and depth to `stack` so we can loop through them
        if is_split_node:
            stack.append((children_left[node_id], depth + 1))
            stack.append((children_right[node_id], depth + 1))
        else:
            is_leaves[node_id] = True
    alls = {}
    for i in range(n_nodes):
        if not is_leaves[i]:
            alls.setdefault(node_depth[i], []).append((axnames[clf.tree_.feature[i]], clf.tree_.threshold[i]))  
    return (alls[0]+alls[1]) if len(alls) > 1 else alls[0]

In [None]:
alls = None
nworked = 0
for conf in configs: #[i for i in configs if i["embed_dimensions"] == NDIM][16:]: #first 10 have <10
    ctx = SnakeContext.loader_context(config={**conf, "debug": False}, silent=True)
    clusters = ctx.load("clusters", loaders=dict(clusters=cluster_loader))
    if len(clusters["clusters"]) < conf["embed_dimensions"]*2:
        print(f'Skipping... ({len(clusters["clusters"])} clusters)')
        continue
    nworked += 1
    descriptions, embedding = ctx.load("pp_descriptions", "embedding", 
                  loaders=dict(pp_descriptions=DescriptionList.from_json, clusters=cluster_loader, embedding=lambda **args: args["embedding"].embedding_))
    clfs, inputs, targets, scores, classes, catnames = classify_shallowtree(clusters, embedding, descriptions, ctx.obj["dataset_class"], one_vs_rest=True, dt_depth=1, test_percentage_crossval=0.33,
                       classes="fachbereich", verbose=False, return_features=True, balance_classes=True, do_plot=False, shutup=True)
    if alls is None:
        alls = {i[1]: {int(j): [] for j in print_cnf["embed_dimensions"]}  for i in classes}
    
    axnames = {n: k for n, k in enumerate(clusters["clusters"].keys())}
    for clf, catname in zip(clfs, classes):
        feats = [(i[0], round(i[1],3)) for i in sorted({axnames[i]: elem for i, elem in enumerate(clf.feature_importances_) if elem > 0}.items(), key=lambda x:x[1], reverse=True)][:3]
        alls[catname[1]][conf["embed_dimensions"]].append([i[0] for i in feats])

In [None]:
from collections import Counter

In [None]:
num_goodruns = {k: len(v) for k, v in list(alls.values())[0].items()}
print(f"How many per dim have at least ndims*2 features with kappa > 0.5: {num_goodruns}")

flattened = {k1: {k2: set([i[0] for i in v2]) for k2, v2 in v1.items()} for k1, v1 in alls.items()}
nums = {k1: {k2: len(v2) for k2, v2 in v1.items()} for k1, v1 in flattened.items()}
print(f"How many unique per dim are there:")
nums

Ok let us ignore 3-dim cause that obvs sucks

In [None]:
flattened = {k1: {k2: v2 for k2, v2 in v1.items() if k2 != 3} for k1, v1 in flattened.items()}

fullflat = {k1: flatten([v2 for v2 in v1.values()]) for k1, v1 in flattened.items()}
nums = {k1: len(set(v1)) for k1, v1 in fullflat.items()}
print(f"How many unique in sum are there (out of {sum(i[1] for i in num_goodruns.items() if i[0] != 3)} possible)")
nums

In [None]:
{k: {f"{k2} ({v2})" for k2, v2 in dict(Counter(v)).items() if v2 > 1} for k, v in fullflat.items()}

<br><br>

## Are intuitively appealing phrases among the semantic directions?

Given the task of manually embedding courses into a semantic space, there are some intuitive can-
didates one may think of that capture some important aspects of a course. For
example, a word like computer hinting at computer-science related courses. Other
obvious candidates that will be checked include math, culture, science, school and
sport.

In [None]:
fullflat

In [None]:
builtins.print('\n'.join(k.ljust(max(len(i)+1 for i in fullflat.keys()))+'   '+(', '.join(v)) for k, v in fullflat.items()))

In [None]:
consider = ["computer", "recht", "musik", "management", "literatur", "sprache", "psychologie", "wirtschaft", "geographie", "schule", "kultur", "wissenschaft", "sport"]

print("Checking if it is a T^0.5 term")
for i in consider:
    if [k for k, v in fullflat.items() if i in v]:
        print(i, [k for k, v in fullflat.items() if i in v])

In [None]:
def lst_to_di(lst):
    dict_of_elems = {}
    for key, val in lst:
        dict_of_elems.setdefault(key, []).append(val)
    return dict_of_elems

full = {}
for i in consider:
    lst = [(k, v2) for k, v in fullflat.items() for v2 in v if i in v2]
    if lst: full.update(**lst_to_di(lst))

print("Checking if it is part of a T^0.5 term")
builtins.print('\n'.join(k.ljust(max(len(i)+1 for i in full.keys()))+'   '+(', '.join(v)) for k, v in full.items()))

# Ok, now for ALL ones

(also robustness assessment)

In [None]:
all_centers, all_elems = {int(j): [] for j in print_cnf["embed_dimensions"]}, {int(j): [] for j in print_cnf["embed_dimensions"]}
nworked = 0
for conf in configs: #[i for i in configs if i["embed_dimensions"] == NDIM][16:]: #first 10 have <10
    ctx = SnakeContext.loader_context(config={**conf, "debug": False}, silent=True)
    clusters = ctx.load("clusters", loaders=dict(clusters=cluster_loader))
    if len(clusters["clusters"]) < conf["embed_dimensions"]*2:
        print(f'Skipping... ({len(clusters["clusters"])} clusters)')
        continue
    all_centers[conf["embed_dimensions"]].append(set(clusters["clusters"].keys()))
    all_elems[conf["embed_dimensions"]].append(set(flatten(clusters["clusters"].values())))

In [None]:
all_centers = {k2: v2 for k2, v2 in all_centers.items() if k2 != 3}

In [None]:
print(len(all_centers[50]), len(flatten(all_centers[50])), len(set(flatten(all_centers[50]))))
print(len(all_centers[200]), len(flatten(all_centers[200])), len(set(flatten(all_centers[200]))))

In [None]:
from derive_conceptualspace.util.result_analysis_tools import df_to_latex

In [None]:
import pandas as pd
df = pd.DataFrame(index=consider, columns=pd.MultiIndex.from_tuples([(i,j) for i in all_centers.keys() for j in ("0.1", "0.5")], names=("NDim", "T")))
for what, whatname in [(all_centers, "0.5"), (all_elems, "0.1")]:
    for ndim in all_centers.keys():
        df.loc["N", (ndim, whatname)] = len(what[ndim])
        for i in consider:
            df.loc[i, (ndim, whatname)] = sum(1 for j in what[ndim] if i in j)
df

In [None]:
builtins.print(df_to_latex(df, lambda x:x.style))