In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import HTML

from misc_util.logutils import setup_logging
from misc_util.pretty_print import Markdown, display

from derive_conceptualspace.pipeline import SnakeContext, load_envfiles
from derive_conceptualspace.util.result_analysis_tools import getfiles_allconfigs, display_metrics, show_lambda_elements, highlight_nonzero_max
from derive_conceptualspace.settings import DEFAULT_N_CPUS
from derive_conceptualspace.util.threadworker import WorkerPool

plt.rcParams['figure.figsize'] = [16, 10]

## Prepare all parameter-combinations

In [None]:
setup_logging()
load_envfiles("siddata")
configs, print_cnf = getfiles_allconfigs("clusters", verbose=True)

# Showing all metrics for the loaded parameter-combinations

In [None]:
# for conf in configs:
#     specials = {k: v for k, v in conf.items() if isinstance(print_cnf[k], list)}
#     display(Markdown("## "+", ".join(f"{k}: {v}" for k,v in specials.items())))
#     ctx = SnakeContext.loader_context(config=conf, silent=True)
#     clusters = ctx.load("clusters")
#     #show_metrics(clusters)
#     display(generate_comparertable(clusters["metrics"], minval=0.1))
#     #look at combis o'f bin2bin, f_one, k_r2r+_min, k_dig+_2, k_c2r+

In [None]:
# with WorkerPool(DEFAULT_N_CPUS-1, pgbar="Fetching clusters..") as pool:
#     get_clusters = lambda conf: ((ctx := SnakeContext.loader_context(config=conf, silent=True)).get_important_settings(), ctx.load("clusters"))
#     cluster_list, interrupted = pool.work(configs, get_clusters)

with WorkerPool(2, pgbar="Fetching clusters..") as pool:
    get_clusters = lambda conf: ((ctx := SnakeContext.loader_context(config=conf, silent=True)).get_important_settings(), ctx.load("clusters"))
    cluster_list, interrupted = pool.work(configs[:5], get_clusters)

In [None]:
for conf, (important_settings, clusters) in zip(configs, cluster_list):
    specials = {k: v for k, v in conf.items() if isinstance(print_cnf[k], list)}
    display(Markdown("## "+", ".join(f"{k}: {v}" for k,v in specials.items())))
    display("Settings: "+(" - ".join(important_settings)))
    display_metrics(clusters["metrics"])

In [None]:
alls = {}
for conf, (important_settings, clusters) in zip(configs, cluster_list):
    specials = {k: v for k, v in conf.items() if isinstance(print_cnf[k], list)}
    display(Markdown("## "+", ".join(f"{k}: {v}" for k,v in specials.items())))
    #display("Settings:"+(": ".join(important_settings)))
    #show_lambda_elements(clusters["metrics"], 0.5, 0.1)
    lambda1, lambda2 = 0.5, 0.1
    metlist = clusters["metrics"]
    res = {}
    for met in list(list(metlist.values())[0].keys()):
        if "kappa" in met and not "bin2bin" in met:
            vals = [i[met] for i in metlist.values()]
            t1 = len([i for i in vals if i >= lambda1])
            t2 = len([i for i in vals if i >= lambda2]) - t1
            if t1:
                print(f" {met}: T^{lambda1}: {t1}, T^{lambda2}: {t2}, in T^{lambda1}: {', '.join([k for k, v in metlist.items() if v[met] > lambda1][:5])}")
            # res[met] = {lambda1: t1, lambda2: t2, f"{lambda1}_elems": [k for k, v in metlist.items() if v[met] > lambda1][:5]}
            met = met.replace("kappa", "k").replace("dense", "d").replace("rank2rank", "r2r").replace("count2rank", "c2r").replace("bin2bin", "b2b").replace("f_one", "f1").replace("digitized", "dig").replace("_onlypos", "+")
            res[met] = t1
    # alls[" | ".join(f"{v}" for k,v in specials.items())] = res
    alls[tuple(specials.values())] = res

In [None]:
styles = [{'selector': 'th', 'props': [('vertical-align','top')]}]  #('border-style', 'solid')  #see https://stackoverflow.com/a/55904239/5122790

df = pd.DataFrame(alls, columns=pd.MultiIndex.from_tuples(alls.keys(), names=list(specials.keys()))).T
df["mean"] = df.mean(axis=1)
res = df.style.apply(highlight_nonzero_max, axis=0).format(precision=0).set_table_styles(styles)
res
print(res.to_latex())

### Take-Aways from this huge table:
* assuming I want >= 400 directions, I must choose quadratic kappa-weights over linear ones.
* dcm_quant_measure == count is very good for digitized kappa, but consistently a lot worse than tf-idf and ppmi for ranking-based (which makes sense assuming there are many count==1 cases)
* I should go for 3 dimensions only sporadically, but for dcm_quant_measure in [tf-idf, ppmi] it's performance is actually consistently competative (huge surprise!!)

In [None]:
less = {k: v for k, v in alls.items() if k[-1] == "quadratic" and not (k[2] == "3" and k[3] not in ["tfidf", "ppmi"])}
df = pd.DataFrame(less, columns=pd.MultiIndex.from_tuples(less.keys(), names=list(specials.keys()))).T
df["mean"] = df.mean(axis=1)
df.style.apply(highlight_nonzero_max, axis=0).format(precision=0).set_table_styles(styles)

### More analysis:
* quantification_measure == tfidf, n_dims = 200 produces aaaall the best results
* can't really say that results are better if quantification_measure and dcm_quant_measure are the same vs one tf-idf one ppmi