In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import HTML
import pyperclip

from misc_util.logutils import setup_logging
from misc_util.pretty_print import Markdown, display

from derive_conceptualspace.pipeline import SnakeContext, load_envfiles
from derive_conceptualspace.util.result_analysis_tools import getfiles_allconfigs, display_metrics, show_lambda_elements, highlight_nonzero_max, df_to_latex, shorten_met
from derive_conceptualspace.settings import DEFAULT_N_CPUS
from derive_conceptualspace.util.threadworker import WorkerPool
from derive_conceptualspace.cli.args_from_filename import get_filename, print_envvars

plt.rcParams['figure.figsize'] = [16, 10]

## Prepare all parameter-combinations

In [None]:
setup_logging()
load_envfiles("siddata")
configs, print_cnf = getfiles_allconfigs("featureaxes", verbose=True)

In [None]:
print_envvars(get_filename(configs[0], get_dependencies=False, doprint=False))

In [None]:
with WorkerPool(DEFAULT_N_CPUS-3, pgbar="Fetching featureaxes..") as pool:
    get_featureaxes = lambda conf: ((ctx := SnakeContext.loader_context(config=conf, silent=True)).get_important_settings(), ctx.load("featureaxes"))
    featureaxes_list, interrupted = pool.work(configs, get_featureaxes)

# Showing all metrics for the loaded parameter-combinations

In [None]:
for conf, (important_settings, featureaxes) in zip(configs, featureaxes_list):
    specials = {k: v for k, v in conf.items() if isinstance(print_cnf[k], list)}
    display(Markdown("## "+", ".join(f"{k}: {v}" for k,v in specials.items())))
    display("Settings: "+(" - ".join(important_settings)))
    display_metrics(featureaxes["metrics"])

In [None]:
alls = {}
for conf, (important_settings, featureaxes) in zip(configs, featureaxes_list):
    specials = {k: v for k, v in conf.items() if isinstance(print_cnf[k], list)}
    display(Markdown("## "+", ".join(f"{k}: {v}" for k,v in specials.items())))
    #display("Settings:"+(": ".join(important_settings)))
    #show_lambda_elements(featureaxes["metrics"], 0.5, 0.1)
    lambda1, lambda2 = 0.5, 0.1
    metlist = featureaxes["metrics"]
    res = {}
    for met in list(list(metlist.values())[0].keys()):
        if "kappa" in met and not "bin2bin" in met:
            vals = [i[met] for i in metlist.values()]
            t1 = len([i for i in vals if i >= lambda1])
            t2 = len([i for i in vals if i >= lambda2]) - t1
            if t1:
                print(f" {met}: T^{lambda1}: {t1}, T^{lambda2}: {t2}, in T^{lambda1}: {', '.join([k for k, v in metlist.items() if v[met] > lambda1][:5])}")
            # res[met] = {lambda1: t1, lambda2: t2, f"{lambda1}_elems": [k for k, v in metlist.items() if v[met] > lambda1][:5]}
            res[shorten_met(met)] = t1
    # alls[" | ".join(f"{v}" for k,v in specials.items())] = res
    alls[tuple(specials.values())] = res

# Show complete Table with the Number of values in Kappa^0.5

In [None]:
styles = [{'selector': 'th', 'props': [('vertical-align','top')]}]  #('border-style', 'solid')  #see https://stackoverflow.com/a/55904239/5122790
styler = lambda df: df.style.apply(highlight_nonzero_max, axis=0).format(precision=0).set_table_styles(styles)

df = pd.DataFrame(alls, columns=pd.MultiIndex.from_tuples(alls.keys(), names=list(specials.keys()))).T
df["mean"] = df.mean(axis=1)
pyperclip.copy(df_to_latex(df, styler))
styler(df)

### Take-Aways from this huge table:
* assuming I want >= 400 directions, I must choose quadratic kappa-weights over linear ones.
* dcm_quant_measure == count is very good for digitized kappa, but consistently a lot worse than tf-idf and ppmi for ranking-based (which makes sense assuming there are many count==1 cases)
* I should go for 3 dimensions only sporadically, but for dcm_quant_measure in [tf-idf, ppmi] it's performance is actually consistently competative (huge surprise!!)

In [None]:
# dict(zip(specials.keys(), list(alls.keys())[0]))
less = {k: v for k, v in alls.items() if not (k[2] == "3" and k[3] not in ["tfidf", "ppmi"])}  #k[-1] == "quadratic"
less = {tuple(i for i in k): v for k, v in less.items()}  #k[:-1]
keys = list(specials.keys()) #[:-1]

df = pd.DataFrame(less, columns=pd.MultiIndex.from_tuples(less.keys(), names=keys)).T
df["mean"] = df.mean(axis=1)
pyperclip.copy(df_to_latex(df, styler))
styler(df)

### More analysis:
* quantification_measure == tfidf, n_dims = 200 produces aaaall the best results
* can't really say that results are better if quantification_measure and dcm_quant_measure are the same vs one tf-idf one ppmi

# Getting the Parameter-Kombi that yields the most candidates on average

In [None]:
METRIC = "mean"

df = pd.DataFrame(alls, columns=pd.MultiIndex.from_tuples(alls.keys(), names=list(specials.keys()))).T
df["mean"] = df.mean(axis=1)
best_config = dict(zip(df.index.names, df.idxmax()[METRIC]))
best_config.update({k: v for k, v in print_cnf.items() if not isinstance(v, list)})

print(f"MA_CLASSIFIER_SUCCMETRIC={shorten_met(df.loc[best_ind.values[0]].idxmax(), reverse=True)}", end=";")
print_envvars(get_filename(best_config, get_dependencies=False, doprint=False))                                                                         