In [None]:
import matplotlib.pyplot as plt

from derive_conceptualspace.pipeline import SnakeContext, load_envfiles
from misc_util.logutils import setup_logging
from misc_util.pretty_print import display
from derive_conceptualspace.util.result_analysis_tools import getfiles_allconfigs
from derive_conceptualspace.util.desc_object import DescriptionList
from derive_conceptualspace.pipeline import cluster_loader

plt.rcParams['figure.figsize'] = [16, 10]

In [None]:
setup_logging()
load_envfiles("siddata")
configs, print_cnf = getfiles_allconfigs("clusters", verbose=True)

# Getting stuff for dataset info table

In [None]:
best_conf = {'pp_components': 'mfauhcsd2', 'quantification_measure': 'tfidf', 'embed_dimensions': '200', 'dcm_quant_measure': 'count', 'classifier_succmetric': 'kappa_digitized_onlypos_2', 'sec_lambda': '0.2'}
print("Best conf:", best_conf)
ctx = SnakeContext.loader_context(config={**best_conf, "debug": False}, silent=False)
descriptions, filtered_dcm, embedding, featureaxes, clusters = ctx.load("pp_descriptions", "filtered_dcm", "embedding", "featureaxes", "clusters",
                        loaders=dict(pp_descriptions=DescriptionList.from_json, clusters=cluster_loader, embedding=lambda **args: args["embedding"].embedding_))

In [None]:
from collections import Counter
import numpy as np
print("Kappa >= 0.1:", len({k: v for k,v in featureaxes["metrics"].items() if v["kappa_digitized_onlypos_2"] > 0.1}))
print("Kappa >= 0.5:", len({k: v for k,v in featureaxes["metrics"].items() if v["kappa_digitized_onlypos_2"] > 0.5}))
np.percentile((clslen := np.array([len(v) for k, v in clusters["clusters"].items()])), 10), np.percentile(clslen,90)

In [None]:
best_conf = {'pp_components': 'mfauhcsd2', 'quantification_measure': 'tfidf', 'embed_dimensions': '50', 'dcm_quant_measure': 'count', 'classifier_succmetric': 'kappa_digitized_onlypos_2', 'sec_lambda': '0.2'}
print("Best conf:", best_conf)
ctx = SnakeContext.loader_context(config={**best_conf, "debug": False}, silent=False)
descriptions, filtered_dcm, embedding, featureaxes, clusters = ctx.load("pp_descriptions", "filtered_dcm", "embedding", "featureaxes", "clusters",
                        loaders=dict(pp_descriptions=DescriptionList.from_json, clusters=cluster_loader, embedding=lambda **args: args["embedding"].embedding_))

In [None]:
from collections import Counter
import numpy as np
print("Kappa >= 0.1:", len({k: v for k,v in featureaxes["metrics"].items() if v["kappa_digitized_onlypos_2"] > 0.1}))
print("Kappa >= 0.5:", len({k: v for k,v in featureaxes["metrics"].items() if v["kappa_digitized_onlypos_2"] > 0.5}))
np.percentile((clslen := np.array([len(v) for k, v in clusters["clusters"].items()])), 10), np.percentile(clslen,90)

In [None]:
NUM_KEYS = 50
NUM_VALS = 7
maxlen = max(len(i) for i in list(clusters["clusters"].keys())[:NUM_KEYS])

print("\n".join([f"{k.rjust(maxlen)}: {(', '.join(v[:NUM_VALS]))+(', ...' if len(v) > NUM_VALS else '')}" for k, v in list(clusters["clusters"].items())[:NUM_KEYS]]))

# And an overall

In [None]:
flatten = lambda l: [item for sublist in l for item in sublist] 
greats, goods = set(), set()
greatsperdim, goodsperdim = {}, {}
for ndims in ["3", "50", "100", "200"]:
    greatsperdim[ndims], goodsperdim[ndims] = list(), list()
    for conf in [i for i in configs if i["embed_dimensions"] == int(ndims)]:
        ctx = SnakeContext.loader_context(config={**conf, "debug": False}, silent=True)
        clusters = ctx.load("clusters", loaders=dict(clusters=cluster_loader))
        greatsperdim[ndims].append(set(clusters["clusters"].keys()))
        goodsperdim[ndims].append(set(flatten(clusters["clusters"].values())))
    
# print("n-greats:", len(greats))
# print("n-goods:", len(goods))

print({k: dict(Counter([len(i) for i in v])) for k, v in greatsperdim.items()})
consider_num = {k: [n for n,i in enumerate(v) if len(i) == int(k)*2][:1] for k, v in greatsperdim.items()}

In [None]:
greats = set(flatten([greatsperdim[k][v[0]] for k, v in consider_num.items() if len(v)]))
print(f"In Kappa_0.5: {len(greats)}/{sum([int(i)*2 for i in consider_num.keys()])}")
goods = set(flatten([goodsperdim[k][v[0]] for k, v in consider_num.items() if len(v)]))
print(f"In Kappa_0.1: {len(goods)}")