In [None]:
from derive_conceptualspace.evaluate.shallow_trees import classify_shallowtree
from derive_conceptualspace.pipeline import SnakeContext, load_envfiles
from derive_conceptualspace.util.result_analysis_tools import get_best_conf
from derive_conceptualspace.cli.args_from_filename import get_filename, print_envvars
from misc_util.logutils import setup_logging

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
setup_logging()
load_envfiles("siddata")
conf, perf = get_best_conf("fachbereich", verbose=True, balance_classes=True, one_vs_rest=True, dt_depth=1, test_percentage_crossval=0.33)

In [None]:
print_envvars(get_filename(conf, get_dependencies=False, doprint=False))

In [None]:
ctx = SnakeContext.loader_context(config=conf, silent=True, warn_filters=["DifferentFileWarning"])
ctx.print_important_settings()
cluster_reprs, clusters, embedding, descriptions = ctx.load("cluster_reprs", "clusters", "embedding", "pp_descriptions")

In [None]:
print("Detected Semantic Directions:", ", ".join(list(cluster_reprs.keys())))

## Can we recover the exact courses from the detected directions?

In [None]:
import pyperclip
from derive_conceptualspace.util.result_analysis_tools import df_to_latex

import numpy as np
import pandas as pd
from scipy.stats import rankdata
from sklearn.tree import DecisionTreeClassifier
from tqdm.notebook import tqdm
from misc_util.pretty_print import pretty_print as print
from derive_conceptualspace.semantic_directions.cluster_names import get_name_dict
clus_rep_algo = "top_1"
clusters, planes = clusters.values()

In [None]:
cluster_names = get_name_dict(clusters, cluster_reprs, clus_rep_algo)
#first I want the distances to the origins of the respective dimensions (induced by the clusters), what induces the respective rankings (see DESC15 p.24u, proj2 of load_semanticspaces.load_projections)
axis_dists = {i: {cluster_names[k]: v.dist(embedding[i]) for k, v in planes.items()} for i in range(len(embedding))}
best_per_dim = {k: descriptions._descriptions[v].title for k, v in pd.DataFrame(axis_dists).T.idxmax().to_dict().items()}
print("Highest-ranking descriptions [with any class] per dimension:\n    "+"\n    ".join([f"*b*{k.ljust(max([len(i) for i in best_per_dim.keys()][:20]))}*b*: {v}" for k, v in best_per_dim.items()][:20]))

In [None]:
#TODO also show places 2, 3, 4 - hier sehen wir wieder sehr ähnliche ("football stadium", "stadium", "fan" for "goalie")
#TODO axis_dists is all I need for the movietuner already!! I can say "give me something like X, only with more Y"

consider = pd.DataFrame({descriptions._descriptions[i].title: axis_dists[i] for i in range(len(embedding))})
ranked = pd.DataFrame([rankdata(i) for i in consider.values], index=consider.index, columns=consider.columns).astype(int).T
ranked = ranked / ranked.shape[0] #looks better if we're doing relative rankings

In [None]:
with pd.option_context('display.max_rows', 10, 'display.max_columns', 10, 'display.expand_frame_repr', False, 'display.float_format', '{:.3f}'.format):
    display(ranked)

So now we will test if we can recover one specific course perfectly against all others with a decision tree...

* With the rankings in the semantic directions rounded
* With only a random subset of the semanatic directions.

In every `repeat`, we will test `ntests` random candidates with only `max_dirs` randomly selected directions, rounded to `roundto`. Which directions to select differs in every `repeat`

In [None]:
def test_recover_one(ranked, roundto=3, ntests=100, max_dirs=None, repeat=1):
    repeat_results = []
    with tqdm(total=ntests*repeat, leave=False) as pgbar:
        for ntrial in range(repeat):
            clone = ranked.copy().round(roundto)
            if max_dirs:
                clone = clone[np.random.choice(clone.columns, max_dirs)]
            results = []
            for i in np.random.choice(range(len(clone)), ntests):
                targets = [0]*len(clone)
                targets[i] = 1
                clf = DecisionTreeClassifier()
                clf.fit(clone.values, targets)
                results.append((clf.predict(clone.values) == targets).all())
                pgbar.update(1)
                
            repeat_results.append(sum(results)/len(results))
        return repeat_results

In [None]:
REPEAT = 20
NTESTS = 200

ROUND_TO = [3, 2, 1]
MAX_DIRS = [150, 100, 50, 20, 10, 5, 3]

n=0
results = {}
for roundto in ROUND_TO:
    for max_dirs in MAX_DIRS:
        n += 1
        print(f"Run {n}/{len(ROUND_TO)*len(MAX_DIRS)}. Arguments: round-to={roundto}, max-dirs={max_dirs}")
        res = test_recover_one(ranked, roundto=roundto, ntests=NTESTS, max_dirs=max_dirs, repeat=REPEAT)
        print(f"   Mean Accuracy: {np.array(res).mean():.2%}, Standard-Deviation: {np.array(res).std():.4f}, Best Result: {np.array(res).max():.2%}")
        results[(roundto,max_dirs)] = res

In [None]:
styles = [{'selector': 'th', 'props': [('vertical-align','top'),('text-align','left')]}]  #('border-style', 'solid')
styler = lambda df: df.style.format('{:.2%}'.format).set_table_styles(styles) #{"amax": '{:.0%}'.format, "mean": '{:.2%}'.format}

df = pd.DataFrame(results, columns=pd.MultiIndex.from_arrays(list(zip(*results.keys())), names=["Precision", "Max-Dims"]))
df = df.agg([np.max, np.mean])  #, np.std
df = df.T.sort_index().T
styler(df)

With three **random** directions we can on average recover 95% of courses!!

## We don't even need to use classifiers, we can just look at the number of duplicates dependent on the number and precision of dimensions

In [None]:
def count_dups(ranked, digit_bins, max_dirs, noise_lvl=(0,), repeat=1):
    total = len(digit_bins) * len(max_dirs) * len(noise_lvl) * repeat
    with tqdm(total=total) as pgbar:
        res = {}
        for bins in digit_bins:
            for dirs in max_dirs:
                for noise in noise_lvl:
                    dup_num, dup_in = [], []
                    for ntrial in range(repeat):
                        clone = ranked.copy()
                        clone = clone[np.random.choice(clone.columns, dirs, replace=False)]
                        clone += np.random.normal(0, noise, clone.values.shape)
                        clone = clone.apply(lambda x: np.digitize(x, bins=np.linspace(0, 1, bins)))
                        
                        n_dups = clone.groupby(clone.columns.tolist()).size()
                        dup_num.append(n_dups[n_dups > 1].sum())
                        dup_in.append(n_dups[n_dups > 1].count())
                        pgbar.update(1)
                    res[(bins, dirs, noise)] = dict(value_space=min(bins**dirs, 2**99),
                                                    dup_num_max=np.array(dup_num).max(), dup_num_mean=np.array(dup_num).mean(), 
                                                    dup_in_max=np.array(dup_in).max(), dup_in_mean=np.array(dup_in).mean())
                    
    return res

In [None]:
nbins = [len(clone)//5000, len(clone)//1000, len(clone)//500, len(clone)//100,  len(clone)//50, len(clone)//10]
ndims = [3, 5, 10, 20, 50, 100, 200]
noise = [0]

res = count_dups(ranked, nbins, ndims, noise_lvl=noise, repeat=20)
#display(pd.DataFrame(res, columns=pd.MultiIndex.from_arrays(list(zip(*res.keys())), names=["#bins", "#dims", "noise"])))
res = {k: dict(value_fill = v["dup_in_mean"]/v["value_space"], dup_perc = v["dup_num_mean"]/len(clone)) for k, v in res.items()}

styles = [{'selector': 'th', 'props': [('vertical-align','top'),('text-align','left')]}]  
styler = lambda df: df.style.format('{:.2%}'.format).set_table_styles(styles) 
df = pd.DataFrame(res, columns=pd.MultiIndex.from_arrays(list(zip(*res.keys())), names=["#bins", "#dims", "noise"]))
styler(df)

In [None]:
df = pd.DataFrame({k[:2]:v["dup_perc"] for k,v in res.items()}, columns=pd.MultiIndex.from_arrays(list(zip(*res.keys()))[:2], names=["#bins", "#dims"]), index=["dup_perc"])
df = df.T.unstack(level=[0])
styler(df)

In [None]:
pyperclip.copy(df_to_latex(df, styler, rotate=False, caption="This algorithm on Placetypes"), multi_ind=False)

<br><br><br><br><br><br>

## Analyzing what becomes duplicates may reveal actual duplicate courses!

In [None]:
max_dirs, digit_bins = 3, len(clone)//500
print("n-cats:", digit_bins)
res = []
for _ in range(100):
    targets = [0]*len(clone)
    targets[i] = 1
    clf = DecisionTreeClassifier()
    clf.fit(clone.values, targets)
    result = clf.predict(clone.values)
    res.append((result == targets).all())
np.array(res).mean()

In [None]:
n_dups = clone.groupby(clone.columns.tolist()).size()
dup_num, dup_in = n_dups[n_dups > 1].sum(), n_dups[n_dups > 1].count()
print(f"Duplicates: {dup_num} ({dup_num/len(clone):.2%}) entities share {dup_in} values (value-space {dup_in/(digit_bins**max_dirs):.2%} filled)")

display(clone[(clone == (clone.iloc[np.argmax(np.array(targets))].values)).all(axis=1)])