In [None]:
from os.path import join

import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
from matplotlib.backends.backend_pdf import PdfPages

from misc_util.logutils import setup_logging
from misc_util.pretty_print import display

from derive_conceptualspace.pipeline import SnakeContext, load_envfiles
from derive_conceptualspace.settings import DEFAULT_N_CPUS
from derive_conceptualspace.util.result_analysis_tools import getfiles_allconfigs
from derive_conceptualspace.util.threadworker import WorkerPool
from derive_conceptualspace.analysis.plots import scatter_2d, scatter_3d, set_seaborn

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Visualizing Data-Embeddings

In [None]:
setup_logging()
load_envfiles("siddata")
configs, print_cnf = getfiles_allconfigs("clusters", verbose=True)

In [None]:
with WorkerPool(DEFAULT_N_CPUS-1, pgbar="Fetching clusters..") as pool:
    get_featureaxes = lambda conf: ((ctx := SnakeContext.loader_context(config=conf, silent=True)).get_important_settings(), ctx.load("clusters"), conf)
    perconf_list, interrupted = pool.work(configs, get_featureaxes)

In [None]:
def load_best_conf(perconf_list, restrictions=None):
    restrictions = restrictions or (lambda x: True)
    perconf_list = [elem for elem in perconf_list if restrictions(elem[2])]
    print("Number of clusters per config:", [len(x[1]["clusters"]) for x in perconf_list])
    display("Taking one of the configs with the most clusters...")
    settings_str, clusters, conf = max(perconf_list, key=lambda x: len(x[1]["clusters"]))
    display(settings_str[1])
    display("Loading the rest of the necessary config...")
    ctx, (embedding, descriptions, dissim_mat) = (ctx := SnakeContext.loader_context(config=conf, silent=True)), ctx.load("embedding", "pp_descriptions", "dissim_mat")
    display("loading done.")
    return ctx, embedding, descriptions, dissim_mat

## Plotting original 3D-Embeddings

* The following plot visualizes an unaltered 3-dimensional MDS Embedding as it was created in the algorithm
* 3D Plot is interactive! You can twist & turn and also disable & enable individual categories using the legend!

In [None]:
ctx, embedding, descriptions, dissim_mat = load_best_conf(perconf_list, restrictions=lambda x: x["embed_dimensions"] == 3)
getcat, hascat, catnames = ctx.obj["dataset_class"].get_custom_class("fachbereich", descriptions)
embedding = embedding[hascat]

In [None]:
df = pd.DataFrame(np.column_stack((embedding, [getcat(i) for i in hascat])), columns=["x","y","z","faculty"])
scatter_3d(df, "faculty", catnames);

<br/><br/><br/><br/><br/><br/>
## Plotting the result of t-SNE of the best-performing dissimilarity Matrix

...Dissimiliarity-Matrix, not Embedding! t-SNE is doing the embedding here

In [None]:
ctx, embedding, descriptions, dissim_mat = load_best_conf(perconf_list)

In [None]:
dissim_mat = dissim_mat[1][hascat].T[hascat].T
# tsne_emb = TSNE(n_components=2, random_state=0, metric="cosine"); tsne_emb.fit(embedding) #we could also do TSNE on the embedding here
tsne_emb = TSNE(n_components=2, random_state=0, metric="precomputed")
tsne = tsne_emb.fit(dissim_mat)
df = pd.DataFrame(np.column_stack((tsne.embedding_, [getcat(i) for i in hascat])), columns=["x","y","faculty"])

In [None]:
savepath = join(ctx.p.in_dir, f"scatter_mds_tsne_{ctx.get_important_settings()[0][3:-3]}.pdf")
set_seaborn()
with PdfPages(savepath) as pdf:
    fig = scatter_2d(df, "faculty", catnames)
    pdf.savefig(fig, bbox_inches='tight')
print(f"Saved under {savepath}")