# Visually checking a 3D-Embedding

In [None]:
import os
from os.path import join, isdir, isfile, abspath, dirname, splitext, basename, split
from parse import parse
import math
import random
import itertools

from IPython.display import Markdown, display
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import sklearn
from sklearn.metrics import confusion_matrix

from misc_util.pretty_print import pretty_print as print
from derive_conceptualspace.pipeline import CustomContext, SnakeContext, load_envfiles
from derive_conceptualspace.settings import ENV_PREFIX, get_setting
from derive_conceptualspace import settings
from derive_conceptualspace.semantic_directions.create_candidate_svm import display_svm
from derive_conceptualspace.create_spaces.create_embedding import show_close_descriptions

plt.rcParams['figure.figsize'] = [8, 5]

In [None]:
load_envfiles()

ctx = SnakeContext.loader_context(config={"DEBUG": False, "EMBED_DIMENSIONS": 3, "VERBOSE": False, "CLASSIFIER_COMPARETO_RANKING": "ppmi"}, warn_filters=["DifferentFileWarning"])
pp_descriptions, dcm, dissim_mat, embedding = ctx.load("pp_descriptions", "filtered_dcm", "dissim_mat", "embedding", loaders=dict(embedding=lambda **kwargs: kwargs["embedding"].embedding_))
dcm.show_info(descriptions=pp_descriptions)

In [None]:
show_close_descriptions(dissim_mat[1], pp_descriptions)
#print(ctx.display_output("embedding", ignore_err=True))
show_close_descriptions(embedding, pp_descriptions, is_embedding=True, title=f"Embedding-Distances ({get_setting('DISSIM_MEASURE')})")

## Calculating if known similar descriptions are close

In [None]:
if "siddata" in ctx.get_config("dataset"):
    CLOSE_DESCRIPTIONS = ["Informatik A: Algorithmen", "Informatik B: Objekt-orientierte Programmierung in Java"]
elif ctx.get_config("dataset") == "placetypes":
    CLOSE_DESCRIPTIONS = ["airplane cabin", "aircraft cabin"]

close_inds = [n for n, i in enumerate(pp_descriptions._descriptions) if i.title in CLOSE_DESCRIPTIONS]
close_dist = np.linalg.norm(embedding[close_inds[0]]-embedding[close_inds[1]])
random_pairs = random.sample(list(itertools.combinations(random.sample(list(embedding), 1000), 2)), 500)
average_dist = np.mean([np.linalg.norm(p1-p2) for p1, p2 in random_pairs])
print(f"Distances: Known close ones: {close_dist:.3f}, Average dist: {average_dist:.3f}")

## Visually checking if the SVM clusters well and if known similar descriptions are close

In [None]:
def display_svm(X, y, svm, term=None, name=None, descriptions=None, quants=None, distances=None, 
                highlight=None, stretch_fact=2, legend_inside=False, show_center=False, **kwargs):
    assert X.shape[1] == 3
    decision_plane = ThreeDPlane(svm.coef_[0], svm.intercept_[0])
    occurences = [descriptions._descriptions[i].count_phrase(term) for i in range(len(X))]
    percentile = lambda percentage: np.percentile(np.array([i for i in occurences if i]), percentage)
    if descriptions._descriptions[0].text is not None:
        extras = [{**{"Name": descriptions._descriptions[i].title, "Occurences": occurences[i],
                      "extra": {"Description": shorten(descriptions._descriptions[i].text, 200)}},
                   **({"Quants": quants[i]} if quants is not None else {}),
                   **({"Distance": distances[i]} if distances is not None else {})}
                  for i in range(len(X))]
    else:
        extras = [{**{"Name": descriptions._descriptions[i].title, "Quants": quants[i], "Occurences": occurences[i],
                      "extra": {"BoW": ", ".join([f"{k}: {v}" for k, v in sorted(descriptions._descriptions[i].bow().items(), key=lambda x:x[1], reverse=True)[:10]])}},
                   **({"Quants": quants[i]} if quants is not None else {}),
                   **({"Distance": distances[i]} if distances is not None else {})}
                  for i in range(len(X))]
    highlight_inds = [n for n, i in enumerate(descriptions._descriptions) if i.title in highlight] if highlight else []
    with ThreeDFigure(name=name, **kwargs) as fig:
        fig.add_markers(X[np.where(np.logical_not(y))], color="blue", size=0.7, custom_data=[extras[i] for i in np.where(np.logical_not(y))[0]], linelen_right=50, name="Negative samples", opacity=0.3)
        fig.add_markers(X[np.where(y)], color="red", size=[9 if occurences[i] > percentile(70) else 4 for i in np.where(y)[0]], custom_data=[extras[i] for i in np.where(y)[0]], linelen_right=50, name="Positive samples")
        if highlight_inds:
            highlight_mask = np.array([i in highlight_inds for i in range(len(y))], dtype=int)
            fig.add_markers(X[np.where(highlight_mask)], color="green", size=9, custom_data=[extras[i] for i in np.where(highlight_mask)[0]], linelen_right=50, name="Highlighted")
        fig.add_surface(decision_plane, X, y, color="gray", name="Decision Plane", showlegend=True)
        fig.add_line(X.mean(axis=0)-decision_plane.normal*stretch_fact, X.mean(axis=0)+decision_plane.normal*stretch_fact, width=2, name="Dec.Plane Orthogonal")  # orthogonal of decision hyperplane through mean of points
        if show_center:
            fig.add_markers([0, 0, 0], size=3, name="Coordinate Center")  # coordinate center
        # fig.add_line(-decision_plane.normal * 5, decision_plane.normal * 5)  # orthogonal of decision hyperplane through [0,0,0]
        # fig.add_sample_projections(X, decision_plane.normal)  # orthogonal lines from the samples onto the decision hyperplane orthogonal
        if legend_inside:
            fig.fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01))
        fig.show()

In [None]:
def create_svm(term, embedding, dcm, descriptions, highlight=None):
    quants = dcm.term_quants(term)
    bin_labels = np.array(quants, dtype=bool)
    svm = sklearn.svm.LinearSVC(class_weight="balanced", loss="hinge", max_iter=20000)
    svm.fit(embedding, bin_labels)
    svm_results = svm.decision_function(embedding)
    tn, fp, fn, tp = confusion_matrix(bin_labels, [i > 0 for i in svm_results]).ravel()
    precision = tp / (tp + fp); recall = tp / (tp + fn); accuracy = (tp + tn) / len(quants)
    f_one = 2*(precision*recall)/(precision+recall)
    display_svm(embedding, np.array(bin_labels, dtype=int), svm, term=term, descriptions=descriptions, highlight=highlight, stretch_fact=0.28, bigfont=True, 
                legend_inside=True, name=f"SIDDATA 3D-MDS-Embedding and SVM splitting for term '{term}' (SVM F1: {f_one:.2f})")

In [None]:
if ctx.get_config("dataset") == "placetypes":
    create_svm("nature", embedding, dcm, pp_descriptions, highlight=CLOSE_DESCRIPTIONS)
elif "siddata" in ctx.get_config("dataset"):
    create_svm("mathematik", embedding, dcm, pp_descriptions, highlight=CLOSE_DESCRIPTIONS)

In [None]:
NUM = 3
featureaxes = ctx.load("featureaxes")
best = [i[0] for i in sorted([(k, v["kappa_digitized_onlypos_1"]) for k, v in featureaxes["metrics"].items()], key=lambda x:x[1], reverse=True)[:NUM]]
best += [i[0] for i in sorted([(k, v["f_one"]) for k, v in featureaxes["metrics"].items()], key=lambda x:x[1], reverse=True)[:NUM]]

terms = best + list(dcm.all_terms.values())[:NUM]
if ctx.get_config("dataset") == "placetypes":
    terms += ['nature', 'ceiling', 'engine', 'athlete', 'seafood', 'shadows', 'skyscrapers', 'b737', 'monument', 'baby', 'sign', 'marine', 'iowa', 'field', 
              'buy', 'military', 'lounge', 'factory', 'road', 'education', '13thcentury', 'people', 'wait', 'travel', 'tunnel', 'treno', 'wings', 'hot', 
              'background', 'vintage', 'farmhouse', 'technology', 'building', 'horror', 'realestate', 'crane', 'slipway', 'ruin', 'national', 'morze'][:NUM]
terms

In [None]:
for term in terms:
    create_svm(term, embedding, dcm, pp_descriptions)