# Visually checking a 3D-Embedding

In [None]:
import os
from os.path import join, isdir, isfile, abspath, dirname, splitext, basename, split
from parse import parse
import math
import random
import itertools

from IPython.display import Markdown, display
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import sklearn
from sklearn.metrics import confusion_matrix

from misc_util.pretty_print import pretty_print as print
from derive_conceptualspace.pipeline import CustomContext, SnakeContext, load_envfiles
from derive_conceptualspace.settings import ENV_PREFIX, get_setting
from derive_conceptualspace import settings
from derive_conceptualspace.semantic_directions.create_candidate_svm import display_svm
from derive_conceptualspace.create_spaces.create_embedding import show_close_descriptions

plt.rcParams['figure.figsize'] = [8, 5]

In [None]:
load_envfiles()

ctx = SnakeContext.loader_context(config={"DEBUG": False, "EMBED_DIMENSIONS": 3, "VERBOSE": False, "CLASSIFIER_COMPARETO_RANKING": "ppmi"}, warn_filters=["DifferentFileWarning"])
pp_descriptions, dcm, dissim_mat, embedding = ctx.load("pp_descriptions", "filtered_dcm", "dissim_mat", "embedding", loaders=dict(embedding=lambda **kwargs: kwargs["embedding"].embedding_))
dcm.show_info(descriptions=pp_descriptions)

In [None]:
show_close_descriptions(dissim_mat[1], pp_descriptions)
#print(ctx.display_output("embedding", ignore_err=True))
show_close_descriptions(embedding, pp_descriptions, is_embedding=True, title=f"Embedding-Distances ({get_setting('DISSIM_MEASURE')})")

## Calculating if known similar descriptions are close

In [None]:
if "siddata" in ctx.get_config("dataset"):
    CLOSE_DESCRIPTIONS = ["Informatik A: Algorithmen", "Informatik B: Grundlagen der Software-Entwicklung"]
elif ctx.get_config("dataset") == "placetypes":
    CLOSE_DESCRIPTIONS = ["airplane cabin", "aircraft cabin"]

close_inds = [n for n, i in enumerate(pp_descriptions._descriptions) if i.title in CLOSE_DESCRIPTIONS]
close_dist = np.linalg.norm(embedding[close_inds[0]]-embedding[close_inds[1]])
random_pairs = random.sample(list(itertools.combinations(random.sample(list(embedding), 1000), 2)), 500)
average_dist = np.mean([np.linalg.norm(p1-p2) for p1, p2 in random_pairs])
f"Known close ones: {close_dist:.3f}, Average dist: {average_dist:.3f}"

## Visually checking if the SVM clusters well and if known similar descriptions are close

In [None]:
def create_svm(term, embedding, dcm, descriptions, highlight=None):
    quants = dcm.term_quants(term)
    bin_labels = np.array(quants, dtype=bool)
    svm = sklearn.svm.LinearSVC(class_weight="balanced", loss="hinge", max_iter=8000)
    svm.fit(embedding, bin_labels)
    svm_results = svm.decision_function(embedding)
    tn, fp, fn, tp = confusion_matrix(bin_labels, [i > 0 for i in svm_results]).ravel()
    precision = tp / (tp + fp); recall = tp / (tp + fn); accuracy = (tp + tn) / len(quants)
    f_one = 2*(precision*recall)/(precision+recall)
    display_svm(embedding, np.array(bin_labels, dtype=int), svm, term=term, descriptions=descriptions, name=term+f" (F1: {f_one:.3f})", highlight=highlight)

In [None]:
if ctx.get_config("dataset") == "placetypes":
    create_svm("nature", embedding, dcm, pp_descriptions, highlight=CLOSE_DESCRIPTIONS)
elif "siddata" in ctx.get_config("dataset"):
    create_svm("mathematik", embedding, dcm, pp_descriptions, highlight=CLOSE_DESCRIPTIONS)

In [None]:
NUM = 3
featureaxes = ctx.load("featureaxes")
best = [i[0] for i in sorted([(k, v["kappa_digitized_onlypos_1"]) for k, v in featureaxes["metrics"].items()], key=lambda x:x[1], reverse=True)[:NUM]]
best += [i[0] for i in sorted([(k, v["f_one"]) for k, v in featureaxes["metrics"].items()], key=lambda x:x[1], reverse=True)[:NUM]]

terms = best + list(dcm.all_terms.values())[:NUM]
if ctx.get_config("dataset") == "placetypes":
    terms += ['nature', 'ceiling', 'engine', 'athlete', 'seafood', 'shadows', 'skyscrapers', 'b737', 'monument', 'baby', 'sign', 'marine', 'iowa', 'field', 
              'buy', 'military', 'lounge', 'factory', 'road', 'education', '13thcentury', 'people', 'wait', 'travel', 'tunnel', 'treno', 'wings', 'hot', 
              'background', 'vintage', 'farmhouse', 'technology', 'building', 'horror', 'realestate', 'crane', 'slipway', 'ruin', 'national', 'morze'][:NUM]
terms

In [None]:
for term in terms:
    create_svm(term, embedding, dcm, pp_descriptions)