In [None]:
import os
from os.path import join, isdir, isfile, abspath, dirname, splitext, basename, split
from parse import parse
import math
import random
import itertools

from IPython.display import Markdown, display
from dotenv import load_dotenv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import sklearn
from sklearn.metrics import confusion_matrix

from misc_util.pretty_print import pretty_print as print
from derive_conceptualspace.pipeline import CustomContext, SnakeContext
from derive_conceptualspace.settings import ENV_PREFIX, get_setting
from derive_conceptualspace import settings

plt.rcParams['figure.figsize'] = [8, 5]

In [None]:
assert load_dotenv(abspath(join(os.getcwd(), "..", "..", "config", "_select_env.env")))
assert load_dotenv(os.environ["MA_ENV_FILE"])

ctx = SnakeContext.loader_context(config={"DEBUG": False, "EMBED_DIMENSIONS": 100})
ctx.load("pp_descriptions", "filtered_dcm", "dissim_mat", "embedding")
ctx.obj["filtered_dcm"].show_info()

In [None]:
print(ctx.display_output("embedding"))

In [None]:
descriptions = ctx.obj["pp_descriptions"]
embedding = ctx.obj["embedding"].embedding_
dcm = ctx.obj["filtered_dcm"]
# terms = list(dcm.all_terms.values())
terms = ['nature', 'ceiling', 'engine', 'athlete', 'seafood', 'shadows', 'skyscrapers', 'b737', 'monument', 'baby', 'sign', 'marine', 'iowa', 'field', 'buy', 'military', 'lounge', 'factory', 'road', 'education', '13thcentury', 'people', 'wait', 'travel', 'tunnel', 'treno', 'wings', 'hot', 'background', 'vintage', 'farmhouse', 'technology', 'building', 'horror', 'realestate', 'crane', 'slipway', 'ruin', 'national', 'morze']
quants_s = [dcm.term_quants(term) for term in tqdm(terms, desc="Counting Terms")]

term, quants = terms[0], quants_s[0]

In [None]:
bin_labels = np.array(quants, dtype=bool)
svm = sklearn.svm.LinearSVC(class_weight="balanced", loss="hinge", max_iter=8000)
svm.fit(embedding, bin_labels)
svm_results = svm.decision_function(embedding)

In [None]:
tn, fp, fn, tp = confusion_matrix(bin_labels, [i > 0 for i in svm_results]).ravel()
precision = tp / (tp + fp); recall = tp / (tp + fn); accuracy = (tp + tn) / len(quants)
f_one = 2*(precision*recall)/(precision+recall)
print("F1:", f_one)

In [None]:
from derive_conceptualspace.semantic_directions.create_candidate_svm import display_svm

display_svm(embedding, np.array(bin_labels, dtype=int), svm, term=term, descriptions=descriptions, name=term, highlight=["airplane cabin", "aircraft cabin"])

In [None]:
close_inds = [n for n, i in enumerate(descriptions._descriptions) if i.title in ["airplane cabin", "aircraft cabin"]]
close_inds

In [None]:
embedding[close_inds]

In [None]:
np.linalg.norm(close_inds[0]-close_inds[1])

In [None]:
random_pairs = random.sample(list(itertools.combinations(random.sample(list(embedding), 1000), 2)), 500)
average_dist = np.mean([np.linalg.norm(p1-p2) for p1, p2 in random_pairs])
average_dist

Ok, so the embeddings of two supposedly close entities are absolutely not close. 
What about the dissimilarity-matrices?

In [None]:
orig1, orig2 = ctx.obj["dissim_mat"][1][close_inds]
np.linalg.norm(orig1-orig2)

In [None]:
random_pairs = random.sample(list(itertools.combinations(random.sample(list(ctx.obj["dissim_mat"][1]), 1000), 2)), 500)
average_dist = np.mean([np.linalg.norm(p1-p2) for p1, p2 in random_pairs])
average_dist

-> they are a lot closer than the average! Soooooo there's an error when creating the embedding?!