In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import HTML
import pyperclip

from misc_util.logutils import setup_logging
from misc_util.pretty_print import Markdown, display

from derive_conceptualspace.pipeline import SnakeContext, load_envfiles, cluster_loader
from derive_conceptualspace.util.result_analysis_tools import getfiles_allconfigs, display_metrics, show_lambda_elements, highlight_nonzero_max
from derive_conceptualspace.settings import DEFAULT_N_CPUS
from derive_conceptualspace.util.threadworker import WorkerPool
from derive_conceptualspace.cli.args_from_filename import get_filename, print_envvars
from derive_conceptualspace.util.desc_object import DescriptionList

plt.rcParams['figure.figsize'] = [16, 10]

In [2]:
setup_logging()
load_envfiles("placetypes")
configs, print_cnf = getfiles_allconfigs("clusters", verbose=False, parse_all=True)

In [3]:
loaders = dict(clusters=cluster_loader, embedding=lambda **args: args["embedding"].embedding_, pp_descriptions=DescriptionList.from_json)
clusters, embedding, descriptions = SnakeContext.loader_context(config=configs[0]).load("clusters", "embedding", "pp_descriptions", loaders=loaders)
clusters, planes = clusters.values()

<span style="font-size:11pt">env_vars demanded config EMBED_DIMENSIONS to be <span style="color: #ff0000">200</span>, but force overwrites it to <span style="color: #0000ff">50</span></span>

<span style="font-size:11pt">env_vars demanded config CLASSIFIER_SUCCMETRIC to be <span style="color: #ff0000">kappa_digitized_onlypos_2</span>, but force overwrites it to <span style="color: #0000ff">kappa_rank2rank_onlypos_max</span></span>

<span style="font-size:11pt">Running with the following settings [<span style="color: #00ff00">a3e25b6709</span>]: ALL_DESCRIPTIONS_LANG: <span style="color: #ff0000">en</span>, CANDIDATE_MIN_TERM_COUNT: <span style="color: #ff0000">50</span>, CANDS_USE_NDOCS_COUNT: <span style="color: #0000ff">True</span>, CLASSIFIER: <span style="color: #0000ff">SVM</span>, CLASSIFIER_SUCCMETRIC: <span style="color: #0000ff">kappa_rank2rank_onlypos_max</span>, CLUSTER_DIRECTION_ALGO: <span style="color: #0000ff">reclassify</span>, DATASET: <span style="color: #ff0000">placetypes</span>, DCM_QUANT_MEASURE: <span style="color: #0000ff">count</span>, DEBUG: <span style="color: #0000ff">False</span>, DISSIM_MAT_ONLY_PARTNERED: <span style="color: #0000ff">True</span>, DISSIM_MEASURE: <span style="color: #0000ff">norm_ang_dist</span>, EMBED_ALGO: <span style="color: #0000ff">mds</span>, EMBED_DIMENSIONS: <span style="color: #ff0000">50</span>, EXTRACTION_METHOD: <span style="color: #ff0000">all</span>, KAPPA_WEIGHTS: <span style="color: #0000ff">quadratic</span>, LANGUAGE: <span style="color: #ff0000">en</span>, MAX_NGRAM: <span style="color: #ff0000">None</span>, MIN_WORDS_PER_DESC: <span style="color: #0000ff">50</span>, MOST_DISTINCT_PERCENT: <span style="color: #0000ff">30</span>, NDIMS_NCANDS_FACTOR: <span style="color: #0000ff">2</span>, NGRAMS_IN_EMBEDDING: <span style="color: #0000ff">False</span>, PP_COMPONENTS: <span style="color: #ff0000">none</span>, PREPROCESSED_BOW: <span style="color: #ff0000">True</span>, PRIM_LAMBDA: <span style="color: #0000ff">0.5</span>, QUANTIFICATION_MEASURE: <span style="color: #ff0000">ppmi</span>, RANDOM_SEED: <span style="color: #ff0000">None</span>, SEC_LAMBDA: <span style="color: #0000ff">0.1</span>, TRANSLATE_POLICY: <span style="color: #0000ff">onlyorig</span></span>

In [4]:
axis_dists = [{k: v.dist(embedding[i]) for k, v in planes.items()} for i in range(len(embedding))]
best_per_dim = {k: descriptions._descriptions[v].title for k, v in pd.DataFrame(axis_dists).idxmax().to_dict().items()}
print("Highest-ranking descriptions per dimension:\n    "+"\n    ".join([f"{k.ljust(max([len(i) for i in best_per_dim.keys()][:20]))}: {v}" for k, v in best_per_dim.items()][:20]))

Highest-ranking descriptions per dimension:
    isawyoufirst           : beach
    workspace              : office
    nutrition              : restaurant
    goalie                 : stadium
    pumper                 : building
    starwoodhotels         : hotel room
    interstate10           : highway
    urban                  : interior
    tuolumne               : creek
    cabs                   : downtown
    investment             : school
    stripmall              : downtown
    michiganstateuniversity: school
    ews                    : railroad
    anchored               : boat
    a10                    : airport
    wc2                    : restaurant
    airbase                : airport
    joshuatreenationalpark : canyon
    clinker                : building
