In [1]:
import random

import altair as alt
import numpy as np
import pandas as pd

from typdiv_sampling.distance import (
    get_dist_matrix,
    get_first_point,
    get_summed_dist_dict,
)

RAND_SEED = 123
random.seed(RAND_SEED)


def sample_maxsum(max_langs, dist_mtx):
    """
    Maximum Diversity Problem
    Sample k languages from N, where we iteratively add the
    next point that yields the largest summed distance (greedy).
    """

    all_langs = [i for i in range(len(dist_mtx))]
    most_distant_langid = dist_mtx.sum(axis=1).argmax()

    langs = [most_distant_langid]
    all_langs.remove(most_distant_langid)
    while len(langs) <= max_langs - 1:
        summed_dist = get_summed_dist_dict(dist_mtx, all_langs, langs)
        next_most_distant = max(summed_dist, key=lambda x: summed_dist[x])
        all_langs.remove(next_most_distant)
        langs.append(next_most_distant)

    return langs


def sample_maxmin(max_langs, dist_mtx):
    """
    MaxMin Diversity Problem
    Sample k languages from N, where we iteratively add the
    next point that yields the maximum minimum distance between
    any two points in k.
    """

    p1 = get_first_point(dist_mtx)
    p2 = dist_mtx[p1].argmax()

    L = {i for i in range(dist_mtx.shape[0])}
    S = {p1, p2}

    while len(S) < max_langs:
        rest_L = tuple(L.symmetric_difference(S))
        rest_dists = dist_mtx[rest_L, :].T[tuple(S), :].T
        S.add(rest_L[rest_dists.min(axis=1).argmax()])

    return list(S)


V = np.load("../data/normal_2000_loc1_scale10.npy")
all_langs = [i for i in range(len(V))]
dist_dict = get_dist_matrix(V)

In [2]:
maxsum_sample = sample_maxsum(50, dist_dict)
maxmin_sample = sample_maxmin(50, dist_dict)

In [3]:
frame = (
    alt.Chart(pd.DataFrame({"x": V[all_langs, 0], "y": V[all_langs, 1]}))
    .mark_point(filled=True, color="#6C8EBF", opacity=0.2)
    .encode(
        # x=alt.X("x").axis(domain=False, labels=False, ticks=False, title=None),
        x=alt.X("x", axis=None),
        y=alt.Y("y", axis=None),
    )
)
sample = (
    alt.Chart(pd.DataFrame({"x": V[maxsum_sample, 0], "y": V[maxsum_sample, 1]}))
    .mark_point(filled=True, color="#FF4647", opacity=1, shape="triangle")
    .encode(
        x=alt.X("x", axis=None),
        y=alt.Y("y", axis=None),
    )
)
maxsum_plot = frame + sample
maxsum_plot.configure_axis(grid=True, domain=False).configure_view(strokeWidth=0)

In [4]:
frame = (
    alt.Chart(pd.DataFrame({"x": V[all_langs, 0], "y": V[all_langs, 1]}))
    .mark_point(filled=True, color="#6C8EBF", opacity=0.15)
    .encode(
        x=alt.X("x", axis=None),
        y=alt.Y("y", axis=None),
    )
)
sample = (
    alt.Chart(pd.DataFrame({"x": V[maxmin_sample, 0], "y": V[maxmin_sample, 1]}))
    .mark_point(filled=True, color="#FF4647", opacity=1, shape="triangle")
    .encode(
        x=alt.X("x", axis=None),
        y=alt.Y("y", axis=None),
    )
)
maxmin_plot = frame + sample
maxmin_plot.configure_axis(grid=True, domain=False).configure_view(strokeWidth=0)

In [5]:
maxmin_plot.save("maxmin.pdf")
maxsum_plot.save("maxsum.pdf")