In [None]:
from top2vec.Top2Vec import Top2Vec
from top2vec.similarity import (
    describe_closest_items,
    find_closest_items,
    generate_similarity_matrix,
    generate_csr_similarity_matrix,
)
import gensim
from sklearn.datasets import fetch_20newsgroups
import scipy.stats
import numpy as np

FIG_SIZE = (30, 10)

In [None]:
# get 20 newsgroups data
newsgroups_train = fetch_20newsgroups(
    subset="all", remove=("headers", "footers", "quotes")
)
# newsgroups_documents = newsgroups_train.data[0:2000]
newsgroups_documents = newsgroups_train.data

# train top2vec model with doc_ids provided
doc_ids = [str(num) for num in range(0, len(newsgroups_documents))]
top2vec_model = Top2Vec(
    documents=newsgroups_documents,
    document_ids=doc_ids,
    speed="fast-learn",
    workers=8,
    umap_args={"random_state": 1337},
)

In [None]:
%matplotlib inline
import sklearn.metrics
from top2vec.cutoff_heuristics import (
    find_elbow_index,
    get_distances_from_line,
    find_cutoff,
    _get_shifted_second_derivative,
    ELBOW_HEURISTIC_STR,
    DERIVATIVE_HEURISTIC_STR,
    AVERAGE_HEURISTIC_STR,
)
import matplotlib.pyplot as plt

# Going to make a cumulative density function showing the move from A to B.
# index[A] shows how much of the total variation has been accomplished BEFORE index A
# Therefore an elbow of A should be thought of as exclusive, not inclusive
def get_cdf_of_differences(sorted_values):
    # print(f"Generating CDF for {len(sorted_values)} values")
    # print("% of total difference: ", percent_of_total_difference)
    # Deal with floating point errors
    return np.cumsum(get_percent_of_total_differences(sorted_values)).round(decimals=8)


def get_percent_of_total_differences(sorted_values):
    total_difference = sorted_values[-1] - sorted_values[0]
    differences = np.hstack((0, sorted_values[1:] - sorted_values[:-1]))
    # print("Total difference: ", total_difference)
    # print("Difference for values up to i: ", differences[:20])
    percent_of_total_difference = differences / total_difference
    return percent_of_total_difference


def plot_heuristic(
    values,
    figure_num="1",
    figsize=FIG_SIZE,
    first_elbow=True,
    below_line_exclusive=True,
):
    sorted_values = -np.sort(-np.array(values))
    x = np.arange(sorted_values.size)

    m = (sorted_values[-1] - sorted_values[0]) / (sorted_values.size - 1)
    line = x * m + sorted_values[0]

    # Uniform is an absolute value and therefore useless for detecting an inflection
    # y_distances = get_distances_from_line(
    #    sorted_values, m, sorted_values[0], metric="raw-y", first_elbow=False
    # ).distances

    elbow = find_cutoff(
        sorted_values,
        cutoff_heuristic=ELBOW_HEURISTIC_STR,
        first_elbow=first_elbow,
        below_line_exclusive=below_line_exclusive,
    )
    distances_tuple = get_distances_from_line(
        sorted_values, m, sorted_values[0], first_elbow=first_elbow
    )
    y_distances = distances_tuple.y_deltas

    slid_second_derivative = _get_shifted_second_derivative(
        sorted_values, distances_tuple.is_truncated, distances_tuple.truncation_index
    )
    scores = (
        distances_tuple.distances[: distances_tuple.truncation_index + 1]
        * slid_second_derivative
    )
    alt_elbow = find_cutoff(
        sorted_values,
        cutoff_heuristic=DERIVATIVE_HEURISTIC_STR,
        first_elbow=first_elbow,
        below_line_exclusive=below_line_exclusive,
    )
    average_elbow = find_cutoff(
        sorted_values,
        cutoff_heuristic=AVERAGE_HEURISTIC_STR,
        first_elbow=first_elbow,
        below_line_exclusive=below_line_exclusive,
    )
    ELBOW_COLOR = "blue"
    DERIVATIVE_COLOR = "orange"
    AVERGE_COLOR = "green"
    # PLOTS ON PLOTS
    fig = plt.figure(num=figure_num, clear=True, figsize=figsize)

    gs = fig.add_gridspec(nrows=3, ncols=3)
    ax = fig.add_subplot(gs[:2, 0])
    ax.plot(line)
    ax.scatter([elbow], [sorted_values[elbow]], color=ELBOW_COLOR)
    ax.scatter([alt_elbow], [sorted_values[alt_elbow]], color=DERIVATIVE_COLOR)
    ax.scatter([average_elbow], [sorted_values[average_elbow]], color=AVERGE_COLOR)
    ax.plot(sorted_values)

    ax_y = fig.add_subplot(gs[2, 0])
    ax_y.axhline(0, color="black")
    ax_y.plot(y_distances)
    ax_y.scatter([elbow], [y_distances[elbow]], color=ELBOW_COLOR)
    ax_y.scatter([alt_elbow], [y_distances[alt_elbow]], color=DERIVATIVE_COLOR)
    ax_y.scatter([average_elbow], [y_distances[average_elbow]], color=AVERGE_COLOR)

    ax_d = fig.add_subplot(gs[0, 1])
    ax_d.plot(distances_tuple.distances[: distances_tuple.truncation_index + 1])
    ax_d.axhline(0, color="black")
    ax_d.scatter([elbow], [distances_tuple.distances[elbow]], color=ELBOW_COLOR)
    ax_d.xaxis.set_ticklabels([])
    ax_val_second_d = fig.add_subplot(gs[1, 1])
    ax_val_second_d.plot(slid_second_derivative)
    ax_val_second_d.scatter(
        [alt_elbow], [slid_second_derivative[alt_elbow]], color=DERIVATIVE_COLOR
    )
    ax_val_second_d.axhline(0, color="black")
    ax_val_scores = fig.add_subplot(gs[2, 1])
    ax_val_scores.plot(scores)
    ax_val_scores.scatter([alt_elbow], [scores[alt_elbow]], color=DERIVATIVE_COLOR)
    ax_val_scores.axhline(0, color="black")
    return elbow

In [None]:
# What does one of our topics look like?

topn = 100000
topic_descriptions = describe_closest_items(
    top2vec_model.topic_vectors,
    top2vec_model.word_vectors,
    top2vec_model.vocab,
    topn=topn,
)

In [None]:
if top2vec_model.get_num_topics() > 20:
    top2vec_model.hierarchical_topic_reduction(20)
    print(f"Reduced to {len(top2vec_model.topic_vectors_reduced)} topics.")
    topn = 100000
    topic_descriptions_reduced = describe_closest_items(
        top2vec_model.topic_vectors_reduced,
        top2vec_model.word_vectors,
        top2vec_model.vocab,
        topn=topn,
    )
else:
    print(f"Already at {top2vec_model.get_num_topics()} topics")

In [None]:
topic_num = min(20, top2vec_model.get_num_topics() - 1)
print(f"TOPIC {topic_num}")
display_limit = -1
terms, scores = topic_descriptions[topic_num]
print(len(scores), " total similar terms found from raw data.")
raw_scores = (
    1
    - sklearn.metrics.pairwise_distances(
        np.array([top2vec_model.topic_vectors[topic_num]]), top2vec_model.word_vectors
    )
)[0]
plot_heuristic(
    raw_scores,
    f"Topic {topic_num} - Raw Cosine",
)

# Now show what the elbow would be if you ran it twice
# Need to spcify first_elbow=False, otherwise you can run into some bad cases
elbow_twice = plot_heuristic(
    scores,
    f"Topic {topic_num} - Values",
    first_elbow=False,
)
print(f"Running heuristic twice gives {elbow_twice} terms: ", terms[:elbow_twice])

In [None]:
doc_topic_matrix = generate_similarity_matrix(
    top2vec_model.document_vectors, top2vec_model.topic_vectors, topn=topn
)
doc_topic_num_zeroes = np.count_nonzero(doc_topic_matrix == 0)
doc_topic_sparsity = doc_topic_num_zeroes / (doc_topic_matrix.size)

topic_term_matrix = generate_similarity_matrix(
    top2vec_model.topic_vectors, top2vec_model.word_vectors, topn=topn
)
topic_term_num_zeroes = np.count_nonzero(topic_term_matrix == 0)
topic_term_sparsity = topic_term_num_zeroes / (topic_term_matrix.size)

doc_topic_sparsity, topic_term_sparsity