In [None]:
from top2vec.Top2Vec import Top2Vec
from top2vec.similarity import (
    describe_closest_items,
    find_closest_items,
    generate_similarity_matrix,
    generate_csr_similarity_matrix,
)
import gensim
from sklearn.datasets import fetch_20newsgroups
import scipy.stats
import numpy as np

FIG_SIZE = (30, 10)

In [None]:
# get 20 newsgroups data
newsgroups_train = fetch_20newsgroups(
    subset="all", remove=("headers", "footers", "quotes")
)
# newsgroups_documents = newsgroups_train.data[0:2000]
newsgroups_documents = newsgroups_train.data

# train top2vec model with doc_ids provided
doc_ids = [str(num) for num in range(0, len(newsgroups_documents))]
top2vec_model = Top2Vec(
    documents=newsgroups_documents,
    document_ids=doc_ids,
    speed="fast-learn",
    workers=8,
    umap_args={"random_state": 1337},
)

In [None]:
%matplotlib inline
import sklearn.metrics
from top2vec.elbow_finding import find_elbow_index, get_distances_from_line
import matplotlib.pyplot as plt

# Going to make a cumulative density function showing the move from A to B.
# index[A] shows how much of the total variation has been accomplished BEFORE index A
# Therefore an elbow of A should be thought of as exclusive, not inclusive
def get_cdf_of_differences(sorted_values):
    # print(f"Generating CDF for {len(sorted_values)} values")
    # print("% of total difference: ", percent_of_total_difference)
    # Deal with floating point errors
    return np.cumsum(get_percent_of_total_differences(sorted_values)).round(decimals=8)

def get_percent_of_total_differences(sorted_values):
    total_difference = sorted_values[-1] - sorted_values[0]
    differences = np.hstack((0,
        sorted_values[1:] - sorted_values[:-1]))
    # print("Total difference: ", total_difference)
    #print("Difference for values up to i: ", differences[:20])
    percent_of_total_difference = differences / total_difference
    return percent_of_total_difference

def plot_heuristic(
    values,
    figure_num="1",
    derivatives="distance",
    figsize=FIG_SIZE,
    first_elbow=True,
    elbow_metric="manhattan",
    display_limit=100,
):
    sorted_vals = -np.sort(-np.array(values))
    x = np.arange(sorted_vals.size)

    m = (sorted_vals[-1] - sorted_vals[0]) / (sorted_vals.size - 1)
    line = x * m + sorted_vals[0]
    # Uniform is an absolute value and therefore useless for detecting an inflection
    y_distances = get_distances_from_line(
        sorted_vals, m, sorted_vals[0], metric="raw-y", first_elbow=False
    )
    elbow = find_elbow_index(sorted_vals, first_elbow=first_elbow, metric=elbow_metric)
    np.argmax(get_cdf_of_differences(sorted_vals))
    print(f"Raw elbow: {elbow}")

    if not derivatives:
        distances = []
        with_derivatives = False
    elif derivatives == "distance":
        distances = get_distances_from_line(
            sorted_vals, m, sorted_vals[0], first_elbow=False, metric=elbow_metric
        )
        with_derivatives = True
    elif derivatives == "values":
        distances = sorted_vals
        with_derivatives = True
    else:
        raise ValueError("Unknown derivatives requested.")

    plot_figure(
        sorted_vals,
        distances,
        elbow,
        line,
        figure_num=figure_num,
        with_derivatives=with_derivatives,
        y_distances=y_distances,
        figsize=figsize,
        display_limit=display_limit,
    )
    return elbow


def plot_figure(
    sorted_vals,
    distances,
    elbow,
    line,
    figure_num="1",
    with_derivatives=True,
    y_distances=None,
    figsize=FIG_SIZE,
    display_limit = -1,
):

    distances_prime = [
        0,
    ]
    for x, distance in enumerate(distances):
        if x == 0:
            continue
        else:
            distances_prime.append(distance - distances[x - 1])
    distances_prime_prime = [0, 0]
    for x, distance_prime in enumerate(distances_prime):
        if x == 0:
            continue
        else:
            distances_prime_prime.append(distance_prime - distances_prime[x - 1])

    if not with_derivatives:
        fig = plt.figure(num=figure_num, clear=True, figsize=figsize)
        gs = fig.add_gridspec(nrows=3, ncols=1)
        if y_distances is not None:
            ax = fig.add_subplot(gs[:2, 0])
            ax_y = fig.add_subplot(gs[2, 0], sharex=ax)
            ax_y.axhline(0, color="black")
            ax_y.plot(y_distances[:display_limit])
            if display_limit < 0 or elbow < display_limit:
                ax_y.scatter([elbow], [y_distances[elbow]])
        else:
            ax = fig.add_subplot(gs[:, 0])

    else:
        fig = plt.figure(num=figure_num, clear=True, figsize=figsize)
        gs = fig.add_gridspec(nrows=3, ncols=3)
        if y_distances is not None:
            ax = fig.add_subplot(gs[:2, 0])
            ax_y = fig.add_subplot(gs[2, 0])
            ax_y.axhline(0, color="black")
            ax_y.plot(y_distances[:display_limit])
            if display_limit < 0 or elbow < display_limit:
                ax_y.scatter([elbow], [y_distances[elbow]])
        else:
            ax = fig.add_subplot(gs[:, 0])
        ax_d = fig.add_subplot(gs[0, 1])
        ax_d.plot(distances[:display_limit])
        ax_d.axhline(0, color="black")
        ax_d.xaxis.set_ticklabels([])
        ax_d_prime = fig.add_subplot(gs[1, 1], sharex=ax_d)
        ax_d_prime.plot(distances_prime[:display_limit])
        ax_d_prime.axhline(0, color="black")
        ax_d_prime_prime = fig.add_subplot(gs[2, 1])
        ax_d_prime_prime.plot(distances_prime_prime[:display_limit])
        ax_d_prime_prime.axhline(0, color="black")
        
        # Now it is the time for our differences and percent differences
        percent_of_total = get_percent_of_total_differences(sorted_vals)
        cdf = np.cumsum(percent_of_total).round(decimals=8)
        other_elbow = np.argmax(percent_of_total)
        ax_percent_of_tot = fig.add_subplot(gs[0, 2])
        ax_percent_of_tot.plot(percent_of_total[:display_limit])
        ax_percent_of_tot.scatter([other_elbow], [percent_of_total[other_elbow]])
        ax_percent_of_tot.axhline(0, color="black")
        ax_cdf = fig.add_subplot(gs[1, 2], sharex=ax_percent_of_tot)
        ax_cdf.plot(cdf)
        ax_cdf.axhline(0, color="black")
        ax_cdf.axhline(1, color="black")
        print(f"Biggest % total difference: {np.argmax(percent_of_total)}")
        # What about an elbow of the CDF?
        ax_cdf_with_line = fig.add_subplot(gs[2,2], sharex=ax_percent_of_tot)
        cdf_line_slope = 1 / (cdf.size - 1)
        cdf_line = np.arange(cdf.size) * cdf_line_slope
        ax_cdf_with_line.plot(cdf)
        ax_cdf_with_line.plot(cdf_line)
        cdf_dist_from_line = get_distances_from_line(cdf, cdf_line_slope, 0)
        cdf_elbow = np.argmax(cdf_dist_from_line)
        ax_cdf_with_line.scatter([cdf_elbow], [cdf[cdf_elbow]])
        if cdf_elbow != elbow:
            print(f"CDF Elbow Disagrees: {cdf_elbow}")


    # Now the stuff for everyone
    ax.plot(line[:display_limit])
    if display_limit < 0 or elbow < display_limit:
        ax.scatter([elbow], [sorted_vals[elbow]])
    ax.plot(sorted_vals[:display_limit])

In [None]:
# What does one of our topics look like?

topn = 100000
topic_descriptions = describe_closest_items(
    top2vec_model.topic_vectors,
    top2vec_model.word_vectors,
    top2vec_model.vocab,
    topn=topn,
)

In [None]:
if top2vec_model.get_num_topics() > 20:
    top2vec_model.hierarchical_topic_reduction(20)
    print(f"Reduced to {len(top2vec_model.topic_vectors_reduced)} topics.")
    topn = 100000
    topic_descriptions_reduced = describe_closest_items(
        top2vec_model.topic_vectors_reduced,
        top2vec_model.word_vectors,
        top2vec_model.vocab,
        topn=topn,
    )
else:
    print(f"Already at {top2vec_model.get_num_topics()} topics")
    

In [None]:
topic_num = min(20, top2vec_model.get_num_topics() -1)
print(f"TOPIC {topic_num}")
display_limit = -1
terms, scores = topic_descriptions[topic_num]
print(len(scores), " total similar terms found from raw data.")
raw_scores = (
    1
    - sklearn.metrics.pairwise_distances(
        np.array([top2vec_model.topic_vectors[topic_num]]), top2vec_model.word_vectors
    )
)[0]
plot_heuristic(
    raw_scores,
    f"Topic {topic_num} - Raw Cosine",
    derivatives="values",
    display_limit=display_limit,
)

# Now show what the elbow would be if you ran it twice
# Need to spcify first_elbow=False, otherwise you can run into some bad cases
elbow_twice = plot_heuristic(
    scores,
    f"Topic {topic_num} - Values",
    derivatives="values",
    display_limit=display_limit,
    first_elbow=False,
)
print(f"Running heuristic twice gives {elbow_twice} terms: ", terms[:elbow_twice])


In [None]:
doc_topic_matrix = generate_similarity_matrix(
    top2vec_model.document_vectors, top2vec_model.topic_vectors, topn=topn
)
doc_topic_num_zeroes = np.count_nonzero(doc_topic_matrix == 0)
doc_topic_sparsity = doc_topic_num_zeroes / (doc_topic_matrix.size)

topic_term_matrix = generate_similarity_matrix(
    top2vec_model.topic_vectors, top2vec_model.word_vectors, topn=topn
)
topic_term_num_zeroes = np.count_nonzero(topic_term_matrix == 0)
topic_term_sparsity = topic_term_num_zeroes / (topic_term_matrix.size)

doc_topic_sparsity, topic_term_sparsity