# Heuristics Visualization
This notebook is a demonstration of how the elbow finding heuristics will work on different datasets.
It is intended for experimentation and to determine if any adjustments should be made to handle cases such as an S-curve along the expected linear descent line.

In [None]:
import numpy as np
import sklearn.metrics
from top2vec.cutoff_heuristics.plot import plot_heuristic
from top2vec.cutoff_heuristics.cutoff_heuristics import (
    find_cutoff,
    get_distances_from_line,
    __edge_cases,
    __shifted_derivative_index,
    _get_shifted_second_derivative,
    ELBOW_HEURISTIC_STR,
    DERIVATIVE_HEURISTIC_STR,
)

%matplotlib inline

## Manufactured Data

In [None]:
vector = np.array([[2, 1]])
base = np.array(
    [
        [0, 1],
        [2, 1],
        [1, 0.5],
        [1, 0],
        [-1, 2],
    ]
)
opposite = np.array(
    [
        [-2, -0.8],
        [-2, -0.9],
        [-2, -0.95],
        [-2, -1],
        [-2, -1.05],
        [-2, -1.1],
        [-2, -1.2],
        [-2, -1.5],
    ]
)

test_embedding = np.vstack(
    [
        np.array(
            [
                [4, 2],
                [8, 4],
                [1, -1],
            ]
        ),
        base,
    ]
)

test_embedding_multiple_elbows = np.vstack([test_embedding, opposite])

# A different divergent case where the S-curve goes the other way
test_embedding_multiple_elbows_2 = np.vstack(
    [base, opposite, np.full((10, 2), [1, -1])]
)

sims = 1 - sklearn.metrics.pairwise_distances(vector, test_embedding, metric="cosine")
sims_multiple_elbows = 1 - sklearn.metrics.pairwise_distances(
    vector, test_embedding_multiple_elbows, metric="cosine"
)
sims_multiple_elbows_2 = 1 - sklearn.metrics.pairwise_distances(
    vector, test_embedding_multiple_elbows_2, metric="cosine"
)

## Show heuristic performance on simple embedding - one elbow

In [None]:
# The current elbow-finding heuristic also showing changes in distance from the line
plot_heuristic(
    sims[0],
    figure_num="Basic - Distance",
    figsize=(18, 8),
    cutoff_args={"first_elbow": False},
)

## Showing heuristic performance with multiple elbows in graph

In [None]:
# The current elbow-finding heuristic also showing changes in distance from the line
plot_heuristic(
    np.hstack((sims_multiple_elbows[0], [-1.5])),
    figure_num="2 Elbows - Exclusive",
    figsize=(18, 8),
    cutoff_args={"first_elbow": False, "below_line_exclusive": True},
)
plot_heuristic(
    np.hstack((sims_multiple_elbows[0], [-1.5])),
    figure_num="2 Elbows - Inclusive",
    figsize=(18, 8),
    cutoff_args={"first_elbow": False, "below_line_exclusive": False},
)
plot_heuristic(
    np.hstack((sims_multiple_elbows[0], [-1.5])),
    figure_num="2 Elbows - First Elbow",
    figsize=(18, 8),
    cutoff_args={"first_elbow": True, "below_line_exclusive": True},
)

## Yet another form of a graph with multiple elbows

In [None]:
# The current elbow-finding heuristic also showing changes in distance from the line
plot_heuristic(
    sims_multiple_elbows_2[0],
    figure_num="3 Elbows",
    figsize=(18, 8),
    cutoff_args={"first_elbow": False, "below_line_exclusive": False},
)
plot_heuristic(
    sims_multiple_elbows_2[0],
    figure_num="3 Elbows - First Elbow",
    figsize=(18, 8),
    cutoff_args={"first_elbow": True, "below_line_exclusive": False},
)

## Data from online

In [None]:
# This is an example of running only on the stuff which was already determined to be similar
# but in this case it appears that we want our elbow to be exclusive, not inclusive

sample_data = np.array(
    [
        0.87387407,
        0.8490747,
        0.83483994,
        0.80989516,
        0.45845926,
        0.45052826,
        0.44408453,
        0.4278804,
        0.4249642,
        0.41800153,
        0.415339,
        0.40166456,
        0.4011852,
        0.3939832,
        0.38374978,
        0.3823452,
        0.37897837,
        0.37643087,
        0.37551993,
        0.37453377,
        0.37433827,
        0.3703115,
        0.36441594,
        0.3591773,
        0.35516483,
        0.35450447,
        0.35071152,
        0.34965813,
        0.3412869,
        0.3399775,
        0.33868152,
        0.33400196,
        0.315881,
    ]
)

plot_heuristic(
    sample_data,
    figure_num="Online data",
    figsize=(18, 8),
    cutoff_args={"first_elbow": True, "below_line_exclusive": True},
)

## Speed Tests

In [None]:
%timeit -n 100000 -r 10 elbows = find_cutoff(sims_multiple_elbows_2[0], cutoff_heuristic=ELBOW_HEURISTIC_STR, first_elbow=False)
%timeit -n 100000 -r 10 elbows = find_cutoff(sims_multiple_elbows_2[0], cutoff_heuristic=ELBOW_HEURISTIC_STR, first_elbow=True)

In [None]:
# What if it is the same shape but much larger?
print(sims_multiple_elbows_2.shape)
big_sims = np.tile(sims_multiple_elbows_2, 1000000)
print(big_sims.shape)

In [None]:
%timeit -n 10 -r 5  elbows = find_cutoff(big_sims[0], cutoff_heuristic=ELBOW_HEURISTIC_STR, first_elbow=False)
%timeit -n 10 -r 5  elbows = find_cutoff(big_sims[0], cutoff_heuristic=ELBOW_HEURISTIC_STR, first_elbow=True)

### Performance Metrics
This has 3 elbows so we expect to have a substantial performance boost from using first_elbow

#### Size 23
17.4 µs ± 505 ns per loop (mean ± std. dev. of 10 runs, 100,000 loops each)  
18.9 µs ± 428 ns per loop (mean ± std. dev. of 10 runs, 100,000 loops each)  


#### Size 230,000
5.43 ms ± 12.6 µs per loop (mean ± std. dev. of 5 runs, 100 loops each)  
5.47 ms ± 31.2 µs per loop (mean ± std. dev. of 5 runs, 100 loops each)  

#### Size 2,300,000
57.7 ms ± 845 µs per loop (mean ± std. dev. of 5 runs, 10 loops each)  
58.1 ms ± 157 µs per loop (mean ± std. dev. of 5 runs, 10 loops each)  

#### Size 23,000,000
632 ms ± 1.5 ms per loop (mean ± std. dev. of 5 runs, 10 loops each)  
649 ms ± 573 µs per loop (mean ± std. dev. of 5 runs, 10 loops each)  

## Profiling the different heuristic options to see if there are major bottlenecks

In [None]:
%load_ext line_profiler

In [None]:
%lprun -T local_files/find_cutoff.recursive_elbow.line_profile -f find_cutoff -f get_distances_from_line -f __edge_cases find_cutoff(big_sims[0], first_elbow=False)
print(open("local_files/find_cutoff.recursive_elbow.line_profile", "r").read())

The good news is the majority of our time is spent sorting the array.
~ 200ms is spent calculating the cutoff (most of that is distances from the line) and almost 500ms is spent sorting the values array.

In [None]:
%lprun -T local_files/find_cutoff.derivative.line_profile -f find_cutoff -f _get_shifted_second_derivative -f __shifted_derivative_index find_cutoff(big_sims[0], cutoff_heuristic=DERIVATIVE_HEURISTIC_STR, first_elbow=False)
print(open("local_files/find_cutoff.derivative.line_profile", "r").read())

It takes 15ms to get the shifted derivative index when we truncate on the first elbow.
Running over all the data takes about 280ms to compute the index, 116ms of that is getting the shifted derivative index.