In [None]:
from sklearn.cluster import HDBSCAN
from utils.utils import load_corpus, load_embeddings, pickle_load
from utils.paths import *
import numpy as np
import plotly.express as px

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Pretrained sentence transformer to perform semantic search.
# Other tested pretrained models are 'paraphrase-distilroberta-base-v1'
# and 'msmarco-distilbert-base-v3'
pretrained_model = 'paraphrase-distilroberta-base-v2'

# Pretrained cross-encoder to perform re-ranking. Other tested pretrained
# cross-encoders are 'cross-encoder/ms-marco-MiniLM-L-6-v2'
pretrained_crossencoder = 'cross-encoder/stsb-distilroberta-base'

# Set the maximum number of tokens to be processed by the sentence trasnformer.
# By default, the maximum lengths is 128 Word Piece tokens. If a sequence exceeds
# `max_seq_length`, it is trimmed.
max_seq_length = 512

# The encoding strategy must be a string containing the names of 
# the features to include into the input of the encoder, each of them
# separated by an underscore ('_'). For example, if you were to use 
# the title and the overview as the encoding strategy, `encoding_strategy`
# must be either `title_overview` or `overview_title`. Current supported
# features are 'title', 'authors' and 'overview'. For further information,
# have a look at the utils ``prepare_input_encoder`` function.
encoding_strategy = 'title_overview'

# Number of trees to use in the ANNOY index.
n_trees = 576

# Summarization strategy. We recommend using the top 5 sentences.
summarization = 'top5sent' #'top4sent' #''

# Load the corpus from disk. Beware that the loaded corpus must be
# consistent with the summarization technique you wish to use (e.g.,
# for the 'top5sent' strategy, the dataset that must be is
# 'books_processed_top5sent.csv')
corpus = load_corpus(PATH_BOOKS_TOP5S)


annoy_index_path = f'{DIR_ANNOY}{pretrained_model}/t{n_trees}_{summarization}_{encoding_strategy}.ann'

# Filepath to store the computed embeddings on disk or path to disk in which
# the embeddings are located. You may write the filepath you wish. If the directory
# does not exist, we will attempt to create it.
embeddings_cache_path = f'{DIR_EMBEDDINGS}{pretrained_model}/{summarization}_{encoding_strategy}.pkl'

In [3]:
embs, enc_input = load_embeddings(embeddings_cache_path)

In [4]:
cluster = HDBSCAN(min_cluster_size=5)

In [5]:
# ass = cluster.fit_predict(embs)

In [6]:
ass = pickle_load("cluster/assignments_min_size_5.pkl")

In [7]:
np.unique(ass, return_counts=True)

(array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
        33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
        50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
        67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82]),
 array([9073,    5,    6,    7,    6,    5,    5,    7,    5,    6,    6,
           6,    6,    5,   11,    9,   11,    8,    5,   10,    7,    9,
          12,    6,    8,    7,    5,    8,    7,    5,    7,    6,    8,
           7,    5,    7,    5,    6,   36,    9,    7,   10,    9,    5,
          16,   12,    7,    5,    8,    6,    8,    7,   10,    5,    6,
           5,    8,   11,    5,   20,    6,    7,    6,   12,   17,    6,
          15,   14,    5,    9,    7,    6,   23,    7,   13,   11,   17,
           6,   13,   19,    9,   21,   15,    8]))

In [8]:
cluster_ass = {}
for a, sample in zip(ass, enc_input):
    a = int(a)
    if a < 0:
        continue
    try:
        cluster_ass[a].add(sample)
    except KeyError:
        cluster_ass[a] = set([sample])

In [9]:
cluster_ass

{49: {"Harry Potter Boxset (Harry Potter)[SEP]The exciting tales of Harry Potter, the young wizard-in-training, have taken the world by storm, and fans just can't get enough of the magical world of Hogwarts and beyond. If you buy one of the Harry Potter books, we guarantee you'll want the next... and the next... and the next so why not have them all, right at your fingertips? With the Harry Potter Boxed Set Years 1-7, Barnes & Noble.com offers simple one-stop shopping for your Harry Potter library! As easy as the wave of a magic wand, you can get all seven Harry Potter books delivered to your doorstep at once. The Harry Potter Boxed Set includes hardcover editions of Harry Potter and the Sorcerer's Stone, Harry Potter and the Chamber of Secrets, Harry Potter and the Prisoner of Azkaban, Harry Potter and the Goblet of Fire, Harry Potter and the Order of the Phoenix, Harry Potter and the Half-Blood Prince, and Harry Potter and the Deathly Hallows.",
  "Harry Potter Collection (Harry Pott

In [10]:
for c in cluster_ass:
    print(cluster_ass[c])

{"Harry Potter and the Sorcerer's Stone (Harry Potter)[SEP]But his fortune changes when he receives a letter that tells him the truth about himself: he's a wizard. A mysterious visitor rescues him from his relatives and takes him to his new home, Hogwarts School of Witchcraft and Wizardry. But even within the Wizarding community, he is special. He is the boy who lived: the only person to have ever survived a killing curse inflicted by the evil Lord Voldemort, who launched a brutal takeover of the Wizarding world, only to vanish after failing to kill Harry. Though Harry's first year at Hogwarts is the best of his life, not everything is perfect.", "James Potter and the Hall of Elders' Crossing (James Potter)[SEP]What’s it like to be the son of the most famous wizard of all time? James Potter thinks he knows, but as he begins his own adventure at Hogwarts, he discovers just how much of a challenge it really is to live up to the legend of the great Harry Potter. As if it wasn’t enough dea

In [11]:
# from sklearn.manifold import TSNE

In [12]:
# tsne = TSNE()
# embs_2d = tsne.fit_transform(embs)

In [None]:
from utils.plotter import project
def scatterplot(word_embeddings,
                    color_text=None,
                    random_state=None,
                    reduction_strategy='pca',
                    marker_size = 12,
                    _graph_showlegend=True,
                    tsne_params={},
                    **scatter_params):
    # Get new word representations in the new embedded space.
    data, dim_compressor = project(
        word_embeddings, 2,
        reduction_strategy=reduction_strategy,
        random_state=random_state,
        **tsne_params)

    # Labels for the graph axis.
    fig_labels = {'0': 'PC 1', '1': 'PC 2'}
    # If there is a label for the `color` property
    if color_text:
        # we add it to the labels.
        fig_labels['color'] = color_text

    # Create a 2D scatter plot with the projected data.
    # x=0, y=1 will take the values for the first and
    # second dimension of the transformed data.
    fig = px.scatter(data, x=0, y=1, opacity=0.6,
                     labels=fig_labels, **scatter_params)

    # Change the position and the size of the markers.
    fig.update_traces(textposition='top center', marker=dict(size=marker_size))

    # Update legend title text, and whether to show legend or not.
    fig.update_layout(showlegend=_graph_showlegend,
                      legend_title_text='Colour legend')
    # Show figure.
    return fig, dim_compressor

In [54]:
ass_color = [i-100 if i < 0 else i for i in ass ]

In [None]:
fig, pca = scatterplot(embs, marker_size=3, color=ass_color, 
                                  color_continuous_scale=px.colors.sequential.Blues)
fig

In [39]:
pca.explained_variance_ratio_

array([0.04228232, 0.03582292])