### Overview

The goal of this notebook is to embed the 20 newsgroups dataset and explore it interactively using Bokeh widgets. We will use UMAP and HDBSCAN to embed and cluster the documents. An overview of embedding text using UMAP can be seen [here](https://umap-learn.readthedocs.io/en/latest/document_embedding.html).

In [1]:
import pandas as pd
import umap
import umap.plot
import hdbscan

# Used to get the data
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.spatial.distance import cdist


# Some plotting libraries
import matplotlib.pyplot as plt
%matplotlib inline
from bokeh.plotting import show, save, output_notebook, output_file
from bokeh.resources import INLINE
output_notebook(resources=INLINE)

### Get the data

In [2]:
dataset = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)



In [3]:
category_labels = [dataset.target_names[x] for x in dataset.target]
hover_df = pd.DataFrame(category_labels, columns=['category'])

Use a bag of words approach and apply TF-IDF normalization

In [4]:
vectorizer = TfidfVectorizer(min_df=5, stop_words='english')
word_doc_matrix = vectorizer.fit_transform(dataset.data)

In [5]:
word_doc_matrix

<18846x34880 sparse matrix of type '<class 'numpy.float64'>'
	with 1939023 stored elements in Compressed Sparse Row format>

Embed using [UMAP](https://umap-learn.readthedocs.io/en/latest/)

In [6]:
%%time
embedding = umap.UMAP(n_components=2, metric='hellinger').fit(word_doc_matrix)

CPU times: user 2min 20s, sys: 1.06 s, total: 2min 21s
Wall time: 1min 57s


Cluster the embedding using [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/)

In [7]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=15)
clusterer.fit_predict(embedding.embedding_)
labels = clusterer.labels_

In [8]:
hover_df['cluster'] = labels

In [9]:
short_abstracts = [x[:140] for x in dataset.data]

### Find ranked points

Here we want to rank each point in a cluster by its distance to the cluster centroid/medoid. There is [a notebook](https://github.com/scikit-learn-contrib/hdbscan/blob/master/notebooks/Looking%20at%20cluster%20consistency.ipynb) which explains this in more detail.

In [10]:
from utils.ranked_points import RankedPoints

In [11]:
examples = RankedPoints(embedding.embedding_, clusterer, metric='euclidean')

In [12]:
examples.calculate_all_distances_to_center()

In [13]:
top_posts = {}

for cluster_id in labels:
    rep_categories = hover_df.iloc[examples.get_closest_samples_for_cluster(cluster_id).index]['category'].values
    top_posts[int(cluster_id)] = '<ol>' + ''.join([f'<li>{c}</li>' for c in rep_categories]) + '</ol>'

### Make an interactive widget to explore the embedding

This code was modified from [this repository](https://github.com/MaksimEkin/COVID19-Literature-Clustering)

In [15]:
from utils.callbacks import input_callback, filter_callback, js_filter_points_code
from utils.interactive_text import header, toolbox_header, description_search, description_slider, description_checkbox, citation
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, CustomJS, Slider, CDSView, BooleanFilter, CheckboxGroup
from bokeh.models import CustomJSFilter
from bokeh.palettes import Category20
from bokeh.transform import linear_cmap, transform
from bokeh.io import output_file, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import RadioButtonGroup, TextInput, Div, Paragraph
from bokeh.layouts import column, widgetbox, row, layout

In [16]:
# show on notebook
output_notebook()
# target labels
y_labels = labels
num_clusters = len(pd.unique(labels)) -1

# data sources
source = ColumnDataSource(data=dict(
    x= embedding.embedding_[:,0], 
    y= embedding.embedding_[:,1],
    x_backup = embedding.embedding_[:,0],
    y_backup = embedding.embedding_[:,1],
    desc=y_labels, 
    category=hover_df['category'],
    abstract = short_abstracts,
    labels = [str(x) for x in y_labels],
    ))

checkboxes = CheckboxGroup(labels=['Ignore outlying points'], active=[])
# filter_callback_1 = filter_callback(source)

# checkboxes.js_on_change("active", filter_callback_1)
checkboxes.js_on_change("active", CustomJS(code="source.change.emit();", args=dict(source=source)))

# outlier_filter = js_filter_points(checkboxes, source, labels)
outlier_filter = CustomJSFilter(code=js_filter_points_code,
                                args=dict(checkboxes=checkboxes, labels=labels))

# hover over information
hover = HoverTool(tooltips=[
    ("Category", "@category{safe}"),
    ("Abstract", "@abstract{safe}"),
    ("Cluster", "@labels{safe}"),
],
point_policy="follow_mouse")

# map colors
mapper = linear_cmap(field_name='desc', 
                     palette=Category20[20],
                     low=min(y_labels) ,high=max(y_labels))

# prepare the figure
plot = figure(plot_width=1200, plot_height=850, 
           tools=[hover, 'pan', 'wheel_zoom', 'box_zoom', 'reset'], 
           title="Clustering of the 20 newsgroups dataset using UMAP and HDBSCAN", 
           toolbar_location="above")

# plot settings
plot.scatter('x', 'y', size=5, 
          source=source,
          fill_color=mapper,
          line_alpha=0.3,
          line_color="black",
          view=CDSView(source=source, filters=[outlier_filter]),
          )

# get the JS callback objects to handle user interactions
# text_banner = Paragraph(text= 'Categories of 5 representative points ', height=45)
text_banner = Div(text= 'Categories of 5 representative points ', height=45)
input_callback_1 = input_callback(plot, source, text_banner, top_posts, num_clusters)

# WIDGETS
slider = Slider(start=min(labels), end=num_clusters, value=num_clusters, step=1, title="Cluster #")
slider.js_on_change('value', input_callback_1)
keyword = TextInput(title="Search:")
keyword.js_on_change('value', input_callback_1)


# pass call back arguments
input_callback_1.args["text"] = keyword
input_callback_1.args["slider"] = slider

# Styling
header.sizing_mode = "stretch_width"
header.style={'color': '#2e484c', 'font-family': 'Julius Sans One, sans-serif;'}
header.margin=5


description_slider.style ={'font-family': 'Helvetica Neue, Helvetica, Arial, sans-serif;', 'font-size': '1.1em'}
description_slider.sizing_mode = "stretch_width"

description_search.style ={'font-family': 'Helvetica Neue, Helvetica, Arial, sans-serif;', 'font-size': '1.1em'}
description_search.sizing_mode = "stretch_width"
description_search.margin = 5

description_checkbox.style ={'font-family': 'Helvetica Neue, Helvetica, Arial, sans-serif;', 'font-size': '1.1em'}
description_search.sizing_mode = "stretch_width"

slider.sizing_mode = "stretch_width"
slider.margin=15

keyword.sizing_mode = "stretch_width"
keyword.margin=15

text_banner.style={'color': '#0269A4', 'font-family': 'Helvetica Neue, Helvetica, Arial, sans-serif;', 'font-size': '1.1em'}
text_banner.sizing_mode = "stretch_both"
text_banner.margin = 15

plot.sizing_mode = "scale_both"
plot.margin = 5

l = layout([
    [header],
    [description_slider, description_search],
    [slider, keyword],
    [description_checkbox],
    [checkboxes],
    [text_banner],
    [plot],
    [citation],
])
l.sizing_mode = "scale_both"

# Render
output_file('plots/umap_20_newsgroups_interactive.html')
show(l)