# Plot and investigate UMAP embeddings

After using two internal clustering metrics, we don't see any strong evidence for a particular number of k-means clusters for any value of k from 4 to 19. HDBSCAN with default parameters returned a very large number of clusters (~150),  with > 90% of sentences being unclassified. 


In [1]:
%config Completer.use_jedi = False

import os
import sys
from datetime import datetime
import numpy as np
import pandas as pd
import csv                      # to handle quotes when calling pd.read_csv 
import matplotlib
import matplotlib.pyplot as plt
from pathlib import Path
import umap, umap.plot

try:
    import plotly
except ImportError as e:
    ! pip install plotly==4.14.3
    import plotly
import plotly.express as px
print(f"Plotly version: {plotly.__version__}")

DATA_DIR = Path("../data/")
DATA_PATH = DATA_DIR/"reddit_wsb.csv"
SENTENCES_DIR = DATA_DIR/"sentences_2021-03-07_12:45:54"
SENTENCES_PATH = next(SENTENCES_DIR.glob("part*.csv"))
EMB_CL_PATH = next(SENTENCES_DIR.glob("HDBSCAN*.csv"))

print("Sentences:\n", "  "+str(SENTENCES_PATH))
print("Embedded & clustered:\n", "  "+str(EMB_CL_PATH))

Plotly version: 4.14.3
Sentences:
   ../data/sentences_2021-03-07_12:45:54/part-00000-e0a3b398-ae1e-4b50-b0df-5a518a114681-c000.csv
Embedded & clustered:
   ../data/sentences_2021-03-07_12:45:54/HDBSCAN_clustered_2021-03-08_05 59 47_dim=5_min_cl_size=20_ep=0.3.csv


# Read the data

In [2]:
sents = pd.read_csv(str(SENTENCES_PATH), sep="\t", quoting=csv.QUOTE_NONE)
sents.head()

Unnamed: 0,post_id,chunk_num,long_sent
0,l6wydu,0,It seems as though Robinhood is blocking peopl...
1,l6wzmx,0,RH is no longer supporting GME! Although I am ...
2,l6yw6k,0,"""Redditors of WSB we \""WILL\"" change the world..."
3,l6ywcd,0,Entered at $390 let's go boys . Bought some sh...
4,l6yyv2,0,Can someone dumb down and explain what's happe...


In [3]:
embcl = pd.read_csv(EMB_CL_PATH)
embcl.head()

Unnamed: 0,post_id,chunk_num,cluster,z0,z1,z2,z3,z4
0,lc7aj4,0,-1,11.520249,10.176333,10.322124,-0.28396,9.739444
1,l8q7yg,0,-1,11.060429,9.986012,9.87345,-0.197139,10.654449
2,l70zah,0,-1,11.421103,9.091762,11.08176,-0.059767,7.91419
3,l71n3m,0,-1,10.93525,10.059787,10.542604,1.500129,9.25162
4,l6wyy7,0,-1,10.73695,10.322493,9.951429,1.937703,8.488021


Two quick sanity checks:

In [4]:
# verify that post_id columns agree up to ordering
S1 = sents.iloc[sents.post_id.argsort()].post_id.reset_index(drop=True)
S2 = embcl.iloc[embcl.post_id.argsort()].post_id.reset_index(drop=True)
np.all(S1==S2)
# verify that chunk_num columns agree up to ordering
S1 = sents.iloc[sents.chunk_num.argsort()].chunk_num.reset_index(drop=True)
S2 = embcl.iloc[embcl.chunk_num.argsort()].chunk_num.reset_index(drop=True)
np.all(S1==S2)

True

Drop the uncommon topics:

Adjoin UMAP embeddings computed in [Colab notebook](../WSB_clustering.ipynb):

In [5]:
Z5 = embcl[[f"z{i}" for i in range(5)]]
df = embcl

In [6]:
if df.shape[0] == sents.shape[0]:
    THRESHOLD = 200  # drop clusters with fewer than this many posts
    vcs = df.cluster.value_counts()

    print(f"Starting with {df.shape[0]} rows and {vcs.shape[0]} clusters.")
    print(f"Thresholding from below at cluster size {THRESHOLD}...")
    dominant_clusters = vcs.where(vcs>THRESHOLD).dropna()
    df = (
        df[df.cluster.isin(dominant_clusters.index)]
    #     .sample(2000, random_state=1)
          )
print(f"df has {df.shape[0]} rows and {dominant_clusters.shape[0]} clusters.")
print("\n")
print("Cluster sizes:", list(dominant_clusters))
df.info()

# redefine Z5 here to drop rows
Z5 = df[[f"z{i}" for i in range(5)]]

Starting with 20694 rows and 35 clusters.
Thresholding from below at cluster size 200...
df has 19302 rows and 10 clusters.


Cluster sizes: [8242.0, 4042.0, 2160.0, 2153.0, 690.0, 560.0, 512.0, 448.0, 266.0, 229.0]
<class 'pandas.core.frame.DataFrame'>
Int64Index: 19302 entries, 0 to 20693
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   post_id    19302 non-null  object 
 1   chunk_num  19302 non-null  int64  
 2   cluster    19302 non-null  int64  
 3   z0         19302 non-null  float64
 4   z1         19302 non-null  float64
 5   z2         19302 non-null  float64
 6   z3         19302 non-null  float64
 7   z4         19302 non-null  float64
dtypes: float64(5), int64(2), object(1)
memory usage: 1.3+ MB


90% of the data lives in 8 clusters. 40% of the data is treated by noise, but that's reasonable for the purpose of topic discovery. We embed in two dimensions to visualize:

In [None]:
%%time
reducer = umap.UMAP(n_neighbors=300)
mapper = reducer.fit(Z5)
Z2 = mapper.transform(Z5)
Z2 = pd.DataFrame(Z2, columns=["x", "y"])

In [None]:
ddf = pd.concat(
    [df
     .merge(sents).reset_index(drop=True)
     .drop(columns=[f"z{i}" for i in range(5)]), 
     Z2], axis=1)
ddf

In [None]:
umap.plot.output_notebook()
hover_data = ddf[["post_id", "cluster", "long_sent"]]
hover_data.assign(text=hover_data.long_sent.apply(lambda s: s.replace('\\\"', '\"')))

# make my own keymap so that noise is distinct (lighter in color)00
cmap = matplotlib.cm.get_cmap("tab20")
my_color_key = [matplotlib.colors.to_hex(c) 
                for c in [(.9,.9,.9)]+list(cmap.colors)]
hover_plot = umap.plot.interactive(mapper, 
                                   labels=ddf.cluster, 
                                   hover_data=hover_data,
                                   point_size=1.6,
#                                    color_key_cmap='Set3',
                                   color_key = my_color_key,
                                   background="black"
                                  )
hover_plot.title.text = "r/WallStreetBets excerpts cleaned (SparkNLP), embedded (UMAP), clustered (HDBSCAN), then projected (UMAP)."
umap.plot.show(hover_plot)

## Scratch: alternative approach via Plotly Express

In [14]:
import textwrap

In [15]:
plot_df = (
    ddf
    .assign(long_sent = 
            ddf.long_sent.apply(
                lambda txt: '<br>'.join(textwrap.wrap(txt, width=40))
                 )
           )
    .assign(cluster = ddf.cluster.astype(str)))

In [144]:
hover_data = {"x":False, "y":False, "long_sent":True}
sample_size = 10000
fig = px.scatter(
    plot_df.sample(sample_size),
    x="x", 
    y="y",
    color="cluster",
    hover_data=hover_data,
    color_discrete_sequence=px.colors.diverging.Armyrose_r,
    title = "Embedding of WSB posts after preprocessing splitting and then recomibining sentences"
          )
fig.show()