# Plot and investigate UMAP embeddings
In [another notebook](../WSB_clustering.ipynb) we found a useful low-dimensional representation of the data using UMAP, and then found a promising clustering of the data using HDBSCAN. We now conduct topic modeling using this, and create a UMAP plot.

In [1]:
%config Completer.use_jedi = False

import os
import sys
from datetime import datetime
import numpy as np
import pandas as pd
import csv  # to handle quotes when calling pd.read_csv 
import matplotlib  # to create custom color map
import matplotlib.pyplot as plt
from pathlib import Path
import umap, umap.plot
from bokeh.plotting import output_file
from bokeh.models.markers import Circle

import yake  # for keyword extraction; avilable here:
             # https://github.com/LIAAD/yake


import sparknlp
import pyspark.sql.functions as F
from pyspark.sql import types as T
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.clustering import LDA

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk"
os.environ["PATH"] = f"{os.environ['JAVA_HOME']}/bin:{os.environ['PATH']}"

spark = sparknlp.start()

# to allow importing from parent directory of notebooks folder
sys.path.append('..')
%load_ext autoreload
%autoreload 1
%aimport lda_pipeline

In [2]:
DATA_DIR = Path("../data/")
# sys.path.append('..')
DATA_PATH = DATA_DIR/"reddit_wsb.csv"
SENTENCES_DIR = DATA_DIR/"sentences_2021-03-07_12-45-54"
SENTENCES_PATH = next(SENTENCES_DIR.glob("part*.csv"))
EMB_CL_PATH = next(SENTENCES_DIR.glob("HDBSCAN*.csv"))

print("Sentences:\n", "  "+str(SENTENCES_PATH))
print("Embedded & clustered:\n", "  "+str(EMB_CL_PATH))

Sentences:
   ../data/sentences_2021-03-07_12-45-54/part-00000-e0a3b398-ae1e-4b50-b0df-5a518a114681-c000.csv
Embedded & clustered:
   ../data/sentences_2021-03-07_12-45-54/HDBSCAN_clustered_2021-03-08_05_59_47_dim=5_min_cl_size=20_ep=0.3.csv


# Read data and build shared data frame

Get sentences.

In [3]:
sents = spark.read.csv(str(SENTENCES_PATH),
                       sep="\t",
                       header=True,
                       multiLine=True, 
                       quote="\"", 
                      )

sents.registerTempTable("sents")

# sents.show(5)

Get UMAP embeddings and labels from clustering.

In [4]:
# TODO: set sep to \t for consistency with above
embcl = spark.read.csv(str(EMB_CL_PATH),
                       header=True,
                       multiLine=True, 
                       quote="\"", 
                       escape="\"")

embcl.registerTempTable("embcl")

# embcl.show(5)

In [5]:
sents_count = sents.count()
embcl_count = embcl.count()
counts_are_equal = (sents_count == embcl_count)
if counts_are_equal:
    print(f"The data frames have the same number of rows")
else:
    print("Some rows have been lost")

The data frames have the same number of rows


Impose size threshold, to discard clusters containing fewer than a chosen  number of sentences.

In [6]:
THRESHOLD = 200

df = spark.sql(f"""
/*CTE to get sizes of all cluster having > 200 posts*/
WITH value_counts AS
(SELECT embcl.cluster, 
        COUNT(DISTINCT post_id) AS count
 FROM embcl
 GROUP BY embcl.cluster
 HAVING count > {THRESHOLD}     /* threshold */
)
/*select the sentences from these clusters*/
SELECT embcl.*, long_sent AS sent
FROM sents 
    JOIN embcl
      ON (sents.post_id = embcl.post_id
          AND sents.chunk_num = embcl.chunk_num)
    JOIN value_counts
      ON (embcl.cluster = value_counts.cluster)
""")

size_df = df.count()

print(f"{round(100*size_df/sents_count)}% of sentences retained "
      + f"using threshold {THRESHOLD}.")

93% of sentences retained using threshold 200.


# Plotting

In [7]:
try:
    pd_sents = sents.toPandas()
    pd_embcl = embcl.toPandas()
    pd_df = df.toPandas()
except AttributeError:
    print("Invalid types; perhaps some data Spark DataFrame", 
          "has already been converted to Pandas?")

Two quick sanity checks:

In [8]:
# verify that post_id columns agree up to ordering
S1 = pd_sents.iloc[pd_sents.post_id.argsort()].post_id.reset_index(drop=True)
S2 = pd_embcl.iloc[pd_embcl.post_id.argsort()].post_id.reset_index(drop=True)
print(np.all(S1==S2))
# verify that chunk_num columns agree up to ordering
S1 = pd_sents.iloc[pd_sents.chunk_num.argsort()].chunk_num.reset_index(drop=True)
S2 = pd_embcl.iloc[pd_embcl.chunk_num.argsort()].chunk_num.reset_index(drop=True)
print(np.all(S1==S2))

True
True


Using embeddings computed in [Colab notebook](../WSB_clustering.ipynb), apply a second UMAP embedding into two dimensions for plotting.


In [9]:
%%time
reducer = umap.UMAP(n_neighbors=300)
mapper = reducer.fit(
    pd_df[[f"z{i}" for i in range(5)]]  # just the embedding vars
)
Z2 = mapper.transform(
    pd_df[[f"z{i}" for i in range(5)]]  # again
)
Z2 = pd.DataFrame(Z2, columns=["x", "y"])

CPU times: user 2min 57s, sys: 2.13 s, total: 2min 59s
Wall time: 1min 12s


In [10]:
ddf = pd.concat(
    [pd_df.merge(pd_sents, "left"),
     # .reset_index(drop=True), 
     Z2], axis=1)
ddf

Unnamed: 0,post_id,chunk_num,cluster,z0,z1,z2,z3,z4,sent,long_sent,x,y
0,l6wydu,0,-1,10.135037,9.804133,11.581939,0.72037655,8.067224,It seems as though Robinhood is blocking peopl...,It seems as though Robinhood is blocking peopl...,9.408455,3.221573
1,l6wzmx,0,-1,8.827715,10.168931,11.86203,-0.010687463,7.96494,RH is no longer supporting GME! Although I am ...,RH is no longer supporting GME! Although I am ...,10.262827,1.082921
2,l6yw6k,0,33,11.345362,9.165419,11.371122,-0.8557228,8.271662,"Redditors of WSB we ""WILL"" change the world NO...","Redditors of WSB we ""WILL"" change the world NO...",5.323113,5.618005
3,l6ywcd,0,-1,10.304863,10.191217,10.934265,0.15972324,8.571029,Entered at $390 let's go boys . Bought some sh...,Entered at $390 let's go boys . Bought some sh...,8.835866,3.586563
4,l6yyv2,0,-1,7.772799,10.225097,11.015197,-0.85345316,7.6593246,Can someone dumb down and explain what's happe...,Can someone dumb down and explain what's happe...,9.045388,-1.945948
...,...,...,...,...,...,...,...,...,...,...,...,...
19297,lbc61s,0,27,11.366164,10.627874,9.857017,1.0392779,9.499864,I love this community and I will to zero with ...,I love this community and I will to zero with ...,10.380409,13.612749
19298,lbercy,1,33,9.469345,9.876454,11.724865,1.5465696,8.728923,Thanks for checking out this subreddit but whe...,Thanks for checking out this subreddit but whe...,12.060952,3.709996
19299,lc174t,0,-1,10.491447,9.115634,11.79601,1.2849495,9.017485,Hey SEC guy monitoring this forum. Friggin wha...,Hey SEC guy monitoring this forum. Friggin wha...,10.941757,5.156502
19300,lc174t,1,27,10.592887,10.38389,9.145164,-0.6576598,10.660142,Anyways I hope you see some wild shit it's a g...,Anyways I hope you see some wild shit it's a g...,8.287837,18.580954


In [11]:
umap.plot.output_notebook()
hover_data = ddf[["post_id", "cluster", "long_sent"]]
hover_data.assign(text=hover_data.long_sent.apply(lambda s: s.replace('\\\"', '\"')))

# make my own keymap so that noise is distinct (lighter in color)
cmap = matplotlib.cm.get_cmap("tab20")
my_color_key = [matplotlib.colors.to_hex(c) 
                for c in [(.9,.9,.9)]+list(cmap.colors)]
# np.random.seed(1)
# plot_sample = np.random.randint(0,20, size_df, dtype=int) <  1
hover_plot = umap.plot.interactive(mapper, 
                                   labels=ddf.cluster, 
                                   hover_data=hover_data,
                                   point_size=0,
#                                    subset_points = plot_sample,
                                   width=600,
                                   height=600,
                                   color_key = my_color_key,
                                   background="black")

output_file("../assets/wsb_emb.html")

for circle in hover_plot.select(Circle):
    circle.radius = .005
    

hover_plot.title.text = ("r/WallStreetBets posts labeled using a deep, " + 
                         f"attention-based topic model ({size_df} points).")
umap.plot.show(hover_plot)

<iframe src="/assets/wsb_emb.html"
    sandbox="allow-same-origin allow-scripts"
    width="100%"
    height="500"
    scrolling="no"
    seamless="seamless"
    frameborder="0">
</iframe>

# Topic Modeling
We try two approaches to extract topics from the clusters.

## YAKE! model?

Spark NLP has a built-in topic extractor which is an implementation of [YAKE!](https://www.sciencedirect.com/science/article/abs/pii/S0020025519308588?via%3Dihub). We tried running that after aggregating our texts by cluster. The Spark NLP implementation was very slow. We also tried the implementation provided by the authors of the papers.

### Via the implementation from the YAKE! authors.

Drop noise.

In [27]:
cluster_sizes = (
    df
    .select("cluster")
    .groupby("cluster")
    .agg(F.count("cluster")
         .alias("size"))
    .toPandas())
cluster_sizes = cluster_sizes.set_index("cluster")
cluster_sizes.T

cluster,-1,15,11,30,28,27,17,25,33,21
size,8242,2160,229,690,560,2153,512,448,4042,266


In [28]:
ddf = (
    df
    .where(df["cluster"] != "-1")
    .select("post_id", "chunk_num", "cluster", "sent")
)

ddf = (
    ddf
    .select("cluster", "sent")
    .groupby("cluster")
    .agg(F.collect_list("sent").alias("sents"))
    .withColumn("text", 
                F.concat_ws(" ", "sents"))
    .select("cluster", "text")
)

In [29]:
everything = (
    ddf
    .select("text")
    .groupby()
    .agg(F.concat_ws(" ", 
                     F.collect_list(ddf['text']))
         .alias('everything'))
)
everything.show()
everything = everything.toPandas().iloc[0][0]
everything[:500]

+--------------------+
|          everything|
+--------------------+
|Robinhood contact...|
+--------------------+



'Robinhood contact information for C level assistance. Vlad Tenev. CEO. 571 224 8804 vlad robinhood.com. Jim Swartwout COO. 3215419434 jim.swartwout robinhood.com. Jason Warnick CFO Mobile . 2069155858 jason.warnick robinhood.com. Scott Hershorin. Chief Compliance Officer 6103295474. Kelly Zigaitis Chief Compliance Officer securities. 3148353050. Director Business Operations Nafeesa Remtilla . 4156275528. Brianna Bain Marker Operations Head . 503 806 1430. Jacob Schwartz. Senior manager customer '

In [30]:
cluster_texts = ddf.toPandas()
cluster_texts = cluster_texts.set_index("cluster")

In [16]:
%%time
keyword_extractor = yake.KeywordExtractor(n=2, top=300)
yake_keywords_all = keyword_extractor.extract_keywords(everything)
yake_keywords_all_just_words = [word for word, _ in yake_keywords_all]

CPU times: user 51.3 s, sys: 514 ms, total: 51.8 s
Wall time: 52 s


Now get keywords cluster-by-cluster, and delete any that were iden

In [17]:
print(yake_keywords_all_just_words[:100])

['buy gme', 'gme', 'gme amc', 'hold gme', 'gme shares', 'gme stock', 'amc gme', 'hold hold', 'gme short', 'sell gme', 'gme gme', 'robinhood', 'wall street', 'fuck robinhood', 'gme buy', 'buy amc', 'gme stocks', 'buy buy', 'buying gme', 'hedge funds', 'holding gme', 'gme hold', 'buy', 'hold', 'amc', 'gme nok', 'gme shorts', 'buy hold', 'hold buy', 'short gme', 'short squeeze', 'gme squeeze', 'stock market', 'gme share', 'melvin capital', 'amc nok', 'stock', 'gme price', 'short interest', 'gme robinhood', 'gme buys', 'buy shares', 'shares gme', 'short', 'amc amc', 'hold amc', 'stock price', 'shares', 'gme today', 'fucking hold', 'sell', 'stock gme', 'bought gme', 'gme trading', 'gamestop gme', 'trading gme', 'trade gme', 'buys gme', 'long gme', 'market', 'stocks', 'buy stocks', 'gme calls', 'gme dip', 'amc stock', 'wsb', 'tldr gme', 'buy nok', 'short positions', 'fucking', 'market manipulation', 'positions gme', 'gme position', 'fucking sell', 'nok amc', 'nok gme', 'share price', 'buy ba

In [18]:
%%time
yake_keywords_by_cluster = {}
for clid in cluster_texts.index:
    keyword_extractor = yake.KeywordExtractor(n=2, top=200)
    keywords = keyword_extractor.extract_keywords(cluster_texts.loc[clid][0])
    keywords = [(word, score) for word, score in keywords 
                if word not in yake_keywords_all_just_words]
    yake_keywords_by_cluster[clid] = keywords

CPU times: user 1min 4s, sys: 122 ms, total: 1min 4s
Wall time: 1min 4s


In [19]:
for clid, keywords in yake_keywords_by_cluster.items():
    print(clid, [word for word, _ in keywords], "\n")

15 ['gme positions', 'gme posts', 'position gme', 'dip buy', 'good', 'post', 'app', 'retards', 'prices', 'lot', 'gme funds', 'gme fidelity', 'nok buy', 'manipulation', 'broker', 'investors', 'options', 'calls', 'gme holders', 'advice', 'order', 'squeeze gme', 'money back', 'short sells'] 

11 ['good', 'trades', 'advice', 'thing', 'gme closed', 'stocks short', 'support gme', 'calls', 'short ratio', 'volume', 'rich', 'long short', 'fractional shares', 'post', 'things', 'cnbc', 'love', 'point', 'stock buy', 'stock share', 'years', 'gme investor', 'cover', 'started gme', 'risk', 'gme galactic', 'gme spreads', 'real', 'market buys', 'robinhood insider', 'cash', 'literally', 'nio', 'prices', 'screwrobinhood gme', 'market open', 'shorts friday', 'move', 'float short', 'called gme', 'media', 'monday', 'make short', 'sell shorts', 'investors', 'broker', 'make amc', 'started', 'brokerage', 'fucking gamestop', 'gme transactions', 'margin', 'funds hold', 'float', 'put', 'orders', 'world', 'gme cha

We see a downside of our method: since the clusters are very imbalanced with Cluster 33 dominant, keywords representing the whole corpus are likely to represent the dominant cluster.

To do: use mutual informaiton to compare with results from LDA? Should also improve this with lemmatization.

## TF-IDF approach

Working cluster-by-cluster, we keep terms with a high tf-idf score. 

I don't currently like the results. It might be beneficial to check this with spaCy or scikit-learn.

Drop noise.

In [26]:
ddf = (
    df
    .where(df["cluster"] != "-1")
    .select("post_id", "chunk_num", "cluster", "sent")
)
ddf.show(5, truncate=40)

+-------+---------+-------+----------------------------------------+
|post_id|chunk_num|cluster|                                    sent|
+-------+---------+-------+----------------------------------------+
| l6yw6k|        0|     33|Redditors of WSB we "WILL" change the...|
| l6zu42|        0|     17|How do I even buy GME if brokers are ...|
| l6zvww|        0|     15|Robinhood contact information for C l...|
| l6zxpv|        0|     15|Rate Robinhood 1 star on the app stor...|
| l701oy|        0|     30|IT'S WORKING Several Hedge Funds Stun...|
+-------+---------+-------+----------------------------------------+
only showing top 5 rows



Compute cluster sizes.

In [27]:
cluster_sizes = (
    ddf
    .select("cluster")
    .groupby("cluster")
    .agg(F.count("cluster")
         .alias("size"))
    .toPandas())
cluster_sizes = cluster_sizes.set_index("cluster")
cluster_sizes.T

cluster,15,11,30,28,27,17,25,33,21
size,2160,229,690,560,2153,512,448,4042,266


In [12]:
pipeline = lda_pipeline.build_embcl_pipeline()
pipeline_model = pipeline.fit(ddf)
ddf = pipeline_model.transform(ddf)

Count term frequencies once, text-by-text, then compute inverse document frequencies cluster-by-cluster.

In [15]:
%%time

cluster_stats = {}
tf_model = (
    CountVectorizer()
    .setInputCol('finished_unigrams')
    .setOutputCol('term_freq')
    .fit(ddf)
)
term_freq = tf_model.transform(ddf)
term_freq.cache()

for clid in cluster_sizes.index:
    idf_model = (
        IDF()
        .setMinDocFreq(5)
        .setInputCol('term_freq')
        .setOutputCol('tfidf')
        .fit(term_freq.where(F.col("cluster")==clid))
    )

    tfidf = idf_model.transform(term_freq)
    cluster_stats[clid] = {"tf_model": tf_model,
                           "term_freq": term_freq,
                           "idf_model": idf_model,
                           "tfidf":tfidf}

CPU times: user 116 ms, sys: 11.1 ms, total: 127 ms
Wall time: 42.3 s


In [16]:
NUM_KEYWORDS = 10

keywords_dict = {}
for clid in cluster_sizes.index:
    vocab = cluster_stats[clid]['tf_model'].vocabulary
    top_cluster_idfs = cluster_stats[clid]['idf_model'].idf
    top_cluster_idf_indices = np.argsort(top_cluster_idfs)[-NUM_KEYWORDS:]
    keywords_dict[clid] = [vocab[i] for i in top_cluster_idf_indices]

In [20]:
rows = [[clid, cluster_sizes.loc[clid][0], keywords] 
        for clid, keywords in keywords_dict.items()]
keywords_df = pd.DataFrame(rows, columns=["clid", "cl_size", "keywords"]).set_index("clid")
# pd.set_option('max_colwidth', 80)
print(keywords_df)

      cl_size  \
clid            
15       2160   
11        229   
30        690   
28        560   
27       2153   
17        512   
25        448   
33       4042   
21        266   

                                                                             keywords  
clid                                                                                   
15    [balls, caught, spite, equipment, hence, facebook, study, assumption, report...  
11         [worth, illegal, years, anyway, in, double, rest, month, create, tomorrow]  
30                 [hey, autist, focus, sky, others, final, 9, asshole, left, easily]  
28      [event, night, absolutely, curious, robin, major, problem, exposure, r, land]  
27      [cold, degree, unfold, plunge, genius, accurate, effectively, song, john, mr]  
17           [works, slv, expire, yeah, insane, barely, effect, mad, ride, recommend]  
25     [e, gamma, true, spread, quite, billionaires, problem, online, bought, course]  
33       [$4000, 3x,

This doesn't seem to be working.

## Scratch: Yake via Spark NLP

This didn't finish within an hour, even though my by-hand version finishes in ~2 minutes. On the other hand, it is very quick if I don't first aggreage the texts by cluster.

Drop noise and aggregate all posts by cluster.

In [19]:
ddf = (
    df
    .where(df["cluster"] != "-1")
    .select("post_id", "chunk_num", "cluster", "sent")
)

ddf = (
    ddf
    .select("cluster", "sent")
    .groupby("cluster")
    .agg(F.collect_list("sent").alias("sents"))
    .withColumn("text", 
                F.concat_ws(" ", "sents"))
    .select("cluster", "text")
)

In [13]:
pipeline = lda_pipeline.build_embcl_pipeline(use_finisher=False)
pipeline_model = pipeline.fit(ddf)
ddf = pipeline_model.transform(ddf)

In [14]:
from sparknlp.annotator import YakeModel
from sparknlp.base import Finisher
keyword_extractor = (
    YakeModel()
    .setNKeywords(1)
    .setInputCols(["unigrams"])
    .setOutputCol("keywords")
    .setMaxNGrams(3)
)

finisher = Finisher().setInputCols(["keywords"])

yake_model = keyword_extractor.transform(ddf)
kws = finisher.transform(yake_model)


In [11]:
# # This never terminated
# %%time
# kws.select('finished_keywords').show(truncate=100)

## Scratch: alternative approach to plotting via Plotly Express

In [14]:
import textwrap
try:
    import plotly
except ImportError as e:
    ! pip install plotly==4.14.3
    import plotly
import plotly.express as px
print(f"Plotly version: {plotly.__version__}")

In [15]:
plot_df = (
    ddf
    .assign(long_sent = 
            ddf.long_sent.apply(
                lambda txt: '<br>'.join(textwrap.wrap(txt, width=40))
                 )
           )
    .assign(cluster = ddf.cluster.astype(str)))

In [144]:
hover_data = {"x":False, "y":False, "long_sent":True}
sample_size = 10000
fig = px.scatter(
    plot_df.sample(sample_size),
    x="x", 
    y="y",
    color="cluster",
    hover_data=hover_data,
    color_discrete_sequence=px.colors.diverging.Armyrose_r,
    title = "Embedding of WSB posts after preprocessing splitting and then recomibining sentences"
          )
fig.show()