In [6]:
import pandas as pd
import numpy as np
from collections import defaultdict
import umap
import hdbscan
import matplotlib.pyplot as plt
import matplotlib.colors
import colorcet

from bokeh.plotting import show, output_file, save
import bokeh.io

# Uncomment if you want to see bokeh output in the notebook
# Warning that this can increase the filesize quite a bit after visualization!
# bokeh.io.output_notebook()

# If you have the latest development UMAP, use this instead of the packaged plot
#from umap import plot
from plot import plot

import os
import pickle
import spacy
from spacy_langdetect import LanguageDetector

# Extract all post titles

Extract the post titles from the provided database.  We write them to a file so that the unmodified BERT repository can generate embeddings for all of them.  Only thing to be aware of is that the author and post title are aligned by index (i.e. line number), so any changes to the database will break that alignment.

# Filter by language
Since we're using an English BERT model we should first separate the post titles by language using spaCy en_core_web_md

In [7]:
def is_likely_french(row):
    doc = nlp(row["title"])
    if doc._.language["language"] == "fr" and doc._.language["score"] > 0.99:
        return True
    return False

if not os.path.exists("./data/filtered_submission_titles.txt"):
    print("Submissions not yet filtered.  Filtering...")
    data = pd.read_feather('./data/reddit_combined_small.feather')
    data['created_utc_dt'] = pd.to_datetime(data['created_utc'])
    submissions = data[data["item_type"] == "submission"]
    #submissions["title"].to_csv("./post_titles.txt", index=False)
    
    nlp = spacy.load("en_core_web_md")
    nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
    
    submissions = submissions[submissions["subreddit"].isin(['britishcolumbia', 'Quebec', 'alberta', 'canadaguns', 'CanadianForces', 'Edmonton', 'CanadaPolitics', 'canada', 'vancouver', 'onguardforthee', 'metacanada', 'ontario'])]
    submissions = submissions[~submissions.apply(is_likely_french, axis=1)]
    submissions.reset_index(drop=True).to_feather("./data/filter_submissions.feather")
    submissions["title"].to_csv("./data/filtered_submission_titles.txt", header=None, index=False)
else:
    print("Found filtered submissions!  Loading...")
    submissions = pd.read_feather("./data/filter_submissions.feather")
    
print("Done!")

Found filtered submissions!  Loading...
Done!


In [8]:
submissions.groupby("subreddit").count()["author"].sort_values(ascending=False)

subreddit
canada             47273
vancouver          31995
metacanada         26890
CanadaPolitics     15621
Edmonton           14024
onguardforthee     11257
ontario            11224
canadaguns          8221
alberta             6944
CanadianForces      5360
britishcolumbia     3980
Quebec              3025
Name: author, dtype: int64

# Generate BERT embeddings

This will be much faster if you have GPUs available.  **Note that we have already generated these for this particular dataset, so we can skip down to the clustering and plotting steps unless we have new data we want to process.**  Uncomment the lines if you have a base TensorFlow environment ready to go, or paste them in a terminal.

Install BERT

In [9]:
#!git clone https://github.com/google-research/bert.git
#!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
#!unzip uncased_L-12_H-768_A-12.zip

Extract features.  You'll need to either have a BERT-compatible TensorFlow installed into your base environment, or run this in a separate terminal.  This is a pretty space-inefficient way to do this (generating the full embedding requires ~25GB of disk space, of which most is discarded when we filter to just the CLS tokens), but it requires the least custom code.  Modifying the "extract_features" script to just return CLS tokens can bypass this if disk space is an issue.

In [10]:
#!python bert/extract_features.py \
#  --input_file=./data/filtered_submission_titles.txt \
#  --output_file=./data/filtered_submission_title_embeddings.jsonl \
#  --vocab_file=./uncased_L-12_H-768_A-12/vocab.txt \
#  --bert_config_file=./uncased_L-12_H-768_A-12/bert_config.json \
#  --init_checkpoint=./uncased_L-12_H-768_A-12/bert_model.ckpt \
#  --layers=-1 \
#  --max_seq_length=128 \
#  --batch_size=8

In [11]:
# Now try running it on the suspicious comments from the old transparency report
# !python bert/extract_features.py \
#   --input_file=./data/suspicious_submission_titles.txt \
#   --output_file=./data/suspicious_titles.jsonl \
#   --vocab_file=./uncased_L-12_H-768_A-12/vocab.txt \
#   --bert_config_file=./uncased_L-12_H-768_A-12/bert_config.json \
#   --init_checkpoint=./uncased_L-12_H-768_A-12/bert_model.ckpt \
#   --layers=-1 \
#   --max_seq_length=128 \
#   --batch_size=8

Filter to just the CLS embeddings (and save them).  **Start here if you already have generated embeddings.**

In [12]:
if not os.path.exists("./data/suspicious_titles_cls.json"):
    print("Suspicious CLS embeddings do not exist.  Generating...")
    lines = pd.read_json("./data/suspicious_titles.jsonl", chunksize=100, lines=True)

    cls_tokens = []

    for chunk in lines:
        for it, line in chunk.iterrows():
            cls_tokens.append(line["features"][0]['layers'][0]['values'])

    susp_df = pd.DataFrame(cls_tokens)
    susp_df.to_json("suspicious_titles_cls.json")
else:
    print("Found suspicious CLS embeddings.  Loading...")
    susp_df = pd.read_json("./data/suspicious_titles_cls.json")
    
print("Done!")

Found suspicious CLS embeddings.  Loading...
Done!


We just want the \[CLS\] embedding from the last layer, so let's extract them and save separately.

In [13]:
if not os.path.exists("./data/filtered_submission_title_embeddings_cls.json"):
    print("Submission CLS embeddings do not exist.  Generating...")
    lines = pd.read_json("./data/filtered_submission_title_embeddings.jsonl", chunksize=100, lines=True)

    cls_tokens = []

    for chunk in lines:
        for it, line in chunk.iterrows():
            cls_tokens.append(line["features"][0]['layers'][0]['values'])

    tokens_df = pd.DataFrame(cls_tokens)
    tokens_df.to_json("./data/filtered_submission_title_embeddings_cls.json")
else:
    print("Found submission CLS embeddings.  Loading...")
    tokens_df = pd.read_json("./data/filtered_submission_title_embeddings_cls.json")

print("Done!")

Found submission CLS embeddings.  Loading...
Done!


Label source of each data point correctly 

In [14]:
source_labels = pd.DataFrame(['canada' for c in range(tokens_df.shape[0])])
source_labels = pd.concat([source_labels, pd.DataFrame(['suspicious' for c in range(susp_df.shape[0])])])
source_labels.reset_index(drop=True, inplace=True)
tokens_df = pd.concat([tokens_df, susp_df])

suspicious_submissions = pd.read_csv("./data/submissions.csv")
suspicious_submissions = suspicious_submissions.rename(columns={"subreddit_name_prefixed": "subreddit", "id": "full_id", "author.name": "author"})
submissions = pd.concat([submissions, suspicious_submissions], sort=False)
submissions.reset_index(drop=True, inplace=True)

hover_df = submissions[["title", "subreddit", "author", "full_id"]].reset_index(drop=True)
submissions['source'] = source_labels[0]

# Perform UMAP and clustering on Post Titles

In [15]:
# This class lets us plot a saved embedding array with the built-in UMAP plotting functions
# Useful to avoid having to pickle/unpickle the model (instead just saving/loading the embedding as Tensorboard compatible TSV)
class UMAP_Facade:
    embedding_ = None
    def __init__(self, embedding=None):
        self.embedding_ = embedding

In 2 dimensions

In [16]:
# Output as Tensorboard-friendly TSV
if not os.path.exists("./data/posts_by_title_embedding_2d.tsv"):
    print("Did not find 2D UMAP embedding.  Generating...")
    # This usually takes 10 minutes or so on my machine
    umap_model_2d = umap.UMAP(n_neighbors=15, n_components=2, metric='cosine', random_state=0, low_memory=True).fit(tokens_df)
    
    umap_df_2d = pd.DataFrame(umap_model.embedding_)
    umap_df_2d.to_csv("./data/posts_by_title_embedding_2d.tsv", sep='\t', index=False, header=False)
    hover_df.to_csv("./data/posts_by_title_embedding_labels.tsv", sep='\t', index=False)
else:
    print("Found UMAP embedding.  Loading...")
    umap_df_2d = pd.read_csv("./data/posts_by_title_embedding_2d.tsv", sep='\t', header=None)
    hover_df = pd.read_csv("./data/posts_by_title_embedding_labels.tsv", sep='\t')
    umap_model_2d = UMAP_Facade(umap_df_2d.to_numpy())
    
print("Done!")

print("Generating clusters...")
cluster_2d = hdbscan.HDBSCAN(min_cluster_size=15, min_samples=15, metric='euclidean', cluster_selection_method='eom').fit(umap_model_2d.embedding_)
cluster_labels_2d = pd.Series(cluster_2d.labels_)
print("Done!")

Found UMAP embedding.  Loading...
Done!
Generating clusters...
Done!


In [17]:
# Output as Tensorboard-friendly TSV
if not os.path.exists("./data/posts_by_title_embedding_3d.tsv"):
    print("Did not find 3D UMAP embedding.  Generating...")
    # This usually takes 10 minutes or so on my machine
    umap_model_3d = umap.UMAP(n_neighbors=15, n_components=3, metric='cosine', random_state=0, low_memory=True).fit(tokens_df)
    
    umap_df_3d = pd.DataFrame(umap_model.embedding_)
    umap_df_3d.to_csv("./data/posts_by_title_embedding_3d.tsv", sep='\t', index=False, header=False)
    hover_df.to_csv("./data/posts_by_title_embedding_labels.tsv", sep='\t', index=False)
else:
    print("Found UMAP embedding.  Loading...")
    umap_df_3d = pd.read_csv("./data/posts_by_title_embedding_3d.tsv", sep='\t', header=None)
    hover_df = pd.read_csv("./data/posts_by_title_embedding_labels.tsv", sep='\t')
    umap_model_3d = UMAP_Facade(umap_df_3d.to_numpy())
    
print("Done!")

print("Generating clusters...")
cluster_3d = hdbscan.HDBSCAN(min_cluster_size=15, min_samples=15, metric='euclidean', cluster_selection_method='eom').fit(umap_model_3d.embedding_)
cluster_labels_3d = pd.Series(cluster_3d.labels_)
print("Done!")

Found UMAP embedding.  Loading...
Done!
Generating clusters...
Done!


# Plot 2D interactive HTML Bokeh Plots
If these don't interest you, you can skip these

In [46]:
# Theme "darkgreen" looks nice
def plot_interactive_bokeh(filename, model, labels, hover_data, point_size=4, width=1800, height=1800, theme="blue"):
    int_plot = plot.interactive(model, labels=labels, hover_data=hover_data, point_size=point_size, width=width, height=height, theme=theme)
    save(int_plot , filename=filename, title="Bokeh Plot")

In [47]:
plot_interactive_bokeh("./vis/post_plot_with_cluster.html", umap_model_2d, labels=cluster_labels_2d, hover_data=hover_df, width=2000, height=2000)
plot_interactive_bokeh("./vis/post_plot_with_authors.html", umap_model_2d, labels=submissions["author"], hover_data=hover_df)
plot_interactive_bokeh("./vis/post_plot_with_subreddit.html", umap_model_2d, labels=submissions["subreddit"], hover_data=hover_df)
plot_interactive_bokeh("./vis/post_plot_with_source.html", umap_model_2d, labels=source_labels[0], hover_data=hover_df)

In [None]:
# Plot at smaller scale for visually-appealing density map
# plot.points(umap_model_2d, width=1000, height=1000, theme="fire")

# Calculate Average Title Embedding by User

Let's do some clustering on users with 10+ posts in the collected time range, based on their average [CLS] token.

In [21]:
tokens_df.reset_index(drop=True, inplace=True)

In [22]:
post_threshold = 10

embeddings_by_author = defaultdict(list)
for it, emb in tokens_df.iterrows():
    embeddings_by_author[submissions.iloc[it]["author"]].append(emb)

# A bit inefficient, since this used to live in separate notebooks
    
active_users = []
avg_embeddings = []
embedding_matrix = []
    
for user, emb_list in embeddings_by_author.items():
    if len(emb_list) >= post_threshold:
        active_users.append((user, emb_list))

for u, emb_list in active_users:
    avg_emb = np.zeros(768)
    
    for emb in emb_list:
        avg_emb += np.array(emb)
        
    avg_emb /= len(emb_list)
    avg_embeddings.append((u, avg_emb))
    
for e in avg_embeddings:
    embedding_matrix.append(e[1])
    
emb_df = pd.DataFrame(embedding_matrix)
avg_emb_df = pd.DataFrame(avg_embeddings, columns=["author", "embedding"])

Arrange the labels in the right order, grab some example post titles.

In [23]:
subs_by_active_users = submissions[submissions["author"].isin(avg_emb_df["author"].tolist())]

In [24]:
hover_samples = {}

for author in avg_emb_df["author"]:
    hover_samples[author] = []
    for posttext in subs_by_active_users[subs_by_active_users["author"] == author]["title"][:5]:
        hover_samples[author].append(posttext)
        
hover_df_sub = pd.DataFrame(hover_samples).T
hover_df_sub = hover_df_sub.reset_index()
hover_df_sub.columns = ["author", "post1", "post2", "post3", "post4", "post5"]

Reduce these average user representations into 2D and 3D visualizable spaces, and cluster with HDBSCAN on both EOM and Leaf settings

In [25]:
%%time 
umap_model_avg_2d = umap.UMAP(n_neighbors=15, n_components=2, metric='cosine', random_state=0).fit(emb_df)

CPU times: user 12.7 s, sys: 216 ms, total: 12.9 s
Wall time: 12.6 s


In [26]:
%%time
umap_model_avg_3d = umap.UMAP(n_neighbors=15, n_components=3, metric='cosine', random_state=0).fit(emb_df)

CPU times: user 8.19 s, sys: 43 ms, total: 8.23 s
Wall time: 7.51 s


In [27]:
umap_user_df_2d = pd.DataFrame(umap_model_avg_2d.embedding_)
umap_user_df_3d = pd.DataFrame(umap_model_avg_3d.embedding_)

umap_user_df_2d.to_csv("./data/users_2d.tsv", sep='\t', index=False, header=False)
umap_user_df_3d.to_csv("./data/users_3d.tsv", sep='\t', index=False, header=False)

In [49]:
def cluster_and_save_labels(embedding, mode, dimension, min_cluster_size=15, min_samples=15):
    clusters = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric='euclidean', cluster_selection_method=mode).fit(embedding)
    cluster_labels = pd.Series(clusters.labels_)
    cluster_labels.to_csv(f"./data/users_hdbscan_labels_{dimension}d_{mode}_mc{min_cluster_size}_ms{min_samples}.tsv", sep='\t', index=False, header=False)

In [50]:
cluster_and_save_labels(umap_model_avg_2d.embedding_, 'eom', '2')
cluster_and_save_labels(umap_model_avg_3d.embedding_, 'eom', '3')
cluster_and_save_labels(umap_model_avg_2d.embedding_, 'leaf', '2')
cluster_and_save_labels(umap_model_avg_3d.embedding_, 'leaf', '3')

Interactive bokeh plot for 2D user map

In [33]:
cluster_labels_avg_2d = pd.read_csv('./data/users_hdbscan_labels_2d_eom_mc15_ms15.tsv', sep='\t')

In [41]:
agg_bert = plot.interactive(umap_model_avg_2d, labels=cluster_labels_avg_2d["0"], hover_data=hover_df_sub, point_size=4, width=800, height=800, theme="darkgreen")
save(agg_bert, filename="./vis/user_plot.html", title="Bokeh Plot")

Create source labels for user plot

In [42]:
canada_authors = submissions[submissions['source'] == 'canada']['author'].unique()
susp_authors = submissions[submissions['source'] == 'suspicious']['author'].unique()

In [43]:
reduced_source_labels = pd.DataFrame()
reduced_source_labels['author'] = avg_emb_df['author']

raw_labels = ['canada' if x in canada_authors else 'suspicious' for x in reduced_source_labels['author']]

hover_df_sub['source'] = raw_labels

hover_df_sub.to_csv("./data/users_by_title_embeddings_source_labels.tsv", sep='\t', index=False)

In [48]:
agg_bert_anno = plot.interactive(umap_model_avg_2d, labels=pd.Series(raw_labels), hover_data=hover_df_sub, point_size=4, width=800, height=800, theme="blue")
save(agg_bert_anno, filename="./vis/user_plot_with_source.html", title="Bokeh Plot")

'/hdd/dd/reddit-aaai2021/clean/vis/user_plot_with_source.html'