In [1]:
# Import libraries for generic data preprocessing
import os
import numpy as np
import pandas as pd
from itertools import chain, product
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# Import libraries for preprocessing embeddings
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sentence_transformers import SentenceTransformer

# Import libraries for clustering and topic classification
import hdbscan
import umap.umap_ as umap
from sklearn.preprocessing import StandardScaler, MinMaxScaler

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Set Random Seed

In [2]:
np.random.seed(10)
os.environ["TOKENIZERS_PARALLELISM"] = 'false'

### Load Preprocessed Data

In [3]:
articles = pd.read_parquet('../data/proc_articles.gzip')
articles = articles.reset_index(drop=True)

In [4]:
articles.head()

Unnamed: 0,title,author,publication,content,party
0,Breitbart Launches ’Border Wall Construction C...,Milo,Breitbart,Last weekend at church I confessed my sin of p...,right
1,IDF Airstrike Eliminates 4 Islamic State-Linke...,Breitbart Jerusalem,Breitbart,The Times of Israel reports: An Israeli airst...,right
2,Oracle Funds Anti-Google Effort that Outs Hill...,Chriss W. Street,Breitbart,The Oracle Corporation is using its deep finan...,right
3,Silicon Valley Urges Giving Election Day Off t...,Chriss W. Street,Breitbart,Apparently worried about the populist movement...,right
4,Illegal Migrant Abandoned in Desert Calls 911 ...,Bob Price,Breitbart,A severely dehydrated illegal alien called 911...,right


### Remove Stop Words and Punctuation

In [5]:
# Load in stop words, stemmer, and regex tokenizer
stop = stopwords.words('english')
punc = RegexpTokenizer(r'\w+')
# stemmer = PorterStemmer()

# Define function for de-lemmatizing words, removing whitespace, and stop words
def reformat_articles(w):
    # lowecase and remove stop words
    words = [word.lower() for word in w.split() if word.lower() not in stop]
    # remove punctuation
    words = [punc.tokenize(word) for word in words]
    # remove whitespace
    words = list(chain.from_iterable(words))
    # remove stems
    # words = [stemmer.stem(word) for word in words]  # TODO -- SEE IF IMPROVED AFTER DELETING THIS LINE
    # convert from list to string
    new_article = ' '.join(words)
    return new_article

# Apply lemmatization, whitespace removal, and stop word removal
articles['content'] = articles['content'].apply(reformat_articles)
articles.head()

Unnamed: 0,title,author,publication,content,party
0,Breitbart Launches ’Border Wall Construction C...,Milo,Breitbart,last weekend church confessed sin personal van...,right
1,IDF Airstrike Eliminates 4 Islamic State-Linke...,Breitbart Jerusalem,Breitbart,times israel reports israeli airstrike killed ...,right
2,Oracle Funds Anti-Google Effort that Outs Hill...,Chriss W. Street,Breitbart,oracle corporation using deep financial resour...,right
3,Silicon Valley Urges Giving Election Day Off t...,Chriss W. Street,Breitbart,apparently worried populist movement led donal...,right
4,Illegal Migrant Abandoned in Desert Calls 911 ...,Bob Price,Breitbart,severely dehydrated illegal alien called 911 p...,right


### Embed Articles

In [6]:
# Load in pre-trained DistilBERT model
# DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than bert-base-uncased , runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language understanding benchmark
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [7]:
# Encode embeddings
embeddings = model.encode(articles['content'], show_progress_bar=True)

Batches:   0%|          | 0/1318 [00:00<?, ?it/s]

### Standardize Embeddings

In [8]:
# Perform mean-centering standardization on embeddings
std_embeddings = StandardScaler(with_mean=True).fit_transform(embeddings)

### Neighborhood-Based Dimensionality Reduction

In [9]:
# Initialize hyper-parameters
neighbors = list(range(10,31,5))
distances = [0.01, 0.02, 0.5, 0.75]
l_umap = {}

# Perform non-linear dimensionality reduction
# n_neighbors = 20
# n_components = 5
# min_dist = 0.01
# metric = cosine
for n, d in tqdm(list(product(neighbors,distances))):
    umap_embeddings = umap.UMAP(n_neighbors=n, n_components=5, min_dist=d, metric='cosine', random_state=42).fit_transform(std_embeddings)
    std_umap_embeddings = MinMaxScaler().fit_transform(umap_embeddings)
    l_umap[f"{n}_{d}"] = std_umap_embeddings

  0%|          | 0/20 [00:00<?, ?it/s]

### Density-Based Clustering

In [10]:
# Initialize hyper-parameters
clusters = [5, 10, 50, 100, 250, 500]
l_clust = {}

# Perform density-based clustering on dimensionality reduced embeddings
# min_cluster_size=100
# metric_euclidean
for k, v in l_umap.items():
    for c in clusters:
        cluster = hdbscan.HDBSCAN(min_cluster_size=c, metric='euclidean').fit(v)
        l_clust[f"{k}_{c}"] = {'umap': v, 'cluster': cluster}

In [11]:
# Frequencies of each cluster/label
for k, v in l_clust.items():
    unique, counts = np.unique(v['cluster'].labels_, return_counts=True)
    print(k)
    print(np.asarray((unique, counts)).T)
    print('-----')

# With removing stop words and punctuation:
# 10_0.01_500: 16k -- 08 -- 12k, 06k, 04k
# 10_0.02_250: 14k -- 12 -- 12k, 06k, 03k
# 15_0.01_100: 13k -- 25 -- 12k, 04k, 02k
# 10_0.02_100: 18k -- 35 -- 06k, 03k, 02k
# 25_0.02_050: 13k -- 42 -- 10k, 03k, 02k

# Without removing stop words or punctuation
# 10_0.01_100: 11k -- 018 -- 12k, 06k, 04k
# 10_0.01_250: 09k -- 010 -- 13k, 10k, 04k
# 20_0.01_100: 10k -- 013 -- 13k, 08k, 04k
# 10_0.02_100: 11k -- 018 -- 10k, 05k, 05k
# 20_0.02_100: 15k -- 019 -- 11k, 08k, 01k

10_0.01_5
[[   -1 20815]
 [    0    72]
 [    1   454]
 ...
 [  856    30]
 [  857    13]
 [  858   124]]
-----
10_0.01_10
[[   -1 20846]
 [    0    72]
 [    1    21]
 [    2   454]
 [    3    28]
 [    4   112]
 [    5    82]
 [    6    27]
 [    7    44]
 [    8   530]
 [    9    35]
 [   10    27]
 [   11    15]
 [   12    14]
 [   13    11]
 [   14    99]
 [   15    56]
 [   16    19]
 [   17    63]
 [   18    55]
 [   19    36]
 [   20    83]
 [   21    11]
 [   22   226]
 [   23    66]
 [   24   116]
 [   25    13]
 [   26    12]
 [   27    21]
 [   28    37]
 [   29    99]
 [   30    31]
 [   31    31]
 [   32    20]
 [   33    52]
 [   34    18]
 [   35    43]
 [   36    50]
 [   37    15]
 [   38    16]
 [   39    98]
 [   40    17]
 [   41   238]
 [   42    33]
 [   43    17]
 [   44    14]
 [   45    33]
 [   46    12]
 [   47    42]
 [   48    17]
 [   49    31]
 [   50    12]
 [   51    28]
 [   52    15]
 [   53    52]
 [   54   100]
 [   55    11]
 [   56   223]
 [   57

-----
25_0.02_50
[[   -1 13250]
 [    0    70]
 [    1   453]
 [    2   117]
 [    3   239]
 [    4  2186]
 [    5   240]
 [    6   116]
 [    7   135]
 [    8   250]
 [    9   360]
 [   10   525]
 [   11    73]
 [   12    52]
 [   13    80]
 [   14   302]
 [   15    52]
 [   16   244]
 [   17    65]
 [   18 10270]
 [   19    58]
 [   20   636]
 [   21   111]
 [   22    96]
 [   23   141]
 [   24   253]
 [   25    52]
 [   26   165]
 [   27   214]
 [   28   114]
 [   29   260]
 [   30   198]
 [   31    67]
 [   32    66]
 [   33   370]
 [   34  2190]
 [   35  1643]
 [   36    95]
 [   37  2576]
 [   38  2451]
 [   39    65]
 [   40   770]
 [   41   188]
 [   42   292]]
-----
25_0.02_100
[[   -1 15517]
 [    0   102]
 [    1   132]
 [    2   239]
 [    3   453]
 [    4   116]
 [    5  6406]
 [    6   516]
 [    7   235]
 [    8   233]
 [    9   276]
 [   10   312]
 [   11   353]
 [   12   211]
 [   13   548]
 [   14   426]
 [   15  1716]
 [   16   207]
 [   17  3021]
 [   18   322]
 [  

30_0.02_250
[[   -1 17500]
 [    0   537]
 [    1  6256]
 [    2   451]
 [    3   346]
 [    4   467]
 [    5   502]
 [    6   611]
 [    7  1034]
 [    8  1649]
 [    9  7283]
 [   10  2830]
 [   11  1244]
 [   12  1440]]
-----
30_0.02_500
[[   -1 17589]
 [    0 11306]
 [    1  5543]
 [    2  1161]
 [    3  3087]
 [    4  3464]]
-----
30_0.5_5
[[   -1  2487]
 [    0     9]
 [    1    66]
 [    2   433]
 [    3 39155]]
-----
30_0.5_10
[[   -1  2169]
 [    0    62]
 [    1    16]
 [    2   433]
 [    3 39470]]
-----
30_0.5_50
[[   -1  5963]
 [    0    57]
 [    1   429]
 [    2 35701]]
-----
30_0.5_100
[[   -1 21757]
 [    0   426]
 [    1   141]
 [    2  7867]
 [    3   287]
 [    4   261]
 [    5 10724]
 [    6   373]
 [    7   162]
 [    8   152]]
-----
30_0.5_250
[[   -1 24373]
 [    0   417]
 [    1  7486]
 [    2   667]
 [    3  9207]]
-----
30_0.5_500
[[   -1 22571]
 [    0  8457]
 [    1 11122]]
-----
30_0.75_5
[[   -1  4417]
 [    0    59]
 [    1     6]
 [    2   422]
 [    3 