In [2]:
import hdbscan
import numpy as np
import umap.umap_ as umap
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

### Create Test Article

In [4]:
articles = pd.DataFrame({'content': ['The Republican Party won the presedential election in 2016, but the Democrats won the presedential election in 2020.']})

### Tokenize Article

In [5]:
# Load in pre-trained DistilBERT model
# DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than bert-base-uncased , runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language understanding benchmark
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [6]:
# Encode embeddings
embeddings = model.encode(articles['content'], show_progress_bar=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

### Standardize Article

In [7]:
# Perform mean-centering standardization on embeddings
std_embeddings = StandardScaler(with_mean=True).fit_transform(embeddings)

### Neighborhood-Based Dimensionality Reduction

In [8]:
# Perform non-linear dimensionality reduction
# n_neighbors = 30
# n_components = 5
# min_dist = 0.1
# metric = cosine
umap_embeddings = umap.UMAP(n_neighbors=10, n_components=5, min_dist=0.02, metric='cosine').fit_transform(std_embeddings)

# Perform min-max standardization on embeddings
std_umap_embeddings = MinMaxScaler().fit_transform(umap_embeddings)

### Density-Based Clustering

In [9]:
# Perform density-based clustering on dimensionality reduced embeddings
# min_cluster_size=300
# metric_euclidean
cluster = hdbscan.HDBSCAN(min_cluster_size=250, metric='euclidean').fit(std_umap_embeddings)

ValueError: k must be less than or equal to the number of training points