## Install and Import Libraries

In [None]:
!pip install datasets sentence-transformers umap-learn hdbscan keybert

In [31]:
# manage data
from datasets import load_dataset
import pandas as pd

# embeddings
from sentence_transformers import SentenceTransformer

# dimensionality reduction
import umap
from sklearn.decomposition import PCA

# clustering
import hdbscan

# extract keywords from texts
# used to assign meaningful names to clusters
from keybert import KeyBERT

# visualization
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

## Download and Prepare Dataset

In [3]:
# download data
dataset = load_dataset("ag_news", split="train")
print(dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 120000
})


In [4]:
# keep only first 3k articles to make computations faster
dataset_subset = dataset.train_test_split(train_size=3000)["train"]
print(dataset_subset)

Dataset({
    features: ['text', 'label'],
    num_rows: 3000
})


In [5]:
# convert dataset to pandas dataframe
df = pd.DataFrame(dataset_subset).drop("label", axis=1)
df.head()

Unnamed: 0,text
0,Gunmen rob cricket legend Imran Pakistani cric...
1,Asia-Pacific summit ends with anti-terror pled...
2,UT safety suspended after felony arrest Tennes...
3,Most Japanese Women Shun Birth Control Pill By...
4,IBM to take supercomputing crown A new incarna...


## Create Articles Embeddings

In [6]:
# download the sentence embeddings model
embedder = SentenceTransformer('all-mpnet-base-v2')

In [7]:
# embed article texts
corpus_embeddings = embedder.encode(df["text"].values)
print(corpus_embeddings.shape)

(3000, 768)


## Reduce Embeddings Size

In [42]:
# reduce the size of the embeddings using UMAP
# reduced_embeddings = umap.UMAP(n_components=2, n_neighbors=100, min_dist=0.02).fit_transform(corpus_embeddings)
reduced_embeddings = PCA(n_components=2).fit_transform(corpus_embeddings) #replaced umap with PCA

print(reduced_embeddings.shape)

# put the values of the two dimensions inside the dataframe
df["x"] = reduced_embeddings[:, 0]
df["y"] = reduced_embeddings[:, 1]

# substring of the full text, for visualization purposes
df["text_short"] = df["text"].str[:100]

(3000, 2)


## Embeddings Visualization

In [43]:
# scatter plot
hover_data = {
    "text_short": True,
    "x": False,
    "y": False
}
fig = px.scatter(df, x="x", y="y", 
                template="plotly_dark",
                title="Embeddings", hover_data=hover_data)
fig.update_layout(showlegend=False)
fig.show()

## Clustering

In [44]:
# clustering with HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=9)
labels = clusterer.fit_predict(reduced_embeddings)
df["label"] = [str(label) for label in labels]
print(f"Num of clusters: {labels.max()}")

Num of clusters: 11


In [45]:
# number of outliers
num_outliers = len(df[df["label"] == "-1"])
print(f"Num of outliers: {num_outliers} ({num_outliers / len(df) * 100:.2f} % of total)")

Num of outliers: 915 (30.50 % of total)


In [36]:
# remove outliers
df_no_outliers = df[df["label"] != "-1"]

# scatter plot
hover_data = {
    "text_short": True,
    "x": False,
    "y": False
}
fig = px.scatter(df_no_outliers, x="x", y="y", template="plotly_dark",
                   title="Embeddings", color="label", hover_data=hover_data)
fig.show()

In [46]:
# show articles in a specific cluster
cluster = "0"
df_subset = df[df["label"] == cluster].reset_index()
for i,row in df_subset.iterrows():
  print(f"- {row['text_short']}")
  if i == 10:
    break

- BASEBALL ROUNDUP Redman Helps Put A #39;s Back on Track ark Redman pitched seven strong innings for 
- Cardinals #39; Smith, 35, leads oldies but goodies hit parade By Paul Connors, AP. The NFL #39;s eld
- F1: Grand Prix hopes splutter into life The British Grand Prix has been given a stay of execution af
- Hawks sign free agent Jelani McCoy Atlanta, GA (Sports Network) - The Atlanta Hawks signed free agen
- Jets Say Pennington Bothered by Shoulder Some throws fell woefully short, skittering across the turf
- Heat scores style points at home exhibition MIAMI  Monday night #39;s 92-82 victory over the Atlanta
- NASA Celebrates Life of Astronaut Gordon Cooper By PAM EASTON    HOUSTON (AP) -- Mercury program ast
-  #39;Canes keep reeling CLEMSON 24, MIAMI 17: A week after a loss to North Carolina, the  #39;Canes 
- Cards, Dolphins in store for long season In his 14th year as an NFL head coach, Dick Vermeil has now
- Marino, Young Nominated for Hall of Fame (AP) AP - Quarterbacks Dan Mar

## Give Meaningful Names to Clusters

In [47]:
# extracting keywords from texts with KeyBERT
cluster = "0"
df_subset = df[df["label"] == cluster].reset_index()
texts_concat = ". ".join(df_subset["text"].values)
keywords_and_scores = KeyBERT().extract_keywords(texts_concat,
                                    keyphrase_ngram_range=(1, 1), top_n=10)
print(keywords_and_scores)

[('sprinter', 0.3747), ('outfielder', 0.3737), ('shortstop', 0.3661), ('sprinters', 0.3609), ('mccoy', 0.3534), ('runner', 0.3511), ('fielder', 0.3481), ('diamondbacks', 0.3454), ('baseman', 0.3384), ('racing', 0.3346)]


In [48]:
# keep only the keywords with different stem
# def filter_keywords(keywords, n_keep=3):
def filter_keywords(keywords, n_keep=10):
  new_keywords = []
  for candidate_keyword in keywords:
    is_ok = True
    for compare_keyword in keywords:
      if candidate_keyword == compare_keyword:
        continue
      if compare_keyword in candidate_keyword:
        is_ok = False
        break
    if is_ok:
      new_keywords.append(candidate_keyword)
      if len(new_keywords) >= n_keep:
        break
  return new_keywords

keywords = [t[0] for t in keywords_and_scores]
keywords_filtered = filter_keywords(keywords)
print(keywords_filtered)

['sprinter', 'shortstop', 'mccoy', 'runner', 'fielder', 'diamondbacks', 'baseman', 'racing']


In [49]:
# assign a meaningful name to each cluster

def get_cluster_name(df, cluster):
  df_subset = df[df["label"] == cluster].reset_index()
  texts_concat = ". ".join(df_subset["text"].values)
  kw_model = KeyBERT()
  keywords_and_scores = kw_model.extract_keywords(texts_concat, keyphrase_ngram_range=(1, 1),
                                      top_n=10)
  keywords = [t[0] for t in keywords_and_scores]
  keywords_filtered = filter_keywords(keywords)
  return " - ".join(keywords_filtered)

# get all the new cluster names
all_clusters = df_no_outliers["label"].unique()
d_cluster_name_mapping = {}
for cluster in all_clusters:
  if cluster == "-1":
    d_cluster_name_mapping[cluster] = "outliers"
  else:
    d_cluster_name_mapping[cluster] = get_cluster_name(df_no_outliers, cluster)

# rename clusters
df_no_outliers["label"] = df_no_outliers["label"].apply(lambda label: d_cluster_name_mapping[label])

In [50]:
# scatter plot
hover_data = {
    "text_short": True,
    "x": False,
    "y": False
}
fig = px.scatter(df_no_outliers, x="x", y="y", template="simple_white",
                   title="Embeddings", color="label", hover_data=hover_data)
fig.show()

# References

- [2.12 Project: Clustering Newspaper Articles — Practical NLP with Python](https://www.nlplanet.org/course-practical-nlp/02-practical-nlp-first-tasks/12-clustering-articles)

- [My practice on different types of clustering](https://github.com/emrulk1/AI-ML-Data-Science-Practice/tree/main/ML_practice/Unsupervised_Learning/01_Clustering)