In [36]:
# manage data
import pandas as pd
from urllib.parse import quote_plus
from sqlalchemy import create_engine
from tqdm.auto import tqdm  # Updated import

# embeddings
from sentence_transformers import SentenceTransformer

# translation
from transformers import pipeline

# dimensionality reduction
import umap

# clustering
import hdbscan

# extract keywords from texts
from keybert import KeyBERT

# visualization
import plotly.express as px

In [37]:
# —— CONFIG —— 
SAMPLE_SIZE = 200
BATCH_SIZE = 32
EMBED_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
TRANSLATE_MODEL = "Helsinki-NLP/opus-mt-zh-en"
UMAP_KWARGS = dict(n_components=10, n_neighbors=15, min_dist=0.01)
HDBSCAN_KWARGS = dict(
    min_cluster_size=3,    # Small cluster sizes allowed
    min_samples=1,         # Higher sensitivity for density
    cluster_selection_epsilon=0.05 # Explicitly control distance for merging clusters
)

In [38]:
# Postgres creds (URL-encode the password)
USER = "postgres"
PASSWORD = quote_plus("4b.3O_XD?C9")
HOST = "18.162.51.182"
PORT = 5432
DBNAME = "mydb"

DB_URL = f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{DBNAME}"
engine = create_engine(DB_URL)
query = f"""
    SELECT id, title, content_en 
    FROM news 
    LIMIT {SAMPLE_SIZE}
"""
df = pd.read_sql(query, engine)
print(f"Loaded {len(df)} articles")
print(df.head())

Loaded 200 articles
   id                                title  \
0   1       \n  谷歌在美砸250億美元 建資料中心和AI基礎設施\n   
1   2                  \n  川普再度施壓Fed調降利率\n   
2   3            \n  台電員工遭高壓線路電擊　卓榮泰關注不捨\n   
3   4   \n  輝達飆4％、台積電ADR漲3％ 台指期夜盤劍指23K關卡\n   
4   5           \n  大樂透頭獎連13摃  下期上看4.4億元\n   

                                          content_en  
0  Here's the translation of the news article in ...  
1  Here's the translation of the news article:\n\...  
2  Here's the translation of the news article:\n\...  
3  Here's the translation of the news article in ...  
4  Here's the translation of the news article:\n\...  


In [39]:
# Initialize embedding model
embedder = SentenceTransformer(EMBED_MODEL)

# Embed content
print("Creating embeddings...")
corpus_embeddings = embedder.encode(df["content_en"].values, batch_size=BATCH_SIZE, show_progress_bar=True)
print(f"Embeddings shape: {corpus_embeddings.shape}")

Creating embeddings...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Embed content
print("Creating embeddings...")
corpus_embeddings = embedder.encode(df["content_en"].values, batch_size=BATCH_SIZE, show_progress_bar=True)
print(f"Embeddings shape: {corpus_embeddings.shape}")

Reducing dimensions with UMAP...
Reduced embeddings shape: (200, 5)


In [None]:
# Dimensionality reduction with UMAP
print("Reducing dimensions with UMAP...")
reduced_embeddings = umap.UMAP(**UMAP_KWARGS).fit_transform(corpus_embeddings)
print(f"✅ Reduced embeddings shape: {reduced_embeddings.shape}")


In [None]:
# Clustering with HDBSCAN
print("🧩 Clustering with HDBSCAN...")
clusterer = hdbscan.HDBSCAN(**HDBSCAN_KWARGS)
labels = clusterer.fit_predict(reduced_embeddings)
df["label"] = [str(label) for label in labels]
print(f"✅ Number of clusters found: {len(set(labels)) - (1 if -1 in labels else 0)}")

✅ Number of clusters found: 3


In [None]:
# Add 2D coordinates to dataframe
df["x"] = reduced_embeddings[:, 0]
df["y"] = reduced_embeddings[:, 1]
df["text_short"] = df["content_en"].str[:100] + "..."


🔍 Embeddings diagnostics:
Embeddings range: -6.14 to 8.66
Embeddings mean: 2.93 ± 4.31

🔍 Finding optimal cluster count...
For n_clusters = 2, silhouette score = 0.56
For n_clusters = 3, silhouette score = 0.44
For n_clusters = 4, silhouette score = 0.52
For n_clusters = 5, silhouette score = 0.59
For n_clusters = 6, silhouette score = 0.62
For n_clusters = 7, silhouette score = 0.61
For n_clusters = 8, silhouette score = 0.58
For n_clusters = 9, silhouette score = 0.48
For n_clusters = 10, silhouette score = 0.50

✅ Optimal cluster count: 6 (score: 0.62)

🔄 Running final clustering...

🔄 Creating visualization...

🔄 Naming clusters...



✅ Cluster distribution:
cluster_name
states google | federal reserve | power grid             49
market restricts | baoshan market | restricted time      49
taiwan power | injured zhuō | article typhoon            48
outfielder chen | win chen | derby chen                  25
resignation | resignation tomorrow | continue working    21
supernumerary teeth | teeth multiple | abnormal teeth     8
Name: count, dtype: int64


In [None]:
# Optional: Extract keywords per cluster
extract_keywords = False
if extract_keywords:
    kw_model = KeyBERT(model=embedder)
    cluster_keywords = {}
    for label in df["label"].unique():
        if label == "-1": continue  # Skip noise
        texts = df[df["label"] == label]["content_en"].tolist()
        joined_text = " ".join(texts)
        keywords = kw_model.extract_keywords(joined_text, top_n=5)
        cluster_keywords[label] = [kw[0] for kw in keywords]
    print("🗝️ Cluster keywords:", cluster_keywords)

Number of outliers: 0


In [None]:
# Visualization
print("📊 Plotting results...")
fig = px.scatter(
    df,
    x="x", y="y",
    color="label",
    hover_data=["title", "text_short"],
    title="News Article Clustering"
)
fig.show()

Generating cluster names...


Cluster 0: electricity google - energy china - reuters trump
Cluster 1: taiwan power - power taiwan - taiwan ministry
Cluster 2: taiwan lottery - taoyuan team - chén
Cluster 3: quitting company - leaving job - left workplace
Error processing cluster 4: sequence item 0: expected str instance, NoneType found
