In [6]:
# manage data
import pandas as pd
from urllib.parse import quote_plus
from sqlalchemy import create_engine
from tqdm.auto import tqdm  # Updated import

# embeddings
from sentence_transformers import SentenceTransformer

# translation
from transformers import pipeline

# dimensionality reduction
import umap

# clustering
import hdbscan

# extract keywords from texts
from keybert import KeyBERT

# visualization
import plotly.express as px

In [7]:
# —— CONFIG —— 
SAMPLE_SIZE = 100  # Only process 100 articles for now
BATCH_SIZE = 32
EMBED_MODEL = "all-mpnet-base-v2"
TRANSLATE_MODEL = "Helsinki-NLP/opus-mt-zh-en"
UMAP_KWARGS = dict(n_components=2, n_neighbors=50, min_dist=0.1)
HDBSCAN_KWARGS = dict(min_cluster_size=10)

In [8]:
# Postgres creds (URL-encode the password)
USER = "postgres"
PASSWORD = quote_plus("4b.3O_XD?C9")
HOST = "54.46.7.169"
PORT = 5432
DBNAME = "mydb"

DB_URL = f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{DBNAME}"
engine = create_engine(DB_URL)
query = f"""
    SELECT id, title, content 
    FROM news 
    WHERE content IS NOT NULL AND LENGTH(content) > 50
    LIMIT {SAMPLE_SIZE}
"""
df = pd.read_sql(query, engine)
print(f"Loaded {len(df)} articles")
print(df.head())

Loaded 100 articles
   id                                title  \
0   1       \n  谷歌在美砸250億美元 建資料中心和AI基礎設施\n   
1   2                  \n  川普再度施壓Fed調降利率\n   
2   3            \n  台電員工遭高壓線路電擊　卓榮泰關注不捨\n   
3   4   \n  輝達飆4％、台積電ADR漲3％ 台指期夜盤劍指23K關卡\n   
4   5           \n  大樂透頭獎連13摃  下期上看4.4億元\n   

                                             content  
0  \n科技巨頭谷歌15日宣布，未來兩年將砸下250億美元，在美國最大電網區建造資料中心與AI基...  
1  \n美國總統川普15日在自家社群平台Truth Social連發數文，再度施壓美國聯準會（F...  
2  \n丹娜絲颱風造成嘉義縣逾26萬戶停電，台電人員不分晝夜全力搶修，但在15日傳出一起高壓線路...  
3  \n市場傳出美國政府同意解除中國科技禁令，放行輝達H20銷往中國，資金受激勵追捧輝達概念股，...  
4  \n大樂透本期(7/15)頭獎未開出，目前連13摃，台灣彩券表示，下期(7/18)預估銷售1...  


In [10]:
# 2. Translation with Immediate Output
print("🔄 Initializing translator...")
translator = pipeline("translation", model=TRANSLATE_MODEL)

def translate_and_print(idx, text):
    if pd.isna(text) or len(text) < 20:
        print(f"\n📄 Article {idx}: [SKIPPED - Too short]")
        return ""
    
    try:
        print(f"\n📄 Original (Article {idx}):")
        print(text[:200] + ("..." if len(text) > 200 else ""))
        
        translated = translator(text[:200], max_length=250)[0]['translation_text']
        
        print(f"\n🌍 Translation:")
        print(translated)
        print("-"*50)
        return translated
    except Exception as e:
        print(f"\n⚠️ Translation error in article {idx}: {e}")
        return ""

🔄 Initializing translator...


Device set to use mps:0


In [12]:
print("🔄 Initializing translator...")
translator = pipeline("translation", model=TRANSLATE_MODEL)

def translate_and_print(idx, text):
    if pd.isna(text) or len(text) < 20:
        print(f"\n📄 Article {idx}: [SKIPPED - Too short]")
        return ""
    
    try:
        print(f"\n📄 Original (Article {idx}):")
        print(text[:200] + ("..." if len(text) > 200 else ""))
        
        translated = translator(text[:200], max_length=250)[0]['translation_text']
        
        print(f"\n🌍 Translation:")
        print(translated)
        print("-"*50)
        return translated
    except Exception as e:
        print(f"\n⚠️ Translation error in article {idx}: {e}")
        return ""
    
# Apply translation with progress tracking
print("🚀 Starting translation process...")
tqdm.pandas(desc="Overall progress")
results = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    translated = translate_and_print(idx+1, row['content'])
    results.append(translated)
    
df['content_en'] = results

# Filter failed translations
initial_count = len(df)
df = df[df['content_en'].str.strip().astype(bool)]
print(f"\n✅ Successfully translated {len(df)}/{initial_count} articles")

🔄 Initializing translator...


Device set to use mps:0


🚀 Starting translation process...


  0%|          | 0/100 [00:00<?, ?it/s]


📄 Original (Article 1):

科技巨頭谷歌15日宣布，未來兩年將砸下250億美元，在美國最大電網區建造資料中心與AI基礎設施。

谷歌投資鎖定的地點，橫跨中大西洋區及中西部、南部共13州，這些地區由全美最大電網營運機構「PJM電網聯營系統」（PJM Interconnection）負責供電。PJM供電區域，也涵蓋在北維吉尼亞州的全球最大資料中心聚落。

谷歌透露將花費30億美元，用於賓州兩座水力發電廠的現代化工程，以迎合資料...

🌍 Translation:
Google, the tech giant Google, announced that it would cost $25 billion over the next two years to build a data center in the largest electricity grid in the United States and an AI infrastructure. Google investment in a targeted location across the Mid-Atlantic region and the Mid-West, and 13 states in the South, where the PJM Networking System, the country’s largest power network, is responsible for power supply.
--------------------------------------------------

📄 Original (Article 2):

美國總統川普15日在自家社群平台Truth Social連發數文，再度施壓美國聯準會（Fed）調降利率。
川普表示：「美國消費者物價指數很低。Fed現在就該降息！Fed應該將利率大幅下調3個百分點。目前通貨膨脹非常低，如果降息的話，一年能為美國節省1兆美元！」

🌍 Translation:
US President Truth Social 15 posted a series of tweets on his own social platform, “Truth Social, again pressuri

In [24]:
# Initialize embedding model
embedder = SentenceTransformer(EMBED_MODEL)

# Embed translated content
print("Creating embeddings...")
corpus_embeddings = embedder.encode(df["content_en"].values, batch_size=BATCH_SIZE, show_progress_bar=True)
print(f"Embeddings shape: {corpus_embeddings.shape}")

Creating embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Embeddings shape: (100, 768)


In [35]:
# Reduce dimensionality with UMAP
print("Reducing dimensions with UMAP...")
reduced_embeddings = umap.UMAP(n_components=2, n_neighbors=100, min_dist=0.02).fit_transform(corpus_embeddings)
print(f"Reduced embeddings shape: {reduced_embeddings.shape}")

Reducing dimensions with UMAP...
Reduced embeddings shape: (100, 2)



n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1



In [26]:
# Add to dataframe
df["x"] = reduced_embeddings[:, 0]
df["y"] = reduced_embeddings[:, 1]
df["text_short"] = df["content_en"].str[:100] + "..."

In [37]:
# clustering with HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=9)
labels = clusterer.fit_predict(reduced_embeddings)
df["label"] = [str(label) for label in labels]
print(f"Num of clusters: {labels.max()}")

Num of clusters: -1


In [47]:
# Improved clustering pipeline with better parameters and diagnostics
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kw_model = KeyBERT()
# 1. First verify the embeddings quality
print("🔍 Embeddings diagnostics:")
print(f"Embeddings range: {reduced_embeddings.min():.2f} to {reduced_embeddings.max():.2f}")
print(f"Embeddings mean: {reduced_embeddings.mean():.2f} ± {reduced_embeddings.std():.2f}")

# 2. Try KMeans first to find optimal clusters
print("\n🔍 Finding optimal cluster count...")
range_n_clusters = range(2, 11)
best_score = -1
best_n = 2

for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(reduced_embeddings)
    silhouette_avg = silhouette_score(reduced_embeddings, cluster_labels)
    print(f"For n_clusters = {n_clusters}, silhouette score = {silhouette_avg:.2f}")
    
    if silhouette_avg > best_score:
        best_score = silhouette_avg
        best_n = n_clusters

print(f"\n✅ Optimal cluster count: {best_n} (score: {best_score:.2f})")

# 3. Apply final clustering
print("\n🔄 Running final clustering...")
final_kmeans = KMeans(n_clusters=best_n, random_state=42)
df['cluster'] = final_kmeans.fit_predict(reduced_embeddings)

# 4. Visualize with UMAP (for visualization only)
print("\n🔄 Creating visualization...")
reducer = umap.UMAP(
    n_components=2,
    n_neighbors=min(15, len(reduced_embeddings)-1),  # Fix neighbor count
    min_dist=0.1,
    metric='cosine'
)
vis_embeddings = reducer.fit_transform(reduced_embeddings)
df[['x','y']] = vis_embeddings

# 5. Generate cluster names using original Chinese text
print("\n🔄 Naming clusters...")
cluster_names = {}
for cluster_id in df['cluster'].unique():
    cluster_texts = df[df['cluster'] == cluster_id]['content'].str.cat(sep=' ')[:5000]
    keywords = kw_model.extract_keywords(
        cluster_texts,
        keyphrase_ngram_range=(1, 2),
        stop_words=None,
        top_n=3
    )
    cluster_names[cluster_id] = " | ".join([kw[0] for kw in keywords])

df['cluster_name'] = df['cluster'].map(cluster_names)

# 6. Final visualization
fig = px.scatter(
    df, x='x', y='y',
    color='cluster_name',
    hover_data=['title'],
    title=f'News Clusters (n={len(df)}, k={best_n})',
    width=1000,
    height=700
)
fig.show()

print("\n✅ Cluster distribution:")
print(df['cluster_name'].value_counts())

🔍 Embeddings diagnostics:
Embeddings range: 2.64 to 11.84
Embeddings mean: 7.28 ± 2.87

🔍 Finding optimal cluster count...
For n_clusters = 2, silhouette score = 0.34
For n_clusters = 3, silhouette score = 0.41
For n_clusters = 4, silhouette score = 0.43
For n_clusters = 5, silhouette score = 0.41
For n_clusters = 6, silhouette score = 0.43
For n_clusters = 7, silhouette score = 0.42
For n_clusters = 8, silhouette score = 0.37
For n_clusters = 9, silhouette score = 0.34
For n_clusters = 10, silhouette score = 0.36

✅ Optimal cluster count: 6 (score: 0.43)

🔄 Running final clustering...

🔄 Creating visualization...

🔄 Naming clusters...



✅ Cluster distribution:
cluster_name
谷歌透露將花費30億美元 用於賓州兩座水力發電廠的現代化工程 | 也涵蓋在北維吉尼亞州的全球最大資料中心聚落 谷歌透露將花費30億美元 | 在美國最大電網區建造資料中心與ai基礎設施 谷歌投資鎖定的地點                19
恢復電力的台電公司主管及員工務必注意安全 丹娜絲颱風侵襲中南部 | 行政院長卓榮泰相當不捨 第一時間即請經濟部及台電公司提供受傷同仁及家屬最即時的協助 | 第一時間即請經濟部及台電公司提供受傷同仁及家屬最即時的協助 李慧芝說明    19
就算你向做為上班族的主管學習 在公司裡做出成績 | 因為 你現在能在公司裡工作 | 在公司裡做出成績 你還是無法培養生意頭腦                                                       17
陳晨威自認 漢草 | 但隊友林子偉分析型態 很看好陳晨威 | 隊友林子偉分析擊球類型 認為陳晨威爆發力好                                                                 17
materials德州旗艦工廠生產的美國製稀土磁鐵 雙方未來也將共同在加州設立最先進的稀土回收產線 | 蘋果將採購mp materials德州旗艦工廠生產的美國製稀土磁鐵 | materials德州旗艦工廠生產的美國製稀土磁鐵    15
美國10年期國債殖利率走跌 現貨黃金短線上漲6美元 | 美國洛杉磯港6月貨櫃吞吐量改寫歷史新高 原因是業者趁著美中貿易休戰期間趕緊拉貨 | 美國6月cpi年增2 為2月以來新高                             13
Name: count, dtype: int64


In [28]:
# Number of outliers
num_outliers = len(df[df["cluster"] == "-1"])
print(f"Number of outliers: {num_outliers}")

# Remove outliers for visualization
df_no_outliers = df[df["cluster"] != "-1"]

Number of outliers: 100


In [29]:
# Function to filter similar keywords
def filter_keywords(keywords, n_keep=3):
    new_keywords = []
    for candidate_keyword in keywords:
        is_ok = True
        for compare_keyword in keywords:
            if candidate_keyword == compare_keyword:
                continue
            if compare_keyword in candidate_keyword:
                is_ok = False
                break
        if is_ok:
            new_keywords.append(candidate_keyword)
            if len(new_keywords) >= n_keep:
                break
    return new_keywords

# Function to get cluster names
def get_cluster_name(df, cluster):
    df_subset = df[df["cluster"] == cluster].reset_index()
    texts_concat = ". ".join(df_subset["content_en"].values)
    kw_model = KeyBERT()
    keywords_and_scores = kw_model.extract_keywords(
        texts_concat, 
        keyphrase_ngram_range=(1, 2),
        top_n=10
    )
    keywords = [t[0] for t in keywords_and_scores]
    keywords_filtered = filter_keywords(keywords)
    return " - ".join(keywords_filtered)

In [30]:
# Get all cluster names
print("Generating cluster names...")
all_clusters = df_no_outliers["cluster"].unique()
d_cluster_name_mapping = {"-1": "outliers"}

Generating cluster names...


In [31]:
for cluster in all_clusters:
    if cluster == "-1":
        continue
    try:
        cluster_name = get_cluster_name(df_no_outliers, cluster)
        d_cluster_name_mapping[cluster] = cluster_name
        print(f"Cluster {cluster}: {cluster_name}")
    except Exception as e:
        print(f"Error processing cluster {cluster}: {e}")
        d_cluster_name_mapping[cluster] = f"cluster_{cluster}"

# Rename clusters
df_no_outliers["cluster_name"] = df_no_outliers["cluster"].apply(
    lambda label: d_cluster_name_mapping.get(label, f"cluster_{label}")
)


In [38]:
# Visualize clusters
print("Creating visualization...")
hover_data = {
    "text_short": True,
    "title": True,
    "x": False,
    "y": False
}
fig = px.scatter(
    df_no_outliers, 
    x="x", 
    y="y", 
    color="cluster_name",
    hover_data=hover_data,
    template="plotly_dark",
    title="News Article Clusters"
)
fig.show()

Creating visualization...
