In [6]:
# manage data
import pandas as pd
from urllib.parse import quote_plus
from sqlalchemy import create_engine
from tqdm.auto import tqdm  # Updated import

# embeddings
from sentence_transformers import SentenceTransformer

# translation
from transformers import pipeline

# dimensionality reduction
import umap

# clustering
import hdbscan

# extract keywords from texts
from keybert import KeyBERT

# visualization
import plotly.express as px

In [7]:
# ‚Äî‚Äî CONFIG ‚Äî‚Äî 
SAMPLE_SIZE = 100  # Only process 100 articles for now
BATCH_SIZE = 32
EMBED_MODEL = "all-mpnet-base-v2"
TRANSLATE_MODEL = "Helsinki-NLP/opus-mt-zh-en"
UMAP_KWARGS = dict(n_components=2, n_neighbors=50, min_dist=0.1)
HDBSCAN_KWARGS = dict(min_cluster_size=10)

In [8]:
# Postgres creds (URL-encode the password)
USER = "postgres"
PASSWORD = quote_plus("4b.3O_XD?C9")
HOST = "54.46.7.169"
PORT = 5432
DBNAME = "mydb"

DB_URL = f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{DBNAME}"
engine = create_engine(DB_URL)
query = f"""
    SELECT id, title, content 
    FROM news 
    WHERE content IS NOT NULL AND LENGTH(content) > 50
    LIMIT {SAMPLE_SIZE}
"""
df = pd.read_sql(query, engine)
print(f"Loaded {len(df)} articles")
print(df.head())

Loaded 100 articles
   id                                title  \
0   1       \n  Ë∞∑Ê≠åÂú®ÁæéÁ†∏250ÂÑÑÁæéÂÖÉ Âª∫Ë≥áÊñô‰∏≠ÂøÉÂíåAIÂü∫Á§éË®≠ÊñΩ\n   
1   2                  \n  Â∑ùÊôÆÂÜçÂ∫¶ÊñΩÂ£ìFedË™øÈôçÂà©Áéá\n   
2   3            \n  Âè∞ÈõªÂì°Â∑•ÈÅ≠È´òÂ£ìÁ∑öË∑ØÈõªÊìä„ÄÄÂçìÊ¶ÆÊ≥∞ÈóúÊ≥®‰∏çÊç®\n   
3   4   \n  ËºùÈÅîÈ£Ü4ÔºÖ„ÄÅÂè∞Á©çÈõªADRÊº≤3ÔºÖ Âè∞ÊåáÊúüÂ§úÁõ§ÂäçÊåá23KÈóúÂç°\n   
4   5           \n  Â§ßÊ®ÇÈÄèÈ†≠ÁçéÈÄ£13ÊëÉ  ‰∏ãÊúü‰∏äÁúã4.4ÂÑÑÂÖÉ\n   

                                             content  
0  \nÁßëÊäÄÂ∑®È†≠Ë∞∑Ê≠å15Êó•ÂÆ£Â∏ÉÔºåÊú™‰æÜÂÖ©Âπ¥Â∞áÁ†∏‰∏ã250ÂÑÑÁæéÂÖÉÔºåÂú®ÁæéÂúãÊúÄÂ§ßÈõªÁ∂≤ÂçÄÂª∫ÈÄ†Ë≥áÊñô‰∏≠ÂøÉËàáAIÂü∫...  
1  \nÁæéÂúãÁ∏ΩÁµ±Â∑ùÊôÆ15Êó•Âú®Ëá™ÂÆ∂Á§æÁæ§Âπ≥Âè∞Truth SocialÈÄ£ÁôºÊï∏ÊñáÔºåÂÜçÂ∫¶ÊñΩÂ£ìÁæéÂúãËÅØÊ∫ñÊúÉÔºàF...  
2  \n‰∏πÂ®úÁµ≤È¢±È¢®ÈÄ†ÊàêÂòâÁæ©Á∏£ÈÄæ26Ëê¨Êà∂ÂÅúÈõªÔºåÂè∞Èõª‰∫∫Âì°‰∏çÂàÜÊôùÂ§úÂÖ®ÂäõÊê∂‰øÆÔºå‰ΩÜÂú®15Êó•ÂÇ≥Âá∫‰∏ÄËµ∑È´òÂ£ìÁ∑öË∑Ø...  
3  \nÂ∏ÇÂ†¥ÂÇ≥Âá∫ÁæéÂúãÊîøÂ∫úÂêåÊÑèËß£Èô§‰∏≠ÂúãÁßëÊäÄÁ¶Å‰ª§ÔºåÊîæË°åËºùÈÅîH20Èä∑ÂæÄ‰∏≠ÂúãÔºåË≥áÈáëÂèóÊøÄÂãµËøΩÊçß

In [10]:
# 2. Translation with Immediate Output
print("üîÑ Initializing translator...")
translator = pipeline("translation", model=TRANSLATE_MODEL)

def translate_and_print(idx, text):
    if pd.isna(text) or len(text) < 20:
        print(f"\nüìÑ Article {idx}: [SKIPPED - Too short]")
        return ""
    
    try:
        print(f"\nüìÑ Original (Article {idx}):")
        print(text[:200] + ("..." if len(text) > 200 else ""))
        
        translated = translator(text[:200], max_length=250)[0]['translation_text']
        
        print(f"\nüåç Translation:")
        print(translated)
        print("-"*50)
        return translated
    except Exception as e:
        print(f"\n‚ö†Ô∏è Translation error in article {idx}: {e}")
        return ""

üîÑ Initializing translator...


Device set to use mps:0


In [12]:
print("üîÑ Initializing translator...")
translator = pipeline("translation", model=TRANSLATE_MODEL)

def translate_and_print(idx, text):
    if pd.isna(text) or len(text) < 20:
        print(f"\nüìÑ Article {idx}: [SKIPPED - Too short]")
        return ""
    
    try:
        print(f"\nüìÑ Original (Article {idx}):")
        print(text[:200] + ("..." if len(text) > 200 else ""))
        
        translated = translator(text[:200], max_length=250)[0]['translation_text']
        
        print(f"\nüåç Translation:")
        print(translated)
        print("-"*50)
        return translated
    except Exception as e:
        print(f"\n‚ö†Ô∏è Translation error in article {idx}: {e}")
        return ""
    
# Apply translation with progress tracking
print("üöÄ Starting translation process...")
tqdm.pandas(desc="Overall progress")
results = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    translated = translate_and_print(idx+1, row['content'])
    results.append(translated)
    
df['content_en'] = results

# Filter failed translations
initial_count = len(df)
df = df[df['content_en'].str.strip().astype(bool)]
print(f"\n‚úÖ Successfully translated {len(df)}/{initial_count} articles")

üîÑ Initializing translator...


Device set to use mps:0


üöÄ Starting translation process...


  0%|          | 0/100 [00:00<?, ?it/s]


üìÑ Original (Article 1):

ÁßëÊäÄÂ∑®È†≠Ë∞∑Ê≠å15Êó•ÂÆ£Â∏ÉÔºåÊú™‰æÜÂÖ©Âπ¥Â∞áÁ†∏‰∏ã250ÂÑÑÁæéÂÖÉÔºåÂú®ÁæéÂúãÊúÄÂ§ßÈõªÁ∂≤ÂçÄÂª∫ÈÄ†Ë≥áÊñô‰∏≠ÂøÉËàáAIÂü∫Á§éË®≠ÊñΩ„ÄÇ

Ë∞∑Ê≠åÊäïË≥áÈéñÂÆöÁöÑÂú∞ÈªûÔºåÊ©´Ë∑®‰∏≠Â§ßË•øÊ¥ãÂçÄÂèä‰∏≠Ë•øÈÉ®„ÄÅÂçóÈÉ®ÂÖ±13Â∑ûÔºåÈÄô‰∫õÂú∞ÂçÄÁî±ÂÖ®ÁæéÊúÄÂ§ßÈõªÁ∂≤ÁáüÈÅãÊ©üÊßã„ÄåPJMÈõªÁ∂≤ËÅØÁáüÁ≥ªÁµ±„ÄçÔºàPJM InterconnectionÔºâË≤†Ë≤¨‰æõÈõª„ÄÇPJM‰æõÈõªÂçÄÂüüÔºå‰πüÊ∂µËìãÂú®ÂåóÁ∂≠ÂêâÂ∞º‰∫ûÂ∑ûÁöÑÂÖ®ÁêÉÊúÄÂ§ßË≥áÊñô‰∏≠ÂøÉËÅöËêΩ„ÄÇ

Ë∞∑Ê≠åÈÄèÈú≤Â∞áËä±Ë≤ª30ÂÑÑÁæéÂÖÉÔºåÁî®ÊñºË≥ìÂ∑ûÂÖ©Â∫ßÊ∞¥ÂäõÁôºÈõªÂª†ÁöÑÁèæ‰ª£ÂåñÂ∑•Á®ãÔºå‰ª•ËøéÂêàË≥áÊñô...

üåç Translation:
Google, the tech giant Google, announced that it would cost $25 billion over the next two years to build a data center in the largest electricity grid in the United States and an AI infrastructure. Google investment in a targeted location across the Mid-Atlantic region and the Mid-West, and 13 states in the South, where the PJM Networking System, the country‚Äôs largest power network, is responsible for power supply.
----------

In [24]:
# Initialize embedding model
embedder = SentenceTransformer(EMBED_MODEL)

# Embed translated content
print("Creating embeddings...")
corpus_embeddings = embedder.encode(df["content_en"].values, batch_size=BATCH_SIZE, show_progress_bar=True)
print(f"Embeddings shape: {corpus_embeddings.shape}")

Creating embeddings...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Embeddings shape: (100, 768)


In [35]:
# Reduce dimensionality with UMAP
print("Reducing dimensions with UMAP...")
reduced_embeddings = umap.UMAP(n_components=2, n_neighbors=100, min_dist=0.02).fit_transform(corpus_embeddings)
print(f"Reduced embeddings shape: {reduced_embeddings.shape}")

Reducing dimensions with UMAP...
Reduced embeddings shape: (100, 2)



n_neighbors is larger than the dataset size; truncating to X.shape[0] - 1



In [26]:
# Add to dataframe
df["x"] = reduced_embeddings[:, 0]
df["y"] = reduced_embeddings[:, 1]
df["text_short"] = df["content_en"].str[:100] + "..."

In [37]:
# clustering with HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=9)
labels = clusterer.fit_predict(reduced_embeddings)
df["label"] = [str(label) for label in labels]
print(f"Num of clusters: {labels.max()}")

Num of clusters: -1


In [47]:
# Improved clustering pipeline with better parameters and diagnostics
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kw_model = KeyBERT()
# 1. First verify the embeddings quality
print("üîç Embeddings diagnostics:")
print(f"Embeddings range: {reduced_embeddings.min():.2f} to {reduced_embeddings.max():.2f}")
print(f"Embeddings mean: {reduced_embeddings.mean():.2f} ¬± {reduced_embeddings.std():.2f}")

# 2. Try KMeans first to find optimal clusters
print("\nüîç Finding optimal cluster count...")
range_n_clusters = range(2, 11)
best_score = -1
best_n = 2

for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(reduced_embeddings)
    silhouette_avg = silhouette_score(reduced_embeddings, cluster_labels)
    print(f"For n_clusters = {n_clusters}, silhouette score = {silhouette_avg:.2f}")
    
    if silhouette_avg > best_score:
        best_score = silhouette_avg
        best_n = n_clusters

print(f"\n‚úÖ Optimal cluster count: {best_n} (score: {best_score:.2f})")

# 3. Apply final clustering
print("\nüîÑ Running final clustering...")
final_kmeans = KMeans(n_clusters=best_n, random_state=42)
df['cluster'] = final_kmeans.fit_predict(reduced_embeddings)

# 4. Visualize with UMAP (for visualization only)
print("\nüîÑ Creating visualization...")
reducer = umap.UMAP(
    n_components=2,
    n_neighbors=min(15, len(reduced_embeddings)-1),  # Fix neighbor count
    min_dist=0.1,
    metric='cosine'
)
vis_embeddings = reducer.fit_transform(reduced_embeddings)
df[['x','y']] = vis_embeddings

# 5. Generate cluster names using original Chinese text
print("\nüîÑ Naming clusters...")
cluster_names = {}
for cluster_id in df['cluster'].unique():
    cluster_texts = df[df['cluster'] == cluster_id]['content'].str.cat(sep=' ')[:5000]
    keywords = kw_model.extract_keywords(
        cluster_texts,
        keyphrase_ngram_range=(1, 2),
        stop_words=None,
        top_n=3
    )
    cluster_names[cluster_id] = " | ".join([kw[0] for kw in keywords])

df['cluster_name'] = df['cluster'].map(cluster_names)

# 6. Final visualization
fig = px.scatter(
    df, x='x', y='y',
    color='cluster_name',
    hover_data=['title'],
    title=f'News Clusters (n={len(df)}, k={best_n})',
    width=1000,
    height=700
)
fig.show()

print("\n‚úÖ Cluster distribution:")
print(df['cluster_name'].value_counts())

üîç Embeddings diagnostics:
Embeddings range: 2.64 to 11.84
Embeddings mean: 7.28 ¬± 2.87

üîç Finding optimal cluster count...
For n_clusters = 2, silhouette score = 0.34
For n_clusters = 3, silhouette score = 0.41
For n_clusters = 4, silhouette score = 0.43
For n_clusters = 5, silhouette score = 0.41
For n_clusters = 6, silhouette score = 0.43
For n_clusters = 7, silhouette score = 0.42
For n_clusters = 8, silhouette score = 0.37
For n_clusters = 9, silhouette score = 0.34
For n_clusters = 10, silhouette score = 0.36

‚úÖ Optimal cluster count: 6 (score: 0.43)

üîÑ Running final clustering...

üîÑ Creating visualization...

üîÑ Naming clusters...



‚úÖ Cluster distribution:
cluster_name
Ë∞∑Ê≠åÈÄèÈú≤Â∞áËä±Ë≤ª30ÂÑÑÁæéÂÖÉ Áî®ÊñºË≥ìÂ∑ûÂÖ©Â∫ßÊ∞¥ÂäõÁôºÈõªÂª†ÁöÑÁèæ‰ª£ÂåñÂ∑•Á®ã | ‰πüÊ∂µËìãÂú®ÂåóÁ∂≠ÂêâÂ∞º‰∫ûÂ∑ûÁöÑÂÖ®ÁêÉÊúÄÂ§ßË≥áÊñô‰∏≠ÂøÉËÅöËêΩ Ë∞∑Ê≠åÈÄèÈú≤Â∞áËä±Ë≤ª30ÂÑÑÁæéÂÖÉ | Âú®ÁæéÂúãÊúÄÂ§ßÈõªÁ∂≤ÂçÄÂª∫ÈÄ†Ë≥áÊñô‰∏≠ÂøÉËàáaiÂü∫Á§éË®≠ÊñΩ Ë∞∑Ê≠åÊäïË≥áÈéñÂÆöÁöÑÂú∞Èªû                19
ÊÅ¢Âæ©ÈõªÂäõÁöÑÂè∞ÈõªÂÖ¨Âè∏‰∏ªÁÆ°ÂèäÂì°Â∑•ÂãôÂøÖÊ≥®ÊÑèÂÆâÂÖ® ‰∏πÂ®úÁµ≤È¢±È¢®‰æµË•≤‰∏≠ÂçóÈÉ® | Ë°åÊîøÈô¢Èï∑ÂçìÊ¶ÆÊ≥∞Áõ∏Áï∂‰∏çÊç® Á¨¨‰∏ÄÊôÇÈñìÂç≥Ë´ãÁ∂ìÊøüÈÉ®ÂèäÂè∞ÈõªÂÖ¨Âè∏Êèê‰æõÂèóÂÇ∑Âêå‰ªÅÂèäÂÆ∂Â±¨ÊúÄÂç≥ÊôÇÁöÑÂçîÂä© | Á¨¨‰∏ÄÊôÇÈñìÂç≥Ë´ãÁ∂ìÊøüÈÉ®ÂèäÂè∞ÈõªÂÖ¨Âè∏Êèê‰æõÂèóÂÇ∑Âêå‰ªÅÂèäÂÆ∂Â±¨ÊúÄÂç≥ÊôÇÁöÑÂçîÂä© ÊùéÊÖßËäùË™™Êòé    19
Â∞±ÁÆó‰Ω†ÂêëÂÅöÁÇ∫‰∏äÁè≠ÊóèÁöÑ‰∏ªÁÆ°Â≠∏Áøí Âú®ÂÖ¨Âè∏Ë£°ÂÅöÂá∫ÊàêÁ∏æ | Âõ†ÁÇ∫ ‰Ω†ÁèæÂú®ËÉΩÂú®ÂÖ¨Âè∏Ë£°Â∑•‰Ωú | Âú®ÂÖ¨Âè∏Ë£°ÂÅöÂá∫ÊàêÁ∏æ ‰Ω†ÈÇÑÊòØÁÑ°Ê≥ïÂüπÈ§äÁîüÊÑèÈ†≠ËÖ¶                                                       17
Èô≥Êô®Â®ÅËá™Ë™ç Êº¢Ëçâ | ‰ΩÜÈöäÂèãÊûóÂ≠êÂÅâÂàÜÊûêÂûãÊÖã ÂæàÁúãÂ•ΩÈô≥Êô®Â®Å | ÈöäÂèãÊûóÂ≠êÂÅâÂàÜÊûêÊìäÁêÉÈ°ûÂûã Ë™ç

In [28]:
# Number of outliers
num_outliers = len(df[df["cluster"] == "-1"])
print(f"Number of outliers: {num_outliers}")

# Remove outliers for visualization
df_no_outliers = df[df["cluster"] != "-1"]

Number of outliers: 100


In [29]:
# Function to filter similar keywords
def filter_keywords(keywords, n_keep=3):
    new_keywords = []
    for candidate_keyword in keywords:
        is_ok = True
        for compare_keyword in keywords:
            if candidate_keyword == compare_keyword:
                continue
            if compare_keyword in candidate_keyword:
                is_ok = False
                break
        if is_ok:
            new_keywords.append(candidate_keyword)
            if len(new_keywords) >= n_keep:
                break
    return new_keywords

# Function to get cluster names
def get_cluster_name(df, cluster):
    df_subset = df[df["cluster"] == cluster].reset_index()
    texts_concat = ". ".join(df_subset["content_en"].values)
    kw_model = KeyBERT()
    keywords_and_scores = kw_model.extract_keywords(
        texts_concat, 
        keyphrase_ngram_range=(1, 2),
        top_n=10
    )
    keywords = [t[0] for t in keywords_and_scores]
    keywords_filtered = filter_keywords(keywords)
    return " - ".join(keywords_filtered)

In [30]:
# Get all cluster names
print("Generating cluster names...")
all_clusters = df_no_outliers["cluster"].unique()
d_cluster_name_mapping = {"-1": "outliers"}

Generating cluster names...


In [31]:
for cluster in all_clusters:
    if cluster == "-1":
        continue
    try:
        cluster_name = get_cluster_name(df_no_outliers, cluster)
        d_cluster_name_mapping[cluster] = cluster_name
        print(f"Cluster {cluster}: {cluster_name}")
    except Exception as e:
        print(f"Error processing cluster {cluster}: {e}")
        d_cluster_name_mapping[cluster] = f"cluster_{cluster}"

# Rename clusters
df_no_outliers["cluster_name"] = df_no_outliers["cluster"].apply(
    lambda label: d_cluster_name_mapping.get(label, f"cluster_{label}")
)


In [38]:
# Visualize clusters
print("Creating visualization...")
hover_data = {
    "text_short": True,
    "title": True,
    "x": False,
    "y": False
}
fig = px.scatter(
    df_no_outliers, 
    x="x", 
    y="y", 
    color="cluster_name",
    hover_data=hover_data,
    template="plotly_dark",
    title="News Article Clusters"
)
fig.show()

Creating visualization...
