In [None]:
!pip install bertopic sentence-transformers umap-learn hdbscan pandas
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic


In [None]:
import math
from itertools import combinations
from collections import Counter

import numpy as np
import pandas as pd
from tqdm import tqdm

from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_score

from bertopic import BERTopic
import matplotlib.pyplot as plt


# ================== 1) Load data ==================
df = pd.read_excel("ai_articles_with_person_cleaned.xlsx")
# keep only articles that have at least one image (for later multimodal pairing)
df = df[df["num_images"] > 0].copy()
texts = df["text_cleaned_final"].dropna().astype(str).tolist()


# ================== 2) Build sentence embeddings ==================
embedding_model = SentenceTransformer("thenlper/gte-small")

embeddings = []
batch_size = 32
for i in tqdm(range(0, len(texts), batch_size), desc="Embedding"):
    batch = texts[i:i + batch_size]
    emb = embedding_model.encode(batch, show_progress_bar=False, normalize_embeddings=False)
    embeddings.extend(emb)
embeddings = np.array(embeddings)


# ================== 3) Dimensionality reduction (UMAP) ==================
UMAP_KW = dict(
    n_neighbors=57,
    n_components=5,
    min_dist=0.0233,
    metric="cosine",
    random_state=42,
)
umap_model = UMAP(**UMAP_KW)
embeddings_umap = umap_model.fit_transform(embeddings)


# ================== 4) NPMI computation ==================
def compute_npmi(topics_words, tokenized_texts):
    """
    Compute per-topic NPMI over the top words.
    topics_words: list[list[str]]  -- words for each topic
    tokenized_texts: list[list[str]] -- documents tokenized into unique tokens
    """
    N = len(tokenized_texts)
    df_word = Counter()
    df_pair = Counter()

    # document-frequency counts (single and pair)
    for doc in tokenized_texts:
        uniq = set(doc)
        for w in uniq:
            df_word[w] += 1
        for w1, w2 in combinations(sorted(uniq), 2):
            df_pair[(w1, w2)] += 1

    def npmi(w1, w2):
        key = (w1, w2) if w1 <= w2 else (w2, w1)
        c12 = df_pair.get(key, 0)
        if c12 == 0:
            return None
        p1 = df_word.get(w1, 0) / N
        p2 = df_word.get(w2, 0) / N
        p12 = c12 / N
        # PMI with tiny eps for numerical stability, then normalise
        pmi = math.log((p12 + 1e-12) / (p1 * p2 + 1e-12))
        return pmi / (-math.log(p12 + 1e-12))

    def topic_npmi(words):
        vals = []
        for a, b in combinations(words, 2):
            v = npmi(a, b)
            if v is not None:
                vals.append(v)
        return float(np.mean(vals)) if vals else float("nan")

    return [topic_npmi(ws) for ws in topics_words]


# ================== 5) Scan candidate k; pick the best ==================
candidate_k = [10, 15, 20, 25, 30]
results = []

for k in candidate_k:
    # Step 1: KMeans clustering in UMAP space
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(embeddings_umap)

    # Step 2: Build BERTopic (vectorizer tightened for speed)
    vectorizer_model = CountVectorizer(min_df=2, stop_words="english")
    topic_model = BERTopic(
        embedding_model=None,          # we already provide embeddings
        vectorizer_model=vectorizer_model,
        calculate_probabilities=True,
        verbose=False,
        low_memory=True,
    )
    # fit once to initialise internal structures
    topic_model.fit(texts, embeddings=embeddings)

    # Crucial: overwrite clusters with our KMeans labels
    topic_model.update_topics(texts, topics=cluster_labels, vectorizer_model=vectorizer_model)

    # Step 3: Silhouette on UMAP space
    sil = silhouette_score(embeddings_umap, cluster_labels) if len(set(cluster_labels)) > 1 else float("nan")

    # Step 4: NPMI over top-10 words per topic
    tokenized_texts = [t.split() for t in texts]
    top_words = []
    for tid, words in topic_model.get_topics().items():
        if tid == -1:
            continue
        top_words.append([w for w, _ in words[:10]])
    per_topic_npmi = compute_npmi(top_words, tokenized_texts)
    mean_npmi = float(np.nanmean(per_topic_npmi))

    results.append({"k": k, "silhouette": sil, "npmi": mean_npmi})
    print(f"K={k}: silhouette={sil:.4f}, npmi={mean_npmi:.4f}")

# choose k by a simple normalised sum of silhouette and NPMI
res_df = pd.DataFrame(results)
sil_norm = (res_df["silhouette"] - res_df["silhouette"].min()) / (res_df["silhouette"].max() - res_df["silhouette"].min() + 1e-12)
npmi_norm = (res_df["npmi"] - res_df["npmi"].min()) / (res_df["npmi"].max() - res_df["npmi"].min() + 1e-12)
res_df["score"] = sil_norm + npmi_norm
best_k = int(res_df.loc[res_df["score"].idxmax(), "k"])
print("\nBest k:", best_k)
print(res_df)


# ================== 6) Rebuild the final model with best k and save ==================
final_kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
final_labels = final_kmeans.fit_predict(embeddings_umap)

final_vectorizer = CountVectorizer(min_df=2, stop_words="english")
final_topic_model = BERTopic(
    embedding_model=None,
    vectorizer_model=final_vectorizer,
    calculate_probabilities=True,
    verbose=True,
    low_memory=True,
)
final_topic_model.fit(texts, embeddings=embeddings)
final_topic_model.update_topics(texts, topics=final_labels, vectorizer_model=final_vectorizer)

df["topic_kmeans"] = final_labels
topic_info = final_topic_model.get_topic_info().set_index("Topic")["Name"].to_dict()
df["topic_label_kmeans"] = df["topic_kmeans"].map(topic_info).fillna("Unknown")

df.to_excel("articles_with_kmeans_topics.xlsx", index=False)
print("Saved final topics to articles_with_kmeans_topics.xlsx")


# ================== 7) (Optional) Plot “Topic Word Scores” ==================
def pick_top_topic_ids(topic_model: BERTopic, k: int = 8, by: str = "Count"):
    info = topic_model.get_topic_info()
    info = info[info["Topic"] != -1].sort_values(by=by, ascending=False)
    return info["Topic"].head(k).tolist()

def plot_topic_word_scores(topic_model: BERTopic, topic_ids=None, top_n_words=5,
                           title="Topic Word Scores", ncols=4,
                           figsize=(18, 9), outfile=None):
    topics_dict = topic_model.get_topics()
    if topic_ids is None:
        topic_ids = pick_top_topic_ids(topic_model, k=8, by="Count")

    n = len(topic_ids)
    ncols = min(ncols, n)
    nrows = int(np.ceil(n / ncols))
    fig, axes = plt.subplots(nrows, ncols, figsize=figsize, squeeze=False)
    fig.suptitle(title, fontsize=22, y=1.02)

    for i, tid in enumerate(topic_ids):
        r, c = divmod(i, ncols)
        ax = axes[r, c]
        ws = topics_dict.get(tid, [])[:top_n_words]
        if not ws:
            ax.axis("off")
            continue
        words, scores = zip(*ws)
        words, scores = list(words)[::-1], list(scores)[::-1]
        ax.barh(words, scores)
        ax.set_xlim(left=0)
        ax.set_title(f"Topic {tid}", fontsize=14)
        for y, v in enumerate(scores):
            ax.text(v, y, f" {v:.3f}", va="center", ha="left", fontsize=9)

    # turn off any unused subplots
    for j in range(i + 1, nrows * ncols):
        r, c = divmod(j, ncols)
        axes[r, c].axis("off")

    plt.tight_layout()
    if outfile:
        plt.savefig(outfile, dpi=300, bbox_inches="tight")
    plt.show()

# Example usage:
top_ids = pick_top_topic_ids(final_topic_model, k=8, by="Count")
plot_topic_word_scores(final_topic_model, topic_ids=top_ids, top_n_words=5,
                       title="Topic Word Scores", outfile="topic_word_scores.png")


In [None]:
#Step 1: Local K-scan

#Confirm whether k=35 is the robust optimal value.
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import pandas as pd

K_LIST = [30, 32, 35, 38, 40]  # Scan near the optimal valu
rows = []
for k in K_LIST:
    km = KMeans(n_clusters=k, n_init=50, max_iter=500, random_state=42)
    lab = km.fit_predict(X_umap)
    sil = silhouette_score(X_umap, lab)
    ch  = calinski_harabasz_score(X_umap, lab)
    db  = davies_bouldin_score(X_umap, lab)
    rows.append((k, sil, ch, db))
    print(f"K={k}: silhouette={sil:.4f}, CH={ch:.1f}, DB={db:.4f}")

df_k = pd.DataFrame(rows, columns=["k","silhouette","calinski","davies"])
df_k.to_excel("k_local_sweep.xlsx", index=False)


In [None]:
#Step 2: Stability test ,See if the same k=35 is consistent under different initializations or samplings.

from sklearn.metrics.cluster import adjusted_rand_score
import numpy as np

base_labels = KMeans(n_clusters=35, n_init=50, random_state=0).fit_predict(X_umap)

# 1) Multiple initializations
aris = []
for seed in range(1,6):
    lab = KMeans(n_clusters=35, n_init=50, random_state=seed).fit_predict(X_umap)
    aris.append(adjusted_rand_score(base_labels, lab))
print("Init reseed stability ARI mean±std:", np.mean(aris), np.std(aris))

# 2) 90% of the sub-samples are resampled
rng = np.random.default_rng(42)
aris_sub = []
for seed in range(5):
    idx = rng.choice(len(X_umap), size=int(0.9*len(X_umap)), replace=False)
    sub = X_umap[idx]
    lab_sub = KMeans(n_clusters=35, n_init=30, random_state=seed).fit_predict(sub)
    aris_sub.append(adjusted_rand_score(base_labels[idx], lab_sub))
print("Bootstrap stability ARI mean±std:", np.mean(aris_sub), np.std(aris_sub))

In [None]:
#Step 3: Quality of topic explainability Verify the semantic robustness of the topic using NPMI and Diversity.
# NPMI & Diversity (fast) from an existing topics file
# Requirements: numpy, pandas, scikit-learn, openpyxl (for Excel I/O)

import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from itertools import combinations
from datetime import datetime

# -----------------------------
# 1) Load texts and existing topic labels
# -----------------------------
src_articles = "ai_articles_with_person_cleaned.xlsx"
src_topics   = "articles_topics_bestk.xlsx"  # previously exported file with column 'main_topic'

assert os.path.exists(src_articles), f"Data file not found: {src_articles}"
assert os.path.exists(src_topics),   f"Topics file not found: {src_topics}"

df_all = pd.read_excel(src_articles)
# Keep only articles that have at least one image (as per your workflow)
df_all = df_all[df_all["num_images"] > 0].copy()

# Keep rows with non-null cleaned text
df_txt = df_all.loc[df_all["text_cleaned_final"].notna()].copy()
texts  = df_txt["text_cleaned_final"].astype(str).tolist()

topics_df = pd.read_excel(src_topics)

# If both files carry an article identifier, optionally verify alignment
candidate_keys = ["article_id", "id", "url_hash"]
for k in candidate_keys:
    if k in df_txt.columns and k in topics_df.columns:
        assert list(df_txt[k].tolist()) == list(topics_df[k].tolist()), \
            f"Row order mismatch detected on key '{k}'. Ensure the same ordering before proceeding."
        break

# Assume the topics file was exported in the same order as df_txt
assert len(topics_df) == len(texts), f"Length mismatch: topics={len(topics_df)}, texts={len(texts)}"
labels = topics_df["main_topic"].to_numpy()

# -----------------------------
# 2) Bag-of-words + class-based TF-IDF (c-TF-IDF) to get Top-N words per topic
# -----------------------------
TOPN = 10
vec = CountVectorizer(
    min_df=8, max_df=0.9, max_features=20000,
    stop_words="english", ngram_range=(1, 1)
)
X = vec.fit_transform(texts)  # CSR matrix of shape (n_docs, n_terms)
vocab = np.array(vec.get_feature_names_out())

topics = np.unique(labels)
t2r = {t: i for i, t in enumerate(topics)}

# Aggregate term counts per topic
X_cls = np.zeros((len(topics), X.shape[1]), dtype=np.float64)
for t in topics:
    idx = np.where(labels == t)[0]
    if len(idx):
        X_cls[t2r[t]] = np.asarray(X[idx].sum(axis=0)).ravel()

# Compute class-based TF-IDF
tf = X_cls / (X_cls.sum(axis=1, keepdims=True) + 1e-12)
df_term_topics = (X_cls > 0).sum(axis=0)
idf = np.log(1.0 + len(topics) / (1.0 + df_term_topics))
ctfidf = tf * idf  # shape: (n_topics, n_terms)

# Extract Top-N words per topic
topic_top_words = {}
for t in topics:
    r = t2r[t]
    if ctfidf[r].sum() == 0:
        topic_top_words[t] = []
        continue
    top_idx = np.argpartition(ctfidf[r], -TOPN)[-TOPN:]
    top_idx = top_idx[np.argsort(-ctfidf[r, top_idx])]
    topic_top_words[t] = vocab[top_idx].tolist()

# -----------------------------
# 3) Fast NPMI + Diversity (restricted to the union of Top-k words)
# -----------------------------
def fast_npmi_and_diversity_from_X(X, topics_words, vec, topk=10):
    vocab = vec.get_feature_names_out()
    vocab2idx = {w: i for i, w in enumerate(vocab)}

    # Use only the union of each topic's Top-k words to speed up
    topic_indices = []
    cols_set = set()
    for tw in topics_words:
        idx = [vocab2idx[w] for w in tw[:topk] if w in vocab2idx]
        topic_indices.append(idx)
        cols_set.update(idx)

    if not cols_set:
        return [], 0.0

    cols = np.fromiter(sorted(cols_set), dtype=int)
    X_sub = X[:, cols].tocsr()

    # Binarize and compute co-occurrence counts
    B = X_sub.copy()
    B.data[:] = 1  # set all nonzero entries to 1
    C = (B.T @ B).astype(np.int32).toarray()  # (m x m) co-occurrence
    N = X_sub.shape[0]

    # Document frequencies and probabilities
    dfw = np.diag(C).astype(np.float64)
    dfw[dfw == 0] = np.nan
    p = dfw / N  # P(w)

    # Map original column index -> compact position
    pos = {g: i for i, g in enumerate(cols)}
    topic_pos = [[pos[g] for g in idxs if g in pos] for idxs in topic_indices]

    def npmi_for(idxs):
        if len(idxs) < 2:
            return float("nan")
        vals = []
        for i, j in combinations(idxs, 2):
            c12 = C[i, j]
            if c12 == 0:
                continue
            p12 = c12 / N
            # NPMI: log( p12 / (p1*p2) ) / -log(p12)
            v = np.log((p12 + 1e-12) / ((p[i] * p[j]) + 1e-12)) / (-np.log(p12 + 1e-12))
            if np.isfinite(v):
                vals.append(np.clip(v, -1.0, 1.0))
        return float(np.mean(vals)) if vals else float("nan")

    npmi_scores = [npmi_for(idxs) for idxs in topic_pos]

    # Diversity: unique words over the total number of selected words (topic-wise min(topk, len))
    per_topic_counts = [min(topk, len(tw)) for tw in topics_words if tw]
    total_selected = int(np.sum(per_topic_counts)) if per_topic_counts else 0
    all_words = [w for tw in topics_words for w in tw[:topk]]
    diversity = (len(set(all_words)) / total_selected) if total_selected > 0 else 0.0

    return npmi_scores, diversity

topics_words = [topic_top_words.get(t, []) for t in topics]
npmi_vals, div_val = fast_npmi_and_diversity_from_X(X, topics_words, vec, topk=10)

# -----------------------------
# 4) Export per-topic NPMI, Top words, and summary
# -----------------------------
# Lightweight topic name using the first three top words
name3 = {
    t: (", ".join(topic_top_words.get(t, [])[:3]) if topic_top_words.get(t) else "misc")
    for t in topics
}

per_topic = pd.DataFrame({
    "topic": topics,
    "topic_name": [name3[t] for t in topics],
    "npmi": npmi_vals,
    "top_words": [", ".join(topic_top_words.get(t, [])) for t in topics],
    "size": [int((labels == t).sum()) for t in topics],
}).sort_values("topic")

summary = pd.DataFrame({
    "metric": ["NPMI_mean", "Diversity"],
    "value": [float(np.nanmean(npmi_vals)), float(div_val)]
})

config = pd.DataFrame({
    "param": ["TOPN", "Vectorizer.min_df", "Vectorizer.max_df", "Vectorizer.max_features",
              "Vectorizer.ngram_range", "Timestamp"],
    "value": [TOPN, 8, 0.9, 20000, "(1,1)", datetime.now().isoformat(timespec="seconds")]
})

with pd.ExcelWriter("topic_quality.xlsx") as w:
    per_topic.to_excel(w, sheet_name="per_topic", index=False)
    summary.to_excel(w, sheet_name="summary", index=False)
    config.to_excel(w, sheet_name="config", index=False)

print("Saved -> topic_quality.xlsx")
print("NPMI mean:", float(np.nanmean(npmi_vals)), "| Diversity:", float(div_val))


In [None]:
#Topic share over time
import os, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

OUT_DIR = "outputs_topic_over_time_yearly"
os.makedirs(OUT_DIR, exist_ok=True)

# 1) Load file (support two possible names)
candidates = ["articles_topics_bestk1.xlsx"]
path = next((p for p in candidates if os.path.exists(p)), None)
if path is None:
    raise FileNotFoundError("Could not find articles_topics_bestk.xlsx.")

df = pd.read_excel(path)

# 2) Smart column detection
def find_col(cols, patterns):
    for pat in patterns:
        for c in cols:
            if re.search(pat, str(c).lower()):
                return c
    return None

date_col   = find_col(df.columns, [r"year"])
topic_col  = find_col(df.columns, [r"topic_label"])
prob_col   = find_col(df.columns, [r"prob", r"score", r"weight"])
bucket_col = find_col(df.columns, [r"topic[_ ]?bucket", r"bucket"])  # may not exist

if date_col is None or topic_col is None:
    raise ValueError(f"Could not automatically find date or topic column. Available columns: {list(df.columns)}")

# 3) Parse date → year
dt = pd.to_datetime(df[date_col], errors="coerce")
if dt.isna().all():
    # Try year-only
    year = df[date_col].astype(str).str.extract(r"(\d{4})")[0]
    dt = pd.to_datetime(year + "-01-01", errors="coerce")

df = df.loc[~dt.isna()].copy()
df["_year"] = dt.dt.to_period("Y").dt.to_timestamp()  # year timestamp (Jan 1 each year)

# 4) Yearly topic shares
df[topic_col] = df[topic_col].astype(str)
df["_w"] = pd.to_numeric(df[prob_col], errors="coerce") if prob_col else 1.0
df["_w"] = df["_w"].fillna(1.0)

counts_topic_year = (
    df.pivot_table(index="_year", columns=topic_col, values="_w", aggfunc="sum")
    .fillna(0.0)
    .sort_index()
)
shares_topic_year = counts_topic_year.div(
    counts_topic_year.sum(axis=1).replace(0, np.nan), axis=0
).fillna(0.0)

# Choose the top N topics by total weight and plot
TOP_N = 8
top_topics = counts_topic_year.sum(axis=0).sort_values(ascending=False).head(TOP_N).index
plot_mat = shares_topic_year[top_topics]

# Save CSVs
counts_topic_year.to_csv(os.path.join(OUT_DIR, "topic_year_counts.csv"))
shares_topic_year.to_csv(os.path.join(OUT_DIR, "topic_year_shares.csv"))

# 5) Stacked area chart of yearly shares (Top-N topics)
plt.figure(figsize=(10, 6))
x = plot_mat.index
y = plot_mat.values.T  # shape: topics x years
plt.stackplot(x, y, labels=top_topics)
plt.title("Topic share over time (yearly, top 8 topics)")
plt.xlabel("Year")
plt.ylabel("Share")
plt.legend(loc="upper left", bbox_to_anchor=(1.02, 1.0))
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "topic_over_time_stacked_area_yearly.png"), dpi=200)
plt.show()

# 6) If a bucket column exists: line chart of yearly bucket shares
if bucket_col:
    df[bucket_col] = df[bucket_col].fillna("Other").astype(str)
    counts_bucket_year = (
        df.pivot_table(index="_year", columns=bucket_col, values="_w", aggfunc="sum")
        .fillna(0.0)
        .sort_index()
    )
    shares_bucket_year = counts_bucket_year.div(
        counts_bucket_year.sum(axis=1).replace(0, np.nan), axis=0
    ).fillna(0.0)

    shares_bucket_year.to_csv(os.path.join(OUT_DIR, "bucket_year_shares.csv"))

    plt.figure(figsize=(10, 6))
    for c in shares_bucket_year.columns:
        plt.plot(shares_bucket_year.index, shares_bucket_year[c], marker="o", label=c)
    plt.title("Bucket share over time (yearly)")
    plt.xlabel("Year")
    plt.ylabel("Share")
    plt.legend(loc="upper left", bbox_to_anchor=(1.02, 1.0))
    plt.tight_layout()
    plt.savefig(os.path.join(OUT_DIR, "bucket_over_time_lines_yearly.png"), dpi=200)
    plt.show()


In [None]:
#UMAP Theme Map (Color Point Cloud)
import numpy as np, matplotlib.pyplot as plt

X = np.load("X_umap_5d.npy")          # (n,5)
labels = np.load("labels.npy")        # (n,)
xy = X[:, :2]                         # Take the first two dimensions for display
topics = np.unique(labels)

# Draw a dot
plt.figure(figsize=(6,5), dpi=140)
scatter = plt.scatter(xy[:,0], xy[:,1], c=labels, s=6, alpha=0.6)
# "Picture Quality Center"
centroids = np.stack([xy[labels==t].mean(axis=0) for t in topics])
plt.scatter(centroids[:,0], centroids[:,1], s=60, marker="X", edgecolors="k")

plt.title("Topic Map (UMAP 2D) with KMeans labels")
plt.xlabel("UMAP-1"); plt.ylabel("UMAP-2")
plt.tight_layout(); plt.show()


In [None]:
#UMAP topic Map
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from collections import Counter

# === Data ===
X = np.load("X_umap_5d.npy")[:, :2]
labels = np.load("labels.npy")
topics = np.unique(labels)

# Read short labels per topic (fallback to numeric id if missing)
try:
    per_topic = pd.read_excel("topic_quality.xlsx", sheet_name="per_topic")
    name_map = dict(zip(per_topic["topic"].astype(int),
                        per_topic["topic_name"].astype(str)))
except Exception:
    name_map = {int(t): f"Topic {int(t)}" for t in topics}

# === Centroids & margin (core vs boundary) ===
centroids = np.stack([X[labels == t].mean(axis=0) for t in topics])   # (k, 2)
D = np.linalg.norm(X[:, None, :] - centroids[None, :, :], axis=2)
Ds = np.sort(D, axis=1)
margin = Ds[:, 1] - Ds[:, 0]
q10 = np.quantile(margin, 0.10)
core = margin > q10

# === Color only the top-N largest clusters, others in gray ===
N = 10
sizes = Counter(labels)
top_topics = [t for t, _ in sizes.most_common(N)]
other = ~np.isin(labels, top_topics)

# Colors
base = plt.get_cmap("tab20").colors
cmapN = ListedColormap(base[:N])
color_of = {t: cmapN(i) for i, t in enumerate(top_topics)}

plt.figure(figsize=(7.2, 5.2), dpi=160)

# 1) Boundary samples: light gray, bottom layer
plt.scatter(X[~core, 0], X[~core, 1], s=6, c="#c7c7c7", alpha=0.15,
            rasterized=True, zorder=1, label="boundary (10%)")

# 2) Core samples: colored for top-N clusters
for t in top_topics:
    mask = (labels == t) & core
    plt.scatter(X[mask, 0], X[mask, 1], s=8, color=color_of[t], alpha=0.75,
                rasterized=True, zorder=3)

# 3) Core samples of other clusters: medium gray
mask_other_core = other & core
plt.scatter(X[mask_other_core, 0], X[mask_other_core, 1], s=6, c="#aaaaaa",
            alpha=0.35, rasterized=True, zorder=2)

# 4) Centroids & labels (for top-N clusters)
def place_label(cx, cy, r=0.18, i=0, n=1):
    """Evenly distribute label offsets by angle; returns a non-overlapping offset point."""
    angle = (2 * np.pi * i) / n
    return cx + r * np.cos(angle), cy + r * np.sin(angle)

cent_map = {int(t): centroids[np.where(topics == t)[0][0]] for t in topics}
for idx, t in enumerate(top_topics):
    cx, cy = cent_map[int(t)]
    # Centroid marker
    plt.scatter(cx, cy, s=90, marker="X", edgecolors="k", linewidths=0.9,
                facecolors=color_of[t], zorder=5)
    # Label: placed around in a ring to avoid stacking
    tx, ty = place_label(cx, cy, r=0.22, i=idx, n=len(top_topics))
    label_txt = name_map.get(int(t), f"Topic {int(t)}")
    plt.text(tx, ty, label_txt,
             fontsize=8,
             bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="none", alpha=0.8),
             zorder=6)

plt.title("UMAP Topic Map (k=35): core vs boundary, centroids & labels", fontsize=12)
plt.xlabel("UMAP-1"); plt.ylabel("UMAP-2")
# Keep a short legend entry
plt.legend(loc="lower right", frameon=True, fontsize=8)
# Slightly expand limits to avoid cutting labels
x0, x1 = plt.xlim(); y0, y1 = plt.ylim()
plt.xlim(x0 - (x1 - x0) * 0.02, x1 + (x1 - x0) * 0.02)
plt.ylim(y0 - (y1 - y0) * 0.02, y1 + (y1 - y0) * 0.02)

plt.tight_layout()
plt.savefig("fig_topic_map_core_centroids.png", dpi=300)
plt.savefig("fig_topic_map_core_centroids.pdf")
plt.show()


In [None]:
#margin distribution
import numpy as np, matplotlib.pyplot as plt

X = np.load("X_umap_5d.npy")[:, :2]
labels = np.load("labels.npy")
topics = np.unique(labels)
centers = np.stack([X[labels==t].mean(axis=0) for t in topics])     # Cluster heart
dists = np.linalg.norm(X[:,None,:] - centers[None,:,:], axis=2)     # (n,k)
margin = np.sort(dists, axis=1)[:,1] - np.sort(dists, axis=1)[:,0]  # Next close - nearest
q10 = np.quantile(margin, 0.10)

plt.figure(figsize=(6,4), dpi=140)
plt.hist(margin, bins=40, alpha=0.8)
plt.axvline(q10, linestyle="--")
plt.title("Margin Distribution (10% cutoff)")
plt.xlabel("second-nearest minus nearest centroid distance"); plt.ylabel("count")
plt.tight_layout(); plt.show()


In [None]:
#Configuration comparison of PMI - Diversity (Pareto Scatter)

#Content Read four configurations (unigram/bigram/bigram+stop/bigram+stop+core) from topic_quality_compare.xlsx, with NPMI_mean on the horizontal axis and Diversity on the vertical axis, and label roll call.

#Function: One diagram proves "improving the evaluation criteria → simultaneous enhancement of consistency and discrimination".
import pandas as pd, matplotlib.pyplot as plt

cmp = pd.read_excel("topic_quality_compare.xlsx", sheet_name="summary")
plt.figure(figsize=(6,5), dpi=140)
plt.scatter(cmp["NPMI_mean"], cmp["Diversity"], s=60)

# Mark the configuration name
for _, r in cmp.iterrows():
    plt.text(r["NPMI_mean"], r["Diversity"], r["config"])

plt.title("NPMI vs Diversity across evaluation configs")
plt.xlabel("NPMI_mean"); plt.ylabel("Diversity")
plt.tight_layout(); plt.show()


In [None]:
#BERTopic: Topic Intensity by Year (Top 15 Topics)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1) Load data
file_path = "articles_topics_bestk1.xlsx"  # change to your path
data = pd.read_excel(file_path, sheet_name="Sheet1")

# 2) Parse date and extract year
data["date"] = pd.to_datetime(data["date"], errors="coerce")
data["year"] = data["date"].dt.year

# 3) Select the Top-N topics by total count
top_n = 15
topic_counts = data["topic_label"].value_counts().head(top_n).index.tolist()

# 4) Build the Year × Topic count matrix
year_topic_counts = (
    data[data["topic_label"].isin(topic_counts)]
    .groupby(["year", "topic_label"])
    .size()
    .unstack(fill_value=0)
    .sort_index()
)

# Ensure columns are in the same order as 'topic_counts'
year_topic_counts = year_topic_counts.reindex(columns=topic_counts)

# Optional: save the matrix
year_topic_counts.to_csv("topic_year_matrix.csv", index=True)

# 5) Plot a heatmap with matplotlib (no seaborn, no explicit colors)
fig, ax = plt.subplots(figsize=(12, 7))
im = ax.imshow(year_topic_counts.values, aspect="auto")

# Axes and ticks
ax.set_xticks(np.arange(len(year_topic_counts.columns)))
ax.set_yticks(np.arange(len(year_topic_counts.index)))
ax.set_xticklabels(year_topic_counts.columns, rotation=45, ha="right")
ax.set_yticklabels(year_topic_counts.index)

ax.set_title("BERTopic: Topic Intensity by Year (Top 15 Topics)")
ax.set_xlabel("Topic")
ax.set_ylabel("Year")

# Colorbar
cbar = fig.colorbar(im, ax=ax)
cbar.set_label("Number of Articles")

plt.tight_layout()
plt.savefig("topic_year_heatmap.png", dpi=200, bbox_inches="tight")
plt.show()


Classification

In [None]:
import pandas as pd

# 1) Load data
file_path = "articles_topics_bestk1.xlsx"
df = pd.read_excel(file_path, sheet_name="Sheet1")

# 2) Get unique topics
topics = (
    df[["main_topic", "topic_label"]]
    .drop_duplicates()
    .rename(columns={"main_topic": "topic_id", "topic_label": "topic_label_full"})
)

# 3) Define a three-level category mapping
category_map = {
    # --- Tech companies & consumer electronics ---
    2: ("Consumer / Tech", "Tech Companies & Consumer Electronics"),
    6: ("Consumer / Tech", "Tech Companies & Consumer Electronics"),
    9: ("Consumer / Tech", "Tech Companies & Consumer Electronics"),
    29: ("Consumer / Tech", "Tech Companies & Consumer Electronics"),
    33: ("Consumer / Tech", "Tech Companies & Consumer Electronics"),
    34: ("Consumer / Tech", "Tech Companies & Consumer Electronics"),

    # --- AI research & companies ---
    3: ("Consumer / Tech", "AI Research & Companies"),
    4: ("Culture / Media", "AI in Culture / Deepfake / Copyright"),
    8: ("Consumer / Tech", "AI Research & Companies"),
    11: ("Culture / Media", "AI in Culture / Deepfake / Copyright"),
    15: ("Consumer / Tech", "AI Research & Companies"),

    # --- Media & culture ---
    7: ("Culture / Media", "Media & Culture"),
    20: ("Culture / Media", "Media & Culture"),
    32: ("Culture / Media", "Media & Culture"),

    # --- Politics & governance ---
    0: ("Politics / Society", "Politics & Governance"),
    10: ("Politics / Society", "Politics & Governance"),
    12: ("Politics / Society", "Politics & Governance"),
    16: ("Politics / Society", "Politics & Governance"),
    17: ("Politics / Society", "Politics & Governance"),
    18: ("Politics / Society", "Politics & Governance"),
    23: ("Politics / Society", "Politics & Governance"),

    # --- War & security ---
    25: ("Politics / Society", "War & Security"),
    26: ("Politics / Society", "War & Security"),

    # --- Economy & finance ---
    24: ("Consumer / Tech", "Economy & Finance"),
    28: ("Consumer / Tech", "Economy & Finance"),
    30: ("Consumer / Tech", "Economy & Finance"),
    31: ("Consumer / Tech", "Economy & Finance"),

    # --- Science & future exploration ---
    1: ("Consumer / Tech", "Science & Future Exploration"),
    21: ("Consumer / Tech", "Science & Future Exploration"),
    22: ("Consumer / Tech", "Science & Future Exploration"),
    27: ("Consumer / Tech", "Science & Future Exploration"),
}

# 4) Map the three-level categories
topics["big_category"] = topics["topic_id"].map(
    lambda x: category_map.get(x, ("Consumer / Tech", "Other"))[0]
)
topics["mid_category"] = topics["topic_id"].map(
    lambda x: category_map.get(x, ("Consumer / Tech", "Other"))[1]
)

# 5) Prepare the output table
hierarchy = topics[["big_category", "mid_category", "topic_id", "topic_label_full"]].sort_values(
    by=["big_category", "mid_category", "topic_id"]
)

# 6) Save as CSV
csv_out = "topic_hierarchy.csv"
hierarchy.to_csv(csv_out, index=False)

# 7) Show a quick preview and the output path
hierarchy.head(20), csv_out
