In [2]:
import os
import sys
import time
import string
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.manifold import TSNE
from sklearn.metrics import (
    silhouette_score,
    adjusted_rand_score,
    normalized_mutual_info_score,
    homogeneity_score,
    completeness_score,
    v_measure_score,
    confusion_matrix
)
from sklearn.preprocessing import LabelEncoder

try:
    import matplotlib.pyplot as plt
    import seaborn as sns
    sns.set_style("whitegrid")
    PLOTTING_ENABLED = True
except ImportError:
    PLOTTING_ENABLED = False

warnings.filterwarnings("ignore")

# ---------------- CONFIG ----------------
ROOT_FOLDER = "bbc" 
OUTPUT_CSV = "clustered_articles.csv"
OUTPUT_METRICS = "evaluation_metrics.txt"
OUTPUT_CONFUSION_MATRIX_PLOT = "confusion_matrix.png"
OUTPUT_CLUSTER_DISTRIBUTION_PLOT = "cluster_distribution.png"
OUTPUT_SILHOUETTE_PLOT = "silhouette_scores.png"
OUTPUT_ELBOW_PLOT = "elbow_method_plot.png"  # <-- New plot output
OUTPUT_TSNE_PLOT = "tsne_visualization.png"

MAX_TFIDF_FEATURES = 5000
K_SEARCH_RANGE = range(2, 11) # For silhouette
K_ELBOW_RANGE = range(1, 16)   # For elbow method
SVD_COMPONENTS = 50
TSNE_PERPLEXITY = 30
TSNE_RANDOM_STATE = 42
MAX_TSNE_SAMPLES = 3000
# ----------------------------------------

# --- DUMMY DATA SETUP ---
def create_dummy_data():
    if os.path.exists(ROOT_FOLDER): return
    print("[SETUP] Creating dummy data for demonstration...")
    # (Dummy data creation logic remains the same)
    categories = {
        "business": ["dollar rises", "stock market rally", "corporate profits soar"],
        "sport": ["england wins rugby", "chelsea league title", "federer wins tennis"],
        "tech": ["new smartphone", "broadband speeds increase", "social media grows"],
        "entertainment": ["blockbuster film", "new music album", "award ceremony honors actors"],
        "politics": ["election campaign", "government new policy", "parliament debates bill"]
    }
    os.makedirs(ROOT_FOLDER, exist_ok=True)
    for category, articles in categories.items():
        cat_path = os.path.join(ROOT_FOLDER, category)
        os.makedirs(cat_path, exist_ok=True)
        for i, article in enumerate(articles):
            with open(os.path.join(cat_path, f"{i+1:03d}.txt"), "w") as f:
                f.write((article + " ") * 20)

# --- UTILITY FUNCTIONS ---
def log(message): print(f"[INFO] {message}")
def log_timer(message, start_time): print(f"[TIMER] {message} finished in {time.time() - start_time:.2f} seconds.")

STOP_WORDS = set(ENGLISH_STOP_WORDS)

def load_articles_from_subfolders(root_folder):
    root = Path(root_folder)
    if not root.is_dir():
        log(f"ERROR: ROOT_FOLDER '{root_folder}' not found."), sys.exit(1)
    records = []
    for fpath in sorted(root.glob("**/*.txt")):
        txt = fpath.read_text(encoding="utf-8", errors="ignore").strip()
        if not txt: continue
        category = fpath.parent.name if fpath.parent != root else None
        records.append({"filename": fpath.name, "category": category, "text": txt})
    return pd.DataFrame.from_records(records)

def preprocess(text, stop_words=STOP_WORDS):
    t = text.lower()
    t = t.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
    return " ".join([tok for tok in t.split() if tok not in stop_words and len(tok) > 1])

# --- CLUSTERING ANALYSIS FUNCTIONS ---
def find_best_k_silhouette(X, k_range):
    best_k, best_score, scores = None, -1.0, {}
    log(f"Searching for best k (Silhouette) in range: {list(k_range)}")
    for k in k_range:
        labels = KMeans(n_clusters=k, random_state=42, n_init='auto').fit_predict(X)
        score = silhouette_score(X, labels)
        scores[k] = score
        print(f"  - Silhouette score for k={k}: {score:.4f}")
        if score > best_score: best_score, best_k = score, k
    return best_k, scores

def top_keywords_per_cluster(vectorizer, kmeans, n_terms=12):
    terms = vectorizer.get_feature_names_out()
    order = kmeans.cluster_centers_.argsort()[:, ::-1]
    return {i: [terms[ind] for ind in order[i, :n_terms]] for i in range(len(kmeans.cluster_centers_))}

# --- PLOTTING FUNCTIONS ---

def plot_elbow_method(X, k_range):
    """
    NEW: Calculates and plots the WCSS (Inertia) for a range of k values.
    """
    if not PLOTTING_ENABLED: return
    log(f"Calculating Inertia for Elbow Method (k in {list(k_range)})...")
    inertia_values = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
        kmeans.fit(X)
        inertia_values.append(kmeans.inertia_)
        print(f"  - Inertia for k={k}: {kmeans.inertia_:.2f}")

    plt.figure(figsize=(10, 6))
    plt.plot(k_range, inertia_values, marker='o', linestyle='--')
    plt.title("Elbow Method for Optimal k")
    plt.xlabel("Number of Clusters (k)")
    plt.ylabel("Within-Cluster Sum of Squares (WCSS / Inertia)")
    plt.xticks(k_range)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(OUTPUT_ELBOW_PLOT)
    log(f"Saved Elbow Method plot to: {OUTPUT_ELBOW_PLOT}"), plt.close()


def plot_silhouette_scores(scores, best_k):
    if not PLOTTING_ENABLED or not scores: return
    plt.figure(figsize=(10, 6))
    plt.plot(list(scores.keys()), list(scores.values()), marker='o', linestyle='--')
    plt.title("Silhouette Score vs. Number of Clusters (k)")
    plt.xlabel("k"), plt.ylabel("Silhouette Score")
    if best_k: plt.axvline(x=best_k, color='r', linestyle='--', label=f'Best k = {best_k}')
    plt.legend(), plt.grid(True), plt.tight_layout()
    plt.savefig(OUTPUT_SILHOUETTE_PLOT)
    log(f"Saved silhouette score plot to: {OUTPUT_SILHOUETTE_PLOT}"), plt.close()

def plot_tsne_visualization(X, df):
    # (This function remains the same)
    if not PLOTTING_ENABLED or not (3 <= X.shape[0] <= MAX_TSNE_SAMPLES): return
    log("Running t-SNE for visualization...")
    start_time_viz = time.time()
    n_svd = min(SVD_COMPONENTS, X.shape[0] - 1, X.shape[1] - 1)
    if n_svd < 2: return
    X_red = TruncatedSVD(n_components=n_svd, random_state=42).fit_transform(X)
    perplexity = min(TSNE_PERPLEXITY, X_red.shape[0] - 1)
    if perplexity < 5: return
    X_tsne = TSNE(n_components=2, perplexity=perplexity, random_state=TSNE_RANDOM_STATE, init="random").fit_transform(X_red)
    df_plot = df.copy()
    df_plot['tsne_1'], df_plot['tsne_2'] = X_tsne[:, 0], X_tsne[:, 1]
    
    plt.figure(figsize=(12, 10))
    sns.scatterplot(data=df_plot, x='tsne_1', y='tsne_2', hue='cluster', palette='tab10', s=80, legend='full')
    plt.title('t-SNE Visualization of Clusters'), plt.xlabel('t-SNE Dimension 1'), plt.ylabel('t-SNE Dimension 2')
    plt.tight_layout()
    plt.savefig(OUTPUT_TSNE_PLOT)
    log(f"Saved t-SNE visualization to: {OUTPUT_TSNE_PLOT}"), plt.close()
    log_timer("Visualization generation", start_time_viz)

def plot_cluster_analysis(df):
    # (This function remains the same)
    if not PLOTTING_ENABLED or df["category"].isnull().all(): return
    log("Generating confusion matrix and cluster distribution plots...")
    y_true, y_pred = df["category"], df["cluster"]
    plt.figure(figsize=(10, 8))
    labels, cluster_labels = sorted(y_true.unique()), sorted(y_pred.unique())
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=cluster_labels, yticklabels=labels)
    plt.title('Confusion Matrix: True Category vs. Predicted Cluster'), plt.xlabel('Predicted Cluster'), plt.ylabel('True Category')
    plt.tight_layout(), plt.savefig(OUTPUT_CONFUSION_MATRIX_PLOT), plt.close()
    log(f"Saved confusion matrix plot to: {OUTPUT_CONFUSION_MATRIX_PLOT}")
    
    crosstab = pd.crosstab(df['cluster'], df['category'])
    crosstab.div(crosstab.sum(axis=1), axis=0).plot(kind='bar', stacked=True, figsize=(12, 8), colormap='tab20')
    plt.title('Distribution of True Categories within Each Cluster'), plt.xlabel('Predicted Cluster'), plt.ylabel('Proportion')
    plt.legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left'), plt.tight_layout()
    plt.savefig(OUTPUT_CLUSTER_DISTRIBUTION_PLOT), plt.close()
    log(f"Saved cluster distribution plot to: {OUTPUT_CLUSTER_DISTRIBUTION_PLOT}")

# --- MAIN PIPELINE ---
if __name__ == "__main__":
    total_start_time = time.time()
    create_dummy_data()

    df = load_articles_from_subfolders(ROOT_FOLDER)
    log(f"Loaded {len(df)} articles.")
    if df.empty: sys.exit(1)

    df["clean_text"] = df["text"].apply(preprocess)
    df = df[df["clean_text"].str.strip().astype(bool)].reset_index(drop=True)
    log(f"{len(df)} articles remain after preprocessing.")

    vectorizer = TfidfVectorizer(max_features=MAX_TFIDF_FEATURES)
    X = vectorizer.fit_transform(df["clean_text"])
    log(f"TF-IDF matrix shape: {X.shape}")

    # --- Find Optimal k ---
    plot_elbow_method(X, K_ELBOW_RANGE) # <-- Call the new Elbow Method plot function
    best_k_sil, sil_scores = find_best_k_silhouette(X, K_SEARCH_RANGE)
    plot_silhouette_scores(sil_scores, best_k_sil)
    
    # --- Final Clustering ---
    num_categories = df['category'].nunique()
    final_k = num_categories if num_categories > 1 else (best_k_sil if best_k_sil else 5)
    log(f"Selected k={final_k} for final clustering based on ground truth / silhouette score.")

    kmeans = KMeans(n_clusters=final_k, random_state=42, n_init='auto')
    df["cluster"] = kmeans.fit_predict(X)
    df['cluster'] = 'Cluster ' + df['cluster'].astype(str)

    print("\n--- Top Keywords per Cluster ---")
    for c, terms in top_keywords_per_cluster(vectorizer, kmeans).items():
        print(f"Cluster {c}: {', '.join(terms)}")

    # --- Evaluation ---
    if df["category"].notnull().any():
        y_true, y_pred = df["category"], df["cluster"]
        metrics = {"Adjusted Rand Index (ARI)": adjusted_rand_score(y_true, y_pred),
                   "Normalized Mutual Info (NMI)": normalized_mutual_info_score(y_true, y_pred),
                   "Homogeneity": homogeneity_score(y_true, y_pred),
                   "Completeness": completeness_score(y_true, y_pred),
                   "V-measure": v_measure_score(y_true, y_pred)}
        print("\n--- Clustering Evaluation vs. Folder Labels ---")
        with open(OUTPUT_METRICS, "w") as f:
            for name, score in metrics.items():
                line = f"{name}: {score:.4f}"
                print(line), f.write(line + "\n")
        log(f"Saved evaluation metrics to: {OUTPUT_METRICS}")

    # --- Visualization ---
    plot_tsne_visualization(X, df)
    plot_cluster_analysis(df)

    df.to_csv(OUTPUT_CSV, index=False)
    log(f"Saved results to: {OUTPUT_CSV}")

    log_timer("Total execution time", total_start_time)

[INFO] Loaded 2225 articles.
[INFO] 2225 articles remain after preprocessing.
[INFO] TF-IDF matrix shape: (2225, 5000)
[INFO] Calculating Inertia for Elbow Method (k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])...
  - Inertia for k=1: 2159.01
  - Inertia for k=2: 2140.89
  - Inertia for k=3: 2121.47
  - Inertia for k=4: 2106.65
  - Inertia for k=5: 2087.66
  - Inertia for k=6: 2078.24
  - Inertia for k=7: 2068.74
  - Inertia for k=8: 2058.69
  - Inertia for k=9: 2051.29
  - Inertia for k=10: 2042.15
  - Inertia for k=11: 2037.68
  - Inertia for k=12: 2031.63
  - Inertia for k=13: 2025.09
  - Inertia for k=14: 2019.72
  - Inertia for k=15: 2015.68
[INFO] Saved Elbow Method plot to: elbow_method_plot.png
[INFO] Searching for best k (Silhouette) in range: [2, 3, 4, 5, 6, 7, 8, 9, 10]
  - Silhouette score for k=2: 0.0081
  - Silhouette score for k=3: 0.0124
  - Silhouette score for k=4: 0.0134
  - Silhouette score for k=5: 0.0157
  - Silhouette score for k=6: 0.0157
  - Silhouet