**Note: this notebook was run on Google Colab, and certain commands reflect the Colab environment**

# text clustering

The three cells (for each data group) are the same code, just taking in different input

In [None]:
!pip install pandas openpyxl jieba snownlp transformers sentence-transformers matplotlib umap-learn adjustText

## combined

In [None]:
import pandas as pd
import re
import jieba
import numpy as np
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from umap import UMAP

# Load and Prepare Data
print("Loading and preprocessing data...")
file_path = "combined_text_data.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# Filter out rows where Description is empty or "No text detected"
df = df[df["Description"].notna()]  # Remove NaN values
df["Description"] = df["Description"].astype(str)  # Convert all to string
df = df[~df["Description"].str.strip().isin(["", "Description"])]  # Remove empty strings and "No text detected"

# Text preprocessing (segmentation and stopwords)
def enhanced_preprocess_text(text):
    text = re.sub(r"[^\w\s\u4e00-\u9fff]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    words = jieba.lcut(text, cut_all=False)
    stopwords = set(["的", "了", "和", "是", "在", "我", "有", "你", "这", "那"])
    words = [word for word in words if word not in stopwords and len(word) > 1]
    return words

preprocessed_texts = [enhanced_preprocess_text(text) for text in df["Description"]]

# Filter out empty texts after preprocessing
valid_indices = [i for i, words in enumerate(preprocessed_texts) if len(words) > 0]
preprocessed_texts = [preprocessed_texts[i] for i in valid_indices]
df = df.iloc[valid_indices].reset_index(drop=True) 

texts_for_embedding = [" ".join(words) for words in preprocessed_texts]

# Embedding generation
print("Generating embeddings...")
model = SentenceTransformer("thenlper/gte-large-zh")
embeddings = model.encode(texts_for_embedding, convert_to_numpy=True)

# Dimensionality reduction 
scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(embeddings)
pca = PCA(n_components=min(50, embeddings_scaled.shape[1]))
pca_embeddings = pca.fit_transform(embeddings_scaled)
umap = UMAP(n_components=2, metric='cosine', random_state=42)
embeddings_reduced = umap.fit_transform(pca_embeddings)

# Cluster evaluation for k = 2 to k = 15
print("\nEvaluating clusters from K=2 to K=15...")
def evaluate_clusters(embeddings, min_k=2, max_k=15):
    silhouette_scores = []
    davies_bouldin_scores = []
    inertia_scores = []
    k_values = range(min_k, max_k + 1)

    # Create figure for cluster visualizations
    plt.figure(figsize=(18, 12))

    # Compute scores and create visualization subplots
    for i, k in enumerate(k_values):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
        clusters = kmeans.fit_predict(embeddings)

        # Calculate metrics
        silhouette = silhouette_score(embeddings, clusters)
        davies_bouldin = davies_bouldin_score(embeddings, clusters)
        inertia = kmeans.inertia_

        silhouette_scores.append(silhouette)
        davies_bouldin_scores.append(davies_bouldin)
        inertia_scores.append(inertia)

        # t-SNE visualization
        tsne = TSNE(n_components=2, random_state=42)
        tsne_embeddings = tsne.fit_transform(embeddings)

        plt.subplot(4, 4, i+1)
        plt.scatter(tsne_embeddings[:, 0], tsne_embeddings[:, 1],
                   c=clusters, cmap="Spectral", alpha=0.7, s=25)
        plt.title(f"K={k}\nSil={silhouette:.2f}", pad=10)
        plt.xticks([])
        plt.yticks([])

    plt.tight_layout()
    plt.show()

    # Create metrics table
    metrics_df = pd.DataFrame({
        'K': k_values,
        'Silhouette Score': silhouette_scores,
        'Davies-Bouldin Score': davies_bouldin_scores,
        'Inertia': inertia_scores
    })

    print("\nCluster Evaluation Metrics:")
    print(metrics_df.to_string(index=False))

    # Plot metrics
    plt.figure(figsize=(15, 10))

    # Silhouette scores
    plt.subplot(2, 2, 1)
    plt.plot(k_values, silhouette_scores, 'bo-')
    plt.xlabel('Number of clusters (K)')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Scores')
    plt.grid(True)

    # Davies-Bouldin scores
    plt.subplot(2, 2, 2)
    plt.plot(k_values, davies_bouldin_scores, 'go-')
    plt.xlabel('Number of clusters (K)')
    plt.ylabel('Davies-Bouldin Score')
    plt.title('Davies-Bouldin Scores')
    plt.grid(True)

    # Inertia scores
    plt.subplot(2, 2, 3)
    plt.plot(k_values, inertia_scores, 'ro-')
    plt.xlabel('Number of clusters (K)')
    plt.ylabel('Inertia')
    plt.title('Inertia Scores')
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    return metrics_df

metrics_df = evaluate_clusters(embeddings_reduced)

## weibo only

### weibo text extraction

In [None]:
!pip install easyocr pandas openpyxl tqdm

In [None]:
import easyocr
import pandas as pd
import os
from tqdm import tqdm

def batch_ocr_process(folder_path, output_file='ocr_results.xlsx'):
    # Initialize reader with Chinese (Simplified/Traditional) and English support
    reader = easyocr.Reader(
        lang_list=['ch_sim', 'en'],
        gpu=True,  # Auto-detects GPU availability
        quantize=True  # Reduces memory usage
    )

    results = []
    error_log = []


    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.jpg')]
    for filename in tqdm(image_files, desc='Processing Images'):
        try:
            file_path = os.path.join(folder_path, filename)
            # Process with paragraph grouping and minimal output
            text = reader.readtext(
                file_path,
                detail=0,
                paragraph=True,
                batch_size=4  # Process 4 images at a time in memory
            )

            # Handle multiple paragraphs
            combined_text = '\n'.join(text) if text else 'No text detected'
            results.append({'Image Name': filename, 'Detected Text': combined_text})

        except Exception as e:
            error_log.append(f"{filename}: {str(e)}")

    # Save results to Excel
    df = pd.DataFrame(results)
    df.to_excel(output_file, index=False, engine='openpyxl')

    # Save error log
    if error_log:
        with open('ocr_errors.log', 'w') as f:
            f.write('\n'.join(error_log))

    return df


batch_ocr_process('/content/weibo_images_all', 'weibo_image_text.xlsx')


### clustering

In [None]:
import pandas as pd
import re
import jieba
import numpy as np
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from umap import UMAP

# Load and Prepare Data
print("Loading and preprocessing data...")
file_path = "weibo_image_text.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# Filter out rows where Description is empty or "No text detected"
df = df[df["Description"].notna()]  # Remove NaN values
df["Description"] = df["Description"].astype(str)  # Convert all to string
df = df[~df["Description"].str.strip().isin(["", "Description"])]  # Remove empty strings and "No text detected"

# Text preprocessing (segmentation and stopwords)
def enhanced_preprocess_text(text):
    text = re.sub(r"[^\w\s\u4e00-\u9fff]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    words = jieba.lcut(text, cut_all=False)
    stopwords = set(["的", "了", "和", "是", "在", "我", "有", "你", "这", "那"])
    words = [word for word in words if word not in stopwords and len(word) > 1]
    return words

preprocessed_texts = [enhanced_preprocess_text(text) for text in df["Description"]]

# Filter out empty texts after preprocessing
valid_indices = [i for i, words in enumerate(preprocessed_texts) if len(words) > 0]
preprocessed_texts = [preprocessed_texts[i] for i in valid_indices]
df = df.iloc[valid_indices].reset_index(drop=True) 

texts_for_embedding = [" ".join(words) for words in preprocessed_texts]

# Embedding generation
print("Generating embeddings...")
model = SentenceTransformer("thenlper/gte-large-zh")
embeddings = model.encode(texts_for_embedding, convert_to_numpy=True)

# Dimensionality reduction 
scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(embeddings)
pca = PCA(n_components=min(50, embeddings_scaled.shape[1]))
pca_embeddings = pca.fit_transform(embeddings_scaled)
umap = UMAP(n_components=2, metric='cosine', random_state=42)
embeddings_reduced = umap.fit_transform(pca_embeddings)

# Cluster evaluation for k = 2 to k = 15
print("\nEvaluating clusters from K=2 to K=15...")
def evaluate_clusters(embeddings, min_k=2, max_k=15):
    silhouette_scores = []
    davies_bouldin_scores = []
    inertia_scores = []
    k_values = range(min_k, max_k + 1)

    # Create figure for cluster visualizations
    plt.figure(figsize=(18, 12))

    # Compute scores and create visualization subplots
    for i, k in enumerate(k_values):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
        clusters = kmeans.fit_predict(embeddings)

        # Calculate metrics
        silhouette = silhouette_score(embeddings, clusters)
        davies_bouldin = davies_bouldin_score(embeddings, clusters)
        inertia = kmeans.inertia_

        silhouette_scores.append(silhouette)
        davies_bouldin_scores.append(davies_bouldin)
        inertia_scores.append(inertia)

        # t-SNE visualization
        tsne = TSNE(n_components=2, random_state=42)
        tsne_embeddings = tsne.fit_transform(embeddings)

        plt.subplot(4, 4, i+1)
        plt.scatter(tsne_embeddings[:, 0], tsne_embeddings[:, 1],
                   c=clusters, cmap="Spectral", alpha=0.7, s=25)
        plt.title(f"K={k}\nSil={silhouette:.2f}", pad=10)
        plt.xticks([])
        plt.yticks([])

    plt.tight_layout()
    plt.show()

    # Create metrics table
    metrics_df = pd.DataFrame({
        'K': k_values,
        'Silhouette Score': silhouette_scores,
        'Davies-Bouldin Score': davies_bouldin_scores,
        'Inertia': inertia_scores
    })

    print("\nCluster Evaluation Metrics:")
    print(metrics_df.to_string(index=False))

    # Plot metrics
    plt.figure(figsize=(15, 10))

    # Silhouette scores
    plt.subplot(2, 2, 1)
    plt.plot(k_values, silhouette_scores, 'bo-')
    plt.xlabel('Number of clusters (K)')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Scores')
    plt.grid(True)

    # Davies-Bouldin scores
    plt.subplot(2, 2, 2)
    plt.plot(k_values, davies_bouldin_scores, 'go-')
    plt.xlabel('Number of clusters (K)')
    plt.ylabel('Davies-Bouldin Score')
    plt.title('Davies-Bouldin Scores')
    plt.grid(True)

    # Inertia scores
    plt.subplot(2, 2, 3)
    plt.plot(k_values, inertia_scores, 'ro-')
    plt.xlabel('Number of clusters (K)')
    plt.ylabel('Inertia')
    plt.title('Inertia Scores')
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    return metrics_df

metrics_df = evaluate_clusters(embeddings_reduced)

## fbq only

In [None]:
import pandas as pd
import re
import jieba
import numpy as np
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from umap import UMAP

# Load and Prepare Data
print("Loading and preprocessing data...")
file_path = "fbq_image_data.xlsx"
df = pd.read_excel(file_path, engine="openpyxl")

# Filter out rows where Description is empty or "No text detected"
df = df[df["Description"].notna()]  # Remove NaN values
df["Description"] = df["Description"].astype(str)  # Convert all to string
df = df[~df["Description"].str.strip().isin(["", "Description"])]  # Remove empty strings and "No text detected"

# Text preprocessing (segmentation and stopwords)
def enhanced_preprocess_text(text):
    text = re.sub(r"[^\w\s\u4e00-\u9fff]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    words = jieba.lcut(text, cut_all=False)
    stopwords = set(["的", "了", "和", "是", "在", "我", "有", "你", "这", "那"])
    words = [word for word in words if word not in stopwords and len(word) > 1]
    return words

preprocessed_texts = [enhanced_preprocess_text(text) for text in df["Description"]]

# Filter out empty texts after preprocessing
valid_indices = [i for i, words in enumerate(preprocessed_texts) if len(words) > 0]
preprocessed_texts = [preprocessed_texts[i] for i in valid_indices]
df = df.iloc[valid_indices].reset_index(drop=True) 

texts_for_embedding = [" ".join(words) for words in preprocessed_texts]

# Embedding generation
print("Generating embeddings...")
model = SentenceTransformer("thenlper/gte-large-zh")
embeddings = model.encode(texts_for_embedding, convert_to_numpy=True)

# Dimensionality reduction 
scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(embeddings)
pca = PCA(n_components=min(50, embeddings_scaled.shape[1]))
pca_embeddings = pca.fit_transform(embeddings_scaled)
umap = UMAP(n_components=2, metric='cosine', random_state=42)
embeddings_reduced = umap.fit_transform(pca_embeddings)

# Cluster evaluation for k = 2 to k = 15
print("\nEvaluating clusters from K=2 to K=15...")
def evaluate_clusters(embeddings, min_k=2, max_k=15):
    silhouette_scores = []
    davies_bouldin_scores = []
    inertia_scores = []
    k_values = range(min_k, max_k + 1)

    # Create figure for cluster visualizations
    plt.figure(figsize=(18, 12))

    # Compute scores and create visualization subplots
    for i, k in enumerate(k_values):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
        clusters = kmeans.fit_predict(embeddings)

        # Calculate metrics
        silhouette = silhouette_score(embeddings, clusters)
        davies_bouldin = davies_bouldin_score(embeddings, clusters)
        inertia = kmeans.inertia_

        silhouette_scores.append(silhouette)
        davies_bouldin_scores.append(davies_bouldin)
        inertia_scores.append(inertia)

        # t-SNE visualization
        tsne = TSNE(n_components=2, random_state=42)
        tsne_embeddings = tsne.fit_transform(embeddings)

        plt.subplot(4, 4, i+1)
        plt.scatter(tsne_embeddings[:, 0], tsne_embeddings[:, 1],
                   c=clusters, cmap="Spectral", alpha=0.7, s=25)
        plt.title(f"K={k}\nSil={silhouette:.2f}", pad=10)
        plt.xticks([])
        plt.yticks([])

    plt.tight_layout()
    plt.show()

    # Create metrics table
    metrics_df = pd.DataFrame({
        'K': k_values,
        'Silhouette Score': silhouette_scores,
        'Davies-Bouldin Score': davies_bouldin_scores,
        'Inertia': inertia_scores
    })

    print("\nCluster Evaluation Metrics:")
    print(metrics_df.to_string(index=False))

    # Plot metrics
    plt.figure(figsize=(15, 10))

    # Silhouette scores
    plt.subplot(2, 2, 1)
    plt.plot(k_values, silhouette_scores, 'bo-')
    plt.xlabel('Number of clusters (K)')
    plt.ylabel('Silhouette Score')
    plt.title('Silhouette Scores')
    plt.grid(True)

    # Davies-Bouldin scores
    plt.subplot(2, 2, 2)
    plt.plot(k_values, davies_bouldin_scores, 'go-')
    plt.xlabel('Number of clusters (K)')
    plt.ylabel('Davies-Bouldin Score')
    plt.title('Davies-Bouldin Scores')
    plt.grid(True)

    # Inertia scores
    plt.subplot(2, 2, 3)
    plt.plot(k_values, inertia_scores, 'ro-')
    plt.xlabel('Number of clusters (K)')
    plt.ylabel('Inertia')
    plt.title('Inertia Scores')
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    return metrics_df

metrics_df = evaluate_clusters(embeddings_reduced)

## actual clustering generation <br>
run immediately after determining optimal k associated with a particular data group. this cell generates text clusters with the optimal cluster as determined in previous sections -- outputs t-sne visualization, and an excel spreadsheet of each image's textual content + its corresponding cluster

In [None]:
# Perform clustering with specified number of clusters
def perform_clustering(n_clusters, embeddings, preprocessed_texts, df):
    print(f"\n{'='*40}\nAnalyzing {n_clusters} clusters\n{'='*40}")

    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=20)
    clusters = kmeans.fit_predict(embeddings)
    df[f'Cluster_{n_clusters}'] = clusters

    # t-SNE visualization
    tsne = TSNE(n_components=2, random_state=42)
    tsne_embeddings = tsne.fit_transform(embeddings)

    plt.figure(figsize=(10, 8))
    plt.scatter(tsne_embeddings[:, 0], tsne_embeddings[:, 1],
               c=clusters, cmap="Spectral", alpha=0.8, s=50)
    plt.title(f"t-SNE Visualization of {n_clusters} Clusters", pad=20)
    plt.xticks([])
    plt.yticks([])
    plt.colorbar()
    plt.show()

    # Cluster term frequency analysis
    print(f"\nCluster Distribution for {n_clusters} clusters:")
    print(pd.Series(clusters).value_counts().sort_index())

    # Create frequency dictionary
    cluster_terms = {}
    top_terms_dfs = []

    for cluster_num in sorted(np.unique(clusters)):
        cluster_mask = (clusters == cluster_num)
        cluster_texts = [text for i, text in enumerate(preprocessed_texts) if cluster_mask[i]]
        all_words = [word for text in cluster_texts for word in text]
        term_freq = Counter(all_words)
        cluster_terms[cluster_num] = term_freq

        # Create DataFrame for top terms
        top_terms = term_freq.most_common(10)
        temp_df = pd.DataFrame(top_terms, columns=['Term', f'Count (Cluster {cluster_num})'])
        temp_df[f'% of Cluster {cluster_num}'] = (temp_df[f'Count (Cluster {cluster_num})'] /
                                                temp_df[f'Count (Cluster {cluster_num})'].sum()) * 100
        temp_df[f'% of Cluster {cluster_num}'] = temp_df[f'% of Cluster {cluster_num}'].round(2)
        top_terms_dfs.append(temp_df)

    # Combine all cluster DataFrames
    combined_df = pd.concat(top_terms_dfs, axis=1)

    print(f"\nTop 10 Terms per Cluster ({n_clusters} clusters):")
    try:
        from IPython.display import display
        display(combined_df)
    except ImportError:
        print(combined_df.to_string())

    return clusters


desired_clusters = 4  # SET OPTIMAL NUMBER OF CLUSTERS (based on results from earlier cells)
clusters = perform_clustering(desired_clusters, embeddings_reduced, preprocessed_texts, df)

# output excel with results
output_file = "fbq_text_clusters.xlsx"
df.to_excel(output_file, index=False, engine="openpyxl")
print(f"\nResults saved to {output_file}")

# image clustering

The three cells (for each data group) are the same code, just taking in different input

In [None]:
!cd /content

In [None]:
!unzip images_all.zip

In [None]:
!unzip weibo_images_all.zip

In [None]:
!unzip fabiaoqing_images.zip

In [None]:
!pip install kneed torch torchvision opencv-python pillow scikit-learn pandas matplotlib numpy tqdm git+https://github.com/openai/CLIP.git umap-learn kneed seaborn

### combined

In [None]:
import torch
import clip  
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.manifold import TSNE
from kneed import KneeLocator
from sklearn.neighbors import NearestNeighbors
from PIL import Image


IMAGE_FOLDER = '/content/images_all'
K_VALUES = range(3, 21)  # computing k = 3 to k = 20
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PCA_VARIANCE = 0.85  # 85% variation

print("Loading CLIP model...")
model, preprocess = clip.load("ViT-B/32", device=DEVICE)  # Using Vision Transformer base model
model.eval()

# CLIP preprocessing of image
def clip_preprocess(image):
    image = Image.fromarray(image)
    return preprocess(image).unsqueeze(0).to(DEVICE)

# feature extraction
def extract_features(image_path):
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Could not read image: {image_path}")

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image_input = clip_preprocess(image)

        with torch.no_grad():
            # Get image features from CLIP
            image_features = model.encode_image(image_input)
            # Normalize features
            image_features /= image_features.norm(dim=-1, keepdim=True)
            return image_features.cpu().numpy().flatten()

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# load images and extract features
print("Loading images and extracting CLIP features...")
image_paths = [os.path.join(IMAGE_FOLDER, img) for img in os.listdir(IMAGE_FOLDER)
               if img.lower().endswith(("jpg", "png", "jpeg"))]

features = np.array([feat for feat in [extract_features(img) for img in image_paths] if feat is not None])

# feature processing (on top of clip)
print("Processing features...")
normalizer = Normalizer()
features_normalized = normalizer.fit_transform(features)

# Dimensionality reduction
pca = PCA(n_components=PCA_VARIANCE, random_state=42)
features_reduced = pca.fit_transform(features_normalized)

# Additional UMAP reduction 
try:
    from umap import UMAP
    reducer = UMAP(n_components=min(50, features_reduced.shape[1]),
                  metric='cosine', random_state=42)
    features_reduced = reducer.fit_transform(features_reduced)
    print("Using UMAP for dimensionality reduction")
except ImportError:
    print("UMAP not available, using PCA only")

print(f"Final feature dimensions: {features_reduced.shape[1]}")

# Clustering evaluation
print("Evaluating clustering...")
results = []

for k in K_VALUES:
    clusterer = KMeans(n_clusters=k, init='k-means++', n_init=20,
                      max_iter=500, random_state=42)

    labels = clusterer.fit_predict(features_reduced)

    # Calculate multiple metrics
    metrics = {
        'K': k,
        'Algorithm': 'kmeans',
        'Inertia': clusterer.inertia_,
        'Silhouette': silhouette_score(features_reduced, labels) if k > 1 else None,
        'DBI': davies_bouldin_score(features_reduced, labels) if k > 1 else None
    }
    results.append(metrics)

results_df = pd.DataFrame(results)

# computing optimal k (but our final result is determined via visual observation / weighing of metrics against each other)
def find_optimal_k(results_df):
    """Combine multiple metrics to find best k"""
    # Normalize metrics
    df = results_df[results_df['K'] > 1].copy()
    for metric in ['Silhouette']:
        df[metric] = (df[metric] - df[metric].min()) / (df[metric].max() - df[metric].min())
    for metric in ['DBI', 'Inertia']:
        df[metric] = 1 - ((df[metric] - df[metric].min()) / (df[metric].max() - df[metric].min()))

    # Combined score (weighted average)
    df['Combined'] = 0.4*df['Silhouette'] + 0.2*df['DBI'] + 0.1*df['Inertia']

    return df.loc[df['Combined'].idxmax()]['K']

optimal_k = int(find_optimal_k(results_df))
print(f"\nOptimal k determined: {optimal_k}")

# --- FINAL CLUSTERING ---
final_clusterer = KMeans(n_clusters=optimal_k, init='k-means++',
                        n_init=20, max_iter=500, random_state=42)

final_labels = final_clusterer.fit_predict(features_reduced)

# t-sne visualization for all k values
print("\nGenerating t-SNE visualizations for all K values...")
tsne = TSNE(n_components=2, perplexity=min(30, len(features_reduced)//4),
            metric='cosine', random_state=42)
features_tsne = tsne.fit_transform(features_reduced)

# Create a figure with subplots for each k value
plt.figure(figsize=(20, 20))
n_cols = 4  
n_rows = int(np.ceil(len(K_VALUES)/n_cols)) 

for i, k in enumerate(K_VALUES):
    # Create clusterer for this k value
    clusterer = KMeans(n_clusters=k, init='k-means++', n_init=20,
                     max_iter=500, random_state=42)

    labels = clusterer.fit_predict(features_reduced)

    # Create subplot
    plt.subplot(n_rows, n_cols, i+1)

    # Plot the t-SNE results
    scatter = plt.scatter(features_tsne[:, 0], features_tsne[:, 1],
                         c=labels, cmap='Spectral', alpha=0.7, s=30)

    # Add cluster centers if available
    if hasattr(clusterer, 'cluster_centers_'):
        try:
            # Project centers to t-SNE space using nearest neighbors
            nbrs = NearestNeighbors(n_neighbors=1).fit(features_reduced)
            _, indices = nbrs.kneighbors(clusterer.cluster_centers_)
            plt.scatter(features_tsne[indices, 0], features_tsne[indices, 1],
                        c='black', marker='X', s=100, alpha=0.8)
        except Exception as e:
            print(f"Could not plot centers for k={k}: {str(e)}")

    # Calculate silhouette score for this k
    sil_score = silhouette_score(features_reduced, labels) if k > 1 else 0

    plt.title(f'k={k}\nSilhouette: {sil_score:.2f}')
    plt.xticks([])
    plt.yticks([])

plt.tight_layout()
plt.suptitle('t-SNE Visualizations for Different Cluster Counts', y=1.02, fontsize=16)
plt.show()


print("\nGenerating clustering metrics analysis...")

# Create a figure for metrics visualization
plt.figure(figsize=(15, 10))

# Silhouette Score
plt.subplot(2, 2, 1)
silhouette_scores = results_df.groupby('K')['Silhouette'].mean()
plt.plot(K_VALUES, silhouette_scores, marker='o')
plt.title('Silhouette Scores')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Score')
plt.xticks(K_VALUES)
plt.grid(True)

# Inertia (Elbow Method)
plt.subplot(2, 2, 2)
inertia_scores = results_df.groupby('K')['Inertia'].mean()
plt.plot(K_VALUES, inertia_scores, marker='o')
plt.title('Inertia')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.xticks(K_VALUES)
plt.grid(True)

# Davies-Bouldin Index
plt.subplot(2, 2, 3)
db_scores = results_df.groupby('K')['DBI'].mean()
plt.plot(K_VALUES, db_scores, marker='o')
plt.title('Davies-Bouldin Index')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Score')
plt.xticks(K_VALUES)
plt.grid(True)

plt.tight_layout()
plt.show()



print("\nClustering Metrics Tables:")

# Create a summary table for all metrics
metrics_summary = results_df[['K', 'Silhouette', 'Inertia', 'DBI']]
metrics_summary = metrics_summary.groupby('K').mean().reset_index()

# Format the metrics for better display
formatted_metrics = metrics_summary.copy()
formatted_metrics['Silhouette'] = formatted_metrics['Silhouette'].round(3)
formatted_metrics['Inertia'] = formatted_metrics['Inertia'].apply(lambda x: f"{x:,.0f}")
formatted_metrics['DBI'] = formatted_metrics['DBI'].round(3)

print("\nAll Metrics Summary Table:")
print(formatted_metrics.to_string(index=False))

# Individual detailed tables
print("\nDetailed Metrics by K Value:")
for k in K_VALUES:
    k_metrics = results_df[results_df['K'] == k]
    k_metrics = k_metrics[['Algorithm', 'Silhouette', 'Inertia', 'DBI']]
    k_metrics['Silhouette'] = k_metrics['Silhouette'].round(3)
    k_metrics['Inertia'] = k_metrics['Inertia'].apply(lambda x: f"{x:,.0f}")
    k_metrics['DBI'] = k_metrics['DBI'].round(3)

    print(f"\nMetrics for k={k}:")
    print(k_metrics.to_string(index=False))

# save results
results_df.to_csv('combined_clustering_results_clip.csv', index=False)
print("\nSaved all clustering results to 'combined_clustering_results_clip.csv'")

print("\nFinal Analysis:")
print(f"Optimal number of clusters: {optimal_k}")

### weibo

In [None]:
import torch
import clip  
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.manifold import TSNE
from kneed import KneeLocator
from sklearn.neighbors import NearestNeighbors
from PIL import Image


IMAGE_FOLDER = '/content/weibo_images_all'
K_VALUES = range(3, 21)  # computing k = 3 to k = 20
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PCA_VARIANCE = 0.85  # 85% variation

print("Loading CLIP model...")
model, preprocess = clip.load("ViT-B/32", device=DEVICE)  # Using Vision Transformer base model
model.eval()

# CLIP preprocessing of image
def clip_preprocess(image):
    image = Image.fromarray(image)
    return preprocess(image).unsqueeze(0).to(DEVICE)

# feature extraction
def extract_features(image_path):
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Could not read image: {image_path}")

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image_input = clip_preprocess(image)

        with torch.no_grad():
            # Get image features from CLIP
            image_features = model.encode_image(image_input)
            # Normalize features
            image_features /= image_features.norm(dim=-1, keepdim=True)
            return image_features.cpu().numpy().flatten()

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# load images and extract features
print("Loading images and extracting CLIP features...")
image_paths = [os.path.join(IMAGE_FOLDER, img) for img in os.listdir(IMAGE_FOLDER)
               if img.lower().endswith(("jpg", "png", "jpeg"))]

features = np.array([feat for feat in [extract_features(img) for img in image_paths] if feat is not None])

# feature processing (on top of clip)
print("Processing features...")
normalizer = Normalizer()
features_normalized = normalizer.fit_transform(features)

# Dimensionality reduction
pca = PCA(n_components=PCA_VARIANCE, random_state=42)
features_reduced = pca.fit_transform(features_normalized)

# Additional UMAP reduction 
try:
    from umap import UMAP
    reducer = UMAP(n_components=min(50, features_reduced.shape[1]),
                  metric='cosine', random_state=42)
    features_reduced = reducer.fit_transform(features_reduced)
    print("Using UMAP for dimensionality reduction")
except ImportError:
    print("UMAP not available, using PCA only")

print(f"Final feature dimensions: {features_reduced.shape[1]}")

# Clustering evaluation
print("Evaluating clustering...")
results = []

for k in K_VALUES:
    clusterer = KMeans(n_clusters=k, init='k-means++', n_init=20,
                      max_iter=500, random_state=42)

    labels = clusterer.fit_predict(features_reduced)

    # Calculate multiple metrics
    metrics = {
        'K': k,
        'Algorithm': 'kmeans',
        'Inertia': clusterer.inertia_,
        'Silhouette': silhouette_score(features_reduced, labels) if k > 1 else None,
        'DBI': davies_bouldin_score(features_reduced, labels) if k > 1 else None
    }
    results.append(metrics)

results_df = pd.DataFrame(results)

# computing optimal k (but our final result is determined via visual observation / weighing of metrics against each other)
def find_optimal_k(results_df):
    """Combine multiple metrics to find best k"""
    # Normalize metrics
    df = results_df[results_df['K'] > 1].copy()
    for metric in ['Silhouette']:
        df[metric] = (df[metric] - df[metric].min()) / (df[metric].max() - df[metric].min())
    for metric in ['DBI', 'Inertia']:
        df[metric] = 1 - ((df[metric] - df[metric].min()) / (df[metric].max() - df[metric].min()))

    # Combined score (weighted average)
    df['Combined'] = 0.4*df['Silhouette'] + 0.2*df['DBI'] + 0.1*df['Inertia']

    return df.loc[df['Combined'].idxmax()]['K']

optimal_k = int(find_optimal_k(results_df))
print(f"\nOptimal k determined: {optimal_k}")

# --- FINAL CLUSTERING ---
final_clusterer = KMeans(n_clusters=optimal_k, init='k-means++',
                        n_init=20, max_iter=500, random_state=42)

final_labels = final_clusterer.fit_predict(features_reduced)

# t-sne visualization for all k values
print("\nGenerating t-SNE visualizations for all K values...")
tsne = TSNE(n_components=2, perplexity=min(30, len(features_reduced)//4),
            metric='cosine', random_state=42)
features_tsne = tsne.fit_transform(features_reduced)

# Create a figure with subplots for each k value
plt.figure(figsize=(20, 20))
n_cols = 4  
n_rows = int(np.ceil(len(K_VALUES)/n_cols)) 

for i, k in enumerate(K_VALUES):
    # Create clusterer for this k value
    clusterer = KMeans(n_clusters=k, init='k-means++', n_init=20,
                     max_iter=500, random_state=42)

    labels = clusterer.fit_predict(features_reduced)

    # Create subplot
    plt.subplot(n_rows, n_cols, i+1)

    # Plot the t-SNE results
    scatter = plt.scatter(features_tsne[:, 0], features_tsne[:, 1],
                         c=labels, cmap='Spectral', alpha=0.7, s=30)

    # Add cluster centers if available
    if hasattr(clusterer, 'cluster_centers_'):
        try:
            # Project centers to t-SNE space using nearest neighbors
            nbrs = NearestNeighbors(n_neighbors=1).fit(features_reduced)
            _, indices = nbrs.kneighbors(clusterer.cluster_centers_)
            plt.scatter(features_tsne[indices, 0], features_tsne[indices, 1],
                        c='black', marker='X', s=100, alpha=0.8)
        except Exception as e:
            print(f"Could not plot centers for k={k}: {str(e)}")

    # Calculate silhouette score for this k
    sil_score = silhouette_score(features_reduced, labels) if k > 1 else 0

    plt.title(f'k={k}\nSilhouette: {sil_score:.2f}')
    plt.xticks([])
    plt.yticks([])

plt.tight_layout()
plt.suptitle('t-SNE Visualizations for Different Cluster Counts', y=1.02, fontsize=16)
plt.show()


print("\nGenerating clustering metrics analysis...")

# Create a figure for metrics visualization
plt.figure(figsize=(15, 10))

# Silhouette Score
plt.subplot(2, 2, 1)
silhouette_scores = results_df.groupby('K')['Silhouette'].mean()
plt.plot(K_VALUES, silhouette_scores, marker='o')
plt.title('Silhouette Scores')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Score')
plt.xticks(K_VALUES)
plt.grid(True)

# Inertia (Elbow Method)
plt.subplot(2, 2, 2)
inertia_scores = results_df.groupby('K')['Inertia'].mean()
plt.plot(K_VALUES, inertia_scores, marker='o')
plt.title('Inertia')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.xticks(K_VALUES)
plt.grid(True)

# Davies-Bouldin Index
plt.subplot(2, 2, 3)
db_scores = results_df.groupby('K')['DBI'].mean()
plt.plot(K_VALUES, db_scores, marker='o')
plt.title('Davies-Bouldin Index')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Score')
plt.xticks(K_VALUES)
plt.grid(True)

plt.tight_layout()
plt.show()



print("\nClustering Metrics Tables:")

# Create a summary table for all metrics
metrics_summary = results_df[['K', 'Silhouette', 'Inertia', 'DBI']]
metrics_summary = metrics_summary.groupby('K').mean().reset_index()

# Format the metrics for better display
formatted_metrics = metrics_summary.copy()
formatted_metrics['Silhouette'] = formatted_metrics['Silhouette'].round(3)
formatted_metrics['Inertia'] = formatted_metrics['Inertia'].apply(lambda x: f"{x:,.0f}")
formatted_metrics['DBI'] = formatted_metrics['DBI'].round(3)

print("\nAll Metrics Summary Table:")
print(formatted_metrics.to_string(index=False))

# Individual detailed tables
print("\nDetailed Metrics by K Value:")
for k in K_VALUES:
    k_metrics = results_df[results_df['K'] == k]
    k_metrics = k_metrics[['Algorithm', 'Silhouette', 'Inertia', 'DBI']]
    k_metrics['Silhouette'] = k_metrics['Silhouette'].round(3)
    k_metrics['Inertia'] = k_metrics['Inertia'].apply(lambda x: f"{x:,.0f}")
    k_metrics['DBI'] = k_metrics['DBI'].round(3)

    print(f"\nMetrics for k={k}:")
    print(k_metrics.to_string(index=False))

# save results
results_df.to_csv('combined_clustering_results_clip.csv', index=False)
print("\nSaved all clustering results to 'combined_clustering_results_clip.csv'")

print("\nFinal Analysis:")
print(f"Optimal number of clusters: {optimal_k}")

### fbq

In [None]:
import torch
import clip  
import numpy as np
import cv2
import os
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.manifold import TSNE
from kneed import KneeLocator
from sklearn.neighbors import NearestNeighbors
from PIL import Image


IMAGE_FOLDER = '/content/fabiaoqing_images'
K_VALUES = range(3, 21)  # computing k = 3 to k = 20
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PCA_VARIANCE = 0.85  # 85% variation

print("Loading CLIP model...")
model, preprocess = clip.load("ViT-B/32", device=DEVICE)  # Using Vision Transformer base model
model.eval()

# CLIP preprocessing of image
def clip_preprocess(image):
    image = Image.fromarray(image)
    return preprocess(image).unsqueeze(0).to(DEVICE)

# feature extraction
def extract_features(image_path):
    try:
        image = cv2.imread(image_path)
        if image is None:
            raise ValueError(f"Could not read image: {image_path}")

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image_input = clip_preprocess(image)

        with torch.no_grad():
            # Get image features from CLIP
            image_features = model.encode_image(image_input)
            # Normalize features
            image_features /= image_features.norm(dim=-1, keepdim=True)
            return image_features.cpu().numpy().flatten()

    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

# load images and extract features
print("Loading images and extracting CLIP features...")
image_paths = [os.path.join(IMAGE_FOLDER, img) for img in os.listdir(IMAGE_FOLDER)
               if img.lower().endswith(("jpg", "png", "jpeg"))]

features = np.array([feat for feat in [extract_features(img) for img in image_paths] if feat is not None])

# feature processing (on top of clip)
print("Processing features...")
normalizer = Normalizer()
features_normalized = normalizer.fit_transform(features)

# Dimensionality reduction
pca = PCA(n_components=PCA_VARIANCE, random_state=42)
features_reduced = pca.fit_transform(features_normalized)

# Additional UMAP reduction 
try:
    from umap import UMAP
    reducer = UMAP(n_components=min(50, features_reduced.shape[1]),
                  metric='cosine', random_state=42)
    features_reduced = reducer.fit_transform(features_reduced)
    print("Using UMAP for dimensionality reduction")
except ImportError:
    print("UMAP not available, using PCA only")

print(f"Final feature dimensions: {features_reduced.shape[1]}")

# Clustering evaluation
print("Evaluating clustering...")
results = []

for k in K_VALUES:
    clusterer = KMeans(n_clusters=k, init='k-means++', n_init=20,
                      max_iter=500, random_state=42)

    labels = clusterer.fit_predict(features_reduced)

    # Calculate multiple metrics
    metrics = {
        'K': k,
        'Algorithm': 'kmeans',
        'Inertia': clusterer.inertia_,
        'Silhouette': silhouette_score(features_reduced, labels) if k > 1 else None,
        'DBI': davies_bouldin_score(features_reduced, labels) if k > 1 else None
    }
    results.append(metrics)

results_df = pd.DataFrame(results)

# computing optimal k (but our final result is determined via visual observation / weighing of metrics against each other)
def find_optimal_k(results_df):
    """Combine multiple metrics to find best k"""
    # Normalize metrics
    df = results_df[results_df['K'] > 1].copy()
    for metric in ['Silhouette']:
        df[metric] = (df[metric] - df[metric].min()) / (df[metric].max() - df[metric].min())
    for metric in ['DBI', 'Inertia']:
        df[metric] = 1 - ((df[metric] - df[metric].min()) / (df[metric].max() - df[metric].min()))

    # Combined score (weighted average)
    df['Combined'] = 0.4*df['Silhouette'] + 0.2*df['DBI'] + 0.1*df['Inertia']

    return df.loc[df['Combined'].idxmax()]['K']

optimal_k = int(find_optimal_k(results_df))
print(f"\nOptimal k determined: {optimal_k}")

# --- FINAL CLUSTERING ---
final_clusterer = KMeans(n_clusters=optimal_k, init='k-means++',
                        n_init=20, max_iter=500, random_state=42)

final_labels = final_clusterer.fit_predict(features_reduced)

# t-sne visualization for all k values
print("\nGenerating t-SNE visualizations for all K values...")
tsne = TSNE(n_components=2, perplexity=min(30, len(features_reduced)//4),
            metric='cosine', random_state=42)
features_tsne = tsne.fit_transform(features_reduced)

# Create a figure with subplots for each k value
plt.figure(figsize=(20, 20))
n_cols = 4  
n_rows = int(np.ceil(len(K_VALUES)/n_cols)) 

for i, k in enumerate(K_VALUES):
    # Create clusterer for this k value
    clusterer = KMeans(n_clusters=k, init='k-means++', n_init=20,
                     max_iter=500, random_state=42)

    labels = clusterer.fit_predict(features_reduced)

    # Create subplot
    plt.subplot(n_rows, n_cols, i+1)

    # Plot the t-SNE results
    scatter = plt.scatter(features_tsne[:, 0], features_tsne[:, 1],
                         c=labels, cmap='Spectral', alpha=0.7, s=30)

    # Add cluster centers if available
    if hasattr(clusterer, 'cluster_centers_'):
        try:
            # Project centers to t-SNE space using nearest neighbors
            nbrs = NearestNeighbors(n_neighbors=1).fit(features_reduced)
            _, indices = nbrs.kneighbors(clusterer.cluster_centers_)
            plt.scatter(features_tsne[indices, 0], features_tsne[indices, 1],
                        c='black', marker='X', s=100, alpha=0.8)
        except Exception as e:
            print(f"Could not plot centers for k={k}: {str(e)}")

    # Calculate silhouette score for this k
    sil_score = silhouette_score(features_reduced, labels) if k > 1 else 0

    plt.title(f'k={k}\nSilhouette: {sil_score:.2f}')
    plt.xticks([])
    plt.yticks([])

plt.tight_layout()
plt.suptitle('t-SNE Visualizations for Different Cluster Counts', y=1.02, fontsize=16)
plt.show()


print("\nGenerating clustering metrics analysis...")

# Create a figure for metrics visualization
plt.figure(figsize=(15, 10))

# Silhouette Score
plt.subplot(2, 2, 1)
silhouette_scores = results_df.groupby('K')['Silhouette'].mean()
plt.plot(K_VALUES, silhouette_scores, marker='o')
plt.title('Silhouette Scores')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Score')
plt.xticks(K_VALUES)
plt.grid(True)

# Inertia (Elbow Method)
plt.subplot(2, 2, 2)
inertia_scores = results_df.groupby('K')['Inertia'].mean()
plt.plot(K_VALUES, inertia_scores, marker='o')
plt.title('Inertia')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.xticks(K_VALUES)
plt.grid(True)

# Davies-Bouldin Index
plt.subplot(2, 2, 3)
db_scores = results_df.groupby('K')['DBI'].mean()
plt.plot(K_VALUES, db_scores, marker='o')
plt.title('Davies-Bouldin Index')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Score')
plt.xticks(K_VALUES)
plt.grid(True)

plt.tight_layout()
plt.show()



print("\nClustering Metrics Tables:")

# Create a summary table for all metrics
metrics_summary = results_df[['K', 'Silhouette', 'Inertia', 'DBI']]
metrics_summary = metrics_summary.groupby('K').mean().reset_index()

# Format the metrics for better display
formatted_metrics = metrics_summary.copy()
formatted_metrics['Silhouette'] = formatted_metrics['Silhouette'].round(3)
formatted_metrics['Inertia'] = formatted_metrics['Inertia'].apply(lambda x: f"{x:,.0f}")
formatted_metrics['DBI'] = formatted_metrics['DBI'].round(3)

print("\nAll Metrics Summary Table:")
print(formatted_metrics.to_string(index=False))

# Individual detailed tables
print("\nDetailed Metrics by K Value:")
for k in K_VALUES:
    k_metrics = results_df[results_df['K'] == k]
    k_metrics = k_metrics[['Algorithm', 'Silhouette', 'Inertia', 'DBI']]
    k_metrics['Silhouette'] = k_metrics['Silhouette'].round(3)
    k_metrics['Inertia'] = k_metrics['Inertia'].apply(lambda x: f"{x:,.0f}")
    k_metrics['DBI'] = k_metrics['DBI'].round(3)

    print(f"\nMetrics for k={k}:")
    print(k_metrics.to_string(index=False))

# save results
results_df.to_csv('combined_clustering_results_clip.csv', index=False)
print("\nSaved all clustering results to 'combined_clustering_results_clip.csv'")

print("\nFinal Analysis:")
print(f"Optimal number of clusters: {optimal_k}")

### actual cluster generation (excel output) <br>
generates an excel sheet, and a t-sne visualization of the selected optimal k

In [None]:

SPECIFIED_K = 11  # Change to desired number of clusters (the optimal k)

print(f"\nClustering with k={SPECIFIED_K}...")
specified_clusterer = KMeans(n_clusters=SPECIFIED_K, init='k-means++',
                           n_init=20, max_iter=500, random_state=42)
specified_labels = specified_clusterer.fit_predict(features_reduced)

# calculating metrics
sil_score = silhouette_score(features_reduced, specified_labels)
db_score = davies_bouldin_score(features_reduced, specified_labels)
inertia = specified_clusterer.inertia_

print(f"\nCluster Metrics for k={SPECIFIED_K}:")
print(f"Silhouette Score: {sil_score:.4f}")
print(f"Davies-Bouldin Index: {db_score:.4f}")
print(f"Inertia: {inertia:,.2f}")

# cluster distribution with distances
unique, counts = np.unique(specified_labels, return_counts=True)
percentages = counts / len(specified_labels)

# Calculate average distances to centroids
avg_distances = []
for c in range(SPECIFIED_K):
    mask = (specified_labels == c)
    if np.sum(mask) > 0:
        dist = np.mean(np.linalg.norm(
            features_reduced[mask] - specified_clusterer.cluster_centers_[c], axis=1))
        avg_distances.append(dist)
    else:
        avg_distances.append(0)

cluster_table = pd.DataFrame({
    'Cluster': unique,
    'Count': counts,
    'Percentage': percentages,
    'Avg Distance to Centroid': [avg_distances[c] for c in unique]
}).sort_values('Cluster')

# Formatting
cluster_table['Percentage'] = cluster_table['Percentage'].apply(lambda x: f"{x:.1%}")
cluster_table['Avg Distance to Centroid'] = cluster_table['Avg Distance to Centroid'].round(4)

print(f"\nCluster Distribution for k={SPECIFIED_K}:")
print(cluster_table.to_string(index=False))

# saving results to excel
def save_cluster_assignments(image_paths, labels, k_value, features_reduced, clusterer):
    """Save cluster assignments to Excel with additional information"""
    # Get the valid image paths (those that were successfully processed)
    valid_image_paths = [img for img, feat in zip(image_paths, [extract_features(img) for img in image_paths]) if feat is not None]

    df = pd.DataFrame({
        'ImageName': [os.path.basename(img) for img in valid_image_paths],
        'Cluster': labels
    })

    # Add distance to centroid
    if hasattr(clusterer, 'cluster_centers_'):
        distances = []
        for i, point in enumerate(features_reduced):
            centroid = clusterer.cluster_centers_[labels[i]]
            distances.append(np.linalg.norm(point - centroid))
        df['DistanceToCentroid'] = distances

    # Sort by cluster and then by distance to centroid
    if 'DistanceToCentroid' in df.columns:
        df = df.sort_values(['Cluster', 'DistanceToCentroid'])
    else:
        df = df.sort_values('Cluster')

    return df 

cluster_df = save_cluster_assignments(image_paths, specified_labels, SPECIFIED_K,
                                    features_reduced, specified_clusterer)

output_filename = f'cluster_assignments_k_{SPECIFIED_K}.xlsx'
cluster_df.to_excel(output_filename, index=False)
print(f"\nSaved Excel file: '{output_filename}' with columns:")
print(f"• ImageName\n• Cluster\n• DistanceToCentroid")

# t-sne visualization
print("\nGenerating cluster visualization...")
plt.figure(figsize=(12, 8))
scatter = plt.scatter(features_tsne[:, 0], features_tsne[:, 1],
                     c=specified_labels, cmap='Spectral', alpha=0.7, s=50,
                     edgecolors='w', linewidth=0.5)

# Add centers
if hasattr(specified_clusterer, 'cluster_centers_'):
    try:
        nbrs = NearestNeighbors(n_neighbors=1).fit(features_reduced)
        _, indices = nbrs.kneighbors(specified_clusterer.cluster_centers_)
        plt.scatter(features_tsne[indices, 0], features_tsne[indices, 1],
                    c='black', marker='X', s=200, alpha=0.8,
                    label='Cluster Centers')
        plt.legend()
    except Exception as e:
        print(f"Could not plot centers: {str(e)}")

plt.colorbar(scatter, label='Cluster')
plt.title(f'Cluster Visualization (k={SPECIFIED_K})\n'
          f'Silhouette: {sil_score:.2f} | Avg Distance: {np.mean(avg_distances):.2f}')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.show()

# cluster analysis

### most representative text extraction (word clustering analysis)

essentially extracts the top representative texts from the clusters via TF-IDF

In [None]:
!pip install pandas numpy scikit-learn tqdm xlsxwriter nltk

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data
df = pd.read_excel("combined_text_clusters.xlsx")
assert {"Image Name", "Description", "Cluster_4"}.issubset(df.columns), "Excel missing required columns."

# Group descriptions by cluster
clusters = df.groupby("Cluster_4")["Description"].apply(list).to_dict()

# Function to select important texts using TF-IDF
def select_representative_texts(texts, num_samples=300):
    vectorizer = TfidfVectorizer(max_features=500)  # Extract key features
    X = vectorizer.fit_transform(texts)
    importance_scores = np.array(X.sum(axis=1)).flatten()

    # Pick top N most important texts
    selected_indices = np.argsort(importance_scores)[-num_samples:]
    return [texts[i] for i in selected_indices]

# For each cluster, select the most representative texts
cluster_representative_texts = {}
for cluster, texts in clusters.items():
    sampled_texts = select_representative_texts(texts)  # Pick the best ~300 texts
    cluster_representative_texts[cluster] = sampled_texts

representative_texts_df = pd.DataFrame.from_dict(cluster_representative_texts, orient="index")
representative_texts_df = representative_texts_df.transpose()  # Transpose for better readability

# Save to an Excel file
representative_texts_df.to_excel("combined_cluster_representative_texts.xlsx", index=False)

print("Representative texts extracted and saved to Excel.")


## cross cluster analysis

the same code for all three data groups, just different input

In [None]:
!cd /content

In [None]:
!unzip combined_image_clusters.zip

In [None]:
!unzip fbq_image_clusters.zip

In [None]:
!unzip weibo_image_clusters.zip

#### weibo

In [None]:
import os
import pandas as pd
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt


IMAGE_CLUSTERS_DIR = "weibo_image_clusters"  

# processing folders
image_cluster_map = {}
for cluster_id in os.listdir(IMAGE_CLUSTERS_DIR):
    cluster_path = os.path.join(IMAGE_CLUSTERS_DIR, cluster_id)
    if os.path.isdir(cluster_path):
        for filename in os.listdir(cluster_path):
            image_cluster_map[filename] = int(cluster_id)  # Folder names are cluster IDs


# load word clusters from excel
word_df = pd.read_excel("weibo_text_clusters.xlsx")  

# rename excel columns to a standardized column name
word_df = word_df.rename(columns={
    "Image Name": "meme_filename",
    "Detected Text": "text",
    "Cluster_5": "word_cluster"
})

# merge image and word cluster labels
word_df["image_cluster"] = word_df["meme_filename"].map(image_cluster_map)
word_df = word_df.dropna(subset=["image_cluster"])  # Drop unmatched memes (if any)

# Ensure clusters are integers
word_df["image_cluster"] = word_df["image_cluster"].astype(int)
word_df["word_cluster"] = word_df["word_cluster"].astype(int)

# cross-tabulation
contingency_table = pd.crosstab(
    word_df["image_cluster"],
    word_df["word_cluster"],
    margins=False
)

# Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-Square p-value: {p:.20f}")
print("Significant relationship found!" if p < 0.05 else "No significant relationship.")

# Lift calculation
lift_table = pd.DataFrame(
    contingency_table.values / expected,
    index=contingency_table.index,
    columns=contingency_table.columns
)

# Visualization and outputting results
max_lift = lift_table.stack().idxmax()
min_lift = lift_table.stack().idxmin()
print(f"\nStrongest Pair: Image Cluster {max_lift[0]} + Word Cluster {max_lift[1]} (Lift = {lift_table.loc[max_lift]:.2f})")
print(f"Weakest Pair: Image Cluster {min_lift[0]} + Word Cluster {min_lift[1]} (Lift = {lift_table.loc[min_lift]:.2f})")

# Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(
    lift_table,
    annot=True,
    cmap="coolwarm",
    center=1,
    fmt=".2f",
    linewidths=0.5
)
plt.title("Image Clusters vs. Word Clusters (Lift Analysis)")
plt.xlabel("Word Cluster")
plt.ylabel("Image Cluster")
plt.show()

#### combined

In [None]:
import os
import pandas as pd
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt


IMAGE_CLUSTERS_DIR = "combined_image_clusters"

# processing folders
image_cluster_map = {}
for cluster_id in os.listdir(IMAGE_CLUSTERS_DIR):
    cluster_path = os.path.join(IMAGE_CLUSTERS_DIR, cluster_id)
    if os.path.isdir(cluster_path):
        for filename in os.listdir(cluster_path):
            image_cluster_map[filename] = int(cluster_id)  # Folder names are cluster IDs


# load word clusters from excel
word_df = pd.read_excel("combined_text_clusters.xlsx") 

# Rename columns for consistency
word_df = word_df.rename(columns={
    "Image Name": "meme_filename",
    "Description": "text",
    "Cluster_4": "word_cluster"
})

# merge image and word cluster labels
word_df["image_cluster"] = word_df["meme_filename"].map(image_cluster_map)
word_df = word_df.dropna(subset=["image_cluster"])  # Drop unmatched memes (if any)

# Ensure clusters are integers
word_df["image_cluster"] = word_df["image_cluster"].astype(int)
word_df["word_cluster"] = word_df["word_cluster"].astype(int)

# cross-tabulation
contingency_table = pd.crosstab(
    word_df["image_cluster"],
    word_df["word_cluster"],
    margins=False
)

# Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-Square p-value: {p:.20f}")
print("Significant relationship found!" if p < 0.05 else "No significant relationship.")

# Lift calculation
lift_table = pd.DataFrame(
    contingency_table.values / expected,
    index=contingency_table.index,
    columns=contingency_table.columns
)

# Visualization and outputting results
max_lift = lift_table.stack().idxmax()
min_lift = lift_table.stack().idxmin()
print(f"\nStrongest Pair: Image Cluster {max_lift[0]} + Word Cluster {max_lift[1]} (Lift = {lift_table.loc[max_lift]:.2f})")
print(f"Weakest Pair: Image Cluster {min_lift[0]} + Word Cluster {min_lift[1]} (Lift = {lift_table.loc[min_lift]:.2f})")

# Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(
    lift_table,
    annot=True,
    cmap="coolwarm",
    center=1,
    fmt=".2f",
    linewidths=0.5
)
plt.title("Image Clusters vs. Word Clusters (Lift Analysis)")
plt.xlabel("Word Cluster")
plt.ylabel("Image Cluster")
plt.show()

#### fbq

In [None]:
import os
import pandas as pd
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt


IMAGE_CLUSTERS_DIR = "fbq_image_clusters"

# processing folders
image_cluster_map = {}
for cluster_id in os.listdir(IMAGE_CLUSTERS_DIR):
    cluster_path = os.path.join(IMAGE_CLUSTERS_DIR, cluster_id)
    if os.path.isdir(cluster_path):
        for filename in os.listdir(cluster_path):
            image_cluster_map[filename] = int(cluster_id)  # Folder names are cluster IDs


# load word clusters from excel
word_df = pd.read_excel("fbq_text_clusters.xlsx") 

# Rename columns for consistency 
word_df = word_df.rename(columns={
    "Image Name": "meme_filename",
    "Description": "text",
    "Cluster_4": "word_cluster"
})

# merge image and word cluster labels
word_df["image_cluster"] = word_df["meme_filename"].map(image_cluster_map)
word_df = word_df.dropna(subset=["image_cluster"])  # Drop unmatched memes (if any)

# Ensure clusters are integers
word_df["image_cluster"] = word_df["image_cluster"].astype(int)
word_df["word_cluster"] = word_df["word_cluster"].astype(int)

# cross-tabulation
contingency_table = pd.crosstab(
    word_df["image_cluster"],
    word_df["word_cluster"],
    margins=False
)

# Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-Square p-value: {p:.20f}")
print("Significant relationship found!" if p < 0.05 else "No significant relationship.")

# Lift calculation
lift_table = pd.DataFrame(
    contingency_table.values / expected,
    index=contingency_table.index,
    columns=contingency_table.columns
)

# Visualization and outputting results
max_lift = lift_table.stack().idxmax()
min_lift = lift_table.stack().idxmin()
print(f"\nStrongest Pair: Image Cluster {max_lift[0]} + Word Cluster {max_lift[1]} (Lift = {lift_table.loc[max_lift]:.2f})")
print(f"Weakest Pair: Image Cluster {min_lift[0]} + Word Cluster {min_lift[1]} (Lift = {lift_table.loc[min_lift]:.2f})")

# Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(
    lift_table,
    annot=True,
    cmap="coolwarm",
    center=1,
    fmt=".2f",
    linewidths=0.5
)
plt.title("Image Clusters vs. Word Clusters (Lift Analysis)")
plt.xlabel("Word Cluster")
plt.ylabel("Image Cluster")
plt.show()