<a href="https://colab.research.google.com/github/devin-p-quinn/adjacency_matrix/blob/main/embed_cluster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Embed and Cluster**

This is a working copy of the Altalytics Embed and Cluster Module. Use this in a Google Colab environment with T4 GPU for best results. Run this using the pickle file output from twitter_search_tool.py
Results are used to help indentify key users and clusters of narratives.

Author: Devin Quinn

In [1]:
!pip install faiss-gpu-cu12



In [2]:
input_filename = "/content/tweets_crackerbarrel_OR_cracker_barrel_20251025_143833.pkl"
output_filename = "clustering_cb.pkl"

In [3]:
import pandas as pd

In [4]:
import sys
import numpy as np

# This is a patch to ensure numpy and faiss are compatible
# Force create the missing modules
sys.modules['numpy.core.numeric'] = np.core.numeric
sys.modules['numpy._core'] = np.core
sys.modules['numpy._core.numeric'] = np.core.numeric
sys.modules['numpy.core._multiarray_umath'] = np.core._multiarray_umath

print("Numpy patched successfully")

Numpy patched successfully


In [5]:
df = pd.read_pickle(input_filename)

In [6]:
all_text = df['text']

In [7]:
import faiss
import json
from sentence_transformers import SentenceTransformer

In [8]:
model_name = "paraphrase-multilingual-mpnet-base-v2"
device = "cuda"
batch_size = 32

In [9]:
embedder = SentenceTransformer(model_name, device=device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
corpus_embeddings = embedder.encode(all_text, batch_size=batch_size, show_progress_bar=True)

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

In [11]:
avg_posts_per_cluster = 50
corpus_embeddings = corpus_embeddings.astype('float32')

print(f"Embedding shape: {corpus_embeddings.shape}")

# Cluster using KMeans
print("\nClustering...")
dimensions = corpus_embeddings.shape[-1]
num_clusters = max(1, len(corpus_embeddings) // avg_posts_per_cluster)
print(f"Creating {num_clusters} clusters")

faiss.normalize_L2(corpus_embeddings)
clusterer = faiss.Kmeans(dimensions, num_clusters, niter=20, verbose=True)
clusterer.train(corpus_embeddings)

# Extract clusters
res = clusterer.index.search(corpus_embeddings, 1)
cluster_assignments = res[1].flatten()
df["cluster"] = cluster_assignments

Embedding shape: (1999, 768)

Clustering...
Creating 39 clusters


In [12]:
with open('output_cb.txt', 'w') as f:
    sys.stdout = f

# Statistics
    print(f"\n=== Clustering Results ===")
    print(f"Number of clusters: {num_clusters}")
    print(f"Cluster distribution:\n{df['cluster'].value_counts().sort_index()}")

    # Save
    df.to_pickle(output_filename)
    print(f"\nSaved to {output_filename}")



# Sample tweets
    print("\n=== Sample Tweets per Cluster (first 3 clusters) ===")
    for cluster_id in sorted(df['cluster'].unique())[:3]:
        print(f"\nCluster {cluster_id} ({len(df[df['cluster'] == cluster_id])} tweets):")
        samples = df[df['cluster'] == cluster_id]['text'].head(2)
        for i, text in enumerate(samples, 1):
            print(f"  {i}. {text[:150]}...")
    sys.stdout = sys.__stdout__

In [14]:
df.info()