In [1]:
import numpy as np
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import umap


In [None]:

# Assuming 'embeddings' is your pre-computed embedding matrix
# and 'df' contains your documents with columns 'title', 'summary', etc.

def perform_nmf_topic_modeling(embeddings, n_topics=10, random_state=42):
    """
    Perform NMF topic modeling on embedding matrix.
    
    Parameters:
        embeddings: numpy array of shape (n_documents, embedding_dim)
        n_topics: number of topics to extract
        random_state: random seed for reproducibility
    
    Returns:
        W: document-topic matrix
        H: topic-feature matrix
        model: trained NMF model
    """
    # Apply NMF to the embeddings
    model = NMF(
        n_components=n_topics,
        init='nndsvd',  # Use Non-negative Double SVD for better initialization
        max_iter=500,
        random_state=random_state,
        alpha=0.1  # L2 regularization to avoid overfitting
    )
    
    # W is document-topic matrix, H is topic-feature matrix
    W = model.fit_transform(embeddings)
    H = model.components_
    
    return W, H, model

def get_dominant_topics(W):
    """Get the dominant topic for each document."""
    return np.argmax(W, axis=1)

def evaluate_topic_coherence(W, topic_idx, df, text_column='text', top_n=10):
    """
    Find the top N documents for a given topic and print them to evaluate coherence.
    """
    # Get affinity scores for this topic across all documents
    topic_scores = W[:, topic_idx]
    
    # Get indices of top N documents for this topic
    top_doc_indices = np.argsort(-topic_scores)[:top_n]
    
    # Return the documents
    return df.iloc[top_doc_indices][['title', text_column]].values

def visualize_topics(embeddings, topic_assignments, output_file='topic_visualization.png'):
    """
    Visualize the topics using dimensionality reduction.
    """
    # Use UMAP for dimensionality reduction
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
    reduced_embeddings = reducer.fit_transform(embeddings)
    
    # Plot
    plt.figure(figsize=(12, 10))
    scatter = plt.scatter(
        reduced_embeddings[:, 0], 
        reduced_embeddings[:, 1], 
        c=topic_assignments, 
        cmap='tab10', 
        alpha=0.7,
        s=10
    )
    plt.colorbar(scatter, label='Topic')
    plt.title('Topic Visualization')
    plt.xlabel('UMAP Dimension 1')
    plt.ylabel('UMAP Dimension 2')
    plt.tight_layout()
    plt.savefig(output_file, dpi=300)
    plt.close()
    
def analyze_topic_distribution(W):
    """
    Analyze the distribution of documents across topics.
    """
    # Get dominant topic for each document
    dominant_topics = get_dominant_topics(W)
    
    # Count documents per topic
    topic_counts = np.bincount(dominant_topics, minlength=W.shape[1])
    
    # Create a percentage distribution
    topic_distribution = 100 * topic_counts / len(dominant_topics)
    
    return pd.DataFrame({
        'Topic': range(len(topic_counts)),
        'Documents': topic_counts,
        'Percentage': topic_distribution
    }).sort_values('Documents', ascending=False)

# Main execution
n_topics = 10  # Adjust based on your dataset

# Perform NMF topic modeling
W, H, model = perform_nmf_topic_modeling(embeddings, n_topics=n_topics)

# Get dominant topic for each document
dominant_topics = get_dominant_topics(W)

# Analyze topic distribution
topic_distribution = analyze_topic_distribution(W)
print(topic_distribution)

# Visualize topics
visualize_topics(embeddings, dominant_topics)

# # If you want to extract topic keywords (assuming you have access to vocabulary)
# # For instance, if you had a CountVectorizer used before getting embeddings:
# def extract_topic_words(H, feature_names, n_top_words=10):
#     """Extract the top words for each topic."""
#     topics = []
#     for i, topic_vector in enumerate(H):
#         topic_words = [feature_names[j] for j in topic_vector.argsort()[:-n_top_words-1:-1]]
#         topics.append({f'Topic {i}': topic_words})
#     return topics
# 
# # feature_names = count_vectorizer.get_feature_names_out()
# # topic_words = extract_topic_words(H, feature_names)