In [None]:
import os
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
import random
from collections import Counter
from itertools import combinations

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def update_twitter_records():
    """
    Update records in the 'cluster' collection:
    1. Change data field from 'twitter' to 'socialmedia'
    2. Remove twitter_ids field from these records
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['cluster']
        
        logger.info("Connected to MongoDB successfully")
        
        # First, let's check how many records match our criteria
        count_before = collection.count_documents({"data": "twitter"})
        logger.info(f"Found {count_before} records with data = 'twitter'")
        
        if count_before == 0:
            logger.info("No records found with data = 'twitter'. Nothing to update.")
            return
        
        # Update the records: change data field and remove twitter_ids field
        update_result = collection.update_many(
            {"data": "twitter"},  # Filter: find records where data = "twitter"
            {
                "$set": {"data": "socialmedia"},  # Change data to "socialmedia"
                "$unset": {"twitter_ids": ""}     # Remove twitter_ids field
            }
        )
        
        logger.info(f"Update operation completed:")
        logger.info(f"- Matched documents: {update_result.matched_count}")
        logger.info(f"- Modified documents: {update_result.modified_count}")
        
        # Verify the changes
        count_after_twitter = collection.count_documents({"data": "twitter"})
        count_after_socialmedia = collection.count_documents({"data": "socialmedia"})
        
        logger.info(f"Verification:")
        logger.info(f"- Records with data = 'twitter' after update: {count_after_twitter}")
        logger.info(f"- Records with data = 'socialmedia' after update: {count_after_socialmedia}")
        
        # Check if twitter_ids field still exists in any socialmedia records
        socialmedia_with_twitter_ids = collection.count_documents({
            "data": "socialmedia", 
            "twitter_ids": {"$exists": True}
        })
        logger.info(f"- Records with data = 'socialmedia' that still have twitter_ids field: {socialmedia_with_twitter_ids}")
        
        if update_result.modified_count > 0:
            logger.info("✅ Update completed successfully!")
        else:
            logger.warning("⚠️ No documents were modified. They might already be in the desired state.")
            
    except Exception as e:
        logger.error(f"❌ Error occurred during update: {str(e)}")
        raise
    finally:
        try:
            client.close()
            logger.info("MongoDB connection closed")
        except:
            pass

def preview_changes():
    """
    Preview what changes will be made without actually updating the data
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['cluster']
        
        logger.info("Connected to MongoDB for preview")
        
        # Find records that will be affected
        twitter_records = list(collection.find(
            {"data": "twitter"},
            {"_id": 1, "data": 1, "twitter_ids": 1}  # Only fetch relevant fields
        ).limit(5))  # Limit to 5 for preview
        
        total_count = collection.count_documents({"data": "twitter"})
        
        logger.info(f"Preview of changes to be made:")
        logger.info(f"- Total records to be updated: {total_count}")
        logger.info(f"- Sample of records that will be affected (showing up to 5):")
        
        for i, record in enumerate(twitter_records, 1):
            has_twitter_ids = "twitter_ids" in record
            twitter_ids_count = len(record.get("twitter_ids", [])) if has_twitter_ids else 0
            
            logger.info(f"  {i}. ID: {record['_id']}")
            logger.info(f"     Current data: '{record['data']}'")
            logger.info(f"     Has twitter_ids field: {has_twitter_ids}")
            if has_twitter_ids:
                logger.info(f"     Twitter IDs count: {twitter_ids_count}")
            logger.info(f"     → Will change to: data = 'socialmedia', twitter_ids field removed")
            logger.info("")
            
    except Exception as e:
        logger.error(f"❌ Error occurred during preview: {str(e)}")
        raise
    finally:
        try:
            client.close()
            logger.info("MongoDB connection closed")
        except:
            pass

if __name__ == "__main__":
    try:
        # Check if environment variables are set
        if not mongo_connection_string:
            logger.error("❌ MONGO_CONNECTION_STRING environment variable is not set")
            exit(1)
        
        if not mongo_database_name:
            logger.error("❌ MONGO_DATABASE_NAME environment variable is not set")
            exit(1)
        
        logger.info("Starting MongoDB update process...")
        logger.info("=" * 50)
        
        # First, preview the changes
        logger.info("🔍 PREVIEW MODE: Showing what will be changed...")
        preview_changes()
        
        # Ask for confirmation
        print("\n" + "=" * 50)
        response = input("Do you want to proceed with the update? (yes/no): ").lower().strip()
        
        if response in ['yes', 'y']:
            logger.info("🚀 EXECUTING UPDATE...")
            update_twitter_records()
        else:
            logger.info("❌ Update cancelled by user")
            
    except KeyboardInterrupt:
        logger.info("❌ Operation cancelled by user (Ctrl+C)")
    except Exception as e:
        logger.error(f"❌ Unexpected error: {str(e)}")
        exit(1)

2025-09-08 16:30:25,223 - INFO - Starting MongoDB update process...
2025-09-08 16:30:25,226 - INFO - 🔍 PREVIEW MODE: Showing what will be changed...
2025-09-08 16:30:25,248 - INFO - Connected to MongoDB for preview
2025-09-08 16:30:27,294 - INFO - Preview of changes to be made:
2025-09-08 16:30:27,296 - INFO - - Total records to be updated: 15
2025-09-08 16:30:27,297 - INFO - - Sample of records that will be affected (showing up to 5):
2025-09-08 16:30:27,298 - INFO -   1. ID: 68b7320e2066e3027cd73e80
2025-09-08 16:30:27,300 - INFO -      Current data: 'twitter'
2025-09-08 16:30:27,301 - INFO -      Has twitter_ids field: True
2025-09-08 16:30:27,302 - INFO -      Twitter IDs count: 91
2025-09-08 16:30:27,304 - INFO -      → Will change to: data = 'socialmedia', twitter_ids field removed
2025-09-08 16:30:27,305 - INFO - 
2025-09-08 16:30:27,307 - INFO -   2. ID: 68b7320e2066e3027cd73e81
2025-09-08 16:30:27,310 - INFO -      Current data: 'twitter'
2025-09-08 16:30:27,311 - INFO -      




2025-09-08 16:30:32,676 - INFO - 🚀 EXECUTING UPDATE...
2025-09-08 16:30:32,681 - INFO - Connected to MongoDB successfully
2025-09-08 16:30:34,263 - INFO - Found 15 records with data = 'twitter'
2025-09-08 16:30:34,508 - INFO - Update operation completed:
2025-09-08 16:30:34,509 - INFO - - Matched documents: 15
2025-09-08 16:30:34,510 - INFO - - Modified documents: 15
2025-09-08 16:30:35,074 - INFO - Verification:
2025-09-08 16:30:35,075 - INFO - - Records with data = 'twitter' after update: 0
2025-09-08 16:30:35,076 - INFO - - Records with data = 'socialmedia' after update: 15
2025-09-08 16:30:35,323 - INFO - - Records with data = 'socialmedia' that still have twitter_ids field: 0
2025-09-08 16:30:35,325 - INFO - ✅ Update completed successfully!
2025-09-08 16:30:35,650 - INFO - MongoDB connection closed


In [6]:
# Import required libraries
from pymongo import MongoClient
from collections import defaultdict
import re
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {MONGO_DATABASE_NAME}")

client = MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DATABASE_NAME]

# Get collections
clusters_collection = db['cluster']
socialmedia_collection = db['socialmedia']

def normalize_text(text):
    """Normalize text for better matching"""
    return re.sub(r'[^\w\s]', '', text.lower().strip())

def match_socialmedia_to_clusters():
    """
    Match social media posts to clusters based on dominant_topic matching keyphrases
    and update cluster documents with socialmedia_ids nested structure
    ONLY for clusters where data: "socialmedia"
    """
    
    print("Fetching clusters where data='socialmedia'...")
    # Get only clusters where data field equals "socialmedia"
    clusters = list(clusters_collection.find({"data": "socialmedia"}))
    print(f"Found {len(clusters)} socialmedia clusters to process")
    
    # Debug: Check first cluster structure
    if clusters:
        print(f"Sample cluster structure: {list(clusters[0].keys())}")
        print(f"Sample cluster_id: {clusters[0].get('cluster_id', 'NOT_FOUND')}")
        print(f"Sample data field: {clusters[0].get('data', 'NOT_FOUND')}")
    else:
        print("❌ No clusters found with data='socialmedia'")
        return
    print()
    
    # Define the social media channels to process
    channels = ["Twitter", "Reddit", "Trustpilot", "App Store/Google Play"]
    
    # Process each cluster
    for cluster in clusters:
        cluster_id = cluster['cluster_id']
        keyphrases = cluster.get('keyphrases', [])
        
        print(f"Processing Cluster ID: {cluster_id}")
        print(f"Cluster Name: {cluster.get('cluster_name', 'N/A')}")
        print(f"Data Field: {cluster.get('data', 'N/A')}")
        print(f"Keyphrases: {keyphrases}")
        
        # Normalize keyphrases for matching
        normalized_keyphrases = [normalize_text(phrase) for phrase in keyphrases]
        print(f"Normalized keyphrases: {normalized_keyphrases}")
        
        # Initialize socialmedia_ids structure
        socialmedia_ids = {channel: [] for channel in channels}
        
        # Process each social media channel
        for channel in channels:
            print(f"  Processing channel: {channel}")
            
            # Get all social media posts for this channel
            print(f"    Searching through {channel} posts...")
            socialmedia_cursor = socialmedia_collection.find(
                {'channel': channel}, 
                {'_id': 1, 'dominant_topic': 1}
            )
            
            post_count = 0
            channel_matches = []
            
            # Debug: Show sample topics for this channel
            sample_topics = []
            
            for post in socialmedia_cursor:
                post_count += 1
                if post_count <= 5:  # Collect first 5 topics for debugging
                    sample_topics.append(post.get('dominant_topic', ''))
                
                if post_count % 1000 == 0:
                    print(f"      Processed {post_count} {channel} posts...")
                    
                post_dominant_topic = post.get('dominant_topic', '')
                
                if post_dominant_topic:
                    normalized_topic = normalize_text(post_dominant_topic)
                    
                    # Improved matching logic - try multiple approaches
                    match_found = False
                    
                    # 1. Exact substring match
                    for keyphrase in normalized_keyphrases:
                        if keyphrase and normalized_topic:
                            if keyphrase in normalized_topic or normalized_topic in keyphrase:
                                channel_matches.append(str(post['_id']))
                                print(f"      Match found: {post['_id']} - Topic: '{post_dominant_topic}' matches keyphrase: '{keyphrase}'")
                                match_found = True
                                break
                    
                    # 2. Word-level matching if no exact match
                    if not match_found:
                        topic_words = set(normalized_topic.split())
                        for keyphrase in normalized_keyphrases:
                            if keyphrase:
                                keyphrase_words = set(keyphrase.split())
                                # Check if any words overlap
                                if topic_words.intersection(keyphrase_words):
                                    channel_matches.append(str(post['_id']))
                                    print(f"      Word match found: {post['_id']} - Topic: '{post_dominant_topic}' shares words with keyphrase: '{keyphrase}'")
                                    match_found = True
                                    break
            
            # Debug: Show sample topics for this channel
            if sample_topics:
                print(f"    Sample {channel} topics: {sample_topics}")
            
            # Remove duplicates for this channel
            channel_matches = list(set(channel_matches))
            socialmedia_ids[channel] = channel_matches
            
            print(f"    Finished processing {post_count} {channel} posts")
            print(f"    Found {len(channel_matches)} matches for {channel}")
        
        # Calculate total matches across all channels
        total_matches = sum(len(ids) for ids in socialmedia_ids.values())
        
        # Always update cluster with socialmedia_ids (even if empty) - ONLY for socialmedia clusters
        try:
            result = clusters_collection.update_one(
                {'cluster_id': cluster_id, 'data': 'socialmedia'},  # Added data filter for safety
                {'$set': {'socialmedia_ids': socialmedia_ids}}
            )
            if result.modified_count > 0:
                print(f"  ✓ Successfully updated cluster {cluster_id} with {total_matches} total social media IDs")
                for channel, ids in socialmedia_ids.items():
                    if ids:
                        print(f"    - {channel}: {len(ids)} IDs")
            else:
                print(f"  ⚠ No update performed for cluster {cluster_id} (may already have same data)")
        except Exception as e:
            print(f"  ❌ Error updating cluster {cluster_id}: {str(e)}")
        
        print(f"  Total social media posts matched: {total_matches}")
        print("-" * 50)

def verify_results():
    """
    Verify the results by displaying updated clusters
    ONLY for clusters where data: "socialmedia"
    """
    print("\n" + "=" * 60)
    print("VERIFICATION RESULTS - SOCIALMEDIA CLUSTERS ONLY")
    print("=" * 60)
    
    try:
        # Only get clusters where data = "socialmedia"
        clusters = list(clusters_collection.find(
            {"data": "socialmedia"}, 
            {
                'cluster_id': 1, 
                'cluster_name': 1, 
                'data': 1,
                'keyphrases': 1, 
                'socialmedia_ids': 1
            }
        ).sort('cluster_id', 1))
        
        print(f"Found {len(clusters)} socialmedia clusters")
        
        for cluster in clusters:
            socialmedia_ids = cluster.get('socialmedia_ids', {})
            total_count = sum(len(ids) for ids in socialmedia_ids.values())
            
            print(f"\nCluster {cluster['cluster_id']}: {cluster.get('cluster_name', 'N/A')} (Data: {cluster.get('data', 'N/A')})")
            print(f"  Keyphrases: {cluster.get('keyphrases', [])}")
            print(f"  Total Social Media IDs: {total_count}")
            
            for channel, ids in socialmedia_ids.items():
                if ids:
                    print(f"  {channel}: {len(ids)} IDs")
                    print(f"    First 3 IDs: {ids[:3]}")
                    if len(ids) > 3:
                        print(f"    ... and {len(ids) - 3} more")
                else:
                    print(f"  {channel}: 0 IDs")
                    
    except Exception as e:
        print(f"❌ Error during verification: {str(e)}")

def get_summary_stats():
    """Get summary statistics for socialmedia clusters only"""
    print("\n" + "=" * 60)
    print("SUMMARY STATISTICS - SOCIALMEDIA CLUSTERS ONLY")
    print("=" * 60)
    
    try:
        total_clusters = clusters_collection.count_documents({})
        socialmedia_clusters = clusters_collection.count_documents({'data': 'socialmedia'})
        socialmedia_clusters_with_matches = clusters_collection.count_documents({
            'data': 'socialmedia', 
            'socialmedia_ids': {'$exists': True}
        })
        
        # Count total matches across all channels for socialmedia clusters only
        pipeline = [
            {'$match': {'data': 'socialmedia', 'socialmedia_ids': {'$exists': True}}},
            {'$project': {
                'twitter_count': {'$size': {'$ifNull': ['$socialmedia_ids.Twitter', []]}},
                'reddit_count': {'$size': {'$ifNull': ['$socialmedia_ids.Reddit', []]}},
                'trustpilot_count': {'$size': {'$ifNull': ['$socialmedia_ids.Trustpilot', []]}},
                'appstore_count': {'$size': {'$ifNull': ['$socialmedia_ids.App Store/Google Play', []]}}
            }},
            {'$group': {
                '_id': None,
                'total_twitter': {'$sum': '$twitter_count'},
                'total_reddit': {'$sum': '$reddit_count'},
                'total_trustpilot': {'$sum': '$trustpilot_count'},
                'total_appstore': {'$sum': '$appstore_count'}
            }}
        ]
        
        result = list(clusters_collection.aggregate(pipeline))
        
        if result:
            stats = result[0]
            total_twitter = stats.get('total_twitter', 0)
            total_reddit = stats.get('total_reddit', 0)
            total_trustpilot = stats.get('total_trustpilot', 0)
            total_appstore = stats.get('total_appstore', 0)
            total_matches = total_twitter + total_reddit + total_trustpilot + total_appstore
        else:
            total_twitter = total_reddit = total_trustpilot = total_appstore = total_matches = 0
        
        # Get total counts for each channel in socialmedia collection
        total_twitter_posts = socialmedia_collection.count_documents({'channel': 'Twitter'})
        total_reddit_posts = socialmedia_collection.count_documents({'channel': 'Reddit'})
        total_trustpilot_posts = socialmedia_collection.count_documents({'channel': 'Trustpilot'})
        total_appstore_posts = socialmedia_collection.count_documents({'channel': 'App Store/Google Play'})
        total_socialmedia_posts = total_twitter_posts + total_reddit_posts + total_trustpilot_posts + total_appstore_posts
        
        print(f"Total clusters in database: {total_clusters}")
        print(f"Socialmedia clusters: {socialmedia_clusters}")
        print(f"Socialmedia clusters with matches: {socialmedia_clusters_with_matches}")
        print(f"Socialmedia clusters without matches: {socialmedia_clusters - socialmedia_clusters_with_matches}")
        print(f"\nSocial Media Posts in Database:")
        print(f"  Twitter: {total_twitter_posts}")
        print(f"  Reddit: {total_reddit_posts}")
        print(f"  Trustpilot: {total_trustpilot_posts}")
        print(f"  App Store/Google Play: {total_appstore_posts}")
        print(f"  Total: {total_socialmedia_posts}")
        print(f"\nMatching Results for Socialmedia Clusters:")
        print(f"  Twitter matches: {total_twitter}")
        print(f"  Reddit matches: {total_reddit}")
        print(f"  Trustpilot matches: {total_trustpilot}")
        print(f"  App Store/Google Play matches: {total_appstore}")
        print(f"  Total matches: {total_matches}")
        
        if total_socialmedia_posts > 0:
            match_percentage = (total_matches / total_socialmedia_posts) * 100
            print(f"  Overall match percentage: {match_percentage:.2f}%")
            
    except Exception as e:
        print(f"❌ Error getting statistics: {str(e)}")

def force_update_socialmedia_clusters():
    """Force update only socialmedia clusters with empty socialmedia_ids structure"""
    print("\n" + "=" * 60)
    print("FORCE UPDATE SOCIALMEDIA CLUSTERS ONLY")
    print("=" * 60)
    
    channels = ["Twitter", "Reddit", "Trustpilot", "App Store/Google Play"]
    empty_socialmedia_ids = {channel: [] for channel in channels}
    
    try:
        # Update only clusters where data = "socialmedia" with empty socialmedia_ids
        result = clusters_collection.update_many(
            {'data': 'socialmedia'},  # Only update socialmedia clusters
            {'$set': {'socialmedia_ids': empty_socialmedia_ids}}
        )
        
        print(f"✓ Updated {result.modified_count} socialmedia clusters with empty socialmedia_ids structure")
        print(f"✓ Matched {result.matched_count} socialmedia clusters total")
        
    except Exception as e:
        print(f"❌ Error during force update: {str(e)}")

def get_cluster_data_distribution():
    """Get distribution of clusters by data field"""
    print("\n" + "=" * 60)
    print("CLUSTER DATA DISTRIBUTION")
    print("=" * 60)
    
    try:
        pipeline = [
            {'$group': {
                '_id': '$data',
                'count': {'$sum': 1}
            }},
            {'$sort': {'count': -1}}
        ]
        
        result = list(clusters_collection.aggregate(pipeline))
        
        if result:
            print("Distribution of clusters by data field:")
            for item in result:
                data_type = item['_id'] if item['_id'] is not None else 'null/undefined'
                count = item['count']
                print(f"  {data_type}: {count} clusters")
        else:
            print("No clusters found")
            
    except Exception as e:
        print(f"❌ Error getting cluster distribution: {str(e)}")

def get_channel_counts():
    """Get counts for each social media channel"""
    print("\n" + "=" * 60)
    print("CHANNEL VERIFICATION")
    print("=" * 60)
    
    channels = ["Twitter", "Reddit", "Trustpilot", "App Store/Google Play"]
    
    for channel in channels:
        count = socialmedia_collection.count_documents({'channel': channel})
        print(f"{channel}: {count} posts")
        
        # Show sample of posts for verification
        sample = list(socialmedia_collection.find(
            {'channel': channel}, 
            {'_id': 1, 'dominant_topic': 1}
        ).limit(3))
        
        if sample:
            print(f"  Sample posts:")
            for post in sample:
                print(f"    ID: {post['_id']}, Topic: '{post.get('dominant_topic', 'N/A')}'")
        print()

# Main execution
if __name__ == "__main__":
    try:
        print("🚀 Starting social media-cluster matching process...")
        print("ONLY processing clusters where data='socialmedia'")
        print("=" * 60)
        
        # Test database connection
        clusters_collection.find_one()
        socialmedia_collection.find_one()
        print("✓ Database connection successful\n")
        
        # Show cluster data distribution
        get_cluster_data_distribution()
        
        # Verify channel data
        get_channel_counts()
        
        # Force update only socialmedia clusters first (to ensure socialmedia_ids field exists)
        force_update_socialmedia_clusters()
        
        # Execute the matching process (only for socialmedia clusters)
        match_socialmedia_to_clusters()
        
        # Verify results (only socialmedia clusters)
        verify_results()
        
        # Get summary statistics (only socialmedia clusters)
        get_summary_stats()
        
        print("\n" + "=" * 60)
        print("✅ Process completed successfully!")
        print("Only clusters with data='socialmedia' were updated with socialmedia_ids")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close database connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
🚀 Starting social media-cluster matching process...
ONLY processing clusters where data='socialmedia'
✓ Database connection successful


CLUSTER DATA DISTRIBUTION
Distribution of clusters by data field:
  email: 33 clusters
  voice: 24 clusters
  tickets: 16 clusters
  socialmedia: 15 clusters
  chat-chunks: 10 clusters

CHANNEL VERIFICATION
Twitter: 282 posts
  Sample posts:
    ID: 68bb12eb35db675a8b09a223, Topic: 'Balance Update Delayed'
    ID: 68bb12eb35db675a8b09a229, Topic: 'Balance Update Delayed'
    ID: 68bb12eb35db675a8b09a22e, Topic: 'Balance Update Delayed'

Reddit: 157 posts
  Sample posts:
    ID: 68bb12eb35db675a8b09a220, Topic: 'Multi Currency Problems'
    ID: 68bb12eb35db675a8b09a222, Topic: 'Multi Currency Problems'
    ID: 68bb12eb35db675a8b09a224, Topic: 'Multi Currency Problems'

Trustpilot: 1147 posts
  Sample posts:
    ID: 68bb12eb35db675a8b09a217, Topic: 'Long Wait Times'
    ID: 68bb12eb35db675a8b09a218, Topic: 'Lo

In [7]:
# Import required libraries
from pymongo import MongoClient
from collections import defaultdict
import re
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {MONGO_DATABASE_NAME}")

client = MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DATABASE_NAME]

# Get collections
clusters_collection = db['cluster']
socialmedia_collection = db['socialmedia']

def normalize_text(text):
    """Normalize text for better matching"""
    if not text:
        return ""
    return re.sub(r'[^\w\s]', '', text.lower().strip())

def find_matching_subcluster(keyphrases, subclusters, dominant_topic):
    """
    Find the best matching subcluster for a given dominant_topic
    Returns (subcluster_id, subcluster_label) or (None, None)
    """
    normalized_topic = normalize_text(dominant_topic)
    best_match = None
    best_score = 0
    
    for subcluster_id, subcluster_data in subclusters.items():
        subcluster_keyphrases = subcluster_data.get('keyphrases', [])
        subcluster_label = subcluster_data.get('label', '')
        
        # Calculate match score for this subcluster
        score = 0
        matched_phrases = []
        
        for keyphrase in subcluster_keyphrases:
            normalized_keyphrase = normalize_text(keyphrase)
            
            if normalized_keyphrase and normalized_topic:
                # Exact substring match
                if normalized_keyphrase in normalized_topic or normalized_topic in normalized_keyphrase:
                    score += 10
                    matched_phrases.append(keyphrase)
                else:
                    # Word-level matching
                    topic_words = set(normalized_topic.split())
                    keyphrase_words = set(normalized_keyphrase.split())
                    common_words = topic_words.intersection(keyphrase_words)
                    if common_words:
                        score += len(common_words)
                        matched_phrases.append(keyphrase)
        
        if score > best_score:
            best_score = score
            best_match = (subcluster_id, subcluster_label, matched_phrases)
    
    if best_match:
        return best_match[0], best_match[1], best_match[2]
    return None, None, []

def match_and_update_socialmedia():
    """
    Match social media posts to clusters and update socialmedia collection
    with cluster information
    """
    
    print("Fetching clusters where data='socialmedia'...")
    # Get only clusters where data field equals "socialmedia"
    clusters = list(clusters_collection.find({"data": "socialmedia"}))
    print(f"Found {len(clusters)} socialmedia clusters to process")
    
    if not clusters:
        print("❌ No clusters found with data='socialmedia'")
        return
    
    # Create a lookup dictionary for faster cluster access
    cluster_lookup = {}
    for cluster in clusters:
        cluster_id = cluster['cluster_id']
        keyphrases = cluster.get('keyphrases', [])
        dominant_label = cluster.get('dominant_label', '')
        subclusters = cluster.get('subclusters', {})
        
        cluster_lookup[cluster_id] = {
            'keyphrases': keyphrases,
            'dominant_label': dominant_label,
            'subclusters': subclusters
        }
    
    print(f"Created lookup for {len(cluster_lookup)} clusters")
    print()
    
    # Define the social media channels to process
    channels = ["Twitter", "Reddit", "Trustpilot", "App Store/Google Play"]
    
    total_updated = 0
    total_processed = 0
    
    # Process each social media channel
    for channel in channels:
        print(f"Processing channel: {channel}")
        
        # Get all social media posts for this channel
        socialmedia_cursor = socialmedia_collection.find(
            {'channel': channel}, 
            {'_id': 1, 'dominant_topic': 1}
        )
        
        channel_updated = 0
        channel_processed = 0
        
        for post in socialmedia_cursor:
            channel_processed += 1
            total_processed += 1
            
            if channel_processed % 1000 == 0:
                print(f"  Processed {channel_processed} {channel} posts...")
            
            post_id = post['_id']
            dominant_topic = post.get('dominant_topic', '')
            
            if not dominant_topic:
                continue
                
            normalized_topic = normalize_text(dominant_topic)
            
            # Try to match against each cluster
            best_cluster_match = None
            best_score = 0
            best_subcluster_info = (None, None, [])
            
            for cluster_id, cluster_data in cluster_lookup.items():
                keyphrases = cluster_data['keyphrases']
                dominant_label = cluster_data['dominant_label']
                subclusters = cluster_data['subclusters']
                
                # Calculate match score for this cluster
                score = 0
                matched_phrases = []
                
                for keyphrase in keyphrases:
                    normalized_keyphrase = normalize_text(keyphrase)
                    
                    if normalized_keyphrase and normalized_topic:
                        # Exact substring match
                        if normalized_keyphrase in normalized_topic or normalized_topic in normalized_keyphrase:
                            score += 10
                            matched_phrases.append(keyphrase)
                        else:
                            # Word-level matching
                            topic_words = set(normalized_topic.split())
                            keyphrase_words = set(normalized_keyphrase.split())
                            common_words = topic_words.intersection(keyphrase_words)
                            if common_words:
                                score += len(common_words)
                                matched_phrases.append(keyphrase)
                
                if score > best_score:
                    best_score = score
                    best_cluster_match = (cluster_id, dominant_label, matched_phrases)
                    
                    # Find the best matching subcluster
                    subcluster_id, subcluster_label, subcluster_matches = find_matching_subcluster(
                        keyphrases, subclusters, dominant_topic
                    )
                    best_subcluster_info = (subcluster_id, subcluster_label, subcluster_matches)
            
            # Update the social media post if we found a match
            if best_cluster_match and best_score > 0:
                cluster_id, dominant_label, matched_phrases = best_cluster_match
                subcluster_id, subcluster_label, subcluster_matches = best_subcluster_info
                
                # Prepare update data
                update_data = {
                    'kmeans_cluster_id': cluster_id,
                    'dominant_cluster_label': dominant_label
                }
                
                # Only add subcluster info if found
                if subcluster_id is not None:
                    update_data['subcluster_id'] = subcluster_id
                    update_data['subcluster_label'] = subcluster_label
                
                # Update the document
                try:
                    result = socialmedia_collection.update_one(
                        {'_id': post_id},
                        {'$set': update_data}
                    )
                    
                    if result.modified_count > 0:
                        channel_updated += 1
                        total_updated += 1
                        
                        if channel_updated <= 5:  # Show first few matches for debugging
                            print(f"    ✓ Updated {post_id}: '{dominant_topic}' -> Cluster {cluster_id}")
                            print(f"      Matched phrases: {matched_phrases[:3]}")
                            if subcluster_id:
                                print(f"      Subcluster: {subcluster_id} - {subcluster_label}")
                            
                except Exception as e:
                    print(f"    ❌ Error updating {post_id}: {str(e)}")
        
        print(f"  Finished processing {channel_processed} {channel} posts")
        print(f"  Updated {channel_updated} posts with cluster information")
        print()
    
    print(f"✅ Process completed!")
    print(f"Total posts processed: {total_processed}")
    print(f"Total posts updated: {total_updated}")

def verify_results():
    """
    Verify the results by showing some updated social media posts
    """
    print("\n" + "=" * 60)
    print("VERIFICATION RESULTS")
    print("=" * 60)
    
    channels = ["Twitter", "Reddit", "Trustpilot", "App Store/Google Play"]
    
    for channel in channels:
        print(f"\n{channel} Posts with Cluster Information:")
        
        # Get updated posts for this channel
        posts = list(socialmedia_collection.find(
            {
                'channel': channel,
                'kmeans_cluster_id': {'$exists': True}
            },
            {
                '_id': 1,
                'dominant_topic': 1,
                'kmeans_cluster_id': 1,
                'dominant_cluster_label': 1,
                'subcluster_id': 1,
                'subcluster_label': 1
            }
        ).limit(5))
        
        if posts:
            for post in posts:
                print(f"  ID: {post['_id']}")
                print(f"  Topic: '{post.get('dominant_topic', 'N/A')}'")
                print(f"  Cluster ID: {post.get('kmeans_cluster_id', 'N/A')}")
                print(f"  Cluster Label: {post.get('dominant_cluster_label', 'N/A')}")
                print(f"  Subcluster ID: {post.get('subcluster_id', 'N/A')}")
                print(f"  Subcluster Label: {post.get('subcluster_label', 'N/A')}")
                print(f"  ---")
        else:
            print(f"  No updated posts found for {channel}")

def get_summary_stats():
    """Get summary statistics"""
    print("\n" + "=" * 60)
    print("SUMMARY STATISTICS")
    print("=" * 60)
    
    try:
        channels = ["Twitter", "Reddit", "Trustpilot", "App Store/Google Play"]
        
        for channel in channels:
            total_posts = socialmedia_collection.count_documents({'channel': channel})
            updated_posts = socialmedia_collection.count_documents({
                'channel': channel,
                'kmeans_cluster_id': {'$exists': True}
            })
            posts_with_subclusters = socialmedia_collection.count_documents({
                'channel': channel,
                'kmeans_cluster_id': {'$exists': True},
                'subcluster_id': {'$exists': True}
            })
            
            percentage = (updated_posts / total_posts * 100) if total_posts > 0 else 0
            subcluster_percentage = (posts_with_subclusters / updated_posts * 100) if updated_posts > 0 else 0
            
            print(f"{channel}:")
            print(f"  Total posts: {total_posts}")
            print(f"  Posts with cluster info: {updated_posts} ({percentage:.2f}%)")
            print(f"  Posts with subcluster info: {posts_with_subclusters} ({subcluster_percentage:.2f}%)")
            print()
        
        # Overall statistics
        total_all_posts = socialmedia_collection.count_documents({})
        total_updated_posts = socialmedia_collection.count_documents({'kmeans_cluster_id': {'$exists': True}})
        total_with_subclusters = socialmedia_collection.count_documents({
            'kmeans_cluster_id': {'$exists': True},
            'subcluster_id': {'$exists': True}
        })
        
        overall_percentage = (total_updated_posts / total_all_posts * 100) if total_all_posts > 0 else 0
        
        print("Overall Statistics:")
        print(f"  Total social media posts: {total_all_posts}")
        print(f"  Posts with cluster information: {total_updated_posts} ({overall_percentage:.2f}%)")
        print(f"  Posts with subcluster information: {total_with_subclusters}")
        
    except Exception as e:
        print(f"❌ Error getting statistics: {str(e)}")

def clean_existing_cluster_fields():
    """Remove existing cluster fields to start fresh (optional)"""
    print("\n" + "=" * 60)
    print("CLEANING EXISTING CLUSTER FIELDS")
    print("=" * 60)
    
    try:
        result = socialmedia_collection.update_many(
            {},
            {
                '$unset': {
                    'kmeans_cluster_id': '',
                    'subcluster_id': '',
                    'subcluster_label': '',
                    'dominant_cluster_label': ''
                }
            }
        )
        
        print(f"✓ Cleaned cluster fields from {result.modified_count} documents")
        
    except Exception as e:
        print(f"❌ Error during cleaning: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        print("🚀 Starting social media-cluster mapping process...")
        print("Updating socialmedia collection with cluster information")
        print("=" * 60)
        
        # Test database connection
        clusters_collection.find_one()
        socialmedia_collection.find_one()
        print("✓ Database connection successful\n")
        
        # Optional: Clean existing fields (uncomment if you want to start fresh)
        # clean_existing_cluster_fields()
        
        # Execute the matching and updating process
        match_and_update_socialmedia()
        
        # Verify results
        verify_results()
        
        # Get summary statistics
        get_summary_stats()
        
        print("\n" + "=" * 60)
        print("✅ Process completed successfully!")
        print("Social media posts have been updated with cluster information")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close database connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
🚀 Starting social media-cluster mapping process...
Updating socialmedia collection with cluster information
✓ Database connection successful

Fetching clusters where data='socialmedia'...
Found 15 socialmedia clusters to process
Created lookup for 15 clusters

Processing channel: Twitter
    ✓ Updated 68bb12eb35db675a8b09a223: 'Balance Update Delayed' -> Cluster 6
      Matched phrases: ['Balance Update Delayed']
      Subcluster: 1 - Software & Feature Errors
    ✓ Updated 68bb12eb35db675a8b09a229: 'Balance Update Delayed' -> Cluster 6
      Matched phrases: ['Balance Update Delayed']
      Subcluster: 1 - Software & Feature Errors
    ✓ Updated 68bb12eb35db675a8b09a22e: 'Balance Update Delayed' -> Cluster 6
      Matched phrases: ['Balance Update Delayed']
      Subcluster: 1 - Software & Feature Errors
    ✓ Updated 68bb12eb35db675a8b09a233: 'Balance Update Delayed' -> Cluster 6
      Matched phrases: ['Balance Update Delayed']
      Subcl

In [8]:
# Import required libraries
from pymongo import MongoClient
from collections import defaultdict
import re
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

def add_kmeans_cluster_keyphrase():
    """
    Add kmeans_cluster_keyphrase field to all documents in socialmedia collection
    copying the value from dominant_topic field
    """
    
    try:
        # Connect to MongoDB
        client = MongoClient(MONGO_CONNECTION_STRING)
        db = client[MONGO_DATABASE_NAME]
        collection = db['socialmedia']
        
        print("Connected to MongoDB successfully!")
        print(f"Database: {MONGO_DATABASE_NAME}")
        print(f"Collection: socialmedia")
        
        # Count total documents
        total_documents = collection.count_documents({})
        print(f"Total documents in collection: {total_documents}")
        
        # Count documents that have dominant_topic field
        documents_with_dominant_topic = collection.count_documents({"dominant_topic": {"$exists": True, "$ne": None}})
        print(f"Documents with dominant_topic field: {documents_with_dominant_topic}")
        
        # Update all documents that have dominant_topic field
        # Copy the value from dominant_topic to kmeans_cluster_keyphrase
        update_result = collection.update_many(
            {"dominant_topic": {"$exists": True, "$ne": None}},  # Filter: documents with dominant_topic
            [{"$set": {"kmeans_cluster_keyphrase": "$dominant_topic"}}]  # Update: copy dominant_topic to new field
        )
        
        print(f"\nUpdate completed!")
        print(f"Documents matched: {update_result.matched_count}")
        print(f"Documents modified: {update_result.modified_count}")
        
        # Verify the update by checking a few sample documents
        print("\n--- Sample documents after update ---")
        sample_docs = collection.find(
            {"kmeans_cluster_keyphrase": {"$exists": True}}, 
            {"dominant_topic": 1, "kmeans_cluster_keyphrase": 1, "_id": 1}
        ).limit(5)
        
        for i, doc in enumerate(sample_docs, 1):
            print(f"\nSample {i}:")
            print(f"  _id: {doc.get('_id')}")
            print(f"  dominant_topic: {doc.get('dominant_topic')}")
            print(f"  kmeans_cluster_keyphrase: {doc.get('kmeans_cluster_keyphrase')}")
        
        # Final count verification
        documents_with_new_field = collection.count_documents({"kmeans_cluster_keyphrase": {"$exists": True}})
        print(f"\nFinal verification:")
        print(f"Documents with kmeans_cluster_keyphrase field: {documents_with_new_field}")
        
        # Close connection
        client.close()
        print("\nConnection closed successfully!")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        if 'client' in locals():
            client.close()

def verify_update():
    """
    Optional function to verify the update worked correctly
    """
    try:
        client = MongoClient(MONGO_CONNECTION_STRING)
        db = client[MONGO_DATABASE_NAME]
        collection = db['socialmedia']
        
        # Check if any documents have mismatched values
        pipeline = [
            {
                "$match": {
                    "dominant_topic": {"$exists": True},
                    "kmeans_cluster_keyphrase": {"$exists": True}
                }
            },
            {
                "$project": {
                    "dominant_topic": 1,
                    "kmeans_cluster_keyphrase": 1,
                    "is_match": {"$eq": ["$dominant_topic", "$kmeans_cluster_keyphrase"]}
                }
            },
            {
                "$match": {
                    "is_match": False
                }
            }
        ]
        
        mismatched_docs = list(collection.aggregate(pipeline))
        
        if mismatched_docs:
            print(f"Warning: Found {len(mismatched_docs)} documents with mismatched values:")
            for doc in mismatched_docs[:3]:  # Show first 3 mismatches
                print(f"  ID: {doc['_id']}")
                print(f"    dominant_topic: {doc['dominant_topic']}")
                print(f"    kmeans_cluster_keyphrase: {doc['kmeans_cluster_keyphrase']}")
        else:
            print("✓ All documents have matching values between dominant_topic and kmeans_cluster_keyphrase")
        
        client.close()
        
    except Exception as e:
        print(f"Error during verification: {str(e)}")
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    print("Starting MongoDB field copy operation...")
    print("=" * 50)
    
    # Check if environment variables are loaded
    if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
        print("Error: Missing required environment variables!")
        print("Please ensure MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME are set in your .env file")
        exit(1)
    
    # Execute the main function
    add_kmeans_cluster_keyphrase()
    
    # Optional: Run verification
    print("\n" + "=" * 50)
    print("Running verification...")
    verify_update()

Starting MongoDB field copy operation...
Connected to MongoDB successfully!
Database: sparzaai
Collection: socialmedia
Total documents in collection: 2287
Documents with dominant_topic field: 2237

Update completed!
Documents matched: 2237
Documents modified: 2237

--- Sample documents after update ---

Sample 1:
  _id: 68bb12eb35db675a8b09a217
  dominant_topic: Long Wait Times
  kmeans_cluster_keyphrase: Long Wait Times

Sample 2:
  _id: 68bb12eb35db675a8b09a218
  dominant_topic: Long Wait Times
  kmeans_cluster_keyphrase: Long Wait Times

Sample 3:
  _id: 68bb12eb35db675a8b09a219
  dominant_topic: Long Wait Times
  kmeans_cluster_keyphrase: Long Wait Times

Sample 4:
  _id: 68bb12eb35db675a8b09a21a
  dominant_topic: Long Wait Times
  kmeans_cluster_keyphrase: Long Wait Times

Sample 5:
  _id: 68bb12eb35db675a8b09a21b
  dominant_topic: High Transaction Fees
  kmeans_cluster_keyphrase: High Transaction Fees

Final verification:
Documents with kmeans_cluster_keyphrase field: 2237

Conne

In [3]:
# Remove socialmedia_ids field from cluster collection where data = "socialmedia"
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

# Connect to MongoDB
client = MongoClient(mongo_connection_string)
db = client[mongo_database_name]
collection = db['cluster']

# First, let's see how many records have data: "socialmedia" and socialmedia_ids field
query = {
    "data": "socialmedia",
    "socialmedia_ids": {"$exists": True}
}

count_before = collection.count_documents(query)
logger.info(f"Found {count_before} records with data='socialmedia' and socialmedia_ids field")

# Show sample of what we're about to delete
sample_docs = list(collection.find(query).limit(2))
logger.info("Sample documents before deletion:")
for doc in sample_docs:
    logger.info(f"Document ID: {doc['_id']}")
    logger.info(f"Cluster ID: {doc.get('cluster_id')}")
    logger.info(f"Data: {doc.get('data')}")
    logger.info(f"Socialmedia_ids keys: {list(doc.get('socialmedia_ids', {}).keys())}")
    logger.info("---")

# Remove the socialmedia_ids field from all records where data = "socialmedia"
result = collection.update_many(
    {"data": "socialmedia"},
    {"$unset": {"socialmedia_ids": ""}}
)

logger.info(f"Update operation completed:")
logger.info(f"Matched documents: {result.matched_count}")
logger.info(f"Modified documents: {result.modified_count}")

# Verify the deletion
count_after = collection.count_documents(query)
logger.info(f"Records with socialmedia_ids field after deletion: {count_after}")

# Show sample after deletion
sample_after = list(collection.find({"data": "socialmedia"}).limit(2))
logger.info("Sample documents after deletion:")
for doc in sample_after:
    logger.info(f"Document ID: {doc['_id']}")
    logger.info(f"Cluster ID: {doc.get('cluster_id')}")
    logger.info(f"Data: {doc.get('data')}")
    logger.info(f"Has socialmedia_ids: {'socialmedia_ids' in doc}")
    logger.info("---")

logger.info("Operation completed successfully!")

2025-09-08 21:49:09,950 - INFO - Found 15 records with data='socialmedia' and socialmedia_ids field
2025-09-08 21:49:10,228 - INFO - Sample documents before deletion:
2025-09-08 21:49:10,230 - INFO - Document ID: 68b7320e2066e3027cd73e80
2025-09-08 21:49:10,233 - INFO - Cluster ID: 0
2025-09-08 21:49:10,234 - INFO - Data: socialmedia
2025-09-08 21:49:10,235 - INFO - Socialmedia_ids keys: ['App Store/Google Play', 'Trustpilot', 'Twitter', 'Reddit']
2025-09-08 21:49:10,236 - INFO - ---
2025-09-08 21:49:10,239 - INFO - Document ID: 68b7320e2066e3027cd73e81
2025-09-08 21:49:10,240 - INFO - Cluster ID: 1
2025-09-08 21:49:10,241 - INFO - Data: socialmedia
2025-09-08 21:49:10,242 - INFO - Socialmedia_ids keys: ['Trustpilot', 'Reddit']
2025-09-08 21:49:10,243 - INFO - ---
2025-09-08 21:49:10,491 - INFO - Update operation completed:
2025-09-08 21:49:10,493 - INFO - Matched documents: 15
2025-09-08 21:49:10,495 - INFO - Modified documents: 15
2025-09-08 21:49:10,758 - INFO - Records with socialm

In [4]:
# Add socialmedia_ids field to cluster collection by matching with socialmedia collection
# Save ObjectIDs as strings
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
from collections import defaultdict

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

# Connect to MongoDB
client = MongoClient(mongo_connection_string)
db = client[mongo_database_name]
cluster_collection = db['cluster']
socialmedia_collection = db['socialmedia']

# Get all cluster IDs from cluster collection where data = "socialmedia"
cluster_query = {"data": "socialmedia"}
clusters = list(cluster_collection.find(cluster_query, {"cluster_id": 1, "_id": 1}))
cluster_ids = [cluster['cluster_id'] for cluster in clusters]

logger.info(f"Found {len(cluster_ids)} clusters with data='socialmedia'")
logger.info(f"Cluster IDs: {cluster_ids}")

# Process each cluster
for cluster in clusters:
    cluster_id = cluster['cluster_id']
    cluster_doc_id = cluster['_id']
    
    logger.info(f"Processing cluster_id: {cluster_id}")
    
    # Find all socialmedia documents that match this cluster_id
    socialmedia_query = {"kmeans_cluster_id": cluster_id}
    socialmedia_docs = list(socialmedia_collection.find(socialmedia_query, {"_id": 1, "channel": 1}))
    
    logger.info(f"Found {len(socialmedia_docs)} socialmedia documents for cluster_id {cluster_id}")
    
    # Group ObjectIDs by channel and convert to strings
    socialmedia_ids_by_channel = defaultdict(list)
    
    for doc in socialmedia_docs:
        channel = doc.get('channel')
        if channel:
            # Convert ObjectId to string
            object_id_string = str(doc['_id'])
            socialmedia_ids_by_channel[channel].append(object_id_string)
    
    # Convert defaultdict to regular dict and show counts
    socialmedia_ids = dict(socialmedia_ids_by_channel)
    
    logger.info(f"Channel distribution for cluster_id {cluster_id}:")
    for channel, ids in socialmedia_ids.items():
        logger.info(f"  {channel}: {len(ids)} documents")
        # Show first few IDs as examples
        if ids:
            logger.info(f"    Sample IDs: {ids[:3]}...")
    
    # Update the cluster document with socialmedia_ids
    update_result = cluster_collection.update_one(
        {"_id": cluster_doc_id},
        {"$set": {"socialmedia_ids": socialmedia_ids}}
    )
    
    if update_result.modified_count > 0:
        logger.info(f"Successfully updated cluster_id {cluster_id} with socialmedia_ids")
    else:
        logger.info(f"No changes made to cluster_id {cluster_id}")

# Verify the updates by checking a few clusters
logger.info("\nVerification - Sample clusters with socialmedia_ids:")
sample_clusters = list(cluster_collection.find(
    {"data": "socialmedia", "socialmedia_ids": {"$exists": True}}
).limit(3))

for cluster in sample_clusters:
    logger.info(f"Cluster ID: {cluster.get('cluster_id')}")
    logger.info(f"Dominant Label: {cluster.get('dominant_label')}")
    socialmedia_ids = cluster.get('socialmedia_ids', {})
    logger.info("Socialmedia_ids distribution:")
    for channel, ids in socialmedia_ids.items():
        logger.info(f"  {channel}: {len(ids)} string IDs")
        # Show first few string IDs as examples
        if ids:
            logger.info(f"    Sample: {ids[:2]}")
    logger.info("---")

logger.info("Operation completed successfully!")

2025-09-08 21:49:47,782 - INFO - Found 15 clusters with data='socialmedia'
2025-09-08 21:49:47,785 - INFO - Cluster IDs: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
2025-09-08 21:49:47,786 - INFO - Processing cluster_id: 0
2025-09-08 21:49:48,043 - INFO - Found 99 socialmedia documents for cluster_id 0
2025-09-08 21:49:48,044 - INFO - Channel distribution for cluster_id 0:
2025-09-08 21:49:48,046 - INFO -   App Store/Google Play: 43 documents
2025-09-08 21:49:48,048 - INFO -     Sample IDs: ['68bb12eb35db675a8b09a3c5', '68bb12eb35db675a8b09a3d3', '68bb12eb35db675a8b09a3dd']...
2025-09-08 21:49:48,049 - INFO -   Trustpilot: 46 documents
2025-09-08 21:49:48,050 - INFO -     Sample IDs: ['68bb12eb35db675a8b09a51f', '68bb12eb35db675a8b09a522', '68bb12eb35db675a8b09a523']...
2025-09-08 21:49:48,052 - INFO -   Twitter: 8 documents
2025-09-08 21:49:48,053 - INFO -     Sample IDs: ['68bb12eb35db675a8b09a705', '68bb12eb35db675a8b09a708', '68bb12eb35db675a8b09a70d']...
2025-09-08 21:49:48