In [1]:
# Import required libraries
from pymongo import MongoClient
from collections import defaultdict
import re
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {MONGO_DATABASE_NAME}")

client = MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DATABASE_NAME]

# Get collections
clusters_collection = db['cluster']
twitter_collection = db['twitter']

def normalize_text(text):
    """Normalize text for better matching"""
    return re.sub(r'[^\w\s]', '', text.lower().strip())

def match_twitter_to_clusters():
    """
    Match twitter to clusters based on dominant_topic matching keyphrases
    and update cluster documents with twitter_ids array
    """
    
    print("Fetching clusters with data: 'twitter'...")
    # Get only clusters that have data field set to "twitter"
    clusters = list(clusters_collection.find({"data": "twitter"}))
    print(f"Found {len(clusters)} clusters with data='twitter' to process\n")
    
    # Process each cluster
    for cluster in clusters:
        cluster_id = cluster['cluster_id']
        keyphrases = cluster.get('keyphrases', [])
        
        print(f"Processing Cluster ID: {cluster_id}")
        print(f"Cluster Name: {cluster.get('cluster_name', 'N/A')}")
        print(f"Keyphrases: {keyphrases}")
        
        # Normalize keyphrases for matching
        normalized_keyphrases = [normalize_text(phrase) for phrase in keyphrases]
        print(f"Normalized keyphrases: {normalized_keyphrases}")
        
        # Find matching twitter
        matching_twitter_ids = []
        
        # Get all twitter - using cursor for better memory management
        print("  Searching through twitter...")
        twitter_cursor = twitter_collection.find({}, {
            '_id': 1, 
            'dominant_topic': 1
        })
        
        twitter_count = 0
        for twitter in twitter_cursor:
            twitter_count += 1
            if twitter_count % 1000 == 0:
                print(f"    Processed {twitter_count} twitter...")
                
            twitter_dominant_topic = twitter.get('dominant_topic', '')
            
            if twitter_dominant_topic:
                normalized_topic = normalize_text(twitter_dominant_topic)
                
                # Check if any keyphrase matches the dominant topic
                for keyphrase in normalized_keyphrases:
                    if keyphrase and normalized_topic:  # Ensure both are not empty
                        if keyphrase in normalized_topic or normalized_topic in keyphrase:
                            matching_twitter_ids.append(str(twitter['_id']))
                            print(f"    Match found: {twitter['_id']} - Topic: '{twitter_dominant_topic}' matches keyphrase: '{keyphrase}'")
                            break
        
        print(f"  Finished processing {twitter_count} twitter")
        
        # Remove duplicates (in case a twitter matches multiple times)
        matching_twitter_ids = list(set(matching_twitter_ids))
        
        # Update cluster with twitter_ids (only for clusters with data: "twitter")
        try:
            if matching_twitter_ids:
                result = clusters_collection.update_one(
                    {'cluster_id': cluster_id, 'data': 'twitter'},
                    {'$set': {'twitter_ids': matching_twitter_ids}}
                )
                if result.modified_count > 0:
                    print(f"  ✓ Successfully updated cluster {cluster_id} with {len(matching_twitter_ids)} twitter IDs")
                elif result.matched_count > 0:
                    print(f"  ⚠ Cluster {cluster_id} already has same twitter data")
                else:
                    print(f"  ❌ Cluster {cluster_id} not found or doesn't have data: 'twitter'")
            else:
                # Set empty array if no matches found
                result = clusters_collection.update_one(
                    {'cluster_id': cluster_id, 'data': 'twitter'},
                    {'$set': {'twitter_ids': []}}
                )
                if result.modified_count > 0:
                    print(f"  ✓ Set empty twitter_ids array for cluster {cluster_id} (no matches found)")
                elif result.matched_count > 0:
                    print(f"  ⚠ Cluster {cluster_id} already has empty twitter_ids array")
                else:
                    print(f"  ❌ Cluster {cluster_id} not found or doesn't have data: 'twitter'")
        except Exception as e:
            print(f"  ❌ Error updating cluster {cluster_id}: {str(e)}")
        
        print(f"  Total unique twitter matched: {len(matching_twitter_ids)}")
        print("-" * 50)

def verify_results():
    """
    Verify the results by displaying updated clusters
    """
    print("\n" + "=" * 60)
    print("VERIFICATION RESULTS")
    print("=" * 60)
    
    try:
        clusters = list(clusters_collection.find({"data": "twitter"}, {
            'cluster_id': 1, 
            'cluster_name': 1, 
            'keyphrases': 1, 
            'twitter_ids': 1,
            'data': 1
        }).sort('cluster_id', 1))
        
        for cluster in clusters:
            twitter_count = len(cluster.get('twitter_ids', []))
            print(f"\nCluster {cluster['cluster_id']}: {cluster.get('cluster_name', 'N/A')} (data: {cluster.get('data', 'N/A')})")
            print(f"  Keyphrases: {cluster.get('keyphrases', [])}")
            print(f"  twitter IDs count: {twitter_count}")
            if twitter_count > 0:
                print(f"  First 3 twitter IDs: {cluster['twitter_ids'][:3]}")
                if twitter_count > 3:
                    print(f"  ... and {twitter_count - 3} more")
    except Exception as e:
        print(f"❌ Error during verification: {str(e)}")

def get_summary_stats():
    """Get summary statistics"""
    print("\n" + "=" * 60)
    print("SUMMARY STATISTICS")
    print("=" * 60)
    
    try:
        total_clusters = clusters_collection.count_documents({"data": "twitter"})
        clusters_with_twitter = clusters_collection.count_documents({
            "data": "twitter",
            'twitter_ids': {'$exists': True, '$ne': []}
        })
        
        pipeline = [
            {'$match': {"data": "twitter", 'twitter_ids': {'$exists': True}}},
            {'$project': {'twitter_count': {'$size': '$twitter_ids'}}},
            {'$group': {'_id': None, 'total_twitter_matched': {'$sum': '$twitter_count'}}}
        ]
        
        result = list(clusters_collection.aggregate(pipeline))
        total_twitter_matched = result[0]['total_twitter_matched'] if result else 0
        
        total_twitter = twitter_collection.count_documents({})
        
        print(f"Total clusters with data='twitter': {total_clusters}")
        print(f"Clusters with matched twitter: {clusters_with_twitter}")
        print(f"Clusters without matches: {total_clusters - clusters_with_twitter}")
        print(f"Total twitter in database: {total_twitter}")
        print(f"Total twitter-cluster matches: {total_twitter_matched}")
        
        if total_twitter > 0:
            match_percentage = (total_twitter_matched / total_twitter) * 100
            print(f"Match percentage: {match_percentage:.2f}%")
            
    except Exception as e:
        print(f"❌ Error getting statistics: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        print("🚀 Starting twitter-cluster matching process...")
        print("This will match twitter to clusters with data: 'twitter'")
        print("=" * 60)
        
        # Test database connection
        test_cluster = clusters_collection.find_one({"data": "twitter"})
        test_twitter = twitter_collection.find_one()
        
        if not test_cluster:
            print("⚠️  Warning: No clusters found with data: 'twitter'")
        if not test_twitter:
            print("⚠️  Warning: No twitter found in twitter collection")
            
        print("✓ Database connection successful\n")
        
        # Execute the matching process
        match_twitter_to_clusters()
        
        # Verify results
        verify_results()
        
        # Get summary statistics
        get_summary_stats()
        
        print("\n" + "=" * 60)
        print("✅ Process completed successfully!")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close database connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
🚀 Starting twitter-cluster matching process...
This will match twitter to clusters with data: 'twitter'
✓ Database connection successful

Fetching clusters with data: 'twitter'...
Found 15 clusters with data='twitter' to process

Processing Cluster ID: 0
Cluster Name: N/A
Keyphrases: ['Excellent Call Support', 'Knowledgeable Agent Helped', 'Unhelpful Agent Encountered', 'Multilingual Support Appreciated', 'Language Barriers Experienced', 'Manager Unavailable Complaints', 'Call Support Adequate']
Normalized keyphrases: ['excellent call support', 'knowledgeable agent helped', 'unhelpful agent encountered', 'multilingual support appreciated', 'language barriers experienced', 'manager unavailable complaints', 'call support adequate']
  Searching through twitter...
    Match found: 68a8c9fb8b8398d82b9fe325 - Topic: 'Excellent Call Support' matches keyphrase: 'excellent call support'
    Match found: 68a8c9fb8b8398d82b9fe329 - Topic: 'Manager Unava

In [2]:
# Import required libraries
from pymongo import MongoClient
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {MONGO_DATABASE_NAME}")

client = MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DATABASE_NAME]

# Get collections
clusters_collection = db['cluster']

def update_domains_to_twitter_support():
    """
    Update twitter cluster documents to change domains to ["twitter Support"]
    Only processes clusters with data: "twitter"
    """
    
    print("Starting domain update process for twitter clusters...")
    print("=" * 50)
    
    try:
        # Count total twitter clusters before update
        total_twitter_clusters = clusters_collection.count_documents({"data": "twitter"})
        print(f"Total twitter clusters in collection: {total_twitter_clusters}")
        
        if total_twitter_clusters == 0:
            print("⚠ No twitter clusters found (data: 'twitter')")
            return
        
        # Count twitter clusters that currently have ["EU bank"] or other domains
        eu_bank_count = clusters_collection.count_documents({
            "data": "twitter", 
            "domains": ["EU bank"]
        })
        print(f"twitter clusters with 'EU bank' domain: {eu_bank_count}")
        
        old_twitter_support_count = clusters_collection.count_documents({
            "data": "twitter", 
            "domains": ["twitter Support"]
        })
        print(f"twitter clusters with 'twitter Support' domain: {old_twitter_support_count}")
        
        # Count twitter clusters with other domains
        other_domains_count = clusters_collection.count_documents({
            "data": "twitter",
            "domains": {"$nin": [["EU bank"], ["twitter Support"]]}
        })
        print(f"twitter clusters with other domains: {other_domains_count}")
        
        print("\n" + "=" * 50)
        print("UPDATING twitter CLUSTER DOMAINS...")
        print("=" * 50)
        
        # Update only twitter clusters to have domains: ["twitter Support"]
        update_result = clusters_collection.update_many(
            {"data": "twitter"},  # Only update clusters with data: "twitter"
            {"$set": {"domains": ["twitter Support"]}}
        )
        
        print(f"✓ Successfully updated {update_result.modified_count} twitter clusters")
        print(f"  Matched twitter clusters: {update_result.matched_count}")
        
        # Verify the update
        verify_update()
        
    except Exception as e:
        print(f"❌ Error during twitter domain update: {str(e)}")

def verify_update():
    """
    Verify that all twitter clusters have been updated to ["twitter Support"]
    """
    print("\n" + "=" * 50)
    print("twitter CLUSTER VERIFICATION")
    print("=" * 50)
    
    try:
        # Count twitter clusters with different domain values
        twitter_support_count = clusters_collection.count_documents({
            "data": "twitter",
            "domains": ["twitter Support"]
        })
        eu_bank_count = clusters_collection.count_documents({
            "data": "twitter",
            "domains": ["EU bank"]
        })
        other_domains = clusters_collection.count_documents({
            "data": "twitter",
            "domains": {"$nin": [["twitter Support"], ["EU bank"]]}
        })
        
        total_twitter_clusters = clusters_collection.count_documents({"data": "twitter"})
        
        print(f"Total twitter clusters: {total_twitter_clusters}")
        print(f"twitter clusters with 'twitter Support' domain: {twitter_support_count}")
        print(f"twitter clusters with 'EU bank' domain: {eu_bank_count}")
        print(f"twitter clusters with other domains: {other_domains}")
        
        if twitter_support_count == total_twitter_clusters:
            print("\n✅ SUCCESS: All twitter clusters now have 'twitter Support' domain!")
        else:
            print(f"\n⚠ WARNING: {total_twitter_clusters - twitter_support_count} twitter clusters still have different domains")
        
        # Show sample of updated twitter cluster documents
        print(f"\nSample of updated twitter clusters:")
        samples = list(clusters_collection.find(
            {"data": "twitter"}, 
            {
                'cluster_id': 1, 
                'domains': 1, 
                'dominant_label': 1,
                'data': 1
            }
        ).limit(5).sort('cluster_id', 1))
        
        for sample in samples:
            cluster_id = sample.get('cluster_id', 'N/A')
            domains = sample.get('domains', [])
            label = sample.get('dominant_label', 'N/A')
            data_type = sample.get('data', 'N/A')
            print(f"  twitter Cluster {cluster_id}: data={data_type}, domains={domains}, label='{label}'")
            
    except Exception as e:
        print(f"❌ Error during verification: {str(e)}")

def get_domain_statistics():
    """
    Get detailed statistics about domains in twitter clusters only
    """
    print("\n" + "=" * 50)
    print("twitter CLUSTER DOMAIN STATISTICS")
    print("=" * 50)
    
    try:
        # Aggregate to get all unique domain combinations for twitter clusters only
        pipeline = [
            {'$match': {"data": "twitter"}},
            {'$group': {'_id': '$domains', 'count': {'$sum': 1}}},
            {'$sort': {'count': -1}}
        ]
        
        domain_stats = list(clusters_collection.aggregate(pipeline))
        
        print("Domain distribution for twitter clusters:")
        for stat in domain_stats:
            domains = stat['_id']
            count = stat['count']
            print(f"  {domains}: {count} twitter clusters")
            
        total_twitter_clusters = clusters_collection.count_documents({"data": "twitter"})
        if total_twitter_clusters > 0:
            twitter_support_percentage = (clusters_collection.count_documents({
                "data": "twitter",
                "domains": ["twitter Support"]
            }) / total_twitter_clusters) * 100
            print(f"\nPercentage of twitter clusters with 'twitter Support' domain: {twitter_support_percentage:.1f}%")
        
        # Show comparison with other data types
        print(f"\nComparison with other cluster types:")
        all_data_types = list(clusters_collection.aggregate([
            {'$group': {'_id': '$data', 'count': {'$sum': 1}}},
            {'$sort': {'count': -1}}
        ]))
        
        for data_type in all_data_types:
            data_value = data_type['_id']
            count = data_type['count']
            print(f"  Clusters with data='{data_value}': {count}")
            
    except Exception as e:
        print(f"❌ Error getting statistics: {str(e)}")

def show_twitter_cluster_summary():
    """
    Show summary of twitter cluster fields after domain update
    """
    print("\n" + "=" * 50)
    print("twitter CLUSTER SUMMARY")
    print("=" * 50)
    
    try:
        # Get twitter cluster statistics
        twitter_clusters_with_twitter_ids = clusters_collection.count_documents({
            "data": "twitter",
            "twitter_ids": {"$exists": True, "$ne": []}
        })
        
        # Get average twitter count per cluster
        twitter_pipeline = [
            {'$match': {"data": "twitter", "twitter_ids": {"$exists": True}}},
            {'$project': {'twitter_count': {'$size': '$twitter_ids'}}},
            {'$group': {
                '_id': None,
                'total_twitter': {'$sum': '$twitter_count'},
                'avg_twitter_per_cluster': {'$avg': '$twitter_count'},
                'max_twitter_per_cluster': {'$max': '$twitter_count'}
            }}
        ]
        
        twitter_result = list(clusters_collection.aggregate(twitter_pipeline))
        
        total_twitter_clusters = clusters_collection.count_documents({"data": "twitter"})
        
        print(f"Total twitter clusters: {total_twitter_clusters}")
        print(f"twitter clusters with assigned twitter: {twitter_clusters_with_twitter_ids}")
        
        if twitter_result:
            result = twitter_result[0]
            print(f"Total twitter assigned to clusters: {result['total_twitter']}")
            print(f"Average twitter per cluster: {result['avg_twitter_per_cluster']:.2f}")
            print(f"Maximum twitter in a cluster: {result['max_twitter_per_cluster']}")
            
    except Exception as e:
        print(f"❌ Error getting twitter summary: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        print("🎫 Starting domains update to 'twitter Support' for twitter clusters...")
        print("This will only update clusters with data: 'twitter'")
        print("=" * 60)
        
        # Test database connection
        test_doc = clusters_collection.find_one({"data": "twitter"})
        if test_doc:
            print("✓ Database connection successful")
            current_domains = test_doc.get('domains', 'N/A')
            data_type = test_doc.get('data', 'N/A')
            print(f"Sample twitter cluster - data: {data_type}, domains: {current_domains}\n")
        else:
            print("⚠ No twitter clusters found (data: 'twitter') in clusters collection")
            print("Please ensure you have clusters with data: 'twitter' before running this script")
            exit(1)
        
        # Execute the domain update for twitter clusters
        update_domains_to_twitter_support()
        
        # Get detailed statistics
        get_domain_statistics()
        
        # Show twitter cluster summary
        show_twitter_cluster_summary()
        
        print("\n" + "=" * 60)
        print("✅ twitter cluster domain update process completed successfully!")
        print("All twitter clusters now have domains: ['twitter Support']")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close database connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
🎫 Starting domains update to 'twitter Support' for twitter clusters...
This will only update clusters with data: 'twitter'
✓ Database connection successful
Sample twitter cluster - data: twitter, domains: ['Ticket Support']

Starting domain update process for twitter clusters...
Total twitter clusters in collection: 15
twitter clusters with 'EU bank' domain: 0
twitter clusters with 'twitter Support' domain: 0
twitter clusters with other domains: 15

UPDATING twitter CLUSTER DOMAINS...
✓ Successfully updated 15 twitter clusters
  Matched twitter clusters: 15

twitter CLUSTER VERIFICATION
Total twitter clusters: 15
twitter clusters with 'twitter Support' domain: 15
twitter clusters with 'EU bank' domain: 0
twitter clusters with other domains: 0

✅ SUCCESS: All twitter clusters now have 'twitter Support' domain!

Sample of updated twitter clusters:
  twitter Cluster 0: data=twitter, domains=['twitter Support'], label='Call Center & Agent Support

In [3]:
from pymongo import MongoClient, UpdateOne
from typing import Dict, List, Optional, Set
import os
from dotenv import load_dotenv
from collections import defaultdict
import threading
from concurrent.futures import ThreadPoolExecutor
import time

# Load environment variables
load_dotenv()

class OptimizedtwitterClusterMatcher:
    def __init__(self, connection_string: str, database_name: str):
        """
        Initialize the matcher with MongoDB connection
        """
        self.client = MongoClient(connection_string)
        self.db = self.client[database_name]
        self.twitter_collection = self.db['twitter']
        self.clusters_collection = self.db['cluster']
        
        # Cache for cluster data - this is the key optimization
        self._cluster_cache = None
        self._subcluster_cache = None
        self._load_cluster_cache()
    
    def _load_cluster_cache(self):
        """
        Load all cluster data into memory for fast lookups
        Only load clusters where data equals "twitter"
        """
        print("Loading twitter cluster data into cache...")
        start_time = time.time()
        
        # Dictionary mapping keyphrase -> cluster info
        self._cluster_cache = {}
        # Dictionary mapping keyphrase -> subcluster info
        self._subcluster_cache = {}
        
        # Only get clusters where data = "twitter"
        clusters = list(self.clusters_collection.find({"data": "twitter"}))
        print(f"Found {len(clusters)} twitter clusters to cache")
        
        for cluster in clusters:
            cluster_id = cluster.get('cluster_id')
            dominant_label = cluster.get('dominant_label')
            keyphrases = cluster.get('keyphrases', [])
            subclusters = cluster.get('subclusters', {})
            
            # Cache cluster keyphrases
            for keyphrase in keyphrases:
                self._cluster_cache[keyphrase] = {
                    'cluster_id': cluster_id,
                    'dominant_label': dominant_label,
                    'subclusters': subclusters
                }
            
            # Cache subcluster keyphrases
            for subcluster_id, subcluster_data in subclusters.items():
                if not isinstance(subcluster_data, dict):
                    continue
                    
                subcluster_keyphrases = subcluster_data.get('keyphrases', [])
                for keyphrase in subcluster_keyphrases:
                    self._subcluster_cache[keyphrase] = {
                        'cluster_id': cluster_id,
                        'dominant_label': dominant_label,
                        'subcluster_id': int(subcluster_id),
                        'subcluster_label': subcluster_data.get('label')
                    }
        
        cache_time = time.time() - start_time
        print(f"Cache loaded in {cache_time:.2f} seconds")
        print(f"Cached {len(self._cluster_cache)} cluster keyphrases")
        print(f"Cached {len(self._subcluster_cache)} subcluster keyphrases")
    
    def find_matching_cluster_fast(self, dominant_topic: str) -> Optional[Dict]:
        """
        Fast cluster lookup using cached data
        """
        return self._cluster_cache.get(dominant_topic)
    
    def find_matching_subcluster_fast(self, dominant_topic: str) -> Optional[Dict]:
        """
        Fast subcluster lookup using cached data
        """
        return self._subcluster_cache.get(dominant_topic)
    
    def find_unmatched_twitter(self, limit: int = None) -> List[Dict]:
        """
        Find twitter that don't match any cluster or subcluster
        """
        unmatched = []
        
        # Get all twitter with dominant_topic
        query = {"dominant_topic": {"$exists": True, "$ne": None}}
        cursor = self.twitter_collection.find(query, {"dominant_topic": 1})
        
        if limit:
            cursor = cursor.limit(limit)
        
        for twitter in cursor:
            dominant_topic = twitter.get('dominant_topic')
            if not dominant_topic:
                continue
                
            # Check if it matches any cluster or subcluster
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            
            if not cluster_match and not subcluster_match:
                unmatched.append({
                    'twitter_id': str(twitter['_id']),
                    'dominant_topic': dominant_topic
                })
        
        return unmatched
    
    def get_unique_dominant_topics(self) -> Dict:
        """
        Get all unique dominant_topic values and their counts from twitter
        """
        pipeline = [
            {"$match": {"dominant_topic": {"$exists": True, "$ne": None}}},
            {"$group": {"_id": "$dominant_topic", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]
        
        result = list(self.twitter_collection.aggregate(pipeline))
        
        topics_info = {
            'total_unique_topics': len(result),
            'topics': result
        }
        
        return topics_info
    
    def analyze_matching_gaps(self) -> Dict:
        """
        Analyze what dominant_topics exist but don't match any twitter clusters
        """
        print("Analyzing matching gaps...")
        
        # Get all unique dominant topics
        topics_info = self.get_unique_dominant_topics()
        print(f"Found {topics_info['total_unique_topics']} unique dominant topics in twitter")
        
        # Check which ones don't match
        unmatched_topics = {}
        matched_topics = {}
        
        for topic_data in topics_info['topics']:
            topic = topic_data['_id']
            count = topic_data['count']
            
            cluster_match = self.find_matching_cluster_fast(topic)
            subcluster_match = self.find_matching_subcluster_fast(topic)
            
            if cluster_match or subcluster_match:
                matched_topics[topic] = {
                    'count': count,
                    'cluster_match': bool(cluster_match),
                    'subcluster_match': bool(subcluster_match)
                }
            else:
                unmatched_topics[topic] = count
        
        return {
            'total_topics': topics_info['total_unique_topics'],
            'matched_topics': len(matched_topics),
            'unmatched_topics': len(unmatched_topics),
            'unmatched_details': unmatched_topics,
            'matched_details': matched_topics,
            'unmatched_twitter_count': sum(unmatched_topics.values()),
            'matched_twitter_count': sum([data['count'] for data in matched_topics.values()])
        }
    
    def create_fallback_cluster_entry(self, unmatched_topics: List[str]) -> Dict:
        """
        Create a fallback cluster entry for unmatched topics (for twitter)
        """
        fallback_cluster = {
            'cluster_id': 999,  # Use a high number to avoid conflicts
            'dominant_label': 'Unclassified twitter Topics',
            'keyphrases': unmatched_topics,
            'data': 'twitter',  # Specify that this is for twitter
            'subclusters': {
                '0': {
                    'label': 'Miscellaneous twitter',
                    'keyphrases': unmatched_topics
                }
            }
        }
        return fallback_cluster
    
    def add_fallback_cluster_to_cache(self, unmatched_topics: List[str]) -> None:
        """
        Add unmatched topics to cache as a fallback cluster
        """
        print(f"Adding {len(unmatched_topics)} unmatched topics to fallback cluster...")
        
        for topic in unmatched_topics:
            # Add to cluster cache
            self._cluster_cache[topic] = {
                'cluster_id': 999,
                'dominant_label': 'Unclassified twitter Topics',
                'subclusters': {'0': {'label': 'Miscellaneous twitter', 'keyphrases': unmatched_topics}}
            }
            
            # Add to subcluster cache
            self._subcluster_cache[topic] = {
                'cluster_id': 999,
                'dominant_label': 'Unclassified twitter Topics',
                'subcluster_id': 0,
                'subcluster_label': 'Miscellaneous twitter'
            }
        
        print(f"✓ Added fallback cluster. Cache now has:")
        print(f"  - Cluster keyphrases: {len(self._cluster_cache)}")
        print(f"  - Subcluster keyphrases: {len(self._subcluster_cache)}")
    
    def process_twitter_batch(self, twitter: List[Dict]) -> List:
        """
        Process a batch of twitter and return bulk operations in correct PyMongo format
        """
        bulk_operations = []
        
        for twitter in twitter:
            dominant_topic = twitter.get('dominant_topic')
            if not dominant_topic:
                continue
            
            # Fast cluster lookup
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            
            update_data = {}
            
            if cluster_match:
                update_data.update({
                    'kmeans_cluster_id': cluster_match['cluster_id'],
                    'dominant_label': cluster_match['dominant_label']
                })
            
            if subcluster_match:
                update_data.update({
                    'kmeans_cluster_id': subcluster_match['cluster_id'],
                    'dominant_label': subcluster_match['dominant_label'],
                    'subcluster_id': subcluster_match['subcluster_id'],
                    'subcluster_label': subcluster_match['subcluster_label']
                })
            
            if update_data:
                # Use PyMongo's UpdateOne class instead of dict
                bulk_operations.append(
                    UpdateOne(
                        {'_id': twitter['_id']}, 
                        {'$set': update_data}
                    )
                )
        
        return bulk_operations
    
    def process_twitter_optimized(self, batch_size: int = 5000, max_workers: int = 4, dry_run: bool = False) -> Dict:
        """
        Optimized twitter processing with larger batches and optional threading
        """
        start_time = time.time()
        
        # Get total count more efficiently
        total_twitter = self.twitter_collection.estimated_document_count()
        processed = 0
        matched_clusters = 0
        matched_subclusters = 0
        total_updates = 0
        
        print(f"Processing ~{total_twitter} twitter in batches of {batch_size}")
        print(f"DRY RUN MODE: {'ON' if dry_run else 'OFF'}")
        
        # Create index on dominant_topic if it doesn't exist (for faster queries)
        try:
            self.twitter_collection.create_index([("dominant_topic", 1)], background=True)
            print("✓ Index on dominant_topic created/verified")
        except Exception as e:
            print(f"Index creation note: {e}")
        
        # Process twitter in larger batches
        cursor = self.twitter_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}},  # Only get twitter with dominant_topic
            projection={'dominant_topic': 1}  # Only fetch the field we need
        ).batch_size(batch_size)
        
        batch = []
        batch_count = 0
        
        for twitter in cursor:
            batch.append(twitter)
            
            if len(batch) >= batch_size:
                batch_count += 1
                print(f"\n--- Processing batch {batch_count} ({len(batch)} twitter) ---")
                
                # Process batch
                bulk_operations = self.process_twitter_batch(batch)
                print(f"Generated {len(bulk_operations)} update operations")
                
                # Count matches for statistics
                batch_cluster_matches = 0
                batch_subcluster_matches = 0
                for twitter in batch:
                    dominant_topic = twitter.get('dominant_topic')
                    if dominant_topic:
                        if self.find_matching_cluster_fast(dominant_topic):
                            matched_clusters += 1
                            batch_cluster_matches += 1
                        if self.find_matching_subcluster_fast(dominant_topic):
                            matched_subclusters += 1
                            batch_subcluster_matches += 1
                
                print(f"Batch matches - Clusters: {batch_cluster_matches}, Subclusters: {batch_subcluster_matches}")
                
                # Execute bulk update (or skip if dry run)
                if bulk_operations and not dry_run:
                    try:
                        print("Executing bulk write...")
                        result = self.twitter_collection.bulk_write(
                            bulk_operations, 
                            ordered=False  # Faster unordered operations
                        )
                        total_updates += result.modified_count
                        print(f"✓ Updated {result.modified_count} documents in batch {batch_count}")
                        
                        # Verify some updates
                        if result.modified_count > 0:
                            sample_updated = list(self.twitter_collection.find(
                                {"kmeans_cluster_id": {"$exists": True}},
                                {"dominant_topic": 1, "kmeans_cluster_id": 1, "subcluster_id": 1}
                            ).limit(3))
                            print(f"Sample updated documents: {len(sample_updated)} found with cluster IDs")
                        
                    except Exception as e:
                        print(f"❌ Bulk write error in batch {batch_count}: {e}")
                        print(f"Error type: {type(e).__name__}")
                        # Show sample operation for debugging in readable format
                        if bulk_operations:
                            sample_op = bulk_operations[0]
                            print(f"Sample operation: Update {sample_op._filter} with {sample_op._doc}")
                elif bulk_operations and dry_run:
                    print(f"DRY RUN: Would update {len(bulk_operations)} documents")
                    # Show sample operations in readable format
                    for i, op in enumerate(bulk_operations[:3]):
                        print(f"Sample operation {i+1}: Update {op._filter} with {op._doc}")
                else:
                    print("No operations to execute (no matches found)")
                
                processed += len(batch)
                batch = []
                
                # Progress update
                elapsed = time.time() - start_time
                rate = processed / elapsed if elapsed > 0 else 0
                print(f"Progress: {processed} twitter processed ({rate:.1f} twitter/sec)")
        
        # Process remaining twitter in the last batch
        if batch:
            batch_count += 1
            print(f"\n--- Processing final batch {batch_count} ({len(batch)} twitter) ---")
            
            bulk_operations = self.process_twitter_batch(batch)
            print(f"Generated {len(bulk_operations)} update operations")
            
            # Count matches for final batch
            batch_cluster_matches = 0
            batch_subcluster_matches = 0
            for twitter in batch:
                dominant_topic = twitter.get('dominant_topic')
                if dominant_topic:
                    if self.find_matching_cluster_fast(dominant_topic):
                        matched_clusters += 1
                        batch_cluster_matches += 1
                    if self.find_matching_subcluster_fast(dominant_topic):
                        matched_subclusters += 1
                        batch_subcluster_matches += 1
            
            print(f"Final batch matches - Clusters: {batch_cluster_matches}, Subclusters: {batch_subcluster_matches}")
            
            if bulk_operations and not dry_run:
                try:
                    print("Executing final bulk write...")
                    result = self.twitter_collection.bulk_write(
                        bulk_operations, 
                        ordered=False
                    )
                    total_updates += result.modified_count
                    print(f"✓ Updated {result.modified_count} documents in final batch")
                except Exception as e:
                    print(f"❌ Bulk write error in final batch: {e}")
                    print(f"Error type: {type(e).__name__}")
            elif bulk_operations and dry_run:
                print(f"DRY RUN: Would update {len(bulk_operations)} documents")
            
            processed += len(batch)
        
        total_time = time.time() - start_time
        
        # Final verification
        if not dry_run and total_updates > 0:
            print(f"\n--- Verification ---")
            updated_count = self.twitter_collection.count_documents({"kmeans_cluster_id": {"$exists": True}})
            print(f"Total twitter documents with kmeans_cluster_id: {updated_count}")
            
            subcluster_count = self.twitter_collection.count_documents({"subcluster_id": {"$exists": True}})
            print(f"Total twitter documents with subcluster_id: {subcluster_count}")
        
        stats = {
            'total_twitter': processed,
            'matched_clusters': matched_clusters,
            'matched_subclusters': matched_subclusters,
            'total_updates': total_updates,
            'processing_time': total_time,
            'twitter_per_second': processed / total_time if total_time > 0 else 0,
            'cluster_match_rate': (matched_clusters / processed * 100) if processed > 0 else 0,
            'subcluster_match_rate': (matched_subclusters / processed * 100) if processed > 0 else 0,
            'dry_run': dry_run
        }
        
        return stats
    
    def process_with_fallback(self, batch_size: int = 5000, dry_run: bool = False) -> Dict:
        """
        Process twitter with automatic fallback cluster for unmatched topics
        """
        print("=== PROCESSING twitter WITH FALLBACK CLUSTER ===")
        
        # First, analyze gaps
        gaps = self.analyze_matching_gaps()
        
        if gaps['unmatched_twitter_count'] > 0:
            print(f"Found {gaps['unmatched_twitter_count']} unmatched twitter")
            print(f"Unmatched topics: {list(gaps['unmatched_details'].keys())}")
            
            # Add fallback cluster to cache
            unmatched_topic_list = list(gaps['unmatched_details'].keys())
            self.add_fallback_cluster_to_cache(unmatched_topic_list)
            
            # Optionally save fallback cluster to database
            save_choice = input("Save fallback cluster to database permanently? (y/n): ")
            if save_choice.lower() == 'y':
                fallback_cluster = self.create_fallback_cluster_entry(unmatched_topic_list)
                try:
                    self.clusters_collection.insert_one(fallback_cluster)
                    print("✓ Fallback cluster saved to database")
                except Exception as e:
                    print(f"⚠️  Could not save fallback cluster: {e}")
        
        # Now process all twitter (should be 100% match rate)
        return self.process_twitter_optimized(batch_size=batch_size, dry_run=dry_run)
    
    def get_performance_stats(self) -> Dict:
        """
        Get database performance statistics
        """
        stats = {}
        
        # Collection sizes
        stats['total_twitter'] = self.twitter_collection.estimated_document_count()
        stats['twitter_with_topic'] = self.twitter_collection.count_documents({
            "dominant_topic": {"$exists": True, "$ne": None}
        })
        stats['total_twitter_clusters'] = self.clusters_collection.count_documents({"data": "twitter"})
        stats['total_all_clusters'] = self.clusters_collection.estimated_document_count()
        
        # Cache statistics
        stats['cached_cluster_keyphrases'] = len(self._cluster_cache) if self._cluster_cache else 0
        stats['cached_subcluster_keyphrases'] = len(self._subcluster_cache) if self._subcluster_cache else 0
        
        return stats
    
    def debug_matching_process(self, limit: int = 5) -> None:
        """
        Debug the matching process to see what's happening with twitter
        """
        print("\n=== DEBUGGING twitter MATCHING PROCESS ===")
        
        # Check if we have any cluster data
        if not self._cluster_cache and not self._subcluster_cache:
            print("❌ NO twitter CLUSTER CACHE DATA! This is why updates are failing.")
            return
        
        print(f"✓ twitter cluster cache has {len(self._cluster_cache)} entries")
        print(f"✓ twitter subcluster cache has {len(self._subcluster_cache)} entries")
        
        # Sample some cluster keyphrases
        print(f"\nSample twitter cluster keyphrases:")
        for i, keyphrase in enumerate(list(self._cluster_cache.keys())[:10]):
            cluster_info = self._cluster_cache[keyphrase]
            print(f"  {i+1}. '{keyphrase}' -> Cluster {cluster_info['cluster_id']}")
        
        # Sample some subcluster keyphrases  
        print(f"\nSample twitter subcluster keyphrases:")
        for i, keyphrase in enumerate(list(self._subcluster_cache.keys())[:10]):
            subcluster_info = self._subcluster_cache[keyphrase]
            print(f"  {i+1}. '{keyphrase}' -> Cluster {subcluster_info['cluster_id']}, Subcluster {subcluster_info['subcluster_id']}")
        
        # Check some actual twitter
        print(f"\n=== TESTING {limit} twitter ===")
        twitter = list(self.twitter_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}}
        ).limit(limit))
        
        if not twitter:
            print("❌ NO twitter with dominant_topic found!")
            return
        
        for i, twitter in enumerate(twitter, 1):
            dominant_topic = twitter.get('dominant_topic', 'NO_TOPIC')
            print(f"\n--- twitter {i} ---")
            print(f"twitter ID: {twitter['_id']}")
            print(f"Dominant Topic: '{dominant_topic}'")
            
            # Test cluster matching
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            if cluster_match:
                print(f"✓ CLUSTER MATCH: ID={cluster_match['cluster_id']}, Label='{cluster_match['dominant_label']}'")
            else:
                print(f"❌ No cluster match for '{dominant_topic}'")
            
            # Test subcluster matching  
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            if subcluster_match:
                print(f"✓ SUBCLUSTER MATCH: Cluster={subcluster_match['cluster_id']}, Subcluster={subcluster_match['subcluster_id']}, Label='{subcluster_match['subcluster_label']}'")
            else:
                print(f"❌ No subcluster match for '{dominant_topic}'")
            
            # Show what the update operation would look like
            update_data = {}
            if cluster_match:
                update_data.update({
                    'kmeans_cluster_id': cluster_match['cluster_id'],
                    'dominant_label': cluster_match['dominant_label']
                })
            if subcluster_match:
                update_data.update({
                    'kmeans_cluster_id': subcluster_match['cluster_id'],
                    'dominant_label': subcluster_match['dominant_label'],
                    'subcluster_id': subcluster_match['subcluster_id'],
                    'subcluster_label': subcluster_match['subcluster_label']
                })
            
            if update_data:
                print(f"UPDATE OPERATION: {update_data}")
            else:
                print("NO UPDATE OPERATION (no matches)")
        
        print(f"\n=== DATABASE STATE CHECK ===")
        # Check existing updates
        existing_with_cluster = self.twitter_collection.count_documents({"kmeans_cluster_id": {"$exists": True}})
        existing_with_subcluster = self.twitter_collection.count_documents({"subcluster_id": {"$exists": True}})
        twitter_with_topic = self.twitter_collection.count_documents({"dominant_topic": {"$exists": True, "$ne": None}})
        
        print(f"twitter with dominant_topic: {twitter_with_topic}")
        print(f"twitter already with kmeans_cluster_id: {existing_with_cluster}")
        print(f"twitter already with subcluster_id: {existing_with_subcluster}")
        
        if twitter_with_topic == 0:
            print("❌ PROBLEM: No twitter have 'dominant_topic' field!")
        elif existing_with_cluster == twitter_with_topic:
            print("✓ All twitter already processed!")
        else:
            print(f"📝 {twitter_with_topic - existing_with_cluster} twitter need processing")
    
    def get_preview(self, limit: int = 10) -> List[Dict]:
        """
        Get a preview of twitter-cluster matches for testing
        """
        twitter = list(self.twitter_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}}
        ).limit(limit))
        
        preview = []
        
        for twitter in twitter:
            dominant_topic = twitter.get('dominant_topic')
            if not dominant_topic:
                continue
            
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            
            preview.append({
                'twitter_id': str(twitter['_id']),
                'dominant_topic': dominant_topic,
                'cluster_match': cluster_match,
                'subcluster_match': subcluster_match
            })
        
        return preview
    
    def close_connection(self):
        """Close MongoDB connection"""
        self.client.close()

# Usage example
def main():
    # Get configuration from environment variables
    CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
    DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')
    
    if not CONNECTION_STRING:
        raise ValueError("MONGO_CONNECTION_STRING not found in environment variables")
    if not DATABASE_NAME:
        raise ValueError("MONGO_DATABASE_NAME not found in environment variables")
    
    print(f"Connecting to database: {DATABASE_NAME}")
    
    # Initialize optimized twitter matcher
    matcher = OptimizedtwitterClusterMatcher(CONNECTION_STRING, DATABASE_NAME)
    
    try:
        # Show performance stats
        print("\n--- Database Statistics ---")
        perf_stats = matcher.get_performance_stats()
        for key, value in perf_stats.items():
            print(f"{key}: {value:,}")
        
        # Analyze matching gaps
        print("\n--- Gap Analysis ---")
        gap_choice = input("Analyze which twitter aren't matching? (y/n): ")
        if gap_choice.lower() == 'y':
            gaps = matcher.analyze_matching_gaps()
            print(f"\n=== MATCHING GAP ANALYSIS ===")
            print(f"Total unique topics: {gaps['total_topics']}")
            print(f"Matched topics: {gaps['matched_topics']}")
            print(f"Unmatched topics: {gaps['unmatched_topics']}")
            print(f"Matched twitter: {gaps['matched_twitter_count']}")
            print(f"Unmatched twitter: {gaps['unmatched_twitter_count']}")
            
            if gaps['unmatched_details']:
                print(f"\n--- UNMATCHED DOMINANT TOPICS ---")
                for topic, count in list(gaps['unmatched_details'].items())[:10]:
                    print(f"'{topic}' - {count} twitter")
                
                if len(gaps['unmatched_details']) > 10:
                    print(f"... and {len(gaps['unmatched_details']) - 10} more")
                
                print(f"\n💡 To get 100% matches, you need to:")
                print(f"1. Add these topics to your twitter cluster keyphrases, OR")
                print(f"2. Create a 'catch-all' cluster for unmatched twitter topics")
        
        # Debug the matching process first
        print("\n--- Debugging Mode ---")
        debug_choice = input("Run debug mode to see why DB isn't updating? (y/n): ")
        if debug_choice.lower() == 'y':
            matcher.debug_matching_process()
        
        # Get a preview first
        print("\n--- Preview of twitter Matches ---")
        preview = matcher.get_preview(limit=5)
        
        for i, item in enumerate(preview, 1):
            print(f"\n--- twitter {i} ---")
            print(f"Dominant Topic: {item['dominant_topic']}")
            
            if item['subcluster_match']:
                print(f"✓ Subcluster Match: Cluster ID={item['subcluster_match']['cluster_id']}, "
                      f"Subcluster ID={item['subcluster_match']['subcluster_id']}, "
                      f"Label={item['subcluster_match']['subcluster_label']}")
            elif item['cluster_match']:
                print(f"✓ Cluster Match: ID={item['cluster_match']['cluster_id']}, "
                      f"Label={item['cluster_match']['dominant_label']}")
            else:
                print("✗ No match found")
        
        # Process all twitter
        print("\n--- Processing Options ---")
        print("1. Dry run (see what would be updated without changing DB)")
        print("2. Full processing (actually update the database)")
        print("3. Process with fallback cluster (100% match guarantee)")
        choice = input("Choose option (1, 2, or 3): ")
        
        if choice in ['1', '2', '3']:
            if choice == '3':
                # Use fallback processing
                dry_run = False
                fallback_choice = input("Dry run with fallback first? (y/n): ")
                if fallback_choice.lower() == 'y':
                    dry_run = True
                
                print(f"\nStarting {'DRY RUN' if dry_run else 'LIVE'} processing with fallback...")
                batch_size = int(input("Enter batch size (recommended: 5000-10000): ") or "5000")
                stats = matcher.process_with_fallback(batch_size=batch_size, dry_run=dry_run)
            else:
                # Regular processing
                dry_run = (choice == '1')
                
                print(f"\nStarting {'DRY RUN' if dry_run else 'LIVE PROCESSING'}...")
                
                # Use larger batch size for better performance
                batch_size = int(input("Enter batch size (recommended: 5000-10000): ") or "5000")
                
                stats = matcher.process_twitter_optimized(batch_size=batch_size, dry_run=dry_run)
            
            print("\n--- Final Results ---")
            print(f"Total twitter processed: {stats['total_twitter']:,}")
            print(f"Total updates made: {stats['total_updates']:,}")
            print(f"Cluster matches: {stats['matched_clusters']:,} ({stats['cluster_match_rate']:.1f}%)")
            print(f"Subcluster matches: {stats['matched_subclusters']:,} ({stats['subcluster_match_rate']:.1f}%)")
            print(f"Processing time: {stats['processing_time']:.2f} seconds")
            print(f"Processing rate: {stats['twitter_per_second']:.1f} twitter/second")
        
    finally:
        matcher.close_connection()

if __name__ == "__main__":
    main()

Connecting to database: sparzaai
Loading twitter cluster data into cache...
Found 15 twitter clusters to cache
Cache loaded in 2.21 seconds
Cached 167 cluster keyphrases
Cached 167 subcluster keyphrases

--- Database Statistics ---
total_twitter: 2,000
twitter_with_topic: 2,000
total_twitter_clusters: 15
total_all_clusters: 74
cached_cluster_keyphrases: 167
cached_subcluster_keyphrases: 167

--- Gap Analysis ---
Analyzing matching gaps...
Found 167 unique dominant topics in twitter

=== MATCHING GAP ANALYSIS ===
Total unique topics: 167
Matched topics: 167
Unmatched topics: 0
Matched twitter: 2000
Unmatched twitter: 0

--- Debugging Mode ---

=== DEBUGGING twitter MATCHING PROCESS ===
✓ twitter cluster cache has 167 entries
✓ twitter subcluster cache has 167 entries

Sample twitter cluster keyphrases:
  1. 'Excellent Call Support' -> Cluster 0
  2. 'Knowledgeable Agent Helped' -> Cluster 0
  3. 'Unhelpful Agent Encountered' -> Cluster 0
  4. 'Multilingual Support Appreciated' -> Cluste

In [4]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {mongo_database_name}")
print("=" * 60)

# Connect to MongoDB
client = MongoClient(mongo_connection_string)
db = client[mongo_database_name]
twitter_collection = db['twitter']

def rename_twitter_fields():
    """
    Rename fields in twitter documents:
    - is_urgent -> urgency
    - dominant_label -> dominant_cluster_label
    """
    
    print("🎫 Starting twitter field rename process...")
    print("Fields to rename:")
    print("  - is_urgent → urgency")
    print("  - dominant_label → dominant_cluster_label")
    print("-" * 50)
    
    try:
        # Check current state before rename
        total_twitter = twitter_collection.count_documents({})
        print(f"Total twitter in collection: {total_twitter}")
        
        if total_twitter == 0:
            print("⚠ No twitter found in collection")
            return
        
        # Count existing fields before rename
        is_urgent_count = twitter_collection.count_documents({"is_urgent": {"$exists": True}})
        dominant_label_count = twitter_collection.count_documents({"dominant_label": {"$exists": True}})
        
        print(f"twitter with 'is_urgent' field: {is_urgent_count}")
        print(f"twitter with 'dominant_label' field: {dominant_label_count}")
        
        # Count already renamed fields
        urgency_count = twitter_collection.count_documents({"urgency": {"$exists": True}})
        dominant_cluster_label_count = twitter_collection.count_documents({"dominant_cluster_label": {"$exists": True}})
        
        print(f"twitter already with 'urgency' field: {urgency_count}")
        print(f"twitter already with 'dominant_cluster_label' field: {dominant_cluster_label_count}")
        
        print("\n" + "=" * 50)
        print("RENAMING twitter FIELDS...")
        print("=" * 50)
        
        # Rename both fields in a single operation for all twitter
        result = twitter_collection.update_many(
            {},  # Empty filter to match all twitter documents
            {
                "$rename": {
                    "is_urgent": "urgency",
                    "dominant_label": "dominant_cluster_label"
                }
            }
        )
        
        # Print results
        print(f"✓ Field rename operation completed:")
        print(f"  Matched twitter: {result.matched_count}")
        print(f"  Modified twitter: {result.modified_count}")
        print(f"  Operation acknowledged: {result.acknowledged}")
        
        # Verify the changes
        verify_rename_changes()
        
    except Exception as e:
        print(f"❌ Error during twitter field rename: {str(e)}")

def verify_rename_changes():
    """
    Verify that the field rename was successful
    """
    print("\n" + "=" * 50)
    print("VERIFICATION OF twitter FIELD RENAME")
    print("=" * 50)
    
    try:
        # Verify the changes by checking a sample twitter document
        sample_twitter = twitter_collection.find_one()
        if sample_twitter:
            print("Sample twitter document after rename:")
            print(f"  twitter ID: {sample_twitter.get('_id')}")
            print(f"  Has 'urgency' field: {'urgency' in sample_twitter}")
            print(f"  Has 'dominant_cluster_label' field: {'dominant_cluster_label' in sample_twitter}")
            print(f"  Has old 'is_urgent' field: {'is_urgent' in sample_twitter}")
            print(f"  Has old 'dominant_label' field: {'dominant_label' in sample_twitter}")
            
            # Show sample values if they exist
            if 'urgency' in sample_twitter:
                print(f"  Sample 'urgency' value: {sample_twitter['urgency']}")
            if 'dominant_cluster_label' in sample_twitter:
                print(f"  Sample 'dominant_cluster_label' value: {sample_twitter['dominant_cluster_label']}")
        else:
            print("⚠ No twitter documents found in the collection")
        
        # Count twitter with the new field names
        urgency_count = twitter_collection.count_documents({"urgency": {"$exists": True}})
        cluster_label_count = twitter_collection.count_documents({"dominant_cluster_label": {"$exists": True}})
        
        # Count twitter with old field names (should be 0 after rename)
        old_is_urgent_count = twitter_collection.count_documents({"is_urgent": {"$exists": True}})
        old_dominant_label_count = twitter_collection.count_documents({"dominant_label": {"$exists": True}})
        
        print(f"\nField counts after rename:")
        print(f"  twitter with 'urgency' field: {urgency_count}")
        print(f"  twitter with 'dominant_cluster_label' field: {cluster_label_count}")
        print(f"  twitter with old 'is_urgent' field: {old_is_urgent_count}")
        print(f"  twitter with old 'dominant_label' field: {old_dominant_label_count}")
        
        # Success check
        if old_is_urgent_count == 0 and old_dominant_label_count == 0:
            print("\n✅ SUCCESS: All twitter fields have been renamed successfully!")
        else:
            print(f"\n⚠ WARNING: Some twitter still have old field names")
            
    except Exception as e:
        print(f"❌ Error during verification: {str(e)}")

def get_twitter_field_statistics():
    """
    Get detailed statistics about twitter fields after rename
    """
    print("\n" + "=" * 50)
    print("twitter FIELD STATISTICS")
    print("=" * 50)
    
    try:
        total_twitter = twitter_collection.count_documents({})
        
        # Get statistics for new field names
        urgency_stats = list(twitter_collection.aggregate([
            {"$match": {"urgency": {"$exists": True}}},
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        cluster_label_stats = list(twitter_collection.aggregate([
            {"$match": {"dominant_cluster_label": {"$exists": True}}},
            {"$group": {"_id": "$dominant_cluster_label", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}},
            {"$limit": 10}  # Show top 10 cluster labels
        ]))
        
        print(f"Total twitter: {total_twitter}")
        
        if urgency_stats:
            print(f"\nUrgency field distribution:")
            for stat in urgency_stats:
                urgency_value = stat['_id']
                count = stat['count']
                percentage = (count / total_twitter) * 100 if total_twitter > 0 else 0
                print(f"  '{urgency_value}': {count} twitter ({percentage:.1f}%)")
        else:
            print(f"\nNo twitter found with 'urgency' field")
        
        if cluster_label_stats:
            print(f"\nTop 10 dominant cluster labels:")
            for i, stat in enumerate(cluster_label_stats, 1):
                label = stat['_id'] if stat['_id'] is not None else 'null'
                count = stat['count']
                percentage = (count / total_twitter) * 100 if total_twitter > 0 else 0
                print(f"  {i}. '{label}': {count} twitter ({percentage:.1f}%)")
        else:
            print(f"\nNo twitter found with 'dominant_cluster_label' field")
            
    except Exception as e:
        print(f"❌ Error getting twitter statistics: {str(e)}")

def show_sample_twitter():
    """
    Show sample twitter documents with renamed fields
    """
    print("\n" + "=" * 50)
    print("SAMPLE twitter DOCUMENTS")
    print("=" * 50)
    
    try:
        # Get sample twitter with both new fields
        sample_twitter = list(twitter_collection.find(
            {
                "urgency": {"$exists": True},
                "dominant_cluster_label": {"$exists": True}
            },
            {
                "_id": 1,
                "urgency": 1,
                "dominant_cluster_label": 1,
                "subject": 1,  # Include subject if it exists
                "priority": 1   # Include priority if it exists
            }
        ).limit(3))
        
        if sample_twitter:
            print("Sample twitter with renamed fields:")
            for i, twitter in enumerate(sample_twitter, 1):
                print(f"\ntwitter {i}:")
                print(f"  ID: {twitter.get('_id')}")
                print(f"  Urgency: {twitter.get('urgency', 'N/A')}")
                print(f"  Dominant Cluster Label: {twitter.get('dominant_cluster_label', 'N/A')}")
                if 'subject' in twitter:
                    print(f"  Subject: {twitter.get('subject', 'N/A')}")
                if 'priority' in twitter:
                    print(f"  Priority: {twitter.get('priority', 'N/A')}")
        else:
            print("No twitter found with both renamed fields")
            
    except Exception as e:
        print(f"❌ Error showing sample twitter: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        # Test database connection
        test_twitter = twitter_collection.find_one()
        if test_twitter:
            print("✓ Database connection successful")
            print(f"Sample twitter fields: {list(test_twitter.keys())}\n")
        else:
            print("⚠ No twitter found in twitter collection")
            print("Please ensure you have twitter documents before running this script")
            exit(1)
        
        # Execute the field rename for twitter
        rename_twitter_fields()
        
        # Get detailed statistics
        get_twitter_field_statistics()
        
        # Show sample twitter
        show_sample_twitter()
        
        print("\n" + "=" * 60)
        print("✅ twitter field rename process completed successfully!")
        print("Fields renamed:")
        print("  - is_urgent → urgency")
        print("  - dominant_label → dominant_cluster_label")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
✓ Database connection successful
Sample twitter fields: ['_id', 'tweet_id', 'created_at', 'user_id', 'username', 'email_id', 'dominant_topic', 'subtopics', 'hashtags', 'like_count', 'priority', 'quote_count', 'reply_count', 'retweet_count', 'sentiment', 'text', 'urgency', 'embeddings', 'kmeans_cluster_id', 'subcluster_id', 'subcluster_label', 'dominant_cluster_label', 'kmeans_cluster_keyphrase', 'domain', 'dominant_label']

🎫 Starting twitter field rename process...
Fields to rename:
  - is_urgent → urgency
  - dominant_label → dominant_cluster_label
--------------------------------------------------
Total twitter in collection: 2000
twitter with 'is_urgent' field: 0
twitter with 'dominant_label' field: 2000
twitter already with 'urgency' field: 2000
twitter already with 'dominant_cluster_label' field: 2000

RENAMING twitter FIELDS...
✓ Field rename operation completed:
  Matched twitter: 2000
  Modified twitter: 2000
  Operation acknowledged

In [5]:
from pymongo import MongoClient, UpdateOne
from typing import Dict, List, Optional
import os
from dotenv import load_dotenv
import time

# Load environment variables
load_dotenv()

class twitterClusterKeyphraseUpdater:
    def __init__(self, connection_string: str, database_name: str):
        """Initialize the updater with MongoDB connection"""
        self.client = MongoClient(connection_string)
        self.db = self.client[database_name]
        self.twitter_collection = self.db['twitter']
        self.clusters_collection = self.db['cluster']
        
        # Cache for keyphrase -> cluster mapping (ONLY twitter cluster level)
        self._keyphrase_to_cluster = {}
        self._load_keyphrase_cache()
    
    def _load_keyphrase_cache(self):
        """Load only twitter cluster-level keyphrases into memory for fast lookups"""
        print("Loading twitter cluster keyphrase cache...")
        start_time = time.time()
        
        # Only load clusters with data: "twitter"
        twitter_clusters = list(self.clusters_collection.find({"data": "twitter"}))
        print(f"Found {len(twitter_clusters)} twitter clusters to process")
        
        for cluster in twitter_clusters:
            cluster_id = cluster.get('cluster_id')
            dominant_label = cluster.get('dominant_label')
            cluster_keyphrases = cluster.get('keyphrases', [])
            
            # Cache ONLY twitter cluster-level keyphrases
            for keyphrase in cluster_keyphrases:
                self._keyphrase_to_cluster[keyphrase] = {
                    'cluster_id': cluster_id,
                    'dominant_label': dominant_label,
                    'matched_keyphrase': keyphrase
                }
        
        cache_time = time.time() - start_time
        print(f"twitter cluster cache loaded in {cache_time:.2f} seconds")
        print(f"Cached {len(self._keyphrase_to_cluster)} twitter cluster keyphrases")
    
    def find_matching_keyphrase(self, dominant_topic: str) -> Optional[Dict]:
        """Find the matching keyphrase for a dominant topic (twitter cluster level only)"""
        return self._keyphrase_to_cluster.get(dominant_topic)
    
    def process_twitter_batch(self, twitter: List[Dict]) -> List:
        """Process a batch of twitter and return bulk operations"""
        bulk_operations = []
        
        for twitter in twitter:
            dominant_topic = twitter.get('dominant_topic')
            if not dominant_topic:
                continue
            
            # Find matching keyphrase from twitter clusters
            match_info = self.find_matching_keyphrase(dominant_topic)
            
            if match_info:
                update_data = {
                    'kmeans_cluster_keyphrase': match_info['matched_keyphrase']
                }
                
                bulk_operations.append(
                    UpdateOne(
                        {'_id': twitter['_id']}, 
                        {'$set': update_data}
                    )
                )
        
        return bulk_operations
    
    def add_keyphrase_field(self, batch_size: int = 5000, dry_run: bool = False) -> Dict:
        """Add kmeans_cluster_keyphrase field to all matching twitter"""
        start_time = time.time()
        
        # Get total count
        total_twitter = self.twitter_collection.count_documents({
            "dominant_topic": {"$exists": True, "$ne": None}
        })
        
        processed = 0
        matched = 0
        total_updates = 0
        
        print(f"Processing {total_twitter} twitter with dominant_topic")
        print(f"Batch size: {batch_size}")
        print(f"DRY RUN MODE: {'ON' if dry_run else 'OFF'}")
        
        # Create index for faster queries
        try:
            self.twitter_collection.create_index([("dominant_topic", 1)], background=True)
            print("✓ Index on dominant_topic verified for twitter collection")
        except Exception as e:
            print(f"Index note: {e}")
        
        # Process in batches
        cursor = self.twitter_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}},
            projection={'dominant_topic': 1}
        ).batch_size(batch_size)
        
        batch = []
        batch_count = 0
        
        for twitter in cursor:
            batch.append(twitter)
            
            if len(batch) >= batch_size:
                batch_count += 1
                print(f"\n--- Processing twitter batch {batch_count} ({len(batch)} twitter) ---")
                
                # Process batch
                bulk_operations = self.process_twitter_batch(batch)
                batch_matched = len(bulk_operations)
                matched += batch_matched
                
                print(f"Generated {batch_matched} keyphrase updates for this twitter batch")
                
                # Execute bulk update (or skip if dry run)
                if bulk_operations and not dry_run:
                    try:
                        result = self.twitter_collection.bulk_write(
                            bulk_operations, 
                            ordered=False
                        )
                        total_updates += result.modified_count
                        print(f"✓ Updated {result.modified_count} twitter with keyphrase field")
                        
                    except Exception as e:
                        print(f"❌ Bulk write error in twitter batch {batch_count}: {e}")
                
                elif bulk_operations and dry_run:
                    print(f"DRY RUN: Would add keyphrase field to {batch_matched} twitter")
                    # Show sample operations
                    for i, op in enumerate(bulk_operations[:3]):
                        keyphrase = op._doc['$set']['kmeans_cluster_keyphrase']
                        print(f"  Sample {i+1}: Would set keyphrase='{keyphrase}'")
                
                processed += len(batch)
                batch = []
                
                # Progress update
                elapsed = time.time() - start_time
                rate = processed / elapsed if elapsed > 0 else 0
                print(f"Progress: {processed}/{total_twitter} ({rate:.1f} twitter/sec)")
        
        # Process remaining twitter in final batch
        if batch:
            batch_count += 1
            print(f"\n--- Processing final twitter batch {batch_count} ({len(batch)} twitter) ---")
            
            bulk_operations = self.process_twitter_batch(batch)
            batch_matched = len(bulk_operations)
            matched += batch_matched
            
            print(f"Generated {batch_matched} keyphrase updates for final twitter batch")
            
            if bulk_operations and not dry_run:
                try:
                    result = self.twitter_collection.bulk_write(
                        bulk_operations, 
                        ordered=False
                    )
                    total_updates += result.modified_count
                    print(f"✓ Updated {result.modified_count} twitter in final batch")
                except Exception as e:
                    print(f"❌ Bulk write error in final twitter batch: {e}")
            elif bulk_operations and dry_run:
                print(f"DRY RUN: Would add keyphrase field to {batch_matched} twitter")
            
            processed += len(batch)
        
        total_time = time.time() - start_time
        
        # Final verification
        if not dry_run and total_updates > 0:
            print(f"\n--- Verification ---")
            keyphrase_count = self.twitter_collection.count_documents({
                "kmeans_cluster_keyphrase": {"$exists": True}
            })
            print(f"Total twitter with kmeans_cluster_keyphrase: {keyphrase_count}")
            
            # Show some sample results
            samples = list(self.twitter_collection.find(
                {"kmeans_cluster_keyphrase": {"$exists": True}},
                {"dominant_topic": 1, "kmeans_cluster_keyphrase": 1}
            ).limit(5))
            
            print(f"\nSample twitter results:")
            for i, sample in enumerate(samples, 1):
                print(f"  {i}. Topic: '{sample.get('dominant_topic')}' -> "
                      f"Keyphrase: '{sample.get('kmeans_cluster_keyphrase')}'")
        
        stats = {
            'total_twitter_processed': processed,
            'twitter_matched': matched,
            'total_updates': total_updates,
            'processing_time': total_time,
            'twitter_per_second': processed / total_time if total_time > 0 else 0,
            'match_rate': (matched / processed * 100) if processed > 0 else 0,
            'dry_run': dry_run
        }
        
        return stats
    
    def debug_keyphrase_matching(self, limit: int = 10) -> None:
        """Debug the keyphrase matching process for twitter"""
        print("\n=== DEBUGGING twitter CLUSTER KEYPHRASE MATCHING ===")
        
        # Check cache
        if not self._keyphrase_to_cluster:
            print("❌ NO twitter CLUSTER KEYPHRASE CACHE DATA!")
            return
        
        print(f"✓ twitter cluster keyphrase cache: {len(self._keyphrase_to_cluster)} entries")
        
        # Show sample keyphrases
        print(f"\nSample twitter cluster keyphrases:")
        for i, (keyphrase, info) in enumerate(list(self._keyphrase_to_cluster.items())[:10]):
            print(f"  {i+1}. '{keyphrase}' -> twitter Cluster {info['cluster_id']} ({info['dominant_label']})")
        
        # Test with actual twitter
        print(f"\n=== TESTING {limit} twitter ===")
        twitter = list(self.twitter_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}}
        ).limit(limit))
        
        if not twitter:
            print("❌ NO twitter with dominant_topic found!")
            return
        
        for i, twitter in enumerate(twitter, 1):
            dominant_topic = twitter.get('dominant_topic', 'NO_TOPIC')
            print(f"\n--- twitter {i} ---")
            print(f"twitter ID: {twitter['_id']}")
            print(f"Dominant Topic: '{dominant_topic}'")
            
            # Test keyphrase matching
            match_info = self.find_matching_keyphrase(dominant_topic)
            if match_info:
                print(f"✓ twitter CLUSTER KEYPHRASE MATCH: '{match_info['matched_keyphrase']}'")
                print(f"  Cluster ID: {match_info['cluster_id']}")
                print(f"  Cluster Label: {match_info['dominant_label']}")
            else:
                print(f"❌ No twitter cluster keyphrase match for '{dominant_topic}'")
    
    def get_keyphrase_stats(self) -> Dict:
        """Get statistics about keyphrase matching for twitter"""
        # Count twitter with dominant_topic
        twitter_with_topic = self.twitter_collection.count_documents({
            "dominant_topic": {"$exists": True, "$ne": None}
        })
        
        # Count twitter already with keyphrase field
        twitter_with_keyphrase = self.twitter_collection.count_documents({
            "kmeans_cluster_keyphrase": {"$exists": True}
        })
        
        # Get unique dominant topics and check match rates
        unique_topics = self.twitter_collection.distinct("dominant_topic")
        unique_topics = [topic for topic in unique_topics if topic is not None]
        
        matchable_topics = 0
        for topic in unique_topics:
            if self.find_matching_keyphrase(topic):
                matchable_topics += 1
        
        # Count twitter clusters
        total_twitter_clusters = self.clusters_collection.count_documents({"data": "twitter"})
        
        return {
            'total_twitter_with_topic': twitter_with_topic,
            'twitter_with_keyphrase_field': twitter_with_keyphrase,
            'unique_dominant_topics': len(unique_topics),
            'matchable_topics': matchable_topics,
            'topic_match_rate': (matchable_topics / len(unique_topics) * 100) if unique_topics else 0,
            'cached_twitter_cluster_keyphrases': len(self._keyphrase_to_cluster),
            'total_twitter_clusters': total_twitter_clusters
        }
    
    def close_connection(self):
        """Close MongoDB connection"""
        self.client.close()

def main():
    # Get configuration from environment variables
    CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
    DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')
    
    if not CONNECTION_STRING:
        raise ValueError("MONGO_CONNECTION_STRING not found in environment variables")
    if not DATABASE_NAME:
        raise ValueError("MONGO_DATABASE_NAME not found in environment variables")
    
    print(f"Connecting to database: {DATABASE_NAME}")
    print("Processing twitter with twitter clusters (data: 'twitter')")
    
    # Initialize twitter keyphrase updater
    updater = twitterClusterKeyphraseUpdater(CONNECTION_STRING, DATABASE_NAME)
    
    try:
        # Show current statistics
        print("\n--- Current twitter Statistics ---")
        stats = updater.get_keyphrase_stats()
        for key, value in stats.items():
            if isinstance(value, float):
                print(f"{key}: {value:.1f}")
            else:
                print(f"{key}: {value:,}")
        
        # Check if we have twitter clusters
        if stats['total_twitter_clusters'] == 0:
            print("\n❌ No twitter clusters found (data: 'twitter')!")
            print("Please ensure you have clusters with data: 'twitter' before running this script.")
            return
        
        # Debug keyphrase matching
        print("\n--- Debug Mode ---")
        debug_choice = input("Run debug mode to see twitter keyphrase matching? (y/n): ")
        if debug_choice.lower() == 'y':
            updater.debug_keyphrase_matching()
        
        # Choose processing mode
        print("\n--- Processing Options ---")
        print("1. Dry run (see what would be updated)")
        print("2. Live processing (actually add keyphrase field)")
        choice = input("Choose option (1 or 2): ")
        
        if choice in ['1', '2']:
            dry_run = (choice == '1')
            
            print(f"\nStarting {'DRY RUN' if dry_run else 'LIVE PROCESSING'} for twitter...")
            
            # Get batch size
            batch_size = int(input("Enter batch size (recommended: 5000): ") or "5000")
            
            # Process twitter
            results = updater.add_keyphrase_field(batch_size=batch_size, dry_run=dry_run)
            
            print("\n--- Final Results ---")
            print(f"Total twitter processed: {results['total_twitter_processed']:,}")
            print(f"twitter with matching keyphrases: {results['twitter_matched']:,}")
            print(f"Total updates made: {results['total_updates']:,}")
            print(f"Match rate: {results['match_rate']:.1f}%")
            print(f"Processing time: {results['processing_time']:.2f} seconds")
            print(f"Processing rate: {results['twitter_per_second']:.1f} twitter/second")
            
            if not dry_run and results['total_updates'] > 0:
                print(f"\n✅ Successfully added kmeans_cluster_keyphrase field to {results['total_updates']:,} twitter!")
            elif dry_run:
                print(f"\nDRY RUN COMPLETE: Would add keyphrase field to {results['twitter_matched']:,} twitter")
        
    finally:
        updater.close_connection()

if __name__ == "__main__":
    main()

Connecting to database: sparzaai
Processing twitter with twitter clusters (data: 'twitter')
Loading twitter cluster keyphrase cache...
Found 15 twitter clusters to process
twitter cluster cache loaded in 2.35 seconds
Cached 167 twitter cluster keyphrases

--- Current twitter Statistics ---
total_twitter_with_topic: 2,000
twitter_with_keyphrase_field: 2,000
unique_dominant_topics: 167
matchable_topics: 167
topic_match_rate: 100.0
cached_twitter_cluster_keyphrases: 167
total_twitter_clusters: 15

--- Debug Mode ---

=== DEBUGGING twitter CLUSTER KEYPHRASE MATCHING ===
✓ twitter cluster keyphrase cache: 167 entries

Sample twitter cluster keyphrases:
  1. 'Excellent Call Support' -> twitter Cluster 0 (Call Center & Agent Support)
  2. 'Knowledgeable Agent Helped' -> twitter Cluster 0 (Call Center & Agent Support)
  3. 'Unhelpful Agent Encountered' -> twitter Cluster 0 (Call Center & Agent Support)
  4. 'Multilingual Support Appreciated' -> twitter Cluster 0 (Call Center & Agent Support)
 

In [6]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get connection details from environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
    raise ValueError("Please set MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME in your environment variables")

# Connect to MongoDB
client = MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DATABASE_NAME]
collection = db['twitter']

try:
    # Add domain field to all documents
    result = collection.update_many(
        {},  # Empty filter to match all documents
        {"$set": {"domain": "banking"}}
    )
    
    print(f"Matched documents: {result.matched_count}")
    print(f"Modified documents: {result.modified_count}")
    
    if result.matched_count == 2004:
        print("Successfully updated all 2004 documents!")
    else:
        print(f"Expected 2004 documents, but found {result.matched_count}")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the connection
    client.close()

# Alternative: Add domain field only to documents that don't already have it
def add_domain_conditionally():
    try:
        result = collection.update_many(
            {"domain": {"$exists": False}},  # Only documents without 'domain' field
            {"$set": {"domain": "banking"}}
        )
        
        print(f"Documents without domain field: {result.matched_count}")
        print(f"Modified documents: {result.modified_count}")
        
    except Exception as e:
        print(f"An error occurred: {e}")

# Uncomment the line below if you want to run the conditional update instead
# add_domain_conditionally()

Matched documents: 2000
Modified documents: 0
Expected 2004 documents, but found 2000


In [7]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {mongo_database_name}")
print("Collection: twitter")
print("=" * 60)

# Connect to MongoDB
client = MongoClient(mongo_connection_string)
db = client[mongo_database_name]
twitter_collection = db['twitter']

def analyze_urgency_values():
    """Analyze current urgency field values"""
    print("Analyzing current urgency values...")
    print("-" * 40)
    
    try:
        # Get all unique urgency values
        urgency_values = twitter_collection.distinct("urgency")
        print(f"Unique urgency values found: {urgency_values}")
        
        # Count each urgency value
        urgency_stats = list(twitter_collection.aggregate([
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        total_twitter = twitter_collection.count_documents({})
        print(f"\nTotal twitter: {total_twitter}")
        print(f"Urgency distribution:")
        
        for stat in urgency_stats:
            value = stat['_id'] if stat['_id'] is not None else 'null/missing'
            count = stat['count']
            percentage = (count / total_twitter) * 100 if total_twitter > 0 else 0
            print(f"  '{value}': {count} twitter ({percentage:.1f}%)")
            
        return urgency_stats
        
    except Exception as e:
        print(f"Error analyzing urgency values: {str(e)}")
        return []

def update_urgency_to_boolean():
    """Update urgency field from string to boolean"""
    print("\nStarting urgency field update...")
    print("Conversion rules:")
    print("  'Critical' → true")
    print("  'High' → false")
    print("-" * 40)
    
    try:
        # Update Critical to true
        critical_result = twitter_collection.update_many(
            {"urgency": "Critical"},
            {"$set": {"urgency": True}}
        )
        
        print(f"✓ Updated 'Critical' urgency:")
        print(f"  Matched: {critical_result.matched_count}")
        print(f"  Modified: {critical_result.modified_count}")
        
        # Update High to false
        high_result = twitter_collection.update_many(
            {"urgency": "High"},
            {"$set": {"urgency": False}}
        )
        
        print(f"✓ Updated 'High' urgency:")
        print(f"  Matched: {high_result.matched_count}")
        print(f"  Modified: {high_result.modified_count}")
        
        total_updated = critical_result.modified_count + high_result.modified_count
        print(f"\nTotal twitter updated: {total_updated}")
        
        return {
            'critical_updated': critical_result.modified_count,
            'high_updated': high_result.modified_count,
            'total_updated': total_updated
        }
        
    except Exception as e:
        print(f"Error updating urgency values: {str(e)}")
        return None

def verify_boolean_conversion():
    """Verify that urgency values are now boolean"""
    print("\n" + "=" * 50)
    print("VERIFICATION OF URGENCY CONVERSION")
    print("=" * 50)
    
    try:
        # Count boolean urgency values
        true_count = twitter_collection.count_documents({"urgency": True})
        false_count = twitter_collection.count_documents({"urgency": False})
        
        # Count any remaining string values
        critical_count = twitter_collection.count_documents({"urgency": "Critical"})
        high_count = twitter_collection.count_documents({"urgency": "High"})
        
        # Count other values
        other_urgency = list(twitter_collection.aggregate([
            {"$match": {"urgency": {"$nin": [True, False, "Critical", "High"]}}},
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        print(f"Boolean urgency values:")
        print(f"  urgency: true → {true_count} twitter")
        print(f"  urgency: false → {false_count} twitter")
        
        print(f"\nRemaining string values:")
        print(f"  urgency: 'Critical' → {critical_count} twitter")
        print(f"  urgency: 'High' → {high_count} twitter")
        
        if other_urgency:
            print(f"\nOther urgency values:")
            for other in other_urgency:
                value = other['_id'] if other['_id'] is not None else 'null/missing'
                count = other['count']
                print(f"  urgency: '{value}' → {count} twitter")
        
        # Show sample twitter with boolean urgency
        print(f"\nSample twitter with boolean urgency:")
        samples = list(twitter_collection.find(
            {"urgency": {"$in": [True, False]}},
            {
                "twitter_number": 1,
                "urgency": 1,
                "priority": 1,
                "title": 1
            }
        ).limit(5))
        
        for i, sample in enumerate(samples, 1):
            twitter_num = sample.get('twitter_number', 'N/A')
            urgency = sample.get('urgency')
            priority = sample.get('priority', 'N/A')
            title = sample.get('title', 'N/A')[:50] + '...' if len(sample.get('title', '')) > 50 else sample.get('title', 'N/A')
            
            print(f"  {i}. {twitter_num}: urgency={urgency}, priority='{priority}'")
            print(f"     Title: {title}")
        
        # Success check
        if critical_count == 0 and high_count == 0:
            print(f"\n✅ SUCCESS: All 'Critical' and 'High' urgency values converted to boolean!")
            print(f"Summary: {true_count} critical (true) + {false_count} high (false) = {true_count + false_count} total")
        else:
            print(f"\n⚠ WARNING: Some string urgency values remain")
            
    except Exception as e:
        print(f"Error during verification: {str(e)}")

def handle_other_urgency_values():
    """Check and optionally handle other urgency values"""
    print("\n" + "=" * 50)
    print("HANDLING OTHER URGENCY VALUES")
    print("=" * 50)
    
    try:
        # Find twitter with urgency values other than True/False/Critical/High
        other_urgency = list(twitter_collection.aggregate([
            {"$match": {"urgency": {"$nin": [True, False, "Critical", "High"]}}},
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        if not other_urgency:
            print("No other urgency values found - all twitter have been processed!")
            return
        
        print("Found twitter with other urgency values:")
        for other in other_urgency:
            value = other['_id'] if other['_id'] is not None else 'null/missing'
            count = other['count']
            print(f"  '{value}': {count} twitter")
        
        print(f"\nSample twitter with other urgency values:")
        samples = list(twitter_collection.find(
            {"urgency": {"$nin": [True, False, "Critical", "High"]}},
            {
                "twitter_number": 1,
                "urgency": 1,
                "priority": 1,
                "title": 1
            }
        ).limit(3))
        
        for i, sample in enumerate(samples, 1):
            twitter_num = sample.get('twitter_number', 'N/A')
            urgency = sample.get('urgency')
            priority = sample.get('priority', 'N/A')
            title = sample.get('title', 'N/A')[:50] + '...' if len(sample.get('title', '')) > 50 else sample.get('title', 'N/A')
            
            print(f"  {i}. {twitter_num}: urgency={urgency}, priority='{priority}'")
            print(f"     Title: {title}")
            
    except Exception as e:
        print(f"Error handling other urgency values: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        # Test database connection
        test_twitter = twitter_collection.find_one()
        if test_twitter:
            print("✓ Database connection successful")
            print(f"Sample twitter fields: {list(test_twitter.keys())}\n")
        else:
            print("⚠ No twitter found in twitter collection")
            exit(1)
        
        # Analyze current urgency values
        analyze_urgency_values()
        
        # Confirm before proceeding
        print(f"\nThis will update urgency values:")
        print(f"  'Critical' → true")
        print(f"  'High' → false")
        
        confirm = input(f"\nProceed with urgency conversion? (y/n): ")
        if confirm.lower() != 'y':
            print("Operation cancelled.")
            exit(0)
        
        # Execute the urgency update
        update_result = update_urgency_to_boolean()
        
        if update_result:
            # Verify the conversion
            verify_boolean_conversion()
            
            # Handle other urgency values
            handle_other_urgency_values()
            
            print(f"\n" + "=" * 60)
            print("✅ URGENCY CONVERSION COMPLETED SUCCESSFULLY!")
            print(f"Summary:")
            print(f"  Critical twitter (now true): {update_result['critical_updated']}")
            print(f"  High twitter (now false): {update_result['high_updated']}")
            print(f"  Total twitter updated: {update_result['total_updated']}")
            print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
Collection: twitter
✓ Database connection successful
Sample twitter fields: ['_id', 'tweet_id', 'created_at', 'user_id', 'username', 'email_id', 'dominant_topic', 'subtopics', 'hashtags', 'like_count', 'priority', 'quote_count', 'reply_count', 'retweet_count', 'sentiment', 'text', 'urgency', 'embeddings', 'kmeans_cluster_id', 'subcluster_id', 'subcluster_label', 'dominant_cluster_label', 'kmeans_cluster_keyphrase', 'domain']

Analyzing current urgency values...
----------------------------------------
Unique urgency values found: [False, True]

Total twitter: 2000
Urgency distribution:
  'False': 1667 twitter (83.4%)
  'True': 333 twitter (16.7%)

This will update urgency values:
  'Critical' → true
  'High' → false

Starting urgency field update...
Conversion rules:
  'Critical' → true
  'High' → false
----------------------------------------
✓ Updated 'Critical' urgency:
  Matched: 0
  Modified: 0
✓ Updated 'High' urgency:
  Matched: 0
  Mo

In [None]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {mongo_database_name}")
print("Collection: twitter")
print("=" * 60)

# Connect to MongoDB
client = MongoClient(mongo_connection_string)
db = client[mongo_database_name]
twitter_collection = db['twitter']

def analyze_subcluster_id_values():
    """Analyze current subcluster_id field values"""
    print("Analyzing current subcluster_id values...")
    print("-" * 40)
    
    try:
        # Get all unique subcluster_id values
        subcluster_values = twitter_collection.distinct("subcluster_id")
        print(f"Unique subcluster_id values found: {subcluster_values}")
        
        # Count each subcluster_id value type
        subcluster_stats = list(twitter_collection.aggregate([
            {"$group": {
                "_id": {"value": "$subcluster_id", "type": {"$type": "$subcluster_id"}}, 
                "count": {"$sum": 1}
            }},
            {"$sort": {"count": -1}}
        ]))
        
        total_twitter = twitter_collection.count_documents({})
        print(f"\nTotal twitter: {total_twitter}")
        print(f"Subcluster_id distribution:")
        
        for stat in subcluster_stats:
            value = stat['_id']['value'] if stat['_id']['value'] is not None else 'null/missing'
            data_type = stat['_id']['type']
            count = stat['count']
            percentage = (count / total_twitter) * 100 if total_twitter > 0 else 0
            print(f"  '{value}' (type: {data_type}): {count} twitter ({percentage:.1f}%)")
            
        return subcluster_stats
        
    except Exception as e:
        print(f"Error analyzing subcluster_id values: {str(e)}")
        return []

def update_subcluster_id_to_string():
    """Update subcluster_id field from integer to string"""
    print("\nStarting subcluster_id field update...")
    print("Conversion rule: All integer values → string values")
    print("-" * 40)
    
    try:
        # Find all twitter with integer subcluster_id
        integer_twitter = list(twitter_collection.find(
            {"subcluster_id": {"$type": "int"}},
            {"_id": 1, "subcluster_id": 1}
        ))
        
        print(f"Found {len(integer_twitter)} twitter with integer subcluster_id")
        
        updated_count = 0
        
        # Update each twitter individually to convert integer to string
        for twitter in integer_twitter:
            old_value = twitter['subcluster_id']
            new_value = str(old_value)
            
            result = twitter_collection.update_one(
                {"_id": twitter['_id']},
                {"$set": {"subcluster_id": new_value}}
            )
            
            if result.modified_count > 0:
                updated_count += 1
        
        print(f"✓ Updated subcluster_id from integer to string:")
        print(f"  Total processed: {len(integer_twitter)}")
        print(f"  Successfully updated: {updated_count}")
        
        return updated_count
        
    except Exception as e:
        print(f"Error updating subcluster_id values: {str(e)}")
        return 0

def verify_string_conversion():
    """Verify that subcluster_id values are now strings"""
    print("\n" + "=" * 50)
    print("VERIFICATION OF SUBCLUSTER_ID CONVERSION")
    print("=" * 50)
    
    try:
        # Count string subcluster_id values
        string_count = twitter_collection.count_documents({"subcluster_id": {"$type": "string"}})
        
        # Count any remaining integer values
        integer_count = twitter_collection.count_documents({"subcluster_id": {"$type": "int"}})
        
        # Count other data types
        other_types = list(twitter_collection.aggregate([
            {"$match": {"subcluster_id": {"$nin": [None]}}},
            {"$group": {"_id": {"$type": "$subcluster_id"}, "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        print(f"Subcluster_id by data type:")
        for type_stat in other_types:
            data_type = type_stat['_id']
            count = type_stat['count']
            print(f"  {data_type}: {count} twitter")
        
        # Count null/missing values
        null_count = twitter_collection.count_documents({"subcluster_id": None})
        missing_count = twitter_collection.count_documents({"subcluster_id": {"$exists": False}})
        
        if null_count > 0:
            print(f"  null: {null_count} twitter")
        if missing_count > 0:
            print(f"  missing: {missing_count} twitter")
        
        # Show sample twitter with string subcluster_id
        print(f"\nSample twitter with string subcluster_id:")
        samples = list(twitter_collection.find(
            {"subcluster_id": {"$type": "string"}},
            {
                "twitter_number": 1,
                "subcluster_id": 1,
                "title": 1
            }
        ).limit(5))
        
        for i, sample in enumerate(samples, 1):
            twitter_num = sample.get('twitter_number', 'N/A')
            subcluster_id = sample.get('subcluster_id')
            title = sample.get('title', 'N/A')[:50] + '...' if len(sample.get('title', '')) > 50 else sample.get('title', 'N/A')
            
            print(f"  {i}. {twitter_num}: subcluster_id=\"{subcluster_id}\"")
            print(f"     Title: {title}")
        
        # Success check
        if integer_count == 0:
            print(f"\n✅ SUCCESS: All integer subcluster_id values converted to strings!")
            print(f"Summary: {string_count} twitter now have string subcluster_id")
        else:
            print(f"\n⚠ WARNING: {integer_count} integer subcluster_id values remain")
            
    except Exception as e:
        print(f"Error during verification: {str(e)}")

def handle_other_subcluster_id_values():
    """Check for any unusual subcluster_id values"""
    print("\n" + "=" * 50)
    print("CHECKING FOR OTHER SUBCLUSTER_ID VALUES")
    print("=" * 50)
    
    try:
        # Find twitter with null or missing subcluster_id
        null_count = twitter_collection.count_documents({"subcluster_id": None})
        missing_count = twitter_collection.count_documents({"subcluster_id": {"$exists": False}})
        
        if null_count > 0:
            print(f"Found {null_count} twitter with null subcluster_id")
            
        if missing_count > 0:
            print(f"Found {missing_count} twitter with missing subcluster_id field")
            
        if null_count == 0 and missing_count == 0:
            print("All twitter have valid subcluster_id values!")
            
        # Show sample of any problematic twitter
        if null_count > 0 or missing_count > 0:
            print(f"\nSample twitter with null/missing subcluster_id:")
            samples = list(twitter_collection.find(
                {"$or": [
                    {"subcluster_id": None},
                    {"subcluster_id": {"$exists": False}}
                ]},
                {
                    "twitter_number": 1,
                    "subcluster_id": 1,
                    "title": 1
                }
            ).limit(3))
            
            for i, sample in enumerate(samples, 1):
                twitter_num = sample.get('twitter_number', 'N/A')
                subcluster_id = sample.get('subcluster_id', 'MISSING_FIELD')
                title = sample.get('title', 'N/A')[:50] + '...' if len(sample.get('title', '')) > 50 else sample.get('title', 'N/A')
                
                print(f"  {i}. {twitter_num}: subcluster_id={subcluster_id}")
                print(f"     Title: {title}")
            
    except Exception as e:
        print(f"Error checking other subcluster_id values: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        # Test database connection
        test_twitter = twitter_collection.find_one()
        if test_twitter:
            print("✓ Database connection successful")
            print(f"Sample twitter fields: {list(test_twitter.keys())}\n")
        else:
            print("⚠ No twitter found in twitter collection")
            exit(1)
        
        # Analyze current subcluster_id values
        analyze_subcluster_id_values()
        
        # Confirm before proceeding
        print(f"\nThis will convert all integer subcluster_id values to strings")
        print(f"Example: subcluster_id: 1 → subcluster_id: \"1\"")
        
        confirm = input(f"\nProceed with subcluster_id conversion? (y/n): ")
        if confirm.lower() != 'y':
            print("Operation cancelled.")
            exit(0)
        
        # Execute the subcluster_id update
        updated_count = update_subcluster_id_to_string()
        
        if updated_count > 0:
            # Verify the conversion
            verify_string_conversion()
            
            # Handle other subcluster_id values
            handle_other_subcluster_id_values()
            
            print(f"\n" + "=" * 60)
            print("✅ SUBCLUSTER_ID CONVERSION COMPLETED SUCCESSFULLY!")
            print(f"Summary:")
            print(f"  Total twitter updated: {updated_count}")
            print(f"  All integer subcluster_id values converted to strings")
            print("=" * 60)
        else:
            print(f"\n⚠ No updates were made. Check if subcluster_id fields are already strings or if there are no integer values.")
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
Collection: twitter
✓ Database connection successful
Sample twitter fields: ['_id', 'tweet_id', 'created_at', 'user_id', 'username', 'email_id', 'dominant_topic', 'subtopics', 'hashtags', 'like_count', 'priority', 'quote_count', 'reply_count', 'retweet_count', 'sentiment', 'text', 'urgency', 'embeddings', 'kmeans_cluster_id', 'subcluster_id', 'subcluster_label', 'dominant_cluster_label', 'kmeans_cluster_keyphrase', 'domain']

Analyzing current subcluster_id values...
----------------------------------------
Unique subcluster_id values found: [0, 1, 2, 3]

Total twitter: 2000
Subcluster_id distribution:
  '0' (type: int): 966 twitter (48.3%)
  '1' (type: int): 689 twitter (34.4%)
  '2' (type: int): 281 twitter (14.1%)
  '3' (type: int): 64 twitter (3.2%)

This will convert all integer subcluster_id values to strings
Example: subcluster_id: 1 → subcluster_id: "1"
Operation cancelled.

Starting subcluster_id field update...
Conversion rule: All 

: 

In [1]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

def rename_field_in_cluster_collection():
    """
    Rename field 'twitter_ids' to 'twitter_ids' in documents where data="twitter"
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        cluster_collection = db['cluster']
        
        # Define the filter for documents that have data="twitter" and twitter_ids field exists
        filter_query = {
            "data": "twitter",
            "twitter_ids": {"$exists": True}
        }
        
        # Count documents that match the criteria before update
        count_before = cluster_collection.count_documents(filter_query)
        print(f"Found {count_before} documents matching criteria (data='twitter' and twitter_ids exists)")
        
        if count_before == 0:
            print("No documents found to update.")
            return
        
        # Use $rename operator to rename the field
        update_operation = {
            "$rename": {
                "twitter_ids": "twitter_ids"
            }
        }
        
        # Perform the update operation
        result = cluster_collection.update_many(filter_query, update_operation)
        
        print(f"Successfully updated {result.modified_count} documents")
        print(f"Matched {result.matched_count} documents")
        
        # Verify the update by counting documents with the new field name
        verification_query = {
            "data": "twitter",
            "twitter_ids": {"$exists": True}
        }
        count_after = cluster_collection.count_documents(verification_query)
        print(f"Verification: {count_after} documents now have 'twitter_ids' field")
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("MongoDB connection closed")

if __name__ == "__main__":
    rename_field_in_cluster_collection()

Found 15 documents matching criteria (data='twitter' and twitter_ids exists)
Error occurred: The source and target field for $rename must differ: twitter_ids: "twitter_ids", full error: {'index': 0, 'code': 2, 'errmsg': 'The source and target field for $rename must differ: twitter_ids: "twitter_ids"'}
MongoDB connection closed


In [2]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

def validate_keyphrases_in_subclusters():
    """
    Cross-check keyphrases field with subclusters keyphrases.
    Find any keyphrases that exist in main keyphrases but are missing from all subclusters.
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        cluster_collection = db['cluster']
        
        # Find all documents that have both keyphrases and subclusters fields
        query = {
            "keyphrases": {"$exists": True, "$ne": None},
            "subclusters": {"$exists": True, "$ne": None}
        }
        
        documents = cluster_collection.find(query)
        
        missing_keyphrases = []
        total_documents_checked = 0
        
        for doc in documents:
            total_documents_checked += 1
            cluster_id = doc.get('cluster_id')
            main_keyphrases = doc.get('keyphrases', [])
            subclusters = doc.get('subclusters', {})
            
            # Collect all keyphrases from all subclusters
            subcluster_keyphrases = set()
            
            # subclusters is an object with keys like "0", "1", "2", etc.
            for subcluster_key, subcluster_data in subclusters.items():
                if isinstance(subcluster_data, dict) and 'keyphrases' in subcluster_data:
                    subcluster_keyphrase_list = subcluster_data.get('keyphrases', [])
                    if isinstance(subcluster_keyphrase_list, list):
                        subcluster_keyphrases.update(subcluster_keyphrase_list)
            
            # Check each main keyphrase against subcluster keyphrases
            for keyphrase in main_keyphrases:
                if keyphrase not in subcluster_keyphrases:
                    missing_keyphrases.append({
                        'cluster_id': cluster_id,
                        'missing_keyphrase': keyphrase,
                        'total_main_keyphrases': len(main_keyphrases),
                        'total_subcluster_keyphrases': len(subcluster_keyphrases)
                    })
        
        # Display results
        print(f"Total documents checked: {total_documents_checked}")
        print(f"Total missing keyphrases found: {len(missing_keyphrases)}")
        print("-" * 80)
        
        if missing_keyphrases:
            print("MISSING KEYPHRASES REPORT:")
            print("-" * 80)
            
            # Group by cluster_id for better readability
            cluster_groups = {}
            for item in missing_keyphrases:
                cluster_id = item['cluster_id']
                if cluster_id not in cluster_groups:
                    cluster_groups[cluster_id] = []
                cluster_groups[cluster_id].append(item)
            
            for cluster_id, missing_items in cluster_groups.items():
                print(f"Cluster ID: {cluster_id}")
                print(f"Missing keyphrases ({len(missing_items)}):")
                for item in missing_items:
                    print(f"  - '{item['missing_keyphrase']}'")
                print(f"Total main keyphrases: {missing_items[0]['total_main_keyphrases']}")
                print(f"Total subcluster keyphrases: {missing_items[0]['total_subcluster_keyphrases']}")
                print("-" * 40)
                
        else:
            print("✅ All keyphrases from main field are present in subclusters!")
            
        # Summary statistics
        if missing_keyphrases:
            clusters_with_issues = len(set(item['cluster_id'] for item in missing_keyphrases))
            print(f"\nSUMMARY:")
            print(f"Clusters with missing keyphrases: {clusters_with_issues}")
            print(f"Total missing keyphrase instances: {len(missing_keyphrases)}")
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("\nMongoDB connection closed")

def get_detailed_analysis():
    """
    Get more detailed analysis including sample data structure
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        cluster_collection = db['cluster']
        
        # Get a sample document to understand structure
        sample_doc = cluster_collection.find_one({
            "keyphrases": {"$exists": True},
            "subclusters": {"$exists": True}
        })
        
        if sample_doc:
            print("SAMPLE DOCUMENT STRUCTURE:")
            print("-" * 40)
            print(f"Cluster ID: {sample_doc.get('cluster_id')}")
            print(f"Cluster Name: {sample_doc.get('cluster_name', 'N/A')}")
            print(f"Main keyphrases count: {len(sample_doc.get('keyphrases', []))}")
            
            subclusters = sample_doc.get('subclusters', {})
            print(f"Subclusters count: {len(subclusters)}")
            
            if sample_doc.get('keyphrases'):
                print(f"Sample main keyphrases: {sample_doc['keyphrases'][:3]}...")
            
            if subclusters:
                print("Subcluster structure:")
                for key, subcluster in list(subclusters.items())[:2]:  # Show first 2 subclusters
                    if isinstance(subcluster, dict):
                        label = subcluster.get('label', 'No label')
                        keyphrases_count = len(subcluster.get('keyphrases', []))
                        print(f"  {key}: '{label}' ({keyphrases_count} keyphrases)")
                        if subcluster.get('keyphrases'):
                            print(f"    Sample keyphrases: {subcluster['keyphrases'][:2]}...")
            print("-" * 40)
    
    except Exception as e:
        print(f"Error in detailed analysis: {str(e)}")
    finally:
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    print("Starting keyphrase validation...")
    print("=" * 80)
    
    # First, get structure analysis
    get_detailed_analysis()
    
    # Then run validation
    validate_keyphrases_in_subclusters()

Starting keyphrase validation...
SAMPLE DOCUMENT STRUCTURE:
----------------------------------------
Cluster ID: 0
Cluster Name: N/A
Main keyphrases count: 6
Subclusters count: 2
Sample main keyphrases: ['SEPA Payment Failure', 'SEPA Processing Error', 'SEPA Instant Failure']...
Subcluster structure:
  0: 'SEPA Failures & Errors' (3 keyphrases)
    Sample keyphrases: ['SEPA Payment Failure', 'SEPA Processing Error']...
  1: 'Clearing & Settlement Issues' (3 keyphrases)
    Sample keyphrases: ['SEPA Payment Status', 'TARGET2 Settlement Issue']...
----------------------------------------
Total documents checked: 74
Total missing keyphrases found: 0
--------------------------------------------------------------------------------
✅ All keyphrases from main field are present in subclusters!

MongoDB connection closed


In [3]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

def convert_subcluster_id_to_string():
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['twitter']
        
        # Find all documents where subcluster_id is an integer
        query = {"subcluster_id": {"$type": "number"}}
        documents_to_update = list(collection.find(query))
        
        print(f"Found {len(documents_to_update)} documents with integer subcluster_id")
        
        if len(documents_to_update) == 0:
            print("No documents found with integer subcluster_id")
            return
        
        # Update each document
        updated_count = 0
        for doc in documents_to_update:
            try:
                # Convert the integer subcluster_id to string
                new_subcluster_id = str(doc['subcluster_id'])
                
                # Update the document
                result = collection.update_one(
                    {"_id": doc["_id"]},
                    {"$set": {"subcluster_id": new_subcluster_id}}
                )
                
                if result.modified_count > 0:
                    updated_count += 1
                    print(f"Updated document {doc['_id']}: subcluster_id {doc['subcluster_id']} -> '{new_subcluster_id}'")
                
            except Exception as e:
                print(f"Error updating document {doc['_id']}: {e}")
        
        print(f"\nSummary: Successfully updated {updated_count} out of {len(documents_to_update)} documents")
        
        # Verify the changes
        remaining_int_docs = collection.count_documents({"subcluster_id": {"$type": "number"}})
        string_docs = collection.count_documents({"subcluster_id": {"$type": "string"}})
        
        print(f"Verification:")
        print(f"- Documents with integer subcluster_id: {remaining_int_docs}")
        print(f"- Documents with string subcluster_id: {string_docs}")
        
    except Exception as e:
        print(f"Error connecting to MongoDB or updating documents: {e}")
    finally:
        if 'client' in locals():
            client.close()
            print("MongoDB connection closed")

def preview_changes():
    """Preview what changes will be made without actually updating"""
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['twitter']
        
        # Find documents where subcluster_id is an integer
        query = {"subcluster_id": {"$type": "number"}}
        documents_to_update = list(collection.find(query, {"_id": 1, "subcluster_id": 1}))
        
        print("PREVIEW MODE - No changes will be made")
        print(f"Found {len(documents_to_update)} documents that would be updated:")
        
        for i, doc in enumerate(documents_to_update[:10]):  # Show first 10
            print(f"  Document {doc['_id']}: subcluster_id {doc['subcluster_id']} -> '{str(doc['subcluster_id'])}'")
        
        if len(documents_to_update) > 10:
            print(f"  ... and {len(documents_to_update) - 10} more documents")
        
    except Exception as e:
        print(f"Error during preview: {e}")
    finally:
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    print("MongoDB subcluster_id Converter")
    print("=" * 40)
    
    # First, preview the changes
    print("\n1. PREVIEW CHANGES:")
    preview_changes()
    
    # Ask for confirmation
    print("\n2. CONFIRMATION:")
    response = input("Do you want to proceed with the conversion? (yes/no): ").lower().strip()
    
    if response == 'yes':
        print("\n3. EXECUTING CONVERSION:")
        convert_subcluster_id_to_string()
    else:
        print("Conversion cancelled.")

MongoDB subcluster_id Converter

1. PREVIEW CHANGES:
PREVIEW MODE - No changes will be made
Found 0 documents that would be updated:

2. CONFIRMATION:
Conversion cancelled.


In [4]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def copy_processed_at_field():
    """
    Copy processed_at field from 'sample twitter' collection to 'twitter' collection
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        
        # Get collections
        sample_twitter_collection = db['sample twitter']
        twitter_collection = db['twitter']
        
        logger.info("Connected to MongoDB successfully")
        
        # Get all documents from sample twitter collection with processed_at field
        sample_twitter = list(sample_twitter_collection.find(
            {"processed_at": {"$exists": True}},
            {"_id": 1, "processed_at": 1}
        ))
        
        logger.info(f"Found {len(sample_twitter)} documents with processed_at field in 'sample twitter' collection")
        
        if not sample_twitter:
            logger.warning("No documents found with processed_at field in 'sample twitter' collection")
            return
        
        # Create a mapping of _id to processed_at value
        processed_at_mapping = {doc['_id']: doc['processed_at'] for doc in sample_twitter}
        
        # Update twitter collection
        updated_count = 0
        failed_count = 0
        
        for doc_id, processed_at_value in processed_at_mapping.items():
            try:
                # Update the document in twitter collection
                result = twitter_collection.update_one(
                    {"_id": doc_id},
                    {"$set": {"processed_at": processed_at_value}},
                    upsert=False  # Don't create new documents if they don't exist
                )
                
                if result.matched_count > 0:
                    updated_count += 1
                    if updated_count % 100 == 0:  # Log progress every 100 updates
                        logger.info(f"Updated {updated_count} documents so far...")
                else:
                    logger.warning(f"Document with _id {doc_id} not found in twitter collection")
                    failed_count += 1
                    
            except Exception as e:
                logger.error(f"Failed to update document {doc_id}: {str(e)}")
                failed_count += 1
        
        logger.info(f"Operation completed:")
        logger.info(f"- Successfully updated: {updated_count} documents")
        logger.info(f"- Failed/Not found: {failed_count} documents")
        
        # Verify the operation
        verify_count = twitter_collection.count_documents({"processed_at": {"$exists": True}})
        logger.info(f"Verification: {verify_count} documents in 'twitter' collection now have 'processed_at' field")
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
    finally:
        # Close the connection
        try:
            client.close()
            logger.info("MongoDB connection closed")
        except:
            pass

def copy_all_processed_at_regardless_of_id():
    """
    Alternative approach: Copy processed_at values based on document order/position
    Use this if documents don't have matching _ids between collections
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        
        # Get collections
        sample_twitter_collection = db['sample twitter']
        twitter_collection = db['twitter']
        
        logger.info("Connected to MongoDB successfully")
        
        # Get all documents from both collections
        sample_twitter = list(sample_twitter_collection.find().sort("_id", 1))
        twitter = list(twitter_collection.find().sort("_id", 1))
        
        logger.info(f"Sample twitter collection has {len(sample_twitter)} documents")
        logger.info(f"twitter collection has {len(twitter)} documents")
        
        # Ensure both collections have the same number of documents
        min_count = min(len(sample_twitter), len(twitter))
        
        if len(sample_twitter) != len(twitter):
            logger.warning(f"Collections have different sizes. Will process {min_count} documents")
        
        updated_count = 0
        
        # Update twitter with processed_at values from sample twitter
        for i in range(min_count):
            sample_doc = sample_twitter[i]
            twitter_doc = twitter[i]
            
            # Check if sample document has processed_at field
            if 'processed_at' in sample_doc:
                try:
                    # Update the corresponding twitter document
                    result = twitter_collection.update_one(
                        {"_id": twitter_doc['_id']},
                        {"$set": {"processed_at": sample_doc['processed_at']}}
                    )
                    
                    if result.modified_count > 0:
                        updated_count += 1
                        if updated_count % 100 == 0:
                            logger.info(f"Updated {updated_count} documents so far...")
                            
                except Exception as e:
                    logger.error(f"Failed to update document at index {i}: {str(e)}")
        
        logger.info(f"Successfully updated {updated_count} documents with processed_at field")
        
        # Verify the operation
        verify_count = twitter_collection.count_documents({"processed_at": {"$exists": True}})
        logger.info(f"Verification: {verify_count} documents in 'twitter' collection now have 'processed_at' field")
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
    finally:
        # Close the connection
        try:
            client.close()
            logger.info("MongoDB connection closed")
        except:
            pass

if __name__ == "__main__":
    # Check if environment variables are set
    if not mongo_connection_string:
        logger.error("MONGO_CONNECTION_STRING environment variable is not set")
        exit(1)
    
    if not mongo_database_name:
        logger.error("MONGO_DATABASE_NAME environment variable is not set")
        exit(1)
    
    print("Choose the method to copy processed_at field:")
    print("1. Copy based on matching document _id (recommended)")
    print("2. Copy based on document order/position")
    
    choice = input("Enter your choice (1 or 2): ").strip()
    
    if choice == "1":
        logger.info("Starting copy operation based on matching _id...")
        copy_processed_at_field()
    elif choice == "2":
        logger.info("Starting copy operation based on document order...")
        copy_all_processed_at_regardless_of_id()
    else:
        logger.error("Invalid choice. Please run the script again and choose 1 or 2.")

Choose the method to copy processed_at field:
1. Copy based on matching document _id (recommended)
2. Copy based on document order/position


2025-09-03 00:20:21,793 - INFO - Starting copy operation based on document order...
2025-09-03 00:20:21,800 - INFO - Connected to MongoDB successfully
2025-09-03 00:21:11,587 - INFO - MongoDB connection closed


KeyboardInterrupt: 

In [2]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def rename_domain_value():
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['cluster']
        
        logger.info("Connected to MongoDB successfully")
        
        # First, let's check how many documents have "twitter Support" in domains array
        count_before = collection.count_documents({"domains": "twitter Support"})
        logger.info(f"Found {count_before} documents with 'twitter Support' in domains array")
        
        if count_before == 0:
            logger.info("No documents found with 'twitter Support' domain. Checking for other variations...")
            # Check for case variations or similar patterns
            variations = ["Twitter Support", "twitter support", "TWITTER SUPPORT", "Twitter support"]
            for variation in variations:
                count = collection.count_documents({"domains": variation})
                if count > 0:
                    logger.info(f"Found {count} documents with '{variation}' domain")
        
        # Update documents where domains array contains "twitter Support"
        # This will replace "twitter Support" with "banking" in the domains array
        result = collection.update_many(
            {"domains": "twitter Support"},
            {"$set": {"domains.$[elem]": "banking"}},
            array_filters=[{"elem": "twitter Support"}]
        )
        
        logger.info(f"Updated {result.modified_count} documents")
        
        # Also handle case variations if they exist
        variations_to_update = ["Twitter Support", "twitter support", "TWITTER SUPPORT", "Twitter support"]
        total_updated = result.modified_count
        
        for variation in variations_to_update:
            result_var = collection.update_many(
                {"domains": variation},
                {"$set": {"domains.$[elem]": "banking"}},
                array_filters=[{"elem": variation}]
            )
            if result_var.modified_count > 0:
                logger.info(f"Updated {result_var.modified_count} documents with '{variation}' domain")
                total_updated += result_var.modified_count
        
        # Verify the changes
        count_after = collection.count_documents({"domains": "banking"})
        logger.info(f"After update: {count_after} documents have 'banking' in domains array")
        
        # Show a sample of updated documents
        logger.info("Sample of updated documents:")
        sample_docs = collection.find({"domains": "banking"}).limit(3)
        for doc in sample_docs:
            logger.info(f"Document ID: {doc.get('_id')}, Domains: {doc.get('domains')}")
        
        logger.info(f"Operation completed. Total documents updated: {total_updated}")
        
    except Exception as e:
        logger.error(f"Error occurred: {str(e)}")
    finally:
        # Close the connection
        client.close()
        logger.info("MongoDB connection closed")

def rollback_domain_value():
    """
    Function to rollback the changes if needed
    This will change "banking" back to "twitter Support"
    """
    try:
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['twitter']
        
        logger.info("Starting rollback operation...")
        
        # Update documents where domains array contains "banking"
        result = collection.update_many(
            {"domains": "banking"},
            {"$set": {"domains.$[elem]": "twitter Support"}},
            array_filters=[{"elem": "banking"}]
        )
        
        logger.info(f"Rollback completed. Updated {result.modified_count} documents")
        
    except Exception as e:
        logger.error(f"Rollback error: {str(e)}")
    finally:
        client.close()
        logger.info("MongoDB connection closed")

if __name__ == "__main__":
    # Run the domain rename operation
    rename_domain_value()
    
    # Uncomment the line below if you need to rollback the changes
    # rollback_domain_value()

2025-09-03 10:56:18,262 - INFO - Connected to MongoDB successfully
2025-09-03 10:56:19,758 - INFO - Found 15 documents with 'twitter Support' in domains array
2025-09-03 10:56:20,009 - INFO - Updated 15 documents
2025-09-03 10:56:21,748 - INFO - After update: 72 documents have 'banking' in domains array
2025-09-03 10:56:21,750 - INFO - Sample of updated documents:
2025-09-03 10:56:21,999 - INFO - Document ID: 68aacadc05037130937cbae2, Domains: ['banking']
2025-09-03 10:56:22,001 - INFO - Document ID: 68aacadc05037130937cbae3, Domains: ['banking']
2025-09-03 10:56:22,002 - INFO - Document ID: 68aacadc05037130937cbae4, Domains: ['banking']
2025-09-03 10:56:22,003 - INFO - Operation completed. Total documents updated: 15
2025-09-03 10:56:22,250 - INFO - MongoDB connection closed


In [2]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
import re

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def get_simple_tweet_stats():
    """
    Simple analysis to count:
    - Tweets incomplete with hashtags (like example 2)
    - Tweets that are genuinely incomplete (like example 1)
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['twitter']
        
        logger.info("Connected to MongoDB successfully")
        
        # Simple counters
        total_tweets = 0
        incomplete_with_hashtags = 0  # Like example 2: ends with incomplete hashtag
        genuinely_incomplete = 0      # Like example 1: ends with ...
        
        # Simple patterns
        hashtag_pattern = re.compile(r'#\w*$')      # Ends with #something
        ellipsis_pattern = re.compile(r'\.{3}$')    # Ends with exactly ...
        
        # Process all documents
        cursor = collection.find({}, {"text": 1})
        
        for doc in cursor:
            total_tweets += 1
            
            text = doc.get('text', '').strip()
            
            # Skip empty tweets
            if not text:
                continue
            
            # Check if ends with incomplete hashtag (like example 2)
            if hashtag_pattern.search(text):
                incomplete_with_hashtags += 1
            
            # Check if ends with ... (like example 1)
            elif ellipsis_pattern.search(text):
                genuinely_incomplete += 1
        
        # Print simple results
        print("\n" + "="*50)
        print("SIMPLE TWEET STATS")
        print("="*50)
        print(f"Total tweets: {total_tweets}")
        print(f"")
        print(f"Incomplete with hashtags (like example 2): {incomplete_with_hashtags}")
        print(f"Genuinely incomplete (like example 1): {genuinely_incomplete}")
        print("="*50)
        
        client.close()
        
        return {
            'total_tweets': total_tweets,
            'incomplete_with_hashtags': incomplete_with_hashtags,
            'genuinely_incomplete': genuinely_incomplete
        }
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise
    finally:
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    # Validate environment variables
    if not mongo_connection_string:
        logger.error("MONGO_CONNECTION_STRING environment variable is not set")
        exit(1)
    
    if not mongo_database_name:
        logger.error("MONGO_DATABASE_NAME environment variable is not set")
        exit(1)
    
    try:
        logger.info("Getting simple tweet statistics...")
        
        results = get_simple_tweet_stats()
        
        logger.info("Analysis completed!")
        
    except Exception as e:
        logger.error(f"Analysis failed: {str(e)}")
        exit(1)

2025-09-04 17:37:56,156 - INFO - Getting simple tweet statistics...
2025-09-04 17:37:56,162 - INFO - Connected to MongoDB successfully



SIMPLE TWEET STATS
Total tweets: 2000

Incomplete with hashtags (like example 2): 49
Genuinely incomplete (like example 1): 1935


2025-09-04 17:37:59,428 - INFO - Analysis completed!


In [1]:
import pandas as pd
import numpy as np

# Read the CSV file
df = pd.read_csv('swedbank-perfect-varied.csv')

# Display basic info about the dataset
print("Dataset Info:")
print(f"Total rows: {len(df)}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
print(df.head())

# Calculate statistics for each Channel Distribution category
print("\n" + "="*60)
print("CHANNEL DISTRIBUTION STATISTICS")
print("="*60)

# Group by Channel Distribution and sum the counts
channel_stats = df.groupby('Channel Distribution')['count'].agg(['sum', 'count']).reset_index()
channel_stats.columns = ['Channel', 'Total_Count', 'Number_of_Records']

# Calculate percentages
total_count = channel_stats['Total_Count'].sum()
channel_stats['Percentage'] = (channel_stats['Total_Count'] / total_count * 100).round(2)

# Sort by total count in descending order
channel_stats = channel_stats.sort_values('Total_Count', ascending=False)

# Display the results
print("\nChannel Distribution Summary:")
print("-" * 50)
for _, row in channel_stats.iterrows():
    print(f"{row['Channel']:<25} | Count: {row['Total_Count']:>6} | Records: {row['Number_of_Records']:>3} | Percentage: {row['Percentage']:>6.2f}%")

print("\n" + "="*60)
print("OVERALL STATISTICS")
print("="*60)
print(f"Total Count (All Channels): {total_count:,}")
print(f"Total Records: {len(df):,}")
print(f"Average Count per Record: {total_count/len(df):.2f}")

# Create a summary table similar to your example
print("\n" + "="*60)
print("SUMMARY TABLE (Count | Channel)")
print("="*60)
for _, row in channel_stats.iterrows():
    print(f"{row['Total_Count']:>6} | {row['Channel']}")

# Optional: Create a simple bar chart visualization
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
bars = plt.bar(channel_stats['Channel'], channel_stats['Total_Count'], 
               color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
plt.title('Channel Distribution - Total Counts', fontsize=14, fontweight='bold')
plt.xlabel('Channel', fontsize=12)
plt.ylabel('Total Count', fontsize=12)
plt.xticks(rotation=45, ha='right')

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 5,
             f'{int(height)}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# Display detailed breakdown
print("\n" + "="*60)
print("DETAILED BREAKDOWN BY CHANNEL")
print("="*60)
for channel in df['Channel Distribution'].unique():
    channel_data = df[df['Channel Distribution'] == channel]
    print(f"\n{channel} ({len(channel_data)} records):")
    print("-" * 40)
    # Show top 5 topics for this channel
    top_topics = channel_data.nlargest(5, 'count')[['dominant topic', 'count']]
    for _, topic in top_topics.iterrows():
        print(f"  {topic['count']:>3} | {topic['dominant topic']}")
    if len(channel_data) > 5:
        print(f"  ... and {len(channel_data) - 5} more topics")


Dataset Info:
Total rows: 168
Columns: ['dominant topic', 'count', 'Channel Distribution']

First few rows:
              dominant topic  count   Channel Distribution
0            Long Wait Times     52             Trustpilot
1    Call Center Overwhelmed     48             Trustpilot
2         Mobile App Crashes     45  App Store/Google Play
3     Call Center Complaints     45             Trustpilot
4  Call Disconnection Issues     43             Trustpilot

CHANNEL DISTRIBUTION STATISTICS

Channel Distribution Summary:
--------------------------------------------------
Reddit                    | Count:    740 | Records:  75 | Percentage:  33.08%
Trustpilot                | Count:    695 | Records:  34 | Percentage:  31.07%
App Store/Google Play     | Count:    444 | Records:  32 | Percentage:  19.85%
Twitter/X                 | Count:    358 | Records:  27 | Percentage:  16.00%

OVERALL STATISTICS
Total Count (All Channels): 2,237
Total Records: 168
Average Count per Record: 13.32

S

ModuleNotFoundError: No module named 'matplotlib'

In [2]:
# Split data based on target percentages
print("="*60)
print("SPLITTING DATA BY TARGET PERCENTAGES")
print("="*60)

# Target percentages for each channel
target_percentages = {
    'Trustpilot': 38,
    'App Store/Google Play': 32,
    'Twitter/X': 18,
    'Reddit': 12
}

# Calculate target counts based on total
total_records = len(df)
target_counts = {}
for channel, percentage in target_percentages.items():
    target_counts[channel] = int(total_records * percentage / 100)

print("Target distribution:")
for channel, count in target_counts.items():
    print(f"{channel}: {count} records ({target_percentages[channel]}%)")

# Create a new dataframe with split data
split_data = []

# Get all records and sort by count (descending) to prioritize high-impact topics
sorted_df = df.sort_values('count', ascending=False).reset_index(drop=True)

# Track how many records we've assigned to each channel
assigned_counts = {channel: 0 for channel in target_percentages.keys()}

# Assign records to channels based on target distribution
for idx, row in sorted_df.iterrows():
    # Find the channel that needs more records and is closest to target
    remaining_needed = {k: v - assigned_counts[k] for k, v in target_counts.items()}
    
    # If all channels are filled, assign to the one with least records
    if all(count <= 0 for count in remaining_needed.values()):
        channel = min(assigned_counts.keys(), key=lambda x: assigned_counts[x])
    else:
        # Find channel that still needs records
        available_channels = [k for k, v in remaining_needed.items() if v > 0]
        if available_channels:
            # Assign to the channel that needs the most records
            channel = max(available_channels, key=lambda x: remaining_needed[x])
        else:
            # Fallback to original channel if no space
            channel = row['Channel Distribution']
    
    # Create new record with assigned channel
    new_record = {
        'dominant topic': row['dominant topic'],
        'count': row['count'],
        'Channel Distribution': channel
    }
    split_data.append(new_record)
    assigned_counts[channel] += 1

# Create new dataframe
split_df = pd.DataFrame(split_data)

# Verify the split
print("\n" + "="*60)
print("ACTUAL SPLIT RESULTS")
print("="*60)

split_stats = split_df.groupby('Channel Distribution')['count'].agg(['sum', 'count']).reset_index()
split_stats.columns = ['Channel', 'Total_Count', 'Number_of_Records']
split_stats['Percentage'] = (split_stats['Number_of_Records'] / len(split_df) * 100).round(2)
split_stats = split_stats.sort_values('Total_Count', ascending=False)

print("\nChannel Distribution Summary:")
print("-" * 50)
for _, row in split_stats.iterrows():
    print(f"{row['Channel']:<25} | Count: {row['Total_Count']:>6} | Records: {row['Number_of_Records']:>3} | Percentage: {row['Percentage']:>6.2f}%")

# Save to new CSV file
output_filename = 'swedbank-split-by-percentages.csv'
split_df.to_csv(output_filename, index=False)

print(f"\n" + "="*60)
print(f"SAVED TO: {output_filename}")
print("="*60)
print(f"Total records: {len(split_df)}")
print(f"File saved successfully!")

# Show sample of the split data
print("\nSample of split data:")
print(split_df.head(10))

# Create visualization of the split
plt.figure(figsize=(14, 8))

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Original distribution
original_stats = df.groupby('Channel Distribution')['count'].sum().sort_values(ascending=False)
bars1 = ax1.bar(original_stats.index, original_stats.values, 
                color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax1.set_title('Original Distribution', fontsize=14, fontweight='bold')
ax1.set_ylabel('Total Count')
ax1.tick_params(axis='x', rotation=45)

# Add value labels
for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 10,
             f'{int(height)}', ha='center', va='bottom', fontweight='bold')

# Split distribution
split_stats_vis = split_df.groupby('Channel Distribution')['count'].sum().sort_values(ascending=False)
bars2 = ax2.bar(split_stats_vis.index, split_stats_vis.values,
                color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax2.set_title('Split Distribution (by Target %)', fontsize=14, fontweight='bold')
ax2.set_ylabel('Total Count')
ax2.tick_params(axis='x', rotation=45)

# Add value labels
for bar in bars2:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 10,
             f'{int(height)}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

# Show detailed breakdown by channel
print("\n" + "="*60)
print("DETAILED BREAKDOWN BY CHANNEL (SPLIT DATA)")
print("="*60)
for channel in split_df['Channel Distribution'].unique():
    channel_data = split_df[split_df['Channel Distribution'] == channel]
    print(f"\n{channel} ({len(channel_data)} records):")
    print("-" * 40)
    # Show top 5 topics for this channel
    top_topics = channel_data.nlargest(5, 'count')[['dominant topic', 'count']]
    for _, topic in top_topics.iterrows():
        print(f"  {topic['count']:>3} | {topic['dominant topic']}")
    if len(channel_data) > 5:
        print(f"  ... and {len(channel_data) - 5} more topics")


SPLITTING DATA BY TARGET PERCENTAGES
Target distribution:
Trustpilot: 63 records (38%)
App Store/Google Play: 53 records (32%)
Twitter/X: 30 records (18%)
Reddit: 20 records (12%)

ACTUAL SPLIT RESULTS

Channel Distribution Summary:
--------------------------------------------------
Trustpilot                | Count:   1147 | Records:  63 | Percentage:  37.50%
App Store/Google Play     | Count:    701 | Records:  53 | Percentage:  31.55%
Twitter/X                 | Count:    257 | Records:  30 | Percentage:  17.86%
Reddit                    | Count:    132 | Records:  22 | Percentage:  13.10%

SAVED TO: swedbank-split-by-percentages.csv
Total records: 168
File saved successfully!

Sample of split data:
                    dominant topic  count Channel Distribution
0                  Long Wait Times     52           Trustpilot
1          Call Center Overwhelmed     48           Trustpilot
2               Mobile App Crashes     45           Trustpilot
3           Call Center Complaints  

NameError: name 'plt' is not defined

In [4]:
def clean_text_file(input_file, output_file):
    """
    Reads a text file, removes text before '-' symbol, and saves cleaned data to new file.
    
    Args:
        input_file (str): Path to input .txt file
        output_file (str): Path to output .txt file
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as file:
            lines = file.readlines()
        
        cleaned_lines = []
        
        for line in lines:
            # Strip whitespace from the line
            line = line.strip()
            
            # Skip empty lines
            if not line:
                continue
            
            # Find the dash and extract text after it
            if ' - ' in line:
                # Split by ' - ' and take everything after the first occurrence
                parts = line.split(' - ', 1)  # Split only on first occurrence
                if len(parts) > 1:
                    cleaned_text = parts[1].strip()
                    if cleaned_text:  # Only add non-empty cleaned text
                        cleaned_lines.append(cleaned_text)
            else:
                # If no dash found, you can choose to skip or keep the line
                # Currently skipping lines without dash
                print(f"Warning: No dash found in line: {line}")
        
        # Write cleaned lines to output file
        with open(output_file, 'w', encoding='utf-8') as file:
            for line in cleaned_lines:
                file.write(line + '\n')
        
        print(f"Successfully processed {len(cleaned_lines)} lines.")
        print(f"Cleaned data saved to: {output_file}")
        
    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
    except Exception as e:
        print(f"Error processing file: {str(e)}")

# Example usage
if __name__ == "__main__":
    # Replace with your actual file paths
    input_filename = "seperate data.txt"  # Your original file
    output_filename = "cleaned_output.txt"  # New cleaned file
    
    # Call the function
    clean_text_file(input_filename, output_filename)
    
    # Optional: Display the cleaned content
    try:
        print("\nCleaned content:")
        print("-" * 50)
        with open(output_filename, 'r', encoding='utf-8') as f:
            content = f.read()
            print(content)
    except FileNotFoundError:
        print("Output file not created yet.")

Successfully processed 168 lines.
Cleaned data saved to: cleaned_output.txt

Cleaned content:
--------------------------------------------------
Queue Length Extended, Hold Time Excessive, Service Delay Impact, Customer Patience Testing, Peak Hour Congestion, Staff Shortage Impact, Call Volume Surge, Resource Allocation Issues, Agent Availability Limited, System Processing Slow, Customer Abandonment Rate, Service Level Agreement Breach, Wait Time Communication Poor, Queue Management Inefficient, Call Back Options Missing
Staff Shortage Critical, Call Volume Surge Unmanaged, Resource Allocation Inadequate, Agent Burnout Issues, Training Capacity Limited, Escalation Backlog Growing, System Capacity Exceeded, Peak Time Management Poor, Workforce Planning Insufficient, Call Routing Inefficient, Overtime Dependency High, Service Quality Degraded, Customer Satisfaction Declining, Management Response Delayed, Infrastructure Scaling Needed
Application Freezing Frequent, System Crashes Recurrin