In [1]:
# Import required libraries
from pymongo import MongoClient
from collections import defaultdict
import re
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {MONGO_DATABASE_NAME}")

client = MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DATABASE_NAME]

# Get collections
clusters_collection = db['cluster']
tickets_collection = db['tickets']

def normalize_text(text):
    """Normalize text for better matching"""
    return re.sub(r'[^\w\s]', '', text.lower().strip())

def match_tickets_to_clusters():
    """
    Match tickets to clusters based on dominant_topic matching keyphrases
    and update cluster documents with ticket_ids array
    """
    
    print("Fetching clusters with data: 'tickets'...")
    # Get only clusters that have data field set to "tickets"
    clusters = list(clusters_collection.find({"data": "tickets"}))
    print(f"Found {len(clusters)} clusters with data='tickets' to process\n")
    
    # Process each cluster
    for cluster in clusters:
        cluster_id = cluster['cluster_id']
        keyphrases = cluster.get('keyphrases', [])
        
        print(f"Processing Cluster ID: {cluster_id}")
        print(f"Cluster Name: {cluster.get('cluster_name', 'N/A')}")
        print(f"Keyphrases: {keyphrases}")
        
        # Normalize keyphrases for matching
        normalized_keyphrases = [normalize_text(phrase) for phrase in keyphrases]
        print(f"Normalized keyphrases: {normalized_keyphrases}")
        
        # Find matching tickets
        matching_ticket_ids = []
        
        # Get all tickets - using cursor for better memory management
        print("  Searching through tickets...")
        tickets_cursor = tickets_collection.find({}, {
            '_id': 1, 
            'dominant_topic': 1
        })
        
        ticket_count = 0
        for ticket in tickets_cursor:
            ticket_count += 1
            if ticket_count % 1000 == 0:
                print(f"    Processed {ticket_count} tickets...")
                
            ticket_dominant_topic = ticket.get('dominant_topic', '')
            
            if ticket_dominant_topic:
                normalized_topic = normalize_text(ticket_dominant_topic)
                
                # Check if any keyphrase matches the dominant topic
                for keyphrase in normalized_keyphrases:
                    if keyphrase and normalized_topic:  # Ensure both are not empty
                        if keyphrase in normalized_topic or normalized_topic in keyphrase:
                            matching_ticket_ids.append(str(ticket['_id']))
                            print(f"    Match found: {ticket['_id']} - Topic: '{ticket_dominant_topic}' matches keyphrase: '{keyphrase}'")
                            break
        
        print(f"  Finished processing {ticket_count} tickets")
        
        # Remove duplicates (in case a ticket matches multiple times)
        matching_ticket_ids = list(set(matching_ticket_ids))
        
        # Update cluster with ticket_ids (only for clusters with data: "tickets")
        try:
            if matching_ticket_ids:
                result = clusters_collection.update_one(
                    {'cluster_id': cluster_id, 'data': 'tickets'},
                    {'$set': {'ticket_ids': matching_ticket_ids}}
                )
                if result.modified_count > 0:
                    print(f"  ✓ Successfully updated cluster {cluster_id} with {len(matching_ticket_ids)} ticket IDs")
                elif result.matched_count > 0:
                    print(f"  ⚠ Cluster {cluster_id} already has same ticket data")
                else:
                    print(f"  ❌ Cluster {cluster_id} not found or doesn't have data: 'tickets'")
            else:
                # Set empty array if no matches found
                result = clusters_collection.update_one(
                    {'cluster_id': cluster_id, 'data': 'tickets'},
                    {'$set': {'ticket_ids': []}}
                )
                if result.modified_count > 0:
                    print(f"  ✓ Set empty ticket_ids array for cluster {cluster_id} (no matches found)")
                elif result.matched_count > 0:
                    print(f"  ⚠ Cluster {cluster_id} already has empty ticket_ids array")
                else:
                    print(f"  ❌ Cluster {cluster_id} not found or doesn't have data: 'tickets'")
        except Exception as e:
            print(f"  ❌ Error updating cluster {cluster_id}: {str(e)}")
        
        print(f"  Total unique tickets matched: {len(matching_ticket_ids)}")
        print("-" * 50)

def verify_results():
    """
    Verify the results by displaying updated clusters
    """
    print("\n" + "=" * 60)
    print("VERIFICATION RESULTS")
    print("=" * 60)
    
    try:
        clusters = list(clusters_collection.find({"data": "tickets"}, {
            'cluster_id': 1, 
            'cluster_name': 1, 
            'keyphrases': 1, 
            'ticket_ids': 1,
            'data': 1
        }).sort('cluster_id', 1))
        
        for cluster in clusters:
            ticket_count = len(cluster.get('ticket_ids', []))
            print(f"\nCluster {cluster['cluster_id']}: {cluster.get('cluster_name', 'N/A')} (data: {cluster.get('data', 'N/A')})")
            print(f"  Keyphrases: {cluster.get('keyphrases', [])}")
            print(f"  Ticket IDs count: {ticket_count}")
            if ticket_count > 0:
                print(f"  First 3 Ticket IDs: {cluster['ticket_ids'][:3]}")
                if ticket_count > 3:
                    print(f"  ... and {ticket_count - 3} more")
    except Exception as e:
        print(f"❌ Error during verification: {str(e)}")

def get_summary_stats():
    """Get summary statistics"""
    print("\n" + "=" * 60)
    print("SUMMARY STATISTICS")
    print("=" * 60)
    
    try:
        total_clusters = clusters_collection.count_documents({"data": "tickets"})
        clusters_with_tickets = clusters_collection.count_documents({
            "data": "tickets",
            'ticket_ids': {'$exists': True, '$ne': []}
        })
        
        pipeline = [
            {'$match': {"data": "tickets", 'ticket_ids': {'$exists': True}}},
            {'$project': {'ticket_count': {'$size': '$ticket_ids'}}},
            {'$group': {'_id': None, 'total_tickets_matched': {'$sum': '$ticket_count'}}}
        ]
        
        result = list(clusters_collection.aggregate(pipeline))
        total_tickets_matched = result[0]['total_tickets_matched'] if result else 0
        
        total_tickets = tickets_collection.count_documents({})
        
        print(f"Total clusters with data='tickets': {total_clusters}")
        print(f"Clusters with matched tickets: {clusters_with_tickets}")
        print(f"Clusters without matches: {total_clusters - clusters_with_tickets}")
        print(f"Total tickets in database: {total_tickets}")
        print(f"Total ticket-cluster matches: {total_tickets_matched}")
        
        if total_tickets > 0:
            match_percentage = (total_tickets_matched / total_tickets) * 100
            print(f"Match percentage: {match_percentage:.2f}%")
            
    except Exception as e:
        print(f"❌ Error getting statistics: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        print("🚀 Starting ticket-cluster matching process...")
        print("This will match tickets to clusters with data: 'tickets'")
        print("=" * 60)
        
        # Test database connection
        test_cluster = clusters_collection.find_one({"data": "tickets"})
        test_ticket = tickets_collection.find_one()
        
        if not test_cluster:
            print("⚠️  Warning: No clusters found with data: 'tickets'")
        if not test_ticket:
            print("⚠️  Warning: No tickets found in tickets collection")
            
        print("✓ Database connection successful\n")
        
        # Execute the matching process
        match_tickets_to_clusters()
        
        # Verify results
        verify_results()
        
        # Get summary statistics
        get_summary_stats()
        
        print("\n" + "=" * 60)
        print("✅ Process completed successfully!")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close database connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
🚀 Starting ticket-cluster matching process...
This will match tickets to clusters with data: 'tickets'
✓ Database connection successful

Fetching clusters with data: 'tickets'...
Found 16 clusters with data='tickets' to process

Processing Cluster ID: 0
Cluster Name: Account & User Administration
Keyphrases: ['Password Reset Failed', 'Account Lockout Issue', 'Profile Modification Request', 'Contact Details Update', 'Notification Setup Error', 'Account Closure Request', 'Signatory Update Required', 'Account Mandate Change', 'Profile Data Incorrect', 'Email Notification Failed', 'Account Termination Request', 'Corporate Setup Delayed', 'User Permission Error', 'Push Notification Failed', 'Closure Documentation Missing', 'User Access Approval']
Normalized keyphrases: ['password reset failed', 'account lockout issue', 'profile modification request', 'contact details update', 'notification setup error', 'account closure request', 'signatory update

In [2]:
# Import required libraries
from pymongo import MongoClient
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {MONGO_DATABASE_NAME}")

client = MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DATABASE_NAME]

# Get collections
clusters_collection = db['cluster']

def update_domains_to_ticket_support():
    """
    Update ticket cluster documents to change domains to ["Ticket Support"]
    Only processes clusters with data: "tickets"
    """
    
    print("Starting domain update process for ticket clusters...")
    print("=" * 50)
    
    try:
        # Count total ticket clusters before update
        total_ticket_clusters = clusters_collection.count_documents({"data": "tickets"})
        print(f"Total ticket clusters in collection: {total_ticket_clusters}")
        
        if total_ticket_clusters == 0:
            print("⚠ No ticket clusters found (data: 'tickets')")
            return
        
        # Count ticket clusters that currently have ["EU bank"] or other domains
        eu_bank_count = clusters_collection.count_documents({
            "data": "tickets", 
            "domains": ["EU bank"]
        })
        print(f"Ticket clusters with 'EU bank' domain: {eu_bank_count}")
        
        old_ticket_support_count = clusters_collection.count_documents({
            "data": "tickets", 
            "domains": ["Ticket Support"]
        })
        print(f"Ticket clusters with 'Ticket Support' domain: {old_ticket_support_count}")
        
        # Count ticket clusters with other domains
        other_domains_count = clusters_collection.count_documents({
            "data": "tickets",
            "domains": {"$nin": [["EU bank"], ["Ticket Support"]]}
        })
        print(f"Ticket clusters with other domains: {other_domains_count}")
        
        print("\n" + "=" * 50)
        print("UPDATING TICKET CLUSTER DOMAINS...")
        print("=" * 50)
        
        # Update only ticket clusters to have domains: ["Ticket Support"]
        update_result = clusters_collection.update_many(
            {"data": "tickets"},  # Only update clusters with data: "tickets"
            {"$set": {"domains": ["Ticket Support"]}}
        )
        
        print(f"✓ Successfully updated {update_result.modified_count} ticket clusters")
        print(f"  Matched ticket clusters: {update_result.matched_count}")
        
        # Verify the update
        verify_update()
        
    except Exception as e:
        print(f"❌ Error during ticket domain update: {str(e)}")

def verify_update():
    """
    Verify that all ticket clusters have been updated to ["Ticket Support"]
    """
    print("\n" + "=" * 50)
    print("TICKET CLUSTER VERIFICATION")
    print("=" * 50)
    
    try:
        # Count ticket clusters with different domain values
        ticket_support_count = clusters_collection.count_documents({
            "data": "tickets",
            "domains": ["Ticket Support"]
        })
        eu_bank_count = clusters_collection.count_documents({
            "data": "tickets",
            "domains": ["EU bank"]
        })
        other_domains = clusters_collection.count_documents({
            "data": "tickets",
            "domains": {"$nin": [["Ticket Support"], ["EU bank"]]}
        })
        
        total_ticket_clusters = clusters_collection.count_documents({"data": "tickets"})
        
        print(f"Total ticket clusters: {total_ticket_clusters}")
        print(f"Ticket clusters with 'Ticket Support' domain: {ticket_support_count}")
        print(f"Ticket clusters with 'EU bank' domain: {eu_bank_count}")
        print(f"Ticket clusters with other domains: {other_domains}")
        
        if ticket_support_count == total_ticket_clusters:
            print("\n✅ SUCCESS: All ticket clusters now have 'Ticket Support' domain!")
        else:
            print(f"\n⚠ WARNING: {total_ticket_clusters - ticket_support_count} ticket clusters still have different domains")
        
        # Show sample of updated ticket cluster documents
        print(f"\nSample of updated ticket clusters:")
        samples = list(clusters_collection.find(
            {"data": "tickets"}, 
            {
                'cluster_id': 1, 
                'domains': 1, 
                'dominant_label': 1,
                'data': 1
            }
        ).limit(5).sort('cluster_id', 1))
        
        for sample in samples:
            cluster_id = sample.get('cluster_id', 'N/A')
            domains = sample.get('domains', [])
            label = sample.get('dominant_label', 'N/A')
            data_type = sample.get('data', 'N/A')
            print(f"  Ticket Cluster {cluster_id}: data={data_type}, domains={domains}, label='{label}'")
            
    except Exception as e:
        print(f"❌ Error during verification: {str(e)}")

def get_domain_statistics():
    """
    Get detailed statistics about domains in ticket clusters only
    """
    print("\n" + "=" * 50)
    print("TICKET CLUSTER DOMAIN STATISTICS")
    print("=" * 50)
    
    try:
        # Aggregate to get all unique domain combinations for ticket clusters only
        pipeline = [
            {'$match': {"data": "tickets"}},
            {'$group': {'_id': '$domains', 'count': {'$sum': 1}}},
            {'$sort': {'count': -1}}
        ]
        
        domain_stats = list(clusters_collection.aggregate(pipeline))
        
        print("Domain distribution for ticket clusters:")
        for stat in domain_stats:
            domains = stat['_id']
            count = stat['count']
            print(f"  {domains}: {count} ticket clusters")
            
        total_ticket_clusters = clusters_collection.count_documents({"data": "tickets"})
        if total_ticket_clusters > 0:
            ticket_support_percentage = (clusters_collection.count_documents({
                "data": "tickets",
                "domains": ["Ticket Support"]
            }) / total_ticket_clusters) * 100
            print(f"\nPercentage of ticket clusters with 'Ticket Support' domain: {ticket_support_percentage:.1f}%")
        
        # Show comparison with other data types
        print(f"\nComparison with other cluster types:")
        all_data_types = list(clusters_collection.aggregate([
            {'$group': {'_id': '$data', 'count': {'$sum': 1}}},
            {'$sort': {'count': -1}}
        ]))
        
        for data_type in all_data_types:
            data_value = data_type['_id']
            count = data_type['count']
            print(f"  Clusters with data='{data_value}': {count}")
            
    except Exception as e:
        print(f"❌ Error getting statistics: {str(e)}")

def show_ticket_cluster_summary():
    """
    Show summary of ticket cluster fields after domain update
    """
    print("\n" + "=" * 50)
    print("TICKET CLUSTER SUMMARY")
    print("=" * 50)
    
    try:
        # Get ticket cluster statistics
        ticket_clusters_with_ticket_ids = clusters_collection.count_documents({
            "data": "tickets",
            "ticket_ids": {"$exists": True, "$ne": []}
        })
        
        # Get average ticket count per cluster
        ticket_pipeline = [
            {'$match': {"data": "tickets", "ticket_ids": {"$exists": True}}},
            {'$project': {'ticket_count': {'$size': '$ticket_ids'}}},
            {'$group': {
                '_id': None,
                'total_tickets': {'$sum': '$ticket_count'},
                'avg_tickets_per_cluster': {'$avg': '$ticket_count'},
                'max_tickets_per_cluster': {'$max': '$ticket_count'}
            }}
        ]
        
        ticket_result = list(clusters_collection.aggregate(ticket_pipeline))
        
        total_ticket_clusters = clusters_collection.count_documents({"data": "tickets"})
        
        print(f"Total ticket clusters: {total_ticket_clusters}")
        print(f"Ticket clusters with assigned tickets: {ticket_clusters_with_ticket_ids}")
        
        if ticket_result:
            result = ticket_result[0]
            print(f"Total tickets assigned to clusters: {result['total_tickets']}")
            print(f"Average tickets per cluster: {result['avg_tickets_per_cluster']:.2f}")
            print(f"Maximum tickets in a cluster: {result['max_tickets_per_cluster']}")
            
    except Exception as e:
        print(f"❌ Error getting ticket summary: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        print("🎫 Starting domains update to 'Ticket Support' for ticket clusters...")
        print("This will only update clusters with data: 'tickets'")
        print("=" * 60)
        
        # Test database connection
        test_doc = clusters_collection.find_one({"data": "tickets"})
        if test_doc:
            print("✓ Database connection successful")
            current_domains = test_doc.get('domains', 'N/A')
            data_type = test_doc.get('data', 'N/A')
            print(f"Sample ticket cluster - data: {data_type}, domains: {current_domains}\n")
        else:
            print("⚠ No ticket clusters found (data: 'tickets') in clusters collection")
            print("Please ensure you have clusters with data: 'tickets' before running this script")
            exit(1)
        
        # Execute the domain update for ticket clusters
        update_domains_to_ticket_support()
        
        # Get detailed statistics
        get_domain_statistics()
        
        # Show ticket cluster summary
        show_ticket_cluster_summary()
        
        print("\n" + "=" * 60)
        print("✅ Ticket cluster domain update process completed successfully!")
        print("All ticket clusters now have domains: ['Ticket Support']")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close database connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
🎫 Starting domains update to 'Ticket Support' for ticket clusters...
This will only update clusters with data: 'tickets'
✓ Database connection successful
Sample ticket cluster - data: tickets, domains: ['banking']

Starting domain update process for ticket clusters...
Total ticket clusters in collection: 16
Ticket clusters with 'EU bank' domain: 0
Ticket clusters with 'Ticket Support' domain: 2
Ticket clusters with other domains: 14

UPDATING TICKET CLUSTER DOMAINS...
✓ Successfully updated 14 ticket clusters
  Matched ticket clusters: 16

TICKET CLUSTER VERIFICATION
Total ticket clusters: 16
Ticket clusters with 'Ticket Support' domain: 16
Ticket clusters with 'EU bank' domain: 0
Ticket clusters with other domains: 0

✅ SUCCESS: All ticket clusters now have 'Ticket Support' domain!

Sample of updated ticket clusters:
  Ticket Cluster 0: data=tickets, domains=['Ticket Support'], label='User Access & Account Management'
  Ticket Cluster 1: dat

In [3]:
from pymongo import MongoClient, UpdateOne
from typing import Dict, List, Optional, Set
import os
from dotenv import load_dotenv
from collections import defaultdict
import threading
from concurrent.futures import ThreadPoolExecutor
import time

# Load environment variables
load_dotenv()

class OptimizedTicketClusterMatcher:
    def __init__(self, connection_string: str, database_name: str):
        """
        Initialize the matcher with MongoDB connection
        """
        self.client = MongoClient(connection_string)
        self.db = self.client[database_name]
        self.tickets_collection = self.db['tickets']
        self.clusters_collection = self.db['cluster']
        
        # Cache for cluster data - this is the key optimization
        self._cluster_cache = None
        self._subcluster_cache = None
        self._load_cluster_cache()
    
    def _load_cluster_cache(self):
        """
        Load all cluster data into memory for fast lookups
        Only load clusters where data equals "tickets"
        """
        print("Loading ticket cluster data into cache...")
        start_time = time.time()
        
        # Dictionary mapping keyphrase -> cluster info
        self._cluster_cache = {}
        # Dictionary mapping keyphrase -> subcluster info
        self._subcluster_cache = {}
        
        # Only get clusters where data = "tickets"
        clusters = list(self.clusters_collection.find({"data": "tickets"}))
        print(f"Found {len(clusters)} ticket clusters to cache")
        
        for cluster in clusters:
            cluster_id = cluster.get('cluster_id')
            dominant_label = cluster.get('dominant_label')
            keyphrases = cluster.get('keyphrases', [])
            subclusters = cluster.get('subclusters', {})
            
            # Cache cluster keyphrases
            for keyphrase in keyphrases:
                self._cluster_cache[keyphrase] = {
                    'cluster_id': cluster_id,
                    'dominant_label': dominant_label,
                    'subclusters': subclusters
                }
            
            # Cache subcluster keyphrases
            for subcluster_id, subcluster_data in subclusters.items():
                if not isinstance(subcluster_data, dict):
                    continue
                    
                subcluster_keyphrases = subcluster_data.get('keyphrases', [])
                for keyphrase in subcluster_keyphrases:
                    self._subcluster_cache[keyphrase] = {
                        'cluster_id': cluster_id,
                        'dominant_label': dominant_label,
                        'subcluster_id': int(subcluster_id),
                        'subcluster_label': subcluster_data.get('label')
                    }
        
        cache_time = time.time() - start_time
        print(f"Cache loaded in {cache_time:.2f} seconds")
        print(f"Cached {len(self._cluster_cache)} cluster keyphrases")
        print(f"Cached {len(self._subcluster_cache)} subcluster keyphrases")
    
    def find_matching_cluster_fast(self, dominant_topic: str) -> Optional[Dict]:
        """
        Fast cluster lookup using cached data
        """
        return self._cluster_cache.get(dominant_topic)
    
    def find_matching_subcluster_fast(self, dominant_topic: str) -> Optional[Dict]:
        """
        Fast subcluster lookup using cached data
        """
        return self._subcluster_cache.get(dominant_topic)
    
    def find_unmatched_tickets(self, limit: int = None) -> List[Dict]:
        """
        Find tickets that don't match any cluster or subcluster
        """
        unmatched = []
        
        # Get all tickets with dominant_topic
        query = {"dominant_topic": {"$exists": True, "$ne": None}}
        cursor = self.tickets_collection.find(query, {"dominant_topic": 1})
        
        if limit:
            cursor = cursor.limit(limit)
        
        for ticket in cursor:
            dominant_topic = ticket.get('dominant_topic')
            if not dominant_topic:
                continue
                
            # Check if it matches any cluster or subcluster
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            
            if not cluster_match and not subcluster_match:
                unmatched.append({
                    'ticket_id': str(ticket['_id']),
                    'dominant_topic': dominant_topic
                })
        
        return unmatched
    
    def get_unique_dominant_topics(self) -> Dict:
        """
        Get all unique dominant_topic values and their counts from tickets
        """
        pipeline = [
            {"$match": {"dominant_topic": {"$exists": True, "$ne": None}}},
            {"$group": {"_id": "$dominant_topic", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]
        
        result = list(self.tickets_collection.aggregate(pipeline))
        
        topics_info = {
            'total_unique_topics': len(result),
            'topics': result
        }
        
        return topics_info
    
    def analyze_matching_gaps(self) -> Dict:
        """
        Analyze what dominant_topics exist but don't match any ticket clusters
        """
        print("Analyzing matching gaps...")
        
        # Get all unique dominant topics
        topics_info = self.get_unique_dominant_topics()
        print(f"Found {topics_info['total_unique_topics']} unique dominant topics in tickets")
        
        # Check which ones don't match
        unmatched_topics = {}
        matched_topics = {}
        
        for topic_data in topics_info['topics']:
            topic = topic_data['_id']
            count = topic_data['count']
            
            cluster_match = self.find_matching_cluster_fast(topic)
            subcluster_match = self.find_matching_subcluster_fast(topic)
            
            if cluster_match or subcluster_match:
                matched_topics[topic] = {
                    'count': count,
                    'cluster_match': bool(cluster_match),
                    'subcluster_match': bool(subcluster_match)
                }
            else:
                unmatched_topics[topic] = count
        
        return {
            'total_topics': topics_info['total_unique_topics'],
            'matched_topics': len(matched_topics),
            'unmatched_topics': len(unmatched_topics),
            'unmatched_details': unmatched_topics,
            'matched_details': matched_topics,
            'unmatched_ticket_count': sum(unmatched_topics.values()),
            'matched_ticket_count': sum([data['count'] for data in matched_topics.values()])
        }
    
    def create_fallback_cluster_entry(self, unmatched_topics: List[str]) -> Dict:
        """
        Create a fallback cluster entry for unmatched topics (for tickets)
        """
        fallback_cluster = {
            'cluster_id': 999,  # Use a high number to avoid conflicts
            'dominant_label': 'Unclassified Ticket Topics',
            'keyphrases': unmatched_topics,
            'data': 'tickets',  # Specify that this is for tickets
            'subclusters': {
                '0': {
                    'label': 'Miscellaneous Tickets',
                    'keyphrases': unmatched_topics
                }
            }
        }
        return fallback_cluster
    
    def add_fallback_cluster_to_cache(self, unmatched_topics: List[str]) -> None:
        """
        Add unmatched topics to cache as a fallback cluster
        """
        print(f"Adding {len(unmatched_topics)} unmatched topics to fallback cluster...")
        
        for topic in unmatched_topics:
            # Add to cluster cache
            self._cluster_cache[topic] = {
                'cluster_id': 999,
                'dominant_label': 'Unclassified Ticket Topics',
                'subclusters': {'0': {'label': 'Miscellaneous Tickets', 'keyphrases': unmatched_topics}}
            }
            
            # Add to subcluster cache
            self._subcluster_cache[topic] = {
                'cluster_id': 999,
                'dominant_label': 'Unclassified Ticket Topics',
                'subcluster_id': 0,
                'subcluster_label': 'Miscellaneous Tickets'
            }
        
        print(f"✓ Added fallback cluster. Cache now has:")
        print(f"  - Cluster keyphrases: {len(self._cluster_cache)}")
        print(f"  - Subcluster keyphrases: {len(self._subcluster_cache)}")
    
    def process_tickets_batch(self, tickets: List[Dict]) -> List:
        """
        Process a batch of tickets and return bulk operations in correct PyMongo format
        """
        bulk_operations = []
        
        for ticket in tickets:
            dominant_topic = ticket.get('dominant_topic')
            if not dominant_topic:
                continue
            
            # Fast cluster lookup
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            
            update_data = {}
            
            if cluster_match:
                update_data.update({
                    'kmeans_cluster_id': cluster_match['cluster_id'],
                    'dominant_label': cluster_match['dominant_label']
                })
            
            if subcluster_match:
                update_data.update({
                    'kmeans_cluster_id': subcluster_match['cluster_id'],
                    'dominant_label': subcluster_match['dominant_label'],
                    'subcluster_id': subcluster_match['subcluster_id'],
                    'subcluster_label': subcluster_match['subcluster_label']
                })
            
            if update_data:
                # Use PyMongo's UpdateOne class instead of dict
                bulk_operations.append(
                    UpdateOne(
                        {'_id': ticket['_id']}, 
                        {'$set': update_data}
                    )
                )
        
        return bulk_operations
    
    def process_tickets_optimized(self, batch_size: int = 5000, max_workers: int = 4, dry_run: bool = False) -> Dict:
        """
        Optimized ticket processing with larger batches and optional threading
        """
        start_time = time.time()
        
        # Get total count more efficiently
        total_tickets = self.tickets_collection.estimated_document_count()
        processed = 0
        matched_clusters = 0
        matched_subclusters = 0
        total_updates = 0
        
        print(f"Processing ~{total_tickets} tickets in batches of {batch_size}")
        print(f"DRY RUN MODE: {'ON' if dry_run else 'OFF'}")
        
        # Create index on dominant_topic if it doesn't exist (for faster queries)
        try:
            self.tickets_collection.create_index([("dominant_topic", 1)], background=True)
            print("✓ Index on dominant_topic created/verified")
        except Exception as e:
            print(f"Index creation note: {e}")
        
        # Process tickets in larger batches
        cursor = self.tickets_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}},  # Only get tickets with dominant_topic
            projection={'dominant_topic': 1}  # Only fetch the field we need
        ).batch_size(batch_size)
        
        batch = []
        batch_count = 0
        
        for ticket in cursor:
            batch.append(ticket)
            
            if len(batch) >= batch_size:
                batch_count += 1
                print(f"\n--- Processing batch {batch_count} ({len(batch)} tickets) ---")
                
                # Process batch
                bulk_operations = self.process_tickets_batch(batch)
                print(f"Generated {len(bulk_operations)} update operations")
                
                # Count matches for statistics
                batch_cluster_matches = 0
                batch_subcluster_matches = 0
                for ticket in batch:
                    dominant_topic = ticket.get('dominant_topic')
                    if dominant_topic:
                        if self.find_matching_cluster_fast(dominant_topic):
                            matched_clusters += 1
                            batch_cluster_matches += 1
                        if self.find_matching_subcluster_fast(dominant_topic):
                            matched_subclusters += 1
                            batch_subcluster_matches += 1
                
                print(f"Batch matches - Clusters: {batch_cluster_matches}, Subclusters: {batch_subcluster_matches}")
                
                # Execute bulk update (or skip if dry run)
                if bulk_operations and not dry_run:
                    try:
                        print("Executing bulk write...")
                        result = self.tickets_collection.bulk_write(
                            bulk_operations, 
                            ordered=False  # Faster unordered operations
                        )
                        total_updates += result.modified_count
                        print(f"✓ Updated {result.modified_count} documents in batch {batch_count}")
                        
                        # Verify some updates
                        if result.modified_count > 0:
                            sample_updated = list(self.tickets_collection.find(
                                {"kmeans_cluster_id": {"$exists": True}},
                                {"dominant_topic": 1, "kmeans_cluster_id": 1, "subcluster_id": 1}
                            ).limit(3))
                            print(f"Sample updated documents: {len(sample_updated)} found with cluster IDs")
                        
                    except Exception as e:
                        print(f"❌ Bulk write error in batch {batch_count}: {e}")
                        print(f"Error type: {type(e).__name__}")
                        # Show sample operation for debugging in readable format
                        if bulk_operations:
                            sample_op = bulk_operations[0]
                            print(f"Sample operation: Update {sample_op._filter} with {sample_op._doc}")
                elif bulk_operations and dry_run:
                    print(f"DRY RUN: Would update {len(bulk_operations)} documents")
                    # Show sample operations in readable format
                    for i, op in enumerate(bulk_operations[:3]):
                        print(f"Sample operation {i+1}: Update {op._filter} with {op._doc}")
                else:
                    print("No operations to execute (no matches found)")
                
                processed += len(batch)
                batch = []
                
                # Progress update
                elapsed = time.time() - start_time
                rate = processed / elapsed if elapsed > 0 else 0
                print(f"Progress: {processed} tickets processed ({rate:.1f} tickets/sec)")
        
        # Process remaining tickets in the last batch
        if batch:
            batch_count += 1
            print(f"\n--- Processing final batch {batch_count} ({len(batch)} tickets) ---")
            
            bulk_operations = self.process_tickets_batch(batch)
            print(f"Generated {len(bulk_operations)} update operations")
            
            # Count matches for final batch
            batch_cluster_matches = 0
            batch_subcluster_matches = 0
            for ticket in batch:
                dominant_topic = ticket.get('dominant_topic')
                if dominant_topic:
                    if self.find_matching_cluster_fast(dominant_topic):
                        matched_clusters += 1
                        batch_cluster_matches += 1
                    if self.find_matching_subcluster_fast(dominant_topic):
                        matched_subclusters += 1
                        batch_subcluster_matches += 1
            
            print(f"Final batch matches - Clusters: {batch_cluster_matches}, Subclusters: {batch_subcluster_matches}")
            
            if bulk_operations and not dry_run:
                try:
                    print("Executing final bulk write...")
                    result = self.tickets_collection.bulk_write(
                        bulk_operations, 
                        ordered=False
                    )
                    total_updates += result.modified_count
                    print(f"✓ Updated {result.modified_count} documents in final batch")
                except Exception as e:
                    print(f"❌ Bulk write error in final batch: {e}")
                    print(f"Error type: {type(e).__name__}")
            elif bulk_operations and dry_run:
                print(f"DRY RUN: Would update {len(bulk_operations)} documents")
            
            processed += len(batch)
        
        total_time = time.time() - start_time
        
        # Final verification
        if not dry_run and total_updates > 0:
            print(f"\n--- Verification ---")
            updated_count = self.tickets_collection.count_documents({"kmeans_cluster_id": {"$exists": True}})
            print(f"Total ticket documents with kmeans_cluster_id: {updated_count}")
            
            subcluster_count = self.tickets_collection.count_documents({"subcluster_id": {"$exists": True}})
            print(f"Total ticket documents with subcluster_id: {subcluster_count}")
        
        stats = {
            'total_tickets': processed,
            'matched_clusters': matched_clusters,
            'matched_subclusters': matched_subclusters,
            'total_updates': total_updates,
            'processing_time': total_time,
            'tickets_per_second': processed / total_time if total_time > 0 else 0,
            'cluster_match_rate': (matched_clusters / processed * 100) if processed > 0 else 0,
            'subcluster_match_rate': (matched_subclusters / processed * 100) if processed > 0 else 0,
            'dry_run': dry_run
        }
        
        return stats
    
    def process_with_fallback(self, batch_size: int = 5000, dry_run: bool = False) -> Dict:
        """
        Process tickets with automatic fallback cluster for unmatched topics
        """
        print("=== PROCESSING TICKETS WITH FALLBACK CLUSTER ===")
        
        # First, analyze gaps
        gaps = self.analyze_matching_gaps()
        
        if gaps['unmatched_ticket_count'] > 0:
            print(f"Found {gaps['unmatched_ticket_count']} unmatched tickets")
            print(f"Unmatched topics: {list(gaps['unmatched_details'].keys())}")
            
            # Add fallback cluster to cache
            unmatched_topic_list = list(gaps['unmatched_details'].keys())
            self.add_fallback_cluster_to_cache(unmatched_topic_list)
            
            # Optionally save fallback cluster to database
            save_choice = input("Save fallback cluster to database permanently? (y/n): ")
            if save_choice.lower() == 'y':
                fallback_cluster = self.create_fallback_cluster_entry(unmatched_topic_list)
                try:
                    self.clusters_collection.insert_one(fallback_cluster)
                    print("✓ Fallback cluster saved to database")
                except Exception as e:
                    print(f"⚠️  Could not save fallback cluster: {e}")
        
        # Now process all tickets (should be 100% match rate)
        return self.process_tickets_optimized(batch_size=batch_size, dry_run=dry_run)
    
    def get_performance_stats(self) -> Dict:
        """
        Get database performance statistics
        """
        stats = {}
        
        # Collection sizes
        stats['total_tickets'] = self.tickets_collection.estimated_document_count()
        stats['tickets_with_topic'] = self.tickets_collection.count_documents({
            "dominant_topic": {"$exists": True, "$ne": None}
        })
        stats['total_ticket_clusters'] = self.clusters_collection.count_documents({"data": "tickets"})
        stats['total_all_clusters'] = self.clusters_collection.estimated_document_count()
        
        # Cache statistics
        stats['cached_cluster_keyphrases'] = len(self._cluster_cache) if self._cluster_cache else 0
        stats['cached_subcluster_keyphrases'] = len(self._subcluster_cache) if self._subcluster_cache else 0
        
        return stats
    
    def debug_matching_process(self, limit: int = 5) -> None:
        """
        Debug the matching process to see what's happening with tickets
        """
        print("\n=== DEBUGGING TICKET MATCHING PROCESS ===")
        
        # Check if we have any cluster data
        if not self._cluster_cache and not self._subcluster_cache:
            print("❌ NO TICKET CLUSTER CACHE DATA! This is why updates are failing.")
            return
        
        print(f"✓ Ticket cluster cache has {len(self._cluster_cache)} entries")
        print(f"✓ Ticket subcluster cache has {len(self._subcluster_cache)} entries")
        
        # Sample some cluster keyphrases
        print(f"\nSample ticket cluster keyphrases:")
        for i, keyphrase in enumerate(list(self._cluster_cache.keys())[:10]):
            cluster_info = self._cluster_cache[keyphrase]
            print(f"  {i+1}. '{keyphrase}' -> Cluster {cluster_info['cluster_id']}")
        
        # Sample some subcluster keyphrases  
        print(f"\nSample ticket subcluster keyphrases:")
        for i, keyphrase in enumerate(list(self._subcluster_cache.keys())[:10]):
            subcluster_info = self._subcluster_cache[keyphrase]
            print(f"  {i+1}. '{keyphrase}' -> Cluster {subcluster_info['cluster_id']}, Subcluster {subcluster_info['subcluster_id']}")
        
        # Check some actual tickets
        print(f"\n=== TESTING {limit} TICKETS ===")
        tickets = list(self.tickets_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}}
        ).limit(limit))
        
        if not tickets:
            print("❌ NO TICKETS with dominant_topic found!")
            return
        
        for i, ticket in enumerate(tickets, 1):
            dominant_topic = ticket.get('dominant_topic', 'NO_TOPIC')
            print(f"\n--- Ticket {i} ---")
            print(f"Ticket ID: {ticket['_id']}")
            print(f"Dominant Topic: '{dominant_topic}'")
            
            # Test cluster matching
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            if cluster_match:
                print(f"✓ CLUSTER MATCH: ID={cluster_match['cluster_id']}, Label='{cluster_match['dominant_label']}'")
            else:
                print(f"❌ No cluster match for '{dominant_topic}'")
            
            # Test subcluster matching  
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            if subcluster_match:
                print(f"✓ SUBCLUSTER MATCH: Cluster={subcluster_match['cluster_id']}, Subcluster={subcluster_match['subcluster_id']}, Label='{subcluster_match['subcluster_label']}'")
            else:
                print(f"❌ No subcluster match for '{dominant_topic}'")
            
            # Show what the update operation would look like
            update_data = {}
            if cluster_match:
                update_data.update({
                    'kmeans_cluster_id': cluster_match['cluster_id'],
                    'dominant_label': cluster_match['dominant_label']
                })
            if subcluster_match:
                update_data.update({
                    'kmeans_cluster_id': subcluster_match['cluster_id'],
                    'dominant_label': subcluster_match['dominant_label'],
                    'subcluster_id': subcluster_match['subcluster_id'],
                    'subcluster_label': subcluster_match['subcluster_label']
                })
            
            if update_data:
                print(f"UPDATE OPERATION: {update_data}")
            else:
                print("NO UPDATE OPERATION (no matches)")
        
        print(f"\n=== DATABASE STATE CHECK ===")
        # Check existing updates
        existing_with_cluster = self.tickets_collection.count_documents({"kmeans_cluster_id": {"$exists": True}})
        existing_with_subcluster = self.tickets_collection.count_documents({"subcluster_id": {"$exists": True}})
        tickets_with_topic = self.tickets_collection.count_documents({"dominant_topic": {"$exists": True, "$ne": None}})
        
        print(f"Tickets with dominant_topic: {tickets_with_topic}")
        print(f"Tickets already with kmeans_cluster_id: {existing_with_cluster}")
        print(f"Tickets already with subcluster_id: {existing_with_subcluster}")
        
        if tickets_with_topic == 0:
            print("❌ PROBLEM: No tickets have 'dominant_topic' field!")
        elif existing_with_cluster == tickets_with_topic:
            print("✓ All tickets already processed!")
        else:
            print(f"📝 {tickets_with_topic - existing_with_cluster} tickets need processing")
    
    def get_preview(self, limit: int = 10) -> List[Dict]:
        """
        Get a preview of ticket-cluster matches for testing
        """
        tickets = list(self.tickets_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}}
        ).limit(limit))
        
        preview = []
        
        for ticket in tickets:
            dominant_topic = ticket.get('dominant_topic')
            if not dominant_topic:
                continue
            
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            
            preview.append({
                'ticket_id': str(ticket['_id']),
                'dominant_topic': dominant_topic,
                'cluster_match': cluster_match,
                'subcluster_match': subcluster_match
            })
        
        return preview
    
    def close_connection(self):
        """Close MongoDB connection"""
        self.client.close()

# Usage example
def main():
    # Get configuration from environment variables
    CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
    DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')
    
    if not CONNECTION_STRING:
        raise ValueError("MONGO_CONNECTION_STRING not found in environment variables")
    if not DATABASE_NAME:
        raise ValueError("MONGO_DATABASE_NAME not found in environment variables")
    
    print(f"Connecting to database: {DATABASE_NAME}")
    
    # Initialize optimized ticket matcher
    matcher = OptimizedTicketClusterMatcher(CONNECTION_STRING, DATABASE_NAME)
    
    try:
        # Show performance stats
        print("\n--- Database Statistics ---")
        perf_stats = matcher.get_performance_stats()
        for key, value in perf_stats.items():
            print(f"{key}: {value:,}")
        
        # Analyze matching gaps
        print("\n--- Gap Analysis ---")
        gap_choice = input("Analyze which tickets aren't matching? (y/n): ")
        if gap_choice.lower() == 'y':
            gaps = matcher.analyze_matching_gaps()
            print(f"\n=== MATCHING GAP ANALYSIS ===")
            print(f"Total unique topics: {gaps['total_topics']}")
            print(f"Matched topics: {gaps['matched_topics']}")
            print(f"Unmatched topics: {gaps['unmatched_topics']}")
            print(f"Matched tickets: {gaps['matched_ticket_count']}")
            print(f"Unmatched tickets: {gaps['unmatched_ticket_count']}")
            
            if gaps['unmatched_details']:
                print(f"\n--- UNMATCHED DOMINANT TOPICS ---")
                for topic, count in list(gaps['unmatched_details'].items())[:10]:
                    print(f"'{topic}' - {count} tickets")
                
                if len(gaps['unmatched_details']) > 10:
                    print(f"... and {len(gaps['unmatched_details']) - 10} more")
                
                print(f"\n💡 To get 100% matches, you need to:")
                print(f"1. Add these topics to your ticket cluster keyphrases, OR")
                print(f"2. Create a 'catch-all' cluster for unmatched ticket topics")
        
        # Debug the matching process first
        print("\n--- Debugging Mode ---")
        debug_choice = input("Run debug mode to see why DB isn't updating? (y/n): ")
        if debug_choice.lower() == 'y':
            matcher.debug_matching_process()
        
        # Get a preview first
        print("\n--- Preview of Ticket Matches ---")
        preview = matcher.get_preview(limit=5)
        
        for i, item in enumerate(preview, 1):
            print(f"\n--- Ticket {i} ---")
            print(f"Dominant Topic: {item['dominant_topic']}")
            
            if item['subcluster_match']:
                print(f"✓ Subcluster Match: Cluster ID={item['subcluster_match']['cluster_id']}, "
                      f"Subcluster ID={item['subcluster_match']['subcluster_id']}, "
                      f"Label={item['subcluster_match']['subcluster_label']}")
            elif item['cluster_match']:
                print(f"✓ Cluster Match: ID={item['cluster_match']['cluster_id']}, "
                      f"Label={item['cluster_match']['dominant_label']}")
            else:
                print("✗ No match found")
        
        # Process all tickets
        print("\n--- Processing Options ---")
        print("1. Dry run (see what would be updated without changing DB)")
        print("2. Full processing (actually update the database)")
        print("3. Process with fallback cluster (100% match guarantee)")
        choice = input("Choose option (1, 2, or 3): ")
        
        if choice in ['1', '2', '3']:
            if choice == '3':
                # Use fallback processing
                dry_run = False
                fallback_choice = input("Dry run with fallback first? (y/n): ")
                if fallback_choice.lower() == 'y':
                    dry_run = True
                
                print(f"\nStarting {'DRY RUN' if dry_run else 'LIVE'} processing with fallback...")
                batch_size = int(input("Enter batch size (recommended: 5000-10000): ") or "5000")
                stats = matcher.process_with_fallback(batch_size=batch_size, dry_run=dry_run)
            else:
                # Regular processing
                dry_run = (choice == '1')
                
                print(f"\nStarting {'DRY RUN' if dry_run else 'LIVE PROCESSING'}...")
                
                # Use larger batch size for better performance
                batch_size = int(input("Enter batch size (recommended: 5000-10000): ") or "5000")
                
                stats = matcher.process_tickets_optimized(batch_size=batch_size, dry_run=dry_run)
            
            print("\n--- Final Results ---")
            print(f"Total tickets processed: {stats['total_tickets']:,}")
            print(f"Total updates made: {stats['total_updates']:,}")
            print(f"Cluster matches: {stats['matched_clusters']:,} ({stats['cluster_match_rate']:.1f}%)")
            print(f"Subcluster matches: {stats['matched_subclusters']:,} ({stats['subcluster_match_rate']:.1f}%)")
            print(f"Processing time: {stats['processing_time']:.2f} seconds")
            print(f"Processing rate: {stats['tickets_per_second']:.1f} tickets/second")
        
    finally:
        matcher.close_connection()

if __name__ == "__main__":
    main()

Connecting to database: sparzaai
Loading ticket cluster data into cache...
Found 16 ticket clusters to cache
Cache loaded in 2.28 seconds
Cached 172 cluster keyphrases
Cached 173 subcluster keyphrases

--- Database Statistics ---
total_tickets: 2,000
tickets_with_topic: 2,000
total_ticket_clusters: 16
total_all_clusters: 98
cached_cluster_keyphrases: 172
cached_subcluster_keyphrases: 173

--- Gap Analysis ---
Analyzing matching gaps...
Found 167 unique dominant topics in tickets

=== MATCHING GAP ANALYSIS ===
Total unique topics: 167
Matched topics: 167
Unmatched topics: 0
Matched tickets: 2000
Unmatched tickets: 0

--- Debugging Mode ---

--- Preview of Ticket Matches ---

--- Ticket 1 ---
Dominant Topic: AML Alert Triggered
✓ Subcluster Match: Cluster ID=6, Subcluster ID=0, Label=AML & KYC Alerts

--- Ticket 2 ---
Dominant Topic: AML Alert Triggered
✓ Subcluster Match: Cluster ID=6, Subcluster ID=0, Label=AML & KYC Alerts

--- Ticket 3 ---
Dominant Topic: AML Alert Triggered
✓ Subclu

In [4]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {mongo_database_name}")
print("=" * 60)

# Connect to MongoDB
client = MongoClient(mongo_connection_string)
db = client[mongo_database_name]
tickets_collection = db['tickets']

def rename_ticket_fields():
    """
    Rename fields in ticket documents:
    - is_urgent -> urgency
    - dominant_label -> dominant_cluster_label
    """
    
    print("🎫 Starting ticket field rename process...")
    print("Fields to rename:")
    print("  - is_urgent → urgency")
    print("  - dominant_label → dominant_cluster_label")
    print("-" * 50)
    
    try:
        # Check current state before rename
        total_tickets = tickets_collection.count_documents({})
        print(f"Total tickets in collection: {total_tickets}")
        
        if total_tickets == 0:
            print("⚠ No tickets found in collection")
            return
        
        # Count existing fields before rename
        is_urgent_count = tickets_collection.count_documents({"is_urgent": {"$exists": True}})
        dominant_label_count = tickets_collection.count_documents({"dominant_label": {"$exists": True}})
        
        print(f"Tickets with 'is_urgent' field: {is_urgent_count}")
        print(f"Tickets with 'dominant_label' field: {dominant_label_count}")
        
        # Count already renamed fields
        urgency_count = tickets_collection.count_documents({"urgency": {"$exists": True}})
        dominant_cluster_label_count = tickets_collection.count_documents({"dominant_cluster_label": {"$exists": True}})
        
        print(f"Tickets already with 'urgency' field: {urgency_count}")
        print(f"Tickets already with 'dominant_cluster_label' field: {dominant_cluster_label_count}")
        
        print("\n" + "=" * 50)
        print("RENAMING TICKET FIELDS...")
        print("=" * 50)
        
        # Rename both fields in a single operation for all tickets
        result = tickets_collection.update_many(
            {},  # Empty filter to match all ticket documents
            {
                "$rename": {
                    "is_urgent": "urgency",
                    "dominant_label": "dominant_cluster_label"
                }
            }
        )
        
        # Print results
        print(f"✓ Field rename operation completed:")
        print(f"  Matched tickets: {result.matched_count}")
        print(f"  Modified tickets: {result.modified_count}")
        print(f"  Operation acknowledged: {result.acknowledged}")
        
        # Verify the changes
        verify_rename_changes()
        
    except Exception as e:
        print(f"❌ Error during ticket field rename: {str(e)}")

def verify_rename_changes():
    """
    Verify that the field rename was successful
    """
    print("\n" + "=" * 50)
    print("VERIFICATION OF TICKET FIELD RENAME")
    print("=" * 50)
    
    try:
        # Verify the changes by checking a sample ticket document
        sample_ticket = tickets_collection.find_one()
        if sample_ticket:
            print("Sample ticket document after rename:")
            print(f"  Ticket ID: {sample_ticket.get('_id')}")
            print(f"  Has 'urgency' field: {'urgency' in sample_ticket}")
            print(f"  Has 'dominant_cluster_label' field: {'dominant_cluster_label' in sample_ticket}")
            print(f"  Has old 'is_urgent' field: {'is_urgent' in sample_ticket}")
            print(f"  Has old 'dominant_label' field: {'dominant_label' in sample_ticket}")
            
            # Show sample values if they exist
            if 'urgency' in sample_ticket:
                print(f"  Sample 'urgency' value: {sample_ticket['urgency']}")
            if 'dominant_cluster_label' in sample_ticket:
                print(f"  Sample 'dominant_cluster_label' value: {sample_ticket['dominant_cluster_label']}")
        else:
            print("⚠ No ticket documents found in the collection")
        
        # Count tickets with the new field names
        urgency_count = tickets_collection.count_documents({"urgency": {"$exists": True}})
        cluster_label_count = tickets_collection.count_documents({"dominant_cluster_label": {"$exists": True}})
        
        # Count tickets with old field names (should be 0 after rename)
        old_is_urgent_count = tickets_collection.count_documents({"is_urgent": {"$exists": True}})
        old_dominant_label_count = tickets_collection.count_documents({"dominant_label": {"$exists": True}})
        
        print(f"\nField counts after rename:")
        print(f"  Tickets with 'urgency' field: {urgency_count}")
        print(f"  Tickets with 'dominant_cluster_label' field: {cluster_label_count}")
        print(f"  Tickets with old 'is_urgent' field: {old_is_urgent_count}")
        print(f"  Tickets with old 'dominant_label' field: {old_dominant_label_count}")
        
        # Success check
        if old_is_urgent_count == 0 and old_dominant_label_count == 0:
            print("\n✅ SUCCESS: All ticket fields have been renamed successfully!")
        else:
            print(f"\n⚠ WARNING: Some tickets still have old field names")
            
    except Exception as e:
        print(f"❌ Error during verification: {str(e)}")

def get_ticket_field_statistics():
    """
    Get detailed statistics about ticket fields after rename
    """
    print("\n" + "=" * 50)
    print("TICKET FIELD STATISTICS")
    print("=" * 50)
    
    try:
        total_tickets = tickets_collection.count_documents({})
        
        # Get statistics for new field names
        urgency_stats = list(tickets_collection.aggregate([
            {"$match": {"urgency": {"$exists": True}}},
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        cluster_label_stats = list(tickets_collection.aggregate([
            {"$match": {"dominant_cluster_label": {"$exists": True}}},
            {"$group": {"_id": "$dominant_cluster_label", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}},
            {"$limit": 10}  # Show top 10 cluster labels
        ]))
        
        print(f"Total tickets: {total_tickets}")
        
        if urgency_stats:
            print(f"\nUrgency field distribution:")
            for stat in urgency_stats:
                urgency_value = stat['_id']
                count = stat['count']
                percentage = (count / total_tickets) * 100 if total_tickets > 0 else 0
                print(f"  '{urgency_value}': {count} tickets ({percentage:.1f}%)")
        else:
            print(f"\nNo tickets found with 'urgency' field")
        
        if cluster_label_stats:
            print(f"\nTop 10 dominant cluster labels:")
            for i, stat in enumerate(cluster_label_stats, 1):
                label = stat['_id'] if stat['_id'] is not None else 'null'
                count = stat['count']
                percentage = (count / total_tickets) * 100 if total_tickets > 0 else 0
                print(f"  {i}. '{label}': {count} tickets ({percentage:.1f}%)")
        else:
            print(f"\nNo tickets found with 'dominant_cluster_label' field")
            
    except Exception as e:
        print(f"❌ Error getting ticket statistics: {str(e)}")

def show_sample_tickets():
    """
    Show sample ticket documents with renamed fields
    """
    print("\n" + "=" * 50)
    print("SAMPLE TICKET DOCUMENTS")
    print("=" * 50)
    
    try:
        # Get sample tickets with both new fields
        sample_tickets = list(tickets_collection.find(
            {
                "urgency": {"$exists": True},
                "dominant_cluster_label": {"$exists": True}
            },
            {
                "_id": 1,
                "urgency": 1,
                "dominant_cluster_label": 1,
                "subject": 1,  # Include subject if it exists
                "priority": 1   # Include priority if it exists
            }
        ).limit(3))
        
        if sample_tickets:
            print("Sample tickets with renamed fields:")
            for i, ticket in enumerate(sample_tickets, 1):
                print(f"\nTicket {i}:")
                print(f"  ID: {ticket.get('_id')}")
                print(f"  Urgency: {ticket.get('urgency', 'N/A')}")
                print(f"  Dominant Cluster Label: {ticket.get('dominant_cluster_label', 'N/A')}")
                if 'subject' in ticket:
                    print(f"  Subject: {ticket.get('subject', 'N/A')}")
                if 'priority' in ticket:
                    print(f"  Priority: {ticket.get('priority', 'N/A')}")
        else:
            print("No tickets found with both renamed fields")
            
    except Exception as e:
        print(f"❌ Error showing sample tickets: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        # Test database connection
        test_ticket = tickets_collection.find_one()
        if test_ticket:
            print("✓ Database connection successful")
            print(f"Sample ticket fields: {list(test_ticket.keys())}\n")
        else:
            print("⚠ No tickets found in tickets collection")
            print("Please ensure you have ticket documents before running this script")
            exit(1)
        
        # Execute the field rename for tickets
        rename_ticket_fields()
        
        # Get detailed statistics
        get_ticket_field_statistics()
        
        # Show sample tickets
        show_sample_tickets()
        
        print("\n" + "=" * 60)
        print("✅ Ticket field rename process completed successfully!")
        print("Fields renamed:")
        print("  - is_urgent → urgency")
        print("  - dominant_label → dominant_cluster_label")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
✓ Database connection successful
Sample ticket fields: ['_id', 'ticket_number', 'dominant_topic', 'subtopics', 'description', 'priority', 'urgency', 'title', 'embeddings', 'dominant_label', 'kmeans_cluster_id', 'subcluster_id', 'subcluster_label']

🎫 Starting ticket field rename process...
Fields to rename:
  - is_urgent → urgency
  - dominant_label → dominant_cluster_label
--------------------------------------------------
Total tickets in collection: 2000
Tickets with 'is_urgent' field: 0
Tickets with 'dominant_label' field: 2000
Tickets already with 'urgency' field: 2000
Tickets already with 'dominant_cluster_label' field: 0

RENAMING TICKET FIELDS...
✓ Field rename operation completed:
  Matched tickets: 2000
  Modified tickets: 2000
  Operation acknowledged: True

VERIFICATION OF TICKET FIELD RENAME
Sample ticket document after rename:
  Ticket ID: 6889ba7ca4f4718f70978ff5
  Has 'urgency' field: True
  Has 'dominant_cluster_label' field:

In [5]:
from pymongo import MongoClient, UpdateOne
from typing import Dict, List, Optional
import os
from dotenv import load_dotenv
import time

# Load environment variables
load_dotenv()

class TicketClusterKeyphraseUpdater:
    def __init__(self, connection_string: str, database_name: str):
        """Initialize the updater with MongoDB connection"""
        self.client = MongoClient(connection_string)
        self.db = self.client[database_name]
        self.tickets_collection = self.db['tickets']
        self.clusters_collection = self.db['cluster']
        
        # Cache for keyphrase -> cluster mapping (ONLY ticket cluster level)
        self._keyphrase_to_cluster = {}
        self._load_keyphrase_cache()
    
    def _load_keyphrase_cache(self):
        """Load only ticket cluster-level keyphrases into memory for fast lookups"""
        print("Loading ticket cluster keyphrase cache...")
        start_time = time.time()
        
        # Only load clusters with data: "tickets"
        ticket_clusters = list(self.clusters_collection.find({"data": "tickets"}))
        print(f"Found {len(ticket_clusters)} ticket clusters to process")
        
        for cluster in ticket_clusters:
            cluster_id = cluster.get('cluster_id')
            dominant_label = cluster.get('dominant_label')
            cluster_keyphrases = cluster.get('keyphrases', [])
            
            # Cache ONLY ticket cluster-level keyphrases
            for keyphrase in cluster_keyphrases:
                self._keyphrase_to_cluster[keyphrase] = {
                    'cluster_id': cluster_id,
                    'dominant_label': dominant_label,
                    'matched_keyphrase': keyphrase
                }
        
        cache_time = time.time() - start_time
        print(f"Ticket cluster cache loaded in {cache_time:.2f} seconds")
        print(f"Cached {len(self._keyphrase_to_cluster)} ticket cluster keyphrases")
    
    def find_matching_keyphrase(self, dominant_topic: str) -> Optional[Dict]:
        """Find the matching keyphrase for a dominant topic (ticket cluster level only)"""
        return self._keyphrase_to_cluster.get(dominant_topic)
    
    def process_tickets_batch(self, tickets: List[Dict]) -> List:
        """Process a batch of tickets and return bulk operations"""
        bulk_operations = []
        
        for ticket in tickets:
            dominant_topic = ticket.get('dominant_topic')
            if not dominant_topic:
                continue
            
            # Find matching keyphrase from ticket clusters
            match_info = self.find_matching_keyphrase(dominant_topic)
            
            if match_info:
                update_data = {
                    'kmeans_cluster_keyphrase': match_info['matched_keyphrase']
                }
                
                bulk_operations.append(
                    UpdateOne(
                        {'_id': ticket['_id']}, 
                        {'$set': update_data}
                    )
                )
        
        return bulk_operations
    
    def add_keyphrase_field(self, batch_size: int = 5000, dry_run: bool = False) -> Dict:
        """Add kmeans_cluster_keyphrase field to all matching tickets"""
        start_time = time.time()
        
        # Get total count
        total_tickets = self.tickets_collection.count_documents({
            "dominant_topic": {"$exists": True, "$ne": None}
        })
        
        processed = 0
        matched = 0
        total_updates = 0
        
        print(f"Processing {total_tickets} tickets with dominant_topic")
        print(f"Batch size: {batch_size}")
        print(f"DRY RUN MODE: {'ON' if dry_run else 'OFF'}")
        
        # Create index for faster queries
        try:
            self.tickets_collection.create_index([("dominant_topic", 1)], background=True)
            print("✓ Index on dominant_topic verified for tickets collection")
        except Exception as e:
            print(f"Index note: {e}")
        
        # Process in batches
        cursor = self.tickets_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}},
            projection={'dominant_topic': 1}
        ).batch_size(batch_size)
        
        batch = []
        batch_count = 0
        
        for ticket in cursor:
            batch.append(ticket)
            
            if len(batch) >= batch_size:
                batch_count += 1
                print(f"\n--- Processing ticket batch {batch_count} ({len(batch)} tickets) ---")
                
                # Process batch
                bulk_operations = self.process_tickets_batch(batch)
                batch_matched = len(bulk_operations)
                matched += batch_matched
                
                print(f"Generated {batch_matched} keyphrase updates for this ticket batch")
                
                # Execute bulk update (or skip if dry run)
                if bulk_operations and not dry_run:
                    try:
                        result = self.tickets_collection.bulk_write(
                            bulk_operations, 
                            ordered=False
                        )
                        total_updates += result.modified_count
                        print(f"✓ Updated {result.modified_count} tickets with keyphrase field")
                        
                    except Exception as e:
                        print(f"❌ Bulk write error in ticket batch {batch_count}: {e}")
                
                elif bulk_operations and dry_run:
                    print(f"DRY RUN: Would add keyphrase field to {batch_matched} tickets")
                    # Show sample operations
                    for i, op in enumerate(bulk_operations[:3]):
                        keyphrase = op._doc['$set']['kmeans_cluster_keyphrase']
                        print(f"  Sample {i+1}: Would set keyphrase='{keyphrase}'")
                
                processed += len(batch)
                batch = []
                
                # Progress update
                elapsed = time.time() - start_time
                rate = processed / elapsed if elapsed > 0 else 0
                print(f"Progress: {processed}/{total_tickets} ({rate:.1f} tickets/sec)")
        
        # Process remaining tickets in final batch
        if batch:
            batch_count += 1
            print(f"\n--- Processing final ticket batch {batch_count} ({len(batch)} tickets) ---")
            
            bulk_operations = self.process_tickets_batch(batch)
            batch_matched = len(bulk_operations)
            matched += batch_matched
            
            print(f"Generated {batch_matched} keyphrase updates for final ticket batch")
            
            if bulk_operations and not dry_run:
                try:
                    result = self.tickets_collection.bulk_write(
                        bulk_operations, 
                        ordered=False
                    )
                    total_updates += result.modified_count
                    print(f"✓ Updated {result.modified_count} tickets in final batch")
                except Exception as e:
                    print(f"❌ Bulk write error in final ticket batch: {e}")
            elif bulk_operations and dry_run:
                print(f"DRY RUN: Would add keyphrase field to {batch_matched} tickets")
            
            processed += len(batch)
        
        total_time = time.time() - start_time
        
        # Final verification
        if not dry_run and total_updates > 0:
            print(f"\n--- Verification ---")
            keyphrase_count = self.tickets_collection.count_documents({
                "kmeans_cluster_keyphrase": {"$exists": True}
            })
            print(f"Total tickets with kmeans_cluster_keyphrase: {keyphrase_count}")
            
            # Show some sample results
            samples = list(self.tickets_collection.find(
                {"kmeans_cluster_keyphrase": {"$exists": True}},
                {"dominant_topic": 1, "kmeans_cluster_keyphrase": 1}
            ).limit(5))
            
            print(f"\nSample ticket results:")
            for i, sample in enumerate(samples, 1):
                print(f"  {i}. Topic: '{sample.get('dominant_topic')}' -> "
                      f"Keyphrase: '{sample.get('kmeans_cluster_keyphrase')}'")
        
        stats = {
            'total_tickets_processed': processed,
            'tickets_matched': matched,
            'total_updates': total_updates,
            'processing_time': total_time,
            'tickets_per_second': processed / total_time if total_time > 0 else 0,
            'match_rate': (matched / processed * 100) if processed > 0 else 0,
            'dry_run': dry_run
        }
        
        return stats
    
    def debug_keyphrase_matching(self, limit: int = 10) -> None:
        """Debug the keyphrase matching process for tickets"""
        print("\n=== DEBUGGING TICKET CLUSTER KEYPHRASE MATCHING ===")
        
        # Check cache
        if not self._keyphrase_to_cluster:
            print("❌ NO TICKET CLUSTER KEYPHRASE CACHE DATA!")
            return
        
        print(f"✓ Ticket cluster keyphrase cache: {len(self._keyphrase_to_cluster)} entries")
        
        # Show sample keyphrases
        print(f"\nSample ticket cluster keyphrases:")
        for i, (keyphrase, info) in enumerate(list(self._keyphrase_to_cluster.items())[:10]):
            print(f"  {i+1}. '{keyphrase}' -> Ticket Cluster {info['cluster_id']} ({info['dominant_label']})")
        
        # Test with actual tickets
        print(f"\n=== TESTING {limit} TICKETS ===")
        tickets = list(self.tickets_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}}
        ).limit(limit))
        
        if not tickets:
            print("❌ NO TICKETS with dominant_topic found!")
            return
        
        for i, ticket in enumerate(tickets, 1):
            dominant_topic = ticket.get('dominant_topic', 'NO_TOPIC')
            print(f"\n--- Ticket {i} ---")
            print(f"Ticket ID: {ticket['_id']}")
            print(f"Dominant Topic: '{dominant_topic}'")
            
            # Test keyphrase matching
            match_info = self.find_matching_keyphrase(dominant_topic)
            if match_info:
                print(f"✓ TICKET CLUSTER KEYPHRASE MATCH: '{match_info['matched_keyphrase']}'")
                print(f"  Cluster ID: {match_info['cluster_id']}")
                print(f"  Cluster Label: {match_info['dominant_label']}")
            else:
                print(f"❌ No ticket cluster keyphrase match for '{dominant_topic}'")
    
    def get_keyphrase_stats(self) -> Dict:
        """Get statistics about keyphrase matching for tickets"""
        # Count tickets with dominant_topic
        tickets_with_topic = self.tickets_collection.count_documents({
            "dominant_topic": {"$exists": True, "$ne": None}
        })
        
        # Count tickets already with keyphrase field
        tickets_with_keyphrase = self.tickets_collection.count_documents({
            "kmeans_cluster_keyphrase": {"$exists": True}
        })
        
        # Get unique dominant topics and check match rates
        unique_topics = self.tickets_collection.distinct("dominant_topic")
        unique_topics = [topic for topic in unique_topics if topic is not None]
        
        matchable_topics = 0
        for topic in unique_topics:
            if self.find_matching_keyphrase(topic):
                matchable_topics += 1
        
        # Count ticket clusters
        total_ticket_clusters = self.clusters_collection.count_documents({"data": "tickets"})
        
        return {
            'total_tickets_with_topic': tickets_with_topic,
            'tickets_with_keyphrase_field': tickets_with_keyphrase,
            'unique_dominant_topics': len(unique_topics),
            'matchable_topics': matchable_topics,
            'topic_match_rate': (matchable_topics / len(unique_topics) * 100) if unique_topics else 0,
            'cached_ticket_cluster_keyphrases': len(self._keyphrase_to_cluster),
            'total_ticket_clusters': total_ticket_clusters
        }
    
    def close_connection(self):
        """Close MongoDB connection"""
        self.client.close()

def main():
    # Get configuration from environment variables
    CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
    DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')
    
    if not CONNECTION_STRING:
        raise ValueError("MONGO_CONNECTION_STRING not found in environment variables")
    if not DATABASE_NAME:
        raise ValueError("MONGO_DATABASE_NAME not found in environment variables")
    
    print(f"Connecting to database: {DATABASE_NAME}")
    print("Processing tickets with ticket clusters (data: 'tickets')")
    
    # Initialize ticket keyphrase updater
    updater = TicketClusterKeyphraseUpdater(CONNECTION_STRING, DATABASE_NAME)
    
    try:
        # Show current statistics
        print("\n--- Current Ticket Statistics ---")
        stats = updater.get_keyphrase_stats()
        for key, value in stats.items():
            if isinstance(value, float):
                print(f"{key}: {value:.1f}")
            else:
                print(f"{key}: {value:,}")
        
        # Check if we have ticket clusters
        if stats['total_ticket_clusters'] == 0:
            print("\n❌ No ticket clusters found (data: 'tickets')!")
            print("Please ensure you have clusters with data: 'tickets' before running this script.")
            return
        
        # Debug keyphrase matching
        print("\n--- Debug Mode ---")
        debug_choice = input("Run debug mode to see ticket keyphrase matching? (y/n): ")
        if debug_choice.lower() == 'y':
            updater.debug_keyphrase_matching()
        
        # Choose processing mode
        print("\n--- Processing Options ---")
        print("1. Dry run (see what would be updated)")
        print("2. Live processing (actually add keyphrase field)")
        choice = input("Choose option (1 or 2): ")
        
        if choice in ['1', '2']:
            dry_run = (choice == '1')
            
            print(f"\nStarting {'DRY RUN' if dry_run else 'LIVE PROCESSING'} for tickets...")
            
            # Get batch size
            batch_size = int(input("Enter batch size (recommended: 5000): ") or "5000")
            
            # Process tickets
            results = updater.add_keyphrase_field(batch_size=batch_size, dry_run=dry_run)
            
            print("\n--- Final Results ---")
            print(f"Total tickets processed: {results['total_tickets_processed']:,}")
            print(f"Tickets with matching keyphrases: {results['tickets_matched']:,}")
            print(f"Total updates made: {results['total_updates']:,}")
            print(f"Match rate: {results['match_rate']:.1f}%")
            print(f"Processing time: {results['processing_time']:.2f} seconds")
            print(f"Processing rate: {results['tickets_per_second']:.1f} tickets/second")
            
            if not dry_run and results['total_updates'] > 0:
                print(f"\n✅ Successfully added kmeans_cluster_keyphrase field to {results['total_updates']:,} tickets!")
            elif dry_run:
                print(f"\nDRY RUN COMPLETE: Would add keyphrase field to {results['tickets_matched']:,} tickets")
        
    finally:
        updater.close_connection()

if __name__ == "__main__":
    main()

Connecting to database: sparzaai
Processing tickets with ticket clusters (data: 'tickets')
Loading ticket cluster keyphrase cache...
Found 16 ticket clusters to process
Ticket cluster cache loaded in 2.27 seconds
Cached 172 ticket cluster keyphrases

--- Current Ticket Statistics ---
total_tickets_with_topic: 2,000
tickets_with_keyphrase_field: 0
unique_dominant_topics: 167
matchable_topics: 167
topic_match_rate: 100.0
cached_ticket_cluster_keyphrases: 172
total_ticket_clusters: 16

--- Debug Mode ---

=== DEBUGGING TICKET CLUSTER KEYPHRASE MATCHING ===
✓ Ticket cluster keyphrase cache: 172 entries

Sample ticket cluster keyphrases:
  1. 'Password Reset Failed' -> Ticket Cluster 0 (User Access & Account Management)
  2. 'Account Lockout Issue' -> Ticket Cluster 0 (User Access & Account Management)
  3. 'Profile Modification Request' -> Ticket Cluster 0 (User Access & Account Management)
  4. 'Contact Details Update' -> Ticket Cluster 0 (User Access & Account Management)
  5. 'Notificat

In [6]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get connection details from environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
    raise ValueError("Please set MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME in your environment variables")

# Connect to MongoDB
client = MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DATABASE_NAME]
collection = db['tickets']

try:
    # Add domain field to all documents
    result = collection.update_many(
        {},  # Empty filter to match all documents
        {"$set": {"domain": "banking"}}
    )
    
    print(f"Matched documents: {result.matched_count}")
    print(f"Modified documents: {result.modified_count}")
    
    if result.matched_count == 2004:
        print("Successfully updated all 2004 documents!")
    else:
        print(f"Expected 2004 documents, but found {result.matched_count}")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the connection
    client.close()

# Alternative: Add domain field only to documents that don't already have it
def add_domain_conditionally():
    try:
        result = collection.update_many(
            {"domain": {"$exists": False}},  # Only documents without 'domain' field
            {"$set": {"domain": "banking"}}
        )
        
        print(f"Documents without domain field: {result.matched_count}")
        print(f"Modified documents: {result.modified_count}")
        
    except Exception as e:
        print(f"An error occurred: {e}")

# Uncomment the line below if you want to run the conditional update instead
# add_domain_conditionally()

Matched documents: 2000
Modified documents: 2000
Expected 2004 documents, but found 2000


In [7]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {mongo_database_name}")
print("Collection: tickets")
print("=" * 60)

# Connect to MongoDB
client = MongoClient(mongo_connection_string)
db = client[mongo_database_name]
tickets_collection = db['tickets']

def analyze_urgency_values():
    """Analyze current urgency field values"""
    print("Analyzing current urgency values...")
    print("-" * 40)
    
    try:
        # Get all unique urgency values
        urgency_values = tickets_collection.distinct("urgency")
        print(f"Unique urgency values found: {urgency_values}")
        
        # Count each urgency value
        urgency_stats = list(tickets_collection.aggregate([
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        total_tickets = tickets_collection.count_documents({})
        print(f"\nTotal tickets: {total_tickets}")
        print(f"Urgency distribution:")
        
        for stat in urgency_stats:
            value = stat['_id'] if stat['_id'] is not None else 'null/missing'
            count = stat['count']
            percentage = (count / total_tickets) * 100 if total_tickets > 0 else 0
            print(f"  '{value}': {count} tickets ({percentage:.1f}%)")
            
        return urgency_stats
        
    except Exception as e:
        print(f"Error analyzing urgency values: {str(e)}")
        return []

def update_urgency_to_boolean():
    """Update urgency field from string to boolean"""
    print("\nStarting urgency field update...")
    print("Conversion rules:")
    print("  'Critical' → true")
    print("  'High' → false")
    print("-" * 40)
    
    try:
        # Update Critical to true
        critical_result = tickets_collection.update_many(
            {"urgency": "Critical"},
            {"$set": {"urgency": True}}
        )
        
        print(f"✓ Updated 'Critical' urgency:")
        print(f"  Matched: {critical_result.matched_count}")
        print(f"  Modified: {critical_result.modified_count}")
        
        # Update High to false
        high_result = tickets_collection.update_many(
            {"urgency": "High"},
            {"$set": {"urgency": False}}
        )
        
        print(f"✓ Updated 'High' urgency:")
        print(f"  Matched: {high_result.matched_count}")
        print(f"  Modified: {high_result.modified_count}")
        
        total_updated = critical_result.modified_count + high_result.modified_count
        print(f"\nTotal tickets updated: {total_updated}")
        
        return {
            'critical_updated': critical_result.modified_count,
            'high_updated': high_result.modified_count,
            'total_updated': total_updated
        }
        
    except Exception as e:
        print(f"Error updating urgency values: {str(e)}")
        return None

def verify_boolean_conversion():
    """Verify that urgency values are now boolean"""
    print("\n" + "=" * 50)
    print("VERIFICATION OF URGENCY CONVERSION")
    print("=" * 50)
    
    try:
        # Count boolean urgency values
        true_count = tickets_collection.count_documents({"urgency": True})
        false_count = tickets_collection.count_documents({"urgency": False})
        
        # Count any remaining string values
        critical_count = tickets_collection.count_documents({"urgency": "Critical"})
        high_count = tickets_collection.count_documents({"urgency": "High"})
        
        # Count other values
        other_urgency = list(tickets_collection.aggregate([
            {"$match": {"urgency": {"$nin": [True, False, "Critical", "High"]}}},
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        print(f"Boolean urgency values:")
        print(f"  urgency: true → {true_count} tickets")
        print(f"  urgency: false → {false_count} tickets")
        
        print(f"\nRemaining string values:")
        print(f"  urgency: 'Critical' → {critical_count} tickets")
        print(f"  urgency: 'High' → {high_count} tickets")
        
        if other_urgency:
            print(f"\nOther urgency values:")
            for other in other_urgency:
                value = other['_id'] if other['_id'] is not None else 'null/missing'
                count = other['count']
                print(f"  urgency: '{value}' → {count} tickets")
        
        # Show sample tickets with boolean urgency
        print(f"\nSample tickets with boolean urgency:")
        samples = list(tickets_collection.find(
            {"urgency": {"$in": [True, False]}},
            {
                "ticket_number": 1,
                "urgency": 1,
                "priority": 1,
                "title": 1
            }
        ).limit(5))
        
        for i, sample in enumerate(samples, 1):
            ticket_num = sample.get('ticket_number', 'N/A')
            urgency = sample.get('urgency')
            priority = sample.get('priority', 'N/A')
            title = sample.get('title', 'N/A')[:50] + '...' if len(sample.get('title', '')) > 50 else sample.get('title', 'N/A')
            
            print(f"  {i}. {ticket_num}: urgency={urgency}, priority='{priority}'")
            print(f"     Title: {title}")
        
        # Success check
        if critical_count == 0 and high_count == 0:
            print(f"\n✅ SUCCESS: All 'Critical' and 'High' urgency values converted to boolean!")
            print(f"Summary: {true_count} critical (true) + {false_count} high (false) = {true_count + false_count} total")
        else:
            print(f"\n⚠ WARNING: Some string urgency values remain")
            
    except Exception as e:
        print(f"Error during verification: {str(e)}")

def handle_other_urgency_values():
    """Check and optionally handle other urgency values"""
    print("\n" + "=" * 50)
    print("HANDLING OTHER URGENCY VALUES")
    print("=" * 50)
    
    try:
        # Find tickets with urgency values other than True/False/Critical/High
        other_urgency = list(tickets_collection.aggregate([
            {"$match": {"urgency": {"$nin": [True, False, "Critical", "High"]}}},
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        if not other_urgency:
            print("No other urgency values found - all tickets have been processed!")
            return
        
        print("Found tickets with other urgency values:")
        for other in other_urgency:
            value = other['_id'] if other['_id'] is not None else 'null/missing'
            count = other['count']
            print(f"  '{value}': {count} tickets")
        
        print(f"\nSample tickets with other urgency values:")
        samples = list(tickets_collection.find(
            {"urgency": {"$nin": [True, False, "Critical", "High"]}},
            {
                "ticket_number": 1,
                "urgency": 1,
                "priority": 1,
                "title": 1
            }
        ).limit(3))
        
        for i, sample in enumerate(samples, 1):
            ticket_num = sample.get('ticket_number', 'N/A')
            urgency = sample.get('urgency')
            priority = sample.get('priority', 'N/A')
            title = sample.get('title', 'N/A')[:50] + '...' if len(sample.get('title', '')) > 50 else sample.get('title', 'N/A')
            
            print(f"  {i}. {ticket_num}: urgency={urgency}, priority='{priority}'")
            print(f"     Title: {title}")
            
    except Exception as e:
        print(f"Error handling other urgency values: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        # Test database connection
        test_ticket = tickets_collection.find_one()
        if test_ticket:
            print("✓ Database connection successful")
            print(f"Sample ticket fields: {list(test_ticket.keys())}\n")
        else:
            print("⚠ No tickets found in tickets collection")
            exit(1)
        
        # Analyze current urgency values
        analyze_urgency_values()
        
        # Confirm before proceeding
        print(f"\nThis will update urgency values:")
        print(f"  'Critical' → true")
        print(f"  'High' → false")
        
        confirm = input(f"\nProceed with urgency conversion? (y/n): ")
        if confirm.lower() != 'y':
            print("Operation cancelled.")
            exit(0)
        
        # Execute the urgency update
        update_result = update_urgency_to_boolean()
        
        if update_result:
            # Verify the conversion
            verify_boolean_conversion()
            
            # Handle other urgency values
            handle_other_urgency_values()
            
            print(f"\n" + "=" * 60)
            print("✅ URGENCY CONVERSION COMPLETED SUCCESSFULLY!")
            print(f"Summary:")
            print(f"  Critical tickets (now true): {update_result['critical_updated']}")
            print(f"  High tickets (now false): {update_result['high_updated']}")
            print(f"  Total tickets updated: {update_result['total_updated']}")
            print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
Collection: tickets
✓ Database connection successful
Sample ticket fields: ['_id', 'ticket_number', 'dominant_topic', 'subtopics', 'description', 'priority', 'urgency', 'title', 'embeddings', 'kmeans_cluster_id', 'subcluster_id', 'subcluster_label', 'dominant_cluster_label', 'kmeans_cluster_keyphrase', 'domain']

Analyzing current urgency values...
----------------------------------------
Unique urgency values found: ['Critical', 'High']

Total tickets: 2000
Urgency distribution:
  'Critical': 1778 tickets (88.9%)
  'High': 222 tickets (11.1%)

This will update urgency values:
  'Critical' → true
  'High' → false

Starting urgency field update...
Conversion rules:
  'Critical' → true
  'High' → false
----------------------------------------
✓ Updated 'Critical' urgency:
  Matched: 1778
  Modified: 1778
✓ Updated 'High' urgency:
  Matched: 222
  Modified: 222

Total tickets updated: 2000

VERIFICATION OF URGENCY CONVERSION
Boolean urgency valu

In [8]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {mongo_database_name}")
print("Collection: tickets")
print("=" * 60)

# Connect to MongoDB
client = MongoClient(mongo_connection_string)
db = client[mongo_database_name]
tickets_collection = db['tickets']

def analyze_subcluster_id_values():
    """Analyze current subcluster_id field values"""
    print("Analyzing current subcluster_id values...")
    print("-" * 40)
    
    try:
        # Get all unique subcluster_id values
        subcluster_values = tickets_collection.distinct("subcluster_id")
        print(f"Unique subcluster_id values found: {subcluster_values}")
        
        # Count each subcluster_id value type
        subcluster_stats = list(tickets_collection.aggregate([
            {"$group": {
                "_id": {"value": "$subcluster_id", "type": {"$type": "$subcluster_id"}}, 
                "count": {"$sum": 1}
            }},
            {"$sort": {"count": -1}}
        ]))
        
        total_tickets = tickets_collection.count_documents({})
        print(f"\nTotal tickets: {total_tickets}")
        print(f"Subcluster_id distribution:")
        
        for stat in subcluster_stats:
            value = stat['_id']['value'] if stat['_id']['value'] is not None else 'null/missing'
            data_type = stat['_id']['type']
            count = stat['count']
            percentage = (count / total_tickets) * 100 if total_tickets > 0 else 0
            print(f"  '{value}' (type: {data_type}): {count} tickets ({percentage:.1f}%)")
            
        return subcluster_stats
        
    except Exception as e:
        print(f"Error analyzing subcluster_id values: {str(e)}")
        return []

def update_subcluster_id_to_string():
    """Update subcluster_id field from integer to string"""
    print("\nStarting subcluster_id field update...")
    print("Conversion rule: All integer values → string values")
    print("-" * 40)
    
    try:
        # Find all tickets with integer subcluster_id
        integer_tickets = list(tickets_collection.find(
            {"subcluster_id": {"$type": "int"}},
            {"_id": 1, "subcluster_id": 1}
        ))
        
        print(f"Found {len(integer_tickets)} tickets with integer subcluster_id")
        
        updated_count = 0
        
        # Update each ticket individually to convert integer to string
        for ticket in integer_tickets:
            old_value = ticket['subcluster_id']
            new_value = str(old_value)
            
            result = tickets_collection.update_one(
                {"_id": ticket['_id']},
                {"$set": {"subcluster_id": new_value}}
            )
            
            if result.modified_count > 0:
                updated_count += 1
        
        print(f"✓ Updated subcluster_id from integer to string:")
        print(f"  Total processed: {len(integer_tickets)}")
        print(f"  Successfully updated: {updated_count}")
        
        return updated_count
        
    except Exception as e:
        print(f"Error updating subcluster_id values: {str(e)}")
        return 0

def verify_string_conversion():
    """Verify that subcluster_id values are now strings"""
    print("\n" + "=" * 50)
    print("VERIFICATION OF SUBCLUSTER_ID CONVERSION")
    print("=" * 50)
    
    try:
        # Count string subcluster_id values
        string_count = tickets_collection.count_documents({"subcluster_id": {"$type": "string"}})
        
        # Count any remaining integer values
        integer_count = tickets_collection.count_documents({"subcluster_id": {"$type": "int"}})
        
        # Count other data types
        other_types = list(tickets_collection.aggregate([
            {"$match": {"subcluster_id": {"$nin": [None]}}},
            {"$group": {"_id": {"$type": "$subcluster_id"}, "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        print(f"Subcluster_id by data type:")
        for type_stat in other_types:
            data_type = type_stat['_id']
            count = type_stat['count']
            print(f"  {data_type}: {count} tickets")
        
        # Count null/missing values
        null_count = tickets_collection.count_documents({"subcluster_id": None})
        missing_count = tickets_collection.count_documents({"subcluster_id": {"$exists": False}})
        
        if null_count > 0:
            print(f"  null: {null_count} tickets")
        if missing_count > 0:
            print(f"  missing: {missing_count} tickets")
        
        # Show sample tickets with string subcluster_id
        print(f"\nSample tickets with string subcluster_id:")
        samples = list(tickets_collection.find(
            {"subcluster_id": {"$type": "string"}},
            {
                "ticket_number": 1,
                "subcluster_id": 1,
                "title": 1
            }
        ).limit(5))
        
        for i, sample in enumerate(samples, 1):
            ticket_num = sample.get('ticket_number', 'N/A')
            subcluster_id = sample.get('subcluster_id')
            title = sample.get('title', 'N/A')[:50] + '...' if len(sample.get('title', '')) > 50 else sample.get('title', 'N/A')
            
            print(f"  {i}. {ticket_num}: subcluster_id=\"{subcluster_id}\"")
            print(f"     Title: {title}")
        
        # Success check
        if integer_count == 0:
            print(f"\n✅ SUCCESS: All integer subcluster_id values converted to strings!")
            print(f"Summary: {string_count} tickets now have string subcluster_id")
        else:
            print(f"\n⚠ WARNING: {integer_count} integer subcluster_id values remain")
            
    except Exception as e:
        print(f"Error during verification: {str(e)}")

def handle_other_subcluster_id_values():
    """Check for any unusual subcluster_id values"""
    print("\n" + "=" * 50)
    print("CHECKING FOR OTHER SUBCLUSTER_ID VALUES")
    print("=" * 50)
    
    try:
        # Find tickets with null or missing subcluster_id
        null_count = tickets_collection.count_documents({"subcluster_id": None})
        missing_count = tickets_collection.count_documents({"subcluster_id": {"$exists": False}})
        
        if null_count > 0:
            print(f"Found {null_count} tickets with null subcluster_id")
            
        if missing_count > 0:
            print(f"Found {missing_count} tickets with missing subcluster_id field")
            
        if null_count == 0 and missing_count == 0:
            print("All tickets have valid subcluster_id values!")
            
        # Show sample of any problematic tickets
        if null_count > 0 or missing_count > 0:
            print(f"\nSample tickets with null/missing subcluster_id:")
            samples = list(tickets_collection.find(
                {"$or": [
                    {"subcluster_id": None},
                    {"subcluster_id": {"$exists": False}}
                ]},
                {
                    "ticket_number": 1,
                    "subcluster_id": 1,
                    "title": 1
                }
            ).limit(3))
            
            for i, sample in enumerate(samples, 1):
                ticket_num = sample.get('ticket_number', 'N/A')
                subcluster_id = sample.get('subcluster_id', 'MISSING_FIELD')
                title = sample.get('title', 'N/A')[:50] + '...' if len(sample.get('title', '')) > 50 else sample.get('title', 'N/A')
                
                print(f"  {i}. {ticket_num}: subcluster_id={subcluster_id}")
                print(f"     Title: {title}")
            
    except Exception as e:
        print(f"Error checking other subcluster_id values: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        # Test database connection
        test_ticket = tickets_collection.find_one()
        if test_ticket:
            print("✓ Database connection successful")
            print(f"Sample ticket fields: {list(test_ticket.keys())}\n")
        else:
            print("⚠ No tickets found in tickets collection")
            exit(1)
        
        # Analyze current subcluster_id values
        analyze_subcluster_id_values()
        
        # Confirm before proceeding
        print(f"\nThis will convert all integer subcluster_id values to strings")
        print(f"Example: subcluster_id: 1 → subcluster_id: \"1\"")
        
        confirm = input(f"\nProceed with subcluster_id conversion? (y/n): ")
        if confirm.lower() != 'y':
            print("Operation cancelled.")
            exit(0)
        
        # Execute the subcluster_id update
        updated_count = update_subcluster_id_to_string()
        
        if updated_count > 0:
            # Verify the conversion
            verify_string_conversion()
            
            # Handle other subcluster_id values
            handle_other_subcluster_id_values()
            
            print(f"\n" + "=" * 60)
            print("✅ SUBCLUSTER_ID CONVERSION COMPLETED SUCCESSFULLY!")
            print(f"Summary:")
            print(f"  Total tickets updated: {updated_count}")
            print(f"  All integer subcluster_id values converted to strings")
            print("=" * 60)
        else:
            print(f"\n⚠ No updates were made. Check if subcluster_id fields are already strings or if there are no integer values.")
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
Collection: tickets
✓ Database connection successful
Sample ticket fields: ['_id', 'ticket_number', 'dominant_topic', 'subtopics', 'description', 'priority', 'urgency', 'title', 'embeddings', 'kmeans_cluster_id', 'subcluster_id', 'subcluster_label', 'dominant_cluster_label', 'kmeans_cluster_keyphrase', 'domain']

Analyzing current subcluster_id values...
----------------------------------------
Unique subcluster_id values found: [0, 1, 2, 3, 4]

Total tickets: 2000
Subcluster_id distribution:
  '1' (type: int): 694 tickets (34.7%)
  '0' (type: int): 661 tickets (33.1%)
  '2' (type: int): 399 tickets (20.0%)
  '3' (type: int): 177 tickets (8.8%)
  '4' (type: int): 69 tickets (3.5%)

This will convert all integer subcluster_id values to strings
Example: subcluster_id: 1 → subcluster_id: "1"

Starting subcluster_id field update...
Conversion rule: All integer values → string values
----------------------------------------
Found 2000 tickets with

In [9]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

def rename_field_in_cluster_collection():
    """
    Rename field 'ticket_ids' to 'tickets_ids' in documents where data="tickets"
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        cluster_collection = db['cluster']
        
        # Define the filter for documents that have data="tickets" and ticket_ids field exists
        filter_query = {
            "data": "tickets",
            "ticket_ids": {"$exists": True}
        }
        
        # Count documents that match the criteria before update
        count_before = cluster_collection.count_documents(filter_query)
        print(f"Found {count_before} documents matching criteria (data='tickets' and ticket_ids exists)")
        
        if count_before == 0:
            print("No documents found to update.")
            return
        
        # Use $rename operator to rename the field
        update_operation = {
            "$rename": {
                "ticket_ids": "tickets_ids"
            }
        }
        
        # Perform the update operation
        result = cluster_collection.update_many(filter_query, update_operation)
        
        print(f"Successfully updated {result.modified_count} documents")
        print(f"Matched {result.matched_count} documents")
        
        # Verify the update by counting documents with the new field name
        verification_query = {
            "data": "tickets",
            "tickets_ids": {"$exists": True}
        }
        count_after = cluster_collection.count_documents(verification_query)
        print(f"Verification: {count_after} documents now have 'tickets_ids' field")
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("MongoDB connection closed")

if __name__ == "__main__":
    rename_field_in_cluster_collection()

Found 16 documents matching criteria (data='tickets' and ticket_ids exists)
Successfully updated 16 documents
Matched 16 documents
Verification: 16 documents now have 'tickets_ids' field
MongoDB connection closed


In [10]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

def validate_keyphrases_in_subclusters():
    """
    Cross-check keyphrases field with subclusters keyphrases.
    Find any keyphrases that exist in main keyphrases but are missing from all subclusters.
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        cluster_collection = db['cluster']
        
        # Find all documents that have both keyphrases and subclusters fields
        query = {
            "keyphrases": {"$exists": True, "$ne": None},
            "subclusters": {"$exists": True, "$ne": None}
        }
        
        documents = cluster_collection.find(query)
        
        missing_keyphrases = []
        total_documents_checked = 0
        
        for doc in documents:
            total_documents_checked += 1
            cluster_id = doc.get('cluster_id')
            main_keyphrases = doc.get('keyphrases', [])
            subclusters = doc.get('subclusters', {})
            
            # Collect all keyphrases from all subclusters
            subcluster_keyphrases = set()
            
            # subclusters is an object with keys like "0", "1", "2", etc.
            for subcluster_key, subcluster_data in subclusters.items():
                if isinstance(subcluster_data, dict) and 'keyphrases' in subcluster_data:
                    subcluster_keyphrase_list = subcluster_data.get('keyphrases', [])
                    if isinstance(subcluster_keyphrase_list, list):
                        subcluster_keyphrases.update(subcluster_keyphrase_list)
            
            # Check each main keyphrase against subcluster keyphrases
            for keyphrase in main_keyphrases:
                if keyphrase not in subcluster_keyphrases:
                    missing_keyphrases.append({
                        'cluster_id': cluster_id,
                        'missing_keyphrase': keyphrase,
                        'total_main_keyphrases': len(main_keyphrases),
                        'total_subcluster_keyphrases': len(subcluster_keyphrases)
                    })
        
        # Display results
        print(f"Total documents checked: {total_documents_checked}")
        print(f"Total missing keyphrases found: {len(missing_keyphrases)}")
        print("-" * 80)
        
        if missing_keyphrases:
            print("MISSING KEYPHRASES REPORT:")
            print("-" * 80)
            
            # Group by cluster_id for better readability
            cluster_groups = {}
            for item in missing_keyphrases:
                cluster_id = item['cluster_id']
                if cluster_id not in cluster_groups:
                    cluster_groups[cluster_id] = []
                cluster_groups[cluster_id].append(item)
            
            for cluster_id, missing_items in cluster_groups.items():
                print(f"Cluster ID: {cluster_id}")
                print(f"Missing keyphrases ({len(missing_items)}):")
                for item in missing_items:
                    print(f"  - '{item['missing_keyphrase']}'")
                print(f"Total main keyphrases: {missing_items[0]['total_main_keyphrases']}")
                print(f"Total subcluster keyphrases: {missing_items[0]['total_subcluster_keyphrases']}")
                print("-" * 40)
                
        else:
            print("✅ All keyphrases from main field are present in subclusters!")
            
        # Summary statistics
        if missing_keyphrases:
            clusters_with_issues = len(set(item['cluster_id'] for item in missing_keyphrases))
            print(f"\nSUMMARY:")
            print(f"Clusters with missing keyphrases: {clusters_with_issues}")
            print(f"Total missing keyphrase instances: {len(missing_keyphrases)}")
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("\nMongoDB connection closed")

def get_detailed_analysis():
    """
    Get more detailed analysis including sample data structure
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        cluster_collection = db['cluster']
        
        # Get a sample document to understand structure
        sample_doc = cluster_collection.find_one({
            "keyphrases": {"$exists": True},
            "subclusters": {"$exists": True}
        })
        
        if sample_doc:
            print("SAMPLE DOCUMENT STRUCTURE:")
            print("-" * 40)
            print(f"Cluster ID: {sample_doc.get('cluster_id')}")
            print(f"Cluster Name: {sample_doc.get('cluster_name', 'N/A')}")
            print(f"Main keyphrases count: {len(sample_doc.get('keyphrases', []))}")
            
            subclusters = sample_doc.get('subclusters', {})
            print(f"Subclusters count: {len(subclusters)}")
            
            if sample_doc.get('keyphrases'):
                print(f"Sample main keyphrases: {sample_doc['keyphrases'][:3]}...")
            
            if subclusters:
                print("Subcluster structure:")
                for key, subcluster in list(subclusters.items())[:2]:  # Show first 2 subclusters
                    if isinstance(subcluster, dict):
                        label = subcluster.get('label', 'No label')
                        keyphrases_count = len(subcluster.get('keyphrases', []))
                        print(f"  {key}: '{label}' ({keyphrases_count} keyphrases)")
                        if subcluster.get('keyphrases'):
                            print(f"    Sample keyphrases: {subcluster['keyphrases'][:2]}...")
            print("-" * 40)
    
    except Exception as e:
        print(f"Error in detailed analysis: {str(e)}")
    finally:
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    print("Starting keyphrase validation...")
    print("=" * 80)
    
    # First, get structure analysis
    get_detailed_analysis()
    
    # Then run validation
    validate_keyphrases_in_subclusters()

Starting keyphrase validation...
SAMPLE DOCUMENT STRUCTURE:
----------------------------------------
Cluster ID: 0
Cluster Name: N/A
Main keyphrases count: 6
Subclusters count: 2
Sample main keyphrases: ['SEPA Payment Failure', 'SEPA Processing Error', 'SEPA Instant Failure']...
Subcluster structure:
  0: 'SEPA Failures & Errors' (3 keyphrases)
    Sample keyphrases: ['SEPA Payment Failure', 'SEPA Processing Error']...
  1: 'Clearing & Settlement Issues' (3 keyphrases)
    Sample keyphrases: ['SEPA Payment Status', 'TARGET2 Settlement Issue']...
----------------------------------------
Total documents checked: 98
Total missing keyphrases found: 0
--------------------------------------------------------------------------------
✅ All keyphrases from main field are present in subclusters!

MongoDB connection closed


In [11]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

def convert_subcluster_id_to_string():
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['tickets']
        
        # Find all documents where subcluster_id is an integer
        query = {"subcluster_id": {"$type": "number"}}
        documents_to_update = list(collection.find(query))
        
        print(f"Found {len(documents_to_update)} documents with integer subcluster_id")
        
        if len(documents_to_update) == 0:
            print("No documents found with integer subcluster_id")
            return
        
        # Update each document
        updated_count = 0
        for doc in documents_to_update:
            try:
                # Convert the integer subcluster_id to string
                new_subcluster_id = str(doc['subcluster_id'])
                
                # Update the document
                result = collection.update_one(
                    {"_id": doc["_id"]},
                    {"$set": {"subcluster_id": new_subcluster_id}}
                )
                
                if result.modified_count > 0:
                    updated_count += 1
                    print(f"Updated document {doc['_id']}: subcluster_id {doc['subcluster_id']} -> '{new_subcluster_id}'")
                
            except Exception as e:
                print(f"Error updating document {doc['_id']}: {e}")
        
        print(f"\nSummary: Successfully updated {updated_count} out of {len(documents_to_update)} documents")
        
        # Verify the changes
        remaining_int_docs = collection.count_documents({"subcluster_id": {"$type": "number"}})
        string_docs = collection.count_documents({"subcluster_id": {"$type": "string"}})
        
        print(f"Verification:")
        print(f"- Documents with integer subcluster_id: {remaining_int_docs}")
        print(f"- Documents with string subcluster_id: {string_docs}")
        
    except Exception as e:
        print(f"Error connecting to MongoDB or updating documents: {e}")
    finally:
        if 'client' in locals():
            client.close()
            print("MongoDB connection closed")

def preview_changes():
    """Preview what changes will be made without actually updating"""
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['tickets']
        
        # Find documents where subcluster_id is an integer
        query = {"subcluster_id": {"$type": "number"}}
        documents_to_update = list(collection.find(query, {"_id": 1, "subcluster_id": 1}))
        
        print("PREVIEW MODE - No changes will be made")
        print(f"Found {len(documents_to_update)} documents that would be updated:")
        
        for i, doc in enumerate(documents_to_update[:10]):  # Show first 10
            print(f"  Document {doc['_id']}: subcluster_id {doc['subcluster_id']} -> '{str(doc['subcluster_id'])}'")
        
        if len(documents_to_update) > 10:
            print(f"  ... and {len(documents_to_update) - 10} more documents")
        
    except Exception as e:
        print(f"Error during preview: {e}")
    finally:
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    print("MongoDB subcluster_id Converter")
    print("=" * 40)
    
    # First, preview the changes
    print("\n1. PREVIEW CHANGES:")
    preview_changes()
    
    # Ask for confirmation
    print("\n2. CONFIRMATION:")
    response = input("Do you want to proceed with the conversion? (yes/no): ").lower().strip()
    
    if response == 'yes':
        print("\n3. EXECUTING CONVERSION:")
        convert_subcluster_id_to_string()
    else:
        print("Conversion cancelled.")

MongoDB subcluster_id Converter

1. PREVIEW CHANGES:
PREVIEW MODE - No changes will be made
Found 0 documents that would be updated:

2. CONFIRMATION:
Conversion cancelled.


In [13]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def copy_processed_at_field():
    """
    Copy processed_at field from 'sample ticket' collection to 'tickets' collection
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        
        # Get collections
        sample_ticket_collection = db['sample ticket']
        tickets_collection = db['tickets']
        
        logger.info("Connected to MongoDB successfully")
        
        # Get all documents from sample ticket collection with processed_at field
        sample_tickets = list(sample_ticket_collection.find(
            {"processed_at": {"$exists": True}},
            {"_id": 1, "processed_at": 1}
        ))
        
        logger.info(f"Found {len(sample_tickets)} documents with processed_at field in 'sample ticket' collection")
        
        if not sample_tickets:
            logger.warning("No documents found with processed_at field in 'sample ticket' collection")
            return
        
        # Create a mapping of _id to processed_at value
        processed_at_mapping = {doc['_id']: doc['processed_at'] for doc in sample_tickets}
        
        # Update tickets collection
        updated_count = 0
        failed_count = 0
        
        for doc_id, processed_at_value in processed_at_mapping.items():
            try:
                # Update the document in tickets collection
                result = tickets_collection.update_one(
                    {"_id": doc_id},
                    {"$set": {"processed_at": processed_at_value}},
                    upsert=False  # Don't create new documents if they don't exist
                )
                
                if result.matched_count > 0:
                    updated_count += 1
                    if updated_count % 100 == 0:  # Log progress every 100 updates
                        logger.info(f"Updated {updated_count} documents so far...")
                else:
                    logger.warning(f"Document with _id {doc_id} not found in tickets collection")
                    failed_count += 1
                    
            except Exception as e:
                logger.error(f"Failed to update document {doc_id}: {str(e)}")
                failed_count += 1
        
        logger.info(f"Operation completed:")
        logger.info(f"- Successfully updated: {updated_count} documents")
        logger.info(f"- Failed/Not found: {failed_count} documents")
        
        # Verify the operation
        verify_count = tickets_collection.count_documents({"processed_at": {"$exists": True}})
        logger.info(f"Verification: {verify_count} documents in 'tickets' collection now have 'processed_at' field")
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
    finally:
        # Close the connection
        try:
            client.close()
            logger.info("MongoDB connection closed")
        except:
            pass

def copy_all_processed_at_regardless_of_id():
    """
    Alternative approach: Copy processed_at values based on document order/position
    Use this if documents don't have matching _ids between collections
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        
        # Get collections
        sample_ticket_collection = db['sample ticket']
        tickets_collection = db['tickets']
        
        logger.info("Connected to MongoDB successfully")
        
        # Get all documents from both collections
        sample_tickets = list(sample_ticket_collection.find().sort("_id", 1))
        tickets = list(tickets_collection.find().sort("_id", 1))
        
        logger.info(f"Sample ticket collection has {len(sample_tickets)} documents")
        logger.info(f"Tickets collection has {len(tickets)} documents")
        
        # Ensure both collections have the same number of documents
        min_count = min(len(sample_tickets), len(tickets))
        
        if len(sample_tickets) != len(tickets):
            logger.warning(f"Collections have different sizes. Will process {min_count} documents")
        
        updated_count = 0
        
        # Update tickets with processed_at values from sample tickets
        for i in range(min_count):
            sample_doc = sample_tickets[i]
            ticket_doc = tickets[i]
            
            # Check if sample document has processed_at field
            if 'processed_at' in sample_doc:
                try:
                    # Update the corresponding ticket document
                    result = tickets_collection.update_one(
                        {"_id": ticket_doc['_id']},
                        {"$set": {"processed_at": sample_doc['processed_at']}}
                    )
                    
                    if result.modified_count > 0:
                        updated_count += 1
                        if updated_count % 100 == 0:
                            logger.info(f"Updated {updated_count} documents so far...")
                            
                except Exception as e:
                    logger.error(f"Failed to update document at index {i}: {str(e)}")
        
        logger.info(f"Successfully updated {updated_count} documents with processed_at field")
        
        # Verify the operation
        verify_count = tickets_collection.count_documents({"processed_at": {"$exists": True}})
        logger.info(f"Verification: {verify_count} documents in 'tickets' collection now have 'processed_at' field")
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
    finally:
        # Close the connection
        try:
            client.close()
            logger.info("MongoDB connection closed")
        except:
            pass

if __name__ == "__main__":
    # Check if environment variables are set
    if not mongo_connection_string:
        logger.error("MONGO_CONNECTION_STRING environment variable is not set")
        exit(1)
    
    if not mongo_database_name:
        logger.error("MONGO_DATABASE_NAME environment variable is not set")
        exit(1)
    
    print("Choose the method to copy processed_at field:")
    print("1. Copy based on matching document _id (recommended)")
    print("2. Copy based on document order/position")
    
    choice = input("Enter your choice (1 or 2): ").strip()
    
    if choice == "1":
        logger.info("Starting copy operation based on matching _id...")
        copy_processed_at_field()
    elif choice == "2":
        logger.info("Starting copy operation based on document order...")
        copy_all_processed_at_regardless_of_id()
    else:
        logger.error("Invalid choice. Please run the script again and choose 1 or 2.")

Choose the method to copy processed_at field:
1. Copy based on matching document _id (recommended)
2. Copy based on document order/position


2025-09-03 16:22:12,801 - INFO - Starting copy operation based on document order...
2025-09-03 16:22:12,806 - INFO - Connected to MongoDB successfully
2025-09-03 16:22:41,174 - INFO - Sample ticket collection has 2000 documents
2025-09-03 16:22:41,176 - INFO - Tickets collection has 2000 documents
2025-09-03 16:23:05,839 - INFO - Updated 100 documents so far...
2025-09-03 16:23:30,478 - INFO - Updated 200 documents so far...
2025-09-03 16:23:55,209 - INFO - Updated 300 documents so far...
2025-09-03 16:24:19,694 - INFO - Updated 400 documents so far...
2025-09-03 16:24:44,189 - INFO - Updated 500 documents so far...
2025-09-03 16:25:08,719 - INFO - Updated 600 documents so far...
2025-09-03 16:25:33,542 - INFO - Updated 700 documents so far...
2025-09-03 16:25:58,474 - INFO - Updated 800 documents so far...
2025-09-03 16:26:23,470 - INFO - Updated 900 documents so far...
2025-09-03 16:26:47,995 - INFO - Updated 1000 documents so far...
2025-09-03 16:27:12,549 - INFO - Updated 1100 docu

2025-09-03 16:35:48,716 - INFO - Connected to MongoDB successfully
2025-09-03 16:35:51,292 - INFO - Found 0 documents with 'voice Support' in domains array
2025-09-03 16:35:51,294 - INFO - No documents found with 'voice Support' domain. Checking for other variations...
2025-09-03 16:35:52,686 - INFO - Updated 0 documents
2025-09-03 16:35:54,119 - INFO - After update: 16 documents have 'Ticket Support' in domains array
2025-09-03 16:35:54,121 - INFO - Sample of updated documents:
2025-09-03 16:35:54,939 - INFO - Document ID: 68aca16519b5f4bea43cfd77, Domains: ['Ticket Support']
2025-09-03 16:35:54,941 - INFO - Document ID: 68aca16519b5f4bea43cfd78, Domains: ['Ticket Support']
2025-09-03 16:35:54,942 - INFO - Document ID: 68aca16519b5f4bea43cfd79, Domains: ['Ticket Support']
2025-09-03 16:35:54,945 - INFO - Operation completed. Total documents updated: 0
2025-09-03 16:35:55,188 - INFO - MongoDB connection closed


In [15]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def rename_domain_value():
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['cluster']
        
        logger.info("Connected to MongoDB successfully")
        
        # First, let's check how many documents have "voice Support" in domains array
        count_before = collection.count_documents({"domains": "voice Support"})
        logger.info(f"Found {count_before} documents with 'voice Support' in domains array")
        
        if count_before == 0:
            logger.info("No documents found with 'voice Support' domain. Checking for other variations...")
            # Check for case variations or similar patterns
            variations = ["Voice Support", "voice support", "VOICE SUPPORT", "Voice support"]
            for variation in variations:
                count = collection.count_documents({"domains": variation})
                if count > 0:
                    logger.info(f"Found {count} documents with '{variation}' domain")
        
        # Update documents where domains array contains "voice Support"
        # This will replace "voice Support" with "Ticket Support" in the domains array
        result = collection.update_many(
            {"domains": "voice Support"},
            {"$set": {"domains.$[elem]": "Ticket Support"}},
            array_filters=[{"elem": "voice Support"}]
        )
        
        logger.info(f"Updated {result.modified_count} documents")
        
        # Also handle case variations if they exist
        variations_to_update = ["Voice Support", "voice support", "VOICE SUPPORT", "Voice support"]
        total_updated = result.modified_count
        
        for variation in variations_to_update:
            result_var = collection.update_many(
                {"domains": variation},
                {"$set": {"domains.$[elem]": "Ticket Support"}},
                array_filters=[{"elem": variation}]
            )
            if result_var.modified_count > 0:
                logger.info(f"Updated {result_var.modified_count} documents with '{variation}' domain")
                total_updated += result_var.modified_count
        
        # Verify the changes
        count_after = collection.count_documents({"domains": "Ticket Support"})
        logger.info(f"After update: {count_after} documents have 'Ticket Support' in domains array")
        
        # Show a sample of updated documents
        logger.info("Sample of updated documents:")
        sample_docs = collection.find({"domains": "Ticket Support"}).limit(3)
        for doc in sample_docs:
            logger.info(f"Document ID: {doc.get('_id')}, Domains: {doc.get('domains')}")
        
        logger.info(f"Operation completed. Total documents updated: {total_updated}")
        
    except Exception as e:
        logger.error(f"Error occurred: {str(e)}")
    finally:
        # Close the connection
        client.close()
        logger.info("MongoDB connection closed")

def rollback_domain_value():
    """
    Function to rollback the changes if needed
    This will change "banking" back to "Ticket Support"
    """
    try:
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['cluster']
        
        logger.info("Starting rollback operation...")
        
        # Update documents where domains array contains "banking"
        result = collection.update_many(
            {"domains": "banking"},
            {"$set": {"domains.$[elem]": "Ticket Support"}},
            array_filters=[{"elem": "banking"}]
        )
        
        logger.info(f"Rollback completed. Updated {result.modified_count} documents")
        
    except Exception as e:
        logger.error(f"Rollback error: {str(e)}")
    finally:
        client.close()
        logger.info("MongoDB connection closed")

if __name__ == "__main__":
    # Run the domain rename operation
    rename_domain_value()
    
    # Uncomment the line below if you need to rollback the changes
    # rollback_domain_value()

2025-09-03 16:38:58,316 - INFO - Connected to MongoDB successfully
2025-09-03 16:39:00,179 - INFO - Found 0 documents with 'voice Support' in domains array
2025-09-03 16:39:00,182 - INFO - No documents found with 'voice Support' domain. Checking for other variations...
2025-09-03 16:39:01,653 - INFO - Updated 0 documents
2025-09-03 16:39:02,982 - INFO - After update: 16 documents have 'Ticket Support' in domains array
2025-09-03 16:39:02,984 - INFO - Sample of updated documents:
2025-09-03 16:39:03,503 - INFO - Document ID: 68aca16519b5f4bea43cfd77, Domains: ['Ticket Support']
2025-09-03 16:39:03,506 - INFO - Document ID: 68aca16519b5f4bea43cfd78, Domains: ['Ticket Support']
2025-09-03 16:39:03,507 - INFO - Document ID: 68aca16519b5f4bea43cfd79, Domains: ['Ticket Support']
2025-09-03 16:39:03,509 - INFO - Operation completed. Total documents updated: 0
2025-09-03 16:39:03,764 - INFO - MongoDB connection closed
