In [1]:
# Import required libraries
from pymongo import MongoClient
from collections import defaultdict
import re
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {MONGO_DATABASE_NAME}")

client = MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DATABASE_NAME]

# Get collections
clusters_collection = db['cluster']
voice_collection = db['voice']

def normalize_text(text):
    """Normalize text for better matching"""
    return re.sub(r'[^\w\s]', '', text.lower().strip())

def match_voice_to_clusters():
    """
    Match voice records to clusters based on dominant_topic matching keyphrases
    and update cluster documents with voice_ids array
    """
    
    print("Fetching clusters with data: 'voice'...")
    # Get only clusters that have data field set to "voice"
    clusters = list(clusters_collection.find({"data": "voice"}))
    print(f"Found {len(clusters)} clusters with data='voice' to process\n")
    
    # Process each cluster
    for cluster in clusters:
        cluster_id = cluster['cluster_id']
        keyphrases = cluster.get('keyphrases', [])
        
        print(f"Processing Cluster ID: {cluster_id}")
        print(f"Cluster Name: {cluster.get('cluster_name', 'N/A')}")
        print(f"Keyphrases: {keyphrases}")
        
        # Normalize keyphrases for matching
        normalized_keyphrases = [normalize_text(phrase) for phrase in keyphrases]
        print(f"Normalized keyphrases: {normalized_keyphrases}")
        
        # Find matching voice records
        matching_voice_ids = []
        
        # Get all voice records - using cursor for better memory management
        print("  Searching through voice records...")
        voice_cursor = voice_collection.find({}, {
            '_id': 1, 
            'dominant_topic': 1
        })
        
        voice_count = 0
        for voice in voice_cursor:
            voice_count += 1
            if voice_count % 1000 == 0:
                print(f"    Processed {voice_count} voice records...")
                
            voice_dominant_topic = voice.get('dominant_topic', '')
            
            if voice_dominant_topic:
                normalized_topic = normalize_text(voice_dominant_topic)
                
                # Check if any keyphrase matches the dominant topic
                for keyphrase in normalized_keyphrases:
                    if keyphrase and normalized_topic:  # Ensure both are not empty
                        if keyphrase in normalized_topic or normalized_topic in keyphrase:
                            matching_voice_ids.append(str(voice['_id']))
                            print(f"    Match found: {voice['_id']} - Topic: '{voice_dominant_topic}' matches keyphrase: '{keyphrase}'")
                            break
        
        print(f"  Finished processing {voice_count} voice records")
        
        # Remove duplicates (in case a voice record matches multiple times)
        matching_voice_ids = list(set(matching_voice_ids))
        
        # Update cluster with voice_ids (only for clusters with data: "voice")
        try:
            if matching_voice_ids:
                result = clusters_collection.update_one(
                    {'cluster_id': cluster_id, 'data': 'voice'},
                    {'$set': {'voice_ids': matching_voice_ids}}
                )
                if result.modified_count > 0:
                    print(f"  ✓ Successfully updated cluster {cluster_id} with {len(matching_voice_ids)} voice IDs")
                elif result.matched_count > 0:
                    print(f"  ⚠ Cluster {cluster_id} already has same voice data")
                else:
                    print(f"  ❌ Cluster {cluster_id} not found or doesn't have data: 'voice'")
            else:
                # Set empty array if no matches found
                result = clusters_collection.update_one(
                    {'cluster_id': cluster_id, 'data': 'voice'},
                    {'$set': {'voice_ids': []}}
                )
                if result.modified_count > 0:
                    print(f"  ✓ Set empty voice_ids array for cluster {cluster_id} (no matches found)")
                elif result.matched_count > 0:
                    print(f"  ⚠ Cluster {cluster_id} already has empty voice_ids array")
                else:
                    print(f"  ❌ Cluster {cluster_id} not found or doesn't have data: 'voice'")
        except Exception as e:
            print(f"  ❌ Error updating cluster {cluster_id}: {str(e)}")
        
        print(f"  Total unique voice records matched: {len(matching_voice_ids)}")
        print("-" * 50)

def verify_results():
    """
    Verify the results by displaying updated clusters
    """
    print("\n" + "=" * 60)
    print("VERIFICATION RESULTS")
    print("=" * 60)
    
    try:
        clusters = list(clusters_collection.find({"data": "voice"}, {
            'cluster_id': 1, 
            'cluster_name': 1, 
            'keyphrases': 1, 
            'voice_ids': 1,
            'data': 1
        }).sort('cluster_id', 1))
        
        for cluster in clusters:
            voice_count = len(cluster.get('voice_ids', []))
            print(f"\nCluster {cluster['cluster_id']}: {cluster.get('cluster_name', 'N/A')} (data: {cluster.get('data', 'N/A')})")
            print(f"  Keyphrases: {cluster.get('keyphrases', [])}")
            print(f"  Voice IDs count: {voice_count}")
            if voice_count > 0:
                print(f"  First 3 Voice IDs: {cluster['voice_ids'][:3]}")
                if voice_count > 3:
                    print(f"  ... and {voice_count - 3} more")
    except Exception as e:
        print(f"❌ Error during verification: {str(e)}")

def get_summary_stats():
    """Get summary statistics"""
    print("\n" + "=" * 60)
    print("SUMMARY STATISTICS")
    print("=" * 60)
    
    try:
        total_clusters = clusters_collection.count_documents({"data": "voice"})
        clusters_with_voice = clusters_collection.count_documents({
            "data": "voice",
            'voice_ids': {'$exists': True, '$ne': []}
        })
        
        pipeline = [
            {'$match': {"data": "voice", 'voice_ids': {'$exists': True}}},
            {'$project': {'voice_count': {'$size': '$voice_ids'}}},
            {'$group': {'_id': None, 'total_voice_matched': {'$sum': '$voice_count'}}}
        ]
        
        result = list(clusters_collection.aggregate(pipeline))
        total_voice_matched = result[0]['total_voice_matched'] if result else 0
        
        total_voice = voice_collection.count_documents({})
        
        print(f"Total clusters with data='voice': {total_clusters}")
        print(f"Clusters with matched voice records: {clusters_with_voice}")
        print(f"Clusters without matches: {total_clusters - clusters_with_voice}")
        print(f"Total voice records in database: {total_voice}")
        print(f"Total voice-cluster matches: {total_voice_matched}")
        
        if total_voice > 0:
            match_percentage = (total_voice_matched / total_voice) * 100
            print(f"Match percentage: {match_percentage:.2f}%")
            
    except Exception as e:
        print(f"❌ Error getting statistics: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        print("🚀 Starting voice-cluster matching process...")
        print("This will match voice records to clusters with data: 'voice'")
        print("=" * 60)
        
        # Test database connection
        test_cluster = clusters_collection.find_one({"data": "voice"})
        test_voice = voice_collection.find_one()
        
        if not test_cluster:
            print("⚠️  Warning: No clusters found with data: 'voice'")
        if not test_voice:
            print("⚠️  Warning: No voice records found in voice collection")
            
        print("✓ Database connection successful\n")
        
        # Execute the matching process
        match_voice_to_clusters()
        
        # Verify results
        verify_results()
        
        # Get summary statistics
        get_summary_stats()
        
        print("\n" + "=" * 60)
        print("✅ Process completed successfully!")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close database connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
🚀 Starting voice-cluster matching process...
This will match voice records to clusters with data: 'voice'
✓ Database connection successful

Fetching clusters with data: 'voice'...
Found 24 clusters with data='voice' to process

Processing Cluster ID: 0
Cluster Name: N/A
Keyphrases: ['Balance Inquiry Call', 'Account Statement Request', 'Transaction History Inquiry', 'Account Reconciliation Discussion', 'Account Verification Call']
Normalized keyphrases: ['balance inquiry call', 'account statement request', 'transaction history inquiry', 'account reconciliation discussion', 'account verification call']
  Searching through voice records...
    Match found: 68a89e5db21812ca3cfb362c - Topic: 'Account Reconciliation Discussion' matches keyphrase: 'account reconciliation discussion'
    Match found: 68a89e5db21812ca3cfb362e - Topic: 'Account Statement Request' matches keyphrase: 'account statement request'
    Match found: 68a89e5db21812ca3cfb3638 -

In [2]:
# Import required libraries
from pymongo import MongoClient
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {MONGO_DATABASE_NAME}")

client = MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DATABASE_NAME]

# Get collections
clusters_collection = db['cluster']

def update_domains_to_voice_support():
    """
    Update voice cluster documents to change domains to ["Voice Support"]
    Only processes clusters with data: "voice"
    """
    
    print("Starting domain update process for voice clusters...")
    print("=" * 50)
    
    try:
        # Count total voice clusters before update
        total_voice_clusters = clusters_collection.count_documents({"data": "voice"})
        print(f"Total voice clusters in collection: {total_voice_clusters}")
        
        if total_voice_clusters == 0:
            print("⚠ No voice clusters found (data: 'voice')")
            return
        
        # Count voice clusters that currently have ["EU bank"] or other domains
        eu_bank_count = clusters_collection.count_documents({
            "data": "voice", 
            "domains": ["EU bank"]
        })
        print(f"Voice clusters with 'EU bank' domain: {eu_bank_count}")
        
        old_voice_support_count = clusters_collection.count_documents({
            "data": "voice", 
            "domains": ["Voice Support"]
        })
        print(f"Voice clusters with 'Voice Support' domain: {old_voice_support_count}")
        
        # Count voice clusters with other domains
        other_domains_count = clusters_collection.count_documents({
            "data": "voice",
            "domains": {"$nin": [["EU bank"], ["Voice Support"]]}
        })
        print(f"Voice clusters with other domains: {other_domains_count}")
        
        print("\n" + "=" * 50)
        print("UPDATING VOICE CLUSTER DOMAINS...")
        print("=" * 50)
        
        # Update only voice clusters to have domains: ["Voice Support"]
        update_result = clusters_collection.update_many(
            {"data": "voice"},  # Only update clusters with data: "voice"
            {"$set": {"domains": ["Voice Support"]}}
        )
        
        print(f"✓ Successfully updated {update_result.modified_count} voice clusters")
        print(f"  Matched voice clusters: {update_result.matched_count}")
        
        # Verify the update
        verify_update()
        
    except Exception as e:
        print(f"❌ Error during voice domain update: {str(e)}")

def verify_update():
    """
    Verify that all voice clusters have been updated to ["Voice Support"]
    """
    print("\n" + "=" * 50)
    print("VOICE CLUSTER VERIFICATION")
    print("=" * 50)
    
    try:
        # Count voice clusters with different domain values
        voice_support_count = clusters_collection.count_documents({
            "data": "voice",
            "domains": ["Voice Support"]
        })
        eu_bank_count = clusters_collection.count_documents({
            "data": "voice",
            "domains": ["EU bank"]
        })
        other_domains = clusters_collection.count_documents({
            "data": "voice",
            "domains": {"$nin": [["Voice Support"], ["EU bank"]]}
        })
        
        total_voice_clusters = clusters_collection.count_documents({"data": "voice"})
        
        print(f"Total voice clusters: {total_voice_clusters}")
        print(f"Voice clusters with 'Voice Support' domain: {voice_support_count}")
        print(f"Voice clusters with 'EU bank' domain: {eu_bank_count}")
        print(f"Voice clusters with other domains: {other_domains}")
        
        if voice_support_count == total_voice_clusters:
            print("\n✅ SUCCESS: All voice clusters now have 'Voice Support' domain!")
        else:
            print(f"\n⚠ WARNING: {total_voice_clusters - voice_support_count} voice clusters still have different domains")
        
        # Show sample of updated voice cluster documents
        print(f"\nSample of updated voice clusters:")
        samples = list(clusters_collection.find(
            {"data": "voice"}, 
            {
                'cluster_id': 1, 
                'domains': 1, 
                'dominant_label': 1,
                'data': 1
            }
        ).limit(5).sort('cluster_id', 1))
        
        for sample in samples:
            cluster_id = sample.get('cluster_id', 'N/A')
            domains = sample.get('domains', [])
            label = sample.get('dominant_label', 'N/A')
            data_type = sample.get('data', 'N/A')
            print(f"  Voice Cluster {cluster_id}: data={data_type}, domains={domains}, label='{label}'")
            
    except Exception as e:
        print(f"❌ Error during verification: {str(e)}")

def get_domain_statistics():
    """
    Get detailed statistics about domains in voice clusters only
    """
    print("\n" + "=" * 50)
    print("VOICE CLUSTER DOMAIN STATISTICS")
    print("=" * 50)
    
    try:
        # Aggregate to get all unique domain combinations for voice clusters only
        pipeline = [
            {'$match': {"data": "voice"}},
            {'$group': {'_id': '$domains', 'count': {'$sum': 1}}},
            {'$sort': {'count': -1}}
        ]
        
        domain_stats = list(clusters_collection.aggregate(pipeline))
        
        print("Domain distribution for voice clusters:")
        for stat in domain_stats:
            domains = stat['_id']
            count = stat['count']
            print(f"  {domains}: {count} voice clusters")
            
        total_voice_clusters = clusters_collection.count_documents({"data": "voice"})
        if total_voice_clusters > 0:
            voice_support_percentage = (clusters_collection.count_documents({
                "data": "voice",
                "domains": ["Voice Support"]
            }) / total_voice_clusters) * 100
            print(f"\nPercentage of voice clusters with 'Voice Support' domain: {voice_support_percentage:.1f}%")
        
        # Show comparison with other data types
        print(f"\nComparison with other cluster types:")
        all_data_types = list(clusters_collection.aggregate([
            {'$group': {'_id': '$data', 'count': {'$sum': 1}}},
            {'$sort': {'count': -1}}
        ]))
        
        for data_type in all_data_types:
            data_value = data_type['_id']
            count = data_type['count']
            print(f"  Clusters with data='{data_value}': {count}")
            
    except Exception as e:
        print(f"❌ Error getting statistics: {str(e)}")

def show_voice_cluster_summary():
    """
    Show summary of voice cluster fields after domain update
    """
    print("\n" + "=" * 50)
    print("VOICE CLUSTER SUMMARY")
    print("=" * 50)
    
    try:
        # Get voice cluster statistics
        voice_clusters_with_voice_ids = clusters_collection.count_documents({
            "data": "voice",
            "voice_ids": {"$exists": True, "$ne": []}
        })
        
        # Get average voice count per cluster
        voice_pipeline = [
            {'$match': {"data": "voice", "voice_ids": {"$exists": True}}},
            {'$project': {'voice_count': {'$size': '$voice_ids'}}},
            {'$group': {
                '_id': None,
                'total_voice_records': {'$sum': '$voice_count'},
                'avg_voice_per_cluster': {'$avg': '$voice_count'},
                'max_voice_per_cluster': {'$max': '$voice_count'}
            }}
        ]
        
        voice_result = list(clusters_collection.aggregate(voice_pipeline))
        
        total_voice_clusters = clusters_collection.count_documents({"data": "voice"})
        
        print(f"Total voice clusters: {total_voice_clusters}")
        print(f"Voice clusters with assigned voice records: {voice_clusters_with_voice_ids}")
        
        if voice_result:
            result = voice_result[0]
            print(f"Total voice records assigned to clusters: {result['total_voice_records']}")
            print(f"Average voice records per cluster: {result['avg_voice_per_cluster']:.2f}")
            print(f"Maximum voice records in a cluster: {result['max_voice_per_cluster']}")
            
    except Exception as e:
        print(f"❌ Error getting voice summary: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        print("🎙️ Starting domains update to 'Voice Support' for voice clusters...")
        print("This will only update clusters with data: 'voice'")
        print("=" * 60)
        
        # Test database connection
        test_doc = clusters_collection.find_one({"data": "voice"})
        if test_doc:
            print("✓ Database connection successful")
            current_domains = test_doc.get('domains', 'N/A')
            data_type = test_doc.get('data', 'N/A')
            print(f"Sample voice cluster - data: {data_type}, domains: {current_domains}\n")
        else:
            print("⚠ No voice clusters found (data: 'voice') in clusters collection")
            print("Please ensure you have clusters with data: 'voice' before running this script")
            exit(1)
        
        # Execute the domain update for voice clusters
        update_domains_to_voice_support()
        
        # Get detailed statistics
        get_domain_statistics()
        
        # Show voice cluster summary
        show_voice_cluster_summary()
        
        print("\n" + "=" * 60)
        print("✅ Voice cluster domain update process completed successfully!")
        print("All voice clusters now have domains: ['Voice Support']")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close database connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
🎙️ Starting domains update to 'Voice Support' for voice clusters...
This will only update clusters with data: 'voice'
✓ Database connection successful
Sample voice cluster - data: voice, domains: ['banking']

Starting domain update process for voice clusters...
Total voice clusters in collection: 24
Voice clusters with 'EU bank' domain: 0
Voice clusters with 'Voice Support' domain: 0
Voice clusters with other domains: 24

UPDATING VOICE CLUSTER DOMAINS...
✓ Successfully updated 24 voice clusters
  Matched voice clusters: 24

VOICE CLUSTER VERIFICATION
Total voice clusters: 24
Voice clusters with 'Voice Support' domain: 24
Voice clusters with 'EU bank' domain: 0
Voice clusters with other domains: 0

✅ SUCCESS: All voice clusters now have 'Voice Support' domain!

Sample of updated voice clusters:
  Voice Cluster 0: data=voice, domains=['Voice Support'], label='Basic Account Information & Services'
  Voice Cluster 1: data=voice, domains=['Voice 

In [3]:
from pymongo import MongoClient, UpdateOne
from typing import Dict, List, Optional, Set
import os
from dotenv import load_dotenv
from collections import defaultdict
import threading
from concurrent.futures import ThreadPoolExecutor
import time

# Load environment variables
load_dotenv()

class OptimizedVoiceClusterMatcher:
    def __init__(self, connection_string: str, database_name: str):
        """
        Initialize the matcher with MongoDB connection
        """
        self.client = MongoClient(connection_string)
        self.db = self.client[database_name]
        self.voice_collection = self.db['voice']
        self.clusters_collection = self.db['cluster']
        
        # Cache for cluster data - this is the key optimization
        self._cluster_cache = None
        self._subcluster_cache = None
        self._load_cluster_cache()
    
    def _load_cluster_cache(self):
        """
        Load all cluster data into memory for fast lookups
        Only load clusters where data equals "voice"
        """
        print("Loading voice cluster data into cache...")
        start_time = time.time()
        
        # Dictionary mapping keyphrase -> cluster info
        self._cluster_cache = {}
        # Dictionary mapping keyphrase -> subcluster info
        self._subcluster_cache = {}
        
        # Only get clusters where data = "voice"
        clusters = list(self.clusters_collection.find({"data": "voice"}))
        print(f"Found {len(clusters)} voice clusters to cache")
        
        for cluster in clusters:
            cluster_id = cluster.get('cluster_id')
            dominant_label = cluster.get('dominant_label')
            keyphrases = cluster.get('keyphrases', [])
            subclusters = cluster.get('subclusters', {})
            
            # Cache cluster keyphrases
            for keyphrase in keyphrases:
                self._cluster_cache[keyphrase] = {
                    'cluster_id': cluster_id,
                    'dominant_label': dominant_label,
                    'subclusters': subclusters
                }
            
            # Cache subcluster keyphrases
            for subcluster_id, subcluster_data in subclusters.items():
                if not isinstance(subcluster_data, dict):
                    continue
                    
                subcluster_keyphrases = subcluster_data.get('keyphrases', [])
                for keyphrase in subcluster_keyphrases:
                    self._subcluster_cache[keyphrase] = {
                        'cluster_id': cluster_id,
                        'dominant_label': dominant_label,
                        'subcluster_id': int(subcluster_id),
                        'subcluster_label': subcluster_data.get('label')
                    }
        
        cache_time = time.time() - start_time
        print(f"Cache loaded in {cache_time:.2f} seconds")
        print(f"Cached {len(self._cluster_cache)} cluster keyphrases")
        print(f"Cached {len(self._subcluster_cache)} subcluster keyphrases")
    
    def find_matching_cluster_fast(self, dominant_topic: str) -> Optional[Dict]:
        """
        Fast cluster lookup using cached data
        """
        return self._cluster_cache.get(dominant_topic)
    
    def find_matching_subcluster_fast(self, dominant_topic: str) -> Optional[Dict]:
        """
        Fast subcluster lookup using cached data
        """
        return self._subcluster_cache.get(dominant_topic)
    
    def find_unmatched_voice_records(self, limit: int = None) -> List[Dict]:
        """
        Find voice records that don't match any cluster or subcluster
        """
        unmatched = []
        
        # Get all voice records with dominant_topic
        query = {"dominant_topic": {"$exists": True, "$ne": None}}
        cursor = self.voice_collection.find(query, {"dominant_topic": 1})
        
        if limit:
            cursor = cursor.limit(limit)
        
        for voice_record in cursor:
            dominant_topic = voice_record.get('dominant_topic')
            if not dominant_topic:
                continue
                
            # Check if it matches any cluster or subcluster
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            
            if not cluster_match and not subcluster_match:
                unmatched.append({
                    'voice_id': str(voice_record['_id']),
                    'dominant_topic': dominant_topic
                })
        
        return unmatched
    
    def get_unique_dominant_topics(self) -> Dict:
        """
        Get all unique dominant_topic values and their counts from voice records
        """
        pipeline = [
            {"$match": {"dominant_topic": {"$exists": True, "$ne": None}}},
            {"$group": {"_id": "$dominant_topic", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]
        
        result = list(self.voice_collection.aggregate(pipeline))
        
        topics_info = {
            'total_unique_topics': len(result),
            'topics': result
        }
        
        return topics_info
    
    def analyze_matching_gaps(self) -> Dict:
        """
        Analyze what dominant_topics exist but don't match any voice clusters
        """
        print("Analyzing matching gaps...")
        
        # Get all unique dominant topics
        topics_info = self.get_unique_dominant_topics()
        print(f"Found {topics_info['total_unique_topics']} unique dominant topics in voice records")
        
        # Check which ones don't match
        unmatched_topics = {}
        matched_topics = {}
        
        for topic_data in topics_info['topics']:
            topic = topic_data['_id']
            count = topic_data['count']
            
            cluster_match = self.find_matching_cluster_fast(topic)
            subcluster_match = self.find_matching_subcluster_fast(topic)
            
            if cluster_match or subcluster_match:
                matched_topics[topic] = {
                    'count': count,
                    'cluster_match': bool(cluster_match),
                    'subcluster_match': bool(subcluster_match)
                }
            else:
                unmatched_topics[topic] = count
        
        return {
            'total_topics': topics_info['total_unique_topics'],
            'matched_topics': len(matched_topics),
            'unmatched_topics': len(unmatched_topics),
            'unmatched_details': unmatched_topics,
            'matched_details': matched_topics,
            'unmatched_voice_count': sum(unmatched_topics.values()),
            'matched_voice_count': sum([data['count'] for data in matched_topics.values()])
        }
    
    def create_fallback_cluster_entry(self, unmatched_topics: List[str]) -> Dict:
        """
        Create a fallback cluster entry for unmatched topics (for voice)
        """
        fallback_cluster = {
            'cluster_id': 999,  # Use a high number to avoid conflicts
            'dominant_label': 'Unclassified Voice Topics',
            'keyphrases': unmatched_topics,
            'data': 'voice',  # Specify that this is for voice
            'subclusters': {
                '0': {
                    'label': 'Miscellaneous Voice',
                    'keyphrases': unmatched_topics
                }
            }
        }
        return fallback_cluster
    
    def add_fallback_cluster_to_cache(self, unmatched_topics: List[str]) -> None:
        """
        Add unmatched topics to cache as a fallback cluster
        """
        print(f"Adding {len(unmatched_topics)} unmatched topics to fallback cluster...")
        
        for topic in unmatched_topics:
            # Add to cluster cache
            self._cluster_cache[topic] = {
                'cluster_id': 999,
                'dominant_label': 'Unclassified Voice Topics',
                'subclusters': {'0': {'label': 'Miscellaneous Voice', 'keyphrases': unmatched_topics}}
            }
            
            # Add to subcluster cache
            self._subcluster_cache[topic] = {
                'cluster_id': 999,
                'dominant_label': 'Unclassified Voice Topics',
                'subcluster_id': 0,
                'subcluster_label': 'Miscellaneous Voice'
            }
        
        print(f"✓ Added fallback cluster. Cache now has:")
        print(f"  - Cluster keyphrases: {len(self._cluster_cache)}")
        print(f"  - Subcluster keyphrases: {len(self._subcluster_cache)}")
    
    def process_voice_batch(self, voice_records: List[Dict]) -> List:
        """
        Process a batch of voice records and return bulk operations in correct PyMongo format
        """
        bulk_operations = []
        
        for voice_record in voice_records:
            dominant_topic = voice_record.get('dominant_topic')
            if not dominant_topic:
                continue
            
            # Fast cluster lookup
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            
            update_data = {}
            
            if cluster_match:
                update_data.update({
                    'kmeans_cluster_id': cluster_match['cluster_id'],
                    'dominant_label': cluster_match['dominant_label']
                })
            
            if subcluster_match:
                update_data.update({
                    'kmeans_cluster_id': subcluster_match['cluster_id'],
                    'dominant_label': subcluster_match['dominant_label'],
                    'subcluster_id': subcluster_match['subcluster_id'],
                    'subcluster_label': subcluster_match['subcluster_label']
                })
            
            if update_data:
                # Use PyMongo's UpdateOne class instead of dict
                bulk_operations.append(
                    UpdateOne(
                        {'_id': voice_record['_id']}, 
                        {'$set': update_data}
                    )
                )
        
        return bulk_operations
    
    def process_voice_optimized(self, batch_size: int = 5000, max_workers: int = 4, dry_run: bool = False) -> Dict:
        """
        Optimized voice processing with larger batches and optional threading
        """
        start_time = time.time()
        
        # Get total count more efficiently
        total_voice_records = self.voice_collection.estimated_document_count()
        processed = 0
        matched_clusters = 0
        matched_subclusters = 0
        total_updates = 0
        
        print(f"Processing ~{total_voice_records} voice records in batches of {batch_size}")
        print(f"DRY RUN MODE: {'ON' if dry_run else 'OFF'}")
        
        # Create index on dominant_topic if it doesn't exist (for faster queries)
        try:
            self.voice_collection.create_index([("dominant_topic", 1)], background=True)
            print("✓ Index on dominant_topic created/verified")
        except Exception as e:
            print(f"Index creation note: {e}")
        
        # Process voice records in larger batches
        cursor = self.voice_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}},  # Only get voice records with dominant_topic
            projection={'dominant_topic': 1}  # Only fetch the field we need
        ).batch_size(batch_size)
        
        batch = []
        batch_count = 0
        
        for voice_record in cursor:
            batch.append(voice_record)
            
            if len(batch) >= batch_size:
                batch_count += 1
                print(f"\n--- Processing batch {batch_count} ({len(batch)} voice records) ---")
                
                # Process batch
                bulk_operations = self.process_voice_batch(batch)
                print(f"Generated {len(bulk_operations)} update operations")
                
                # Count matches for statistics
                batch_cluster_matches = 0
                batch_subcluster_matches = 0
                for voice_record in batch:
                    dominant_topic = voice_record.get('dominant_topic')
                    if dominant_topic:
                        if self.find_matching_cluster_fast(dominant_topic):
                            matched_clusters += 1
                            batch_cluster_matches += 1
                        if self.find_matching_subcluster_fast(dominant_topic):
                            matched_subclusters += 1
                            batch_subcluster_matches += 1
                
                print(f"Batch matches - Clusters: {batch_cluster_matches}, Subclusters: {batch_subcluster_matches}")
                
                # Execute bulk update (or skip if dry run)
                if bulk_operations and not dry_run:
                    try:
                        print("Executing bulk write...")
                        result = self.voice_collection.bulk_write(
                            bulk_operations, 
                            ordered=False  # Faster unordered operations
                        )
                        total_updates += result.modified_count
                        print(f"✓ Updated {result.modified_count} documents in batch {batch_count}")
                        
                        # Verify some updates
                        if result.modified_count > 0:
                            sample_updated = list(self.voice_collection.find(
                                {"kmeans_cluster_id": {"$exists": True}},
                                {"dominant_topic": 1, "kmeans_cluster_id": 1, "subcluster_id": 1}
                            ).limit(3))
                            print(f"Sample updated documents: {len(sample_updated)} found with cluster IDs")
                        
                    except Exception as e:
                        print(f"❌ Bulk write error in batch {batch_count}: {e}")
                        print(f"Error type: {type(e).__name__}")
                        # Show sample operation for debugging in readable format
                        if bulk_operations:
                            sample_op = bulk_operations[0]
                            print(f"Sample operation: Update {sample_op._filter} with {sample_op._doc}")
                elif bulk_operations and dry_run:
                    print(f"DRY RUN: Would update {len(bulk_operations)} documents")
                    # Show sample operations in readable format
                    for i, op in enumerate(bulk_operations[:3]):
                        print(f"Sample operation {i+1}: Update {op._filter} with {op._doc}")
                else:
                    print("No operations to execute (no matches found)")
                
                processed += len(batch)
                batch = []
                
                # Progress update
                elapsed = time.time() - start_time
                rate = processed / elapsed if elapsed > 0 else 0
                print(f"Progress: {processed} voice records processed ({rate:.1f} records/sec)")
        
        # Process remaining voice records in the last batch
        if batch:
            batch_count += 1
            print(f"\n--- Processing final batch {batch_count} ({len(batch)} voice records) ---")
            
            bulk_operations = self.process_voice_batch(batch)
            print(f"Generated {len(bulk_operations)} update operations")
            
            # Count matches for final batch
            batch_cluster_matches = 0
            batch_subcluster_matches = 0
            for voice_record in batch:
                dominant_topic = voice_record.get('dominant_topic')
                if dominant_topic:
                    if self.find_matching_cluster_fast(dominant_topic):
                        matched_clusters += 1
                        batch_cluster_matches += 1
                    if self.find_matching_subcluster_fast(dominant_topic):
                        matched_subclusters += 1
                        batch_subcluster_matches += 1
            
            print(f"Final batch matches - Clusters: {batch_cluster_matches}, Subclusters: {batch_subcluster_matches}")
            
            if bulk_operations and not dry_run:
                try:
                    print("Executing final bulk write...")
                    result = self.voice_collection.bulk_write(
                        bulk_operations, 
                        ordered=False
                    )
                    total_updates += result.modified_count
                    print(f"✓ Updated {result.modified_count} documents in final batch")
                except Exception as e:
                    print(f"❌ Bulk write error in final batch: {e}")
                    print(f"Error type: {type(e).__name__}")
            elif bulk_operations and dry_run:
                print(f"DRY RUN: Would update {len(bulk_operations)} documents")
            
            processed += len(batch)
        
        total_time = time.time() - start_time
        
        # Final verification
        if not dry_run and total_updates > 0:
            print(f"\n--- Verification ---")
            updated_count = self.voice_collection.count_documents({"kmeans_cluster_id": {"$exists": True}})
            print(f"Total voice documents with kmeans_cluster_id: {updated_count}")
            
            subcluster_count = self.voice_collection.count_documents({"subcluster_id": {"$exists": True}})
            print(f"Total voice documents with subcluster_id: {subcluster_count}")
        
        stats = {
            'total_voice_records': processed,
            'matched_clusters': matched_clusters,
            'matched_subclusters': matched_subclusters,
            'total_updates': total_updates,
            'processing_time': total_time,
            'records_per_second': processed / total_time if total_time > 0 else 0,
            'cluster_match_rate': (matched_clusters / processed * 100) if processed > 0 else 0,
            'subcluster_match_rate': (matched_subclusters / processed * 100) if processed > 0 else 0,
            'dry_run': dry_run
        }
        
        return stats
    
    def process_with_fallback(self, batch_size: int = 5000, dry_run: bool = False) -> Dict:
        """
        Process voice records with automatic fallback cluster for unmatched topics
        """
        print("=== PROCESSING VOICE RECORDS WITH FALLBACK CLUSTER ===")
        
        # First, analyze gaps
        gaps = self.analyze_matching_gaps()
        
        if gaps['unmatched_voice_count'] > 0:
            print(f"Found {gaps['unmatched_voice_count']} unmatched voice records")
            print(f"Unmatched topics: {list(gaps['unmatched_details'].keys())}")
            
            # Add fallback cluster to cache
            unmatched_topic_list = list(gaps['unmatched_details'].keys())
            self.add_fallback_cluster_to_cache(unmatched_topic_list)
            
            # Optionally save fallback cluster to database
            save_choice = input("Save fallback cluster to database permanently? (y/n): ")
            if save_choice.lower() == 'y':
                fallback_cluster = self.create_fallback_cluster_entry(unmatched_topic_list)
                try:
                    self.clusters_collection.insert_one(fallback_cluster)
                    print("✓ Fallback cluster saved to database")
                except Exception as e:
                    print(f"⚠️  Could not save fallback cluster: {e}")
        
        # Now process all voice records (should be 100% match rate)
        return self.process_voice_optimized(batch_size=batch_size, dry_run=dry_run)
    
    def get_performance_stats(self) -> Dict:
        """
        Get database performance statistics
        """
        stats = {}
        
        # Collection sizes
        stats['total_voice_records'] = self.voice_collection.estimated_document_count()
        stats['voice_with_topic'] = self.voice_collection.count_documents({
            "dominant_topic": {"$exists": True, "$ne": None}
        })
        stats['total_voice_clusters'] = self.clusters_collection.count_documents({"data": "voice"})
        stats['total_all_clusters'] = self.clusters_collection.estimated_document_count()
        
        # Cache statistics
        stats['cached_cluster_keyphrases'] = len(self._cluster_cache) if self._cluster_cache else 0
        stats['cached_subcluster_keyphrases'] = len(self._subcluster_cache) if self._subcluster_cache else 0
        
        return stats
    
    def debug_matching_process(self, limit: int = 5) -> None:
        """
        Debug the matching process to see what's happening with voice records
        """
        print("\n=== DEBUGGING VOICE MATCHING PROCESS ===")
        
        # Check if we have any cluster data
        if not self._cluster_cache and not self._subcluster_cache:
            print("❌ NO VOICE CLUSTER CACHE DATA! This is why updates are failing.")
            return
        
        print(f"✓ Voice cluster cache has {len(self._cluster_cache)} entries")
        print(f"✓ Voice subcluster cache has {len(self._subcluster_cache)} entries")
        
        # Sample some cluster keyphrases
        print(f"\nSample voice cluster keyphrases:")
        for i, keyphrase in enumerate(list(self._cluster_cache.keys())[:10]):
            cluster_info = self._cluster_cache[keyphrase]
            print(f"  {i+1}. '{keyphrase}' -> Cluster {cluster_info['cluster_id']}")
        
        # Sample some subcluster keyphrases  
        print(f"\nSample voice subcluster keyphrases:")
        for i, keyphrase in enumerate(list(self._subcluster_cache.keys())[:10]):
            subcluster_info = self._subcluster_cache[keyphrase]
            print(f"  {i+1}. '{keyphrase}' -> Cluster {subcluster_info['cluster_id']}, Subcluster {subcluster_info['subcluster_id']}")
        
        # Check some actual voice records
        print(f"\n=== TESTING {limit} VOICE RECORDS ===")
        voice_records = list(self.voice_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}}
        ).limit(limit))
        
        if not voice_records:
            print("❌ NO VOICE RECORDS with dominant_topic found!")
            return
        
        for i, voice_record in enumerate(voice_records, 1):
            dominant_topic = voice_record.get('dominant_topic', 'NO_TOPIC')
            print(f"\n--- Voice Record {i} ---")
            print(f"Voice ID: {voice_record['_id']}")
            print(f"Dominant Topic: '{dominant_topic}'")
            
            # Test cluster matching
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            if cluster_match:
                print(f"✓ CLUSTER MATCH: ID={cluster_match['cluster_id']}, Label='{cluster_match['dominant_label']}'")
            else:
                print(f"❌ No cluster match for '{dominant_topic}'")
            
            # Test subcluster matching  
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            if subcluster_match:
                print(f"✓ SUBCLUSTER MATCH: Cluster={subcluster_match['cluster_id']}, Subcluster={subcluster_match['subcluster_id']}, Label='{subcluster_match['subcluster_label']}'")
            else:
                print(f"❌ No subcluster match for '{dominant_topic}'")
            
            # Show what the update operation would look like
            update_data = {}
            if cluster_match:
                update_data.update({
                    'kmeans_cluster_id': cluster_match['cluster_id'],
                    'dominant_label': cluster_match['dominant_label']
                })
            if subcluster_match:
                update_data.update({
                    'kmeans_cluster_id': subcluster_match['cluster_id'],
                    'dominant_label': subcluster_match['dominant_label'],
                    'subcluster_id': subcluster_match['subcluster_id'],
                    'subcluster_label': subcluster_match['subcluster_label']
                })
            
            if update_data:
                print(f"UPDATE OPERATION: {update_data}")
            else:
                print("NO UPDATE OPERATION (no matches)")
        
        print(f"\n=== DATABASE STATE CHECK ===")
        # Check existing updates
        existing_with_cluster = self.voice_collection.count_documents({"kmeans_cluster_id": {"$exists": True}})
        existing_with_subcluster = self.voice_collection.count_documents({"subcluster_id": {"$exists": True}})
        voice_with_topic = self.voice_collection.count_documents({"dominant_topic": {"$exists": True, "$ne": None}})
        
        print(f"Voice records with dominant_topic: {voice_with_topic}")
        print(f"Voice records already with kmeans_cluster_id: {existing_with_cluster}")
        print(f"Voice records already with subcluster_id: {existing_with_subcluster}")
        
        if voice_with_topic == 0:
            print("❌ PROBLEM: No voice records have 'dominant_topic' field!")
        elif existing_with_cluster == voice_with_topic:
            print("✓ All voice records already processed!")
        else:
            print(f"📝 {voice_with_topic - existing_with_cluster} voice records need processing")
    
    def get_preview(self, limit: int = 10) -> List[Dict]:
        """
        Get a preview of voice-cluster matches for testing
        """
        voice_records = list(self.voice_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}}
        ).limit(limit))
        
        preview = []
        
        for voice_record in voice_records:
            dominant_topic = voice_record.get('dominant_topic')
            if not dominant_topic:
                continue
            
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            
            preview.append({
                'voice_id': str(voice_record['_id']),
                'dominant_topic': dominant_topic,
                'cluster_match': cluster_match,
                'subcluster_match': subcluster_match
            })
        
        return preview
    
    def close_connection(self):
        """Close MongoDB connection"""
        self.client.close()

# Usage example
def main():
    # Get configuration from environment variables
    CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
    DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')
    
    if not CONNECTION_STRING:
        raise ValueError("MONGO_CONNECTION_STRING not found in environment variables")
    if not DATABASE_NAME:
        raise ValueError("MONGO_DATABASE_NAME not found in environment variables")
    
    print(f"Connecting to database: {DATABASE_NAME}")
    
    # Initialize optimized voice matcher
    matcher = OptimizedVoiceClusterMatcher(CONNECTION_STRING, DATABASE_NAME)
    
    try:
        # Show performance stats
        print("\n--- Database Statistics ---")
        perf_stats = matcher.get_performance_stats()
        for key, value in perf_stats.items():
            print(f"{key}: {value:,}")
        
        # Analyze matching gaps
        print("\n--- Gap Analysis ---")
        gap_choice = input("Analyze which voice records aren't matching? (y/n): ")
        if gap_choice.lower() == 'y':
            gaps = matcher.analyze_matching_gaps()
            print(f"\n=== MATCHING GAP ANALYSIS ===")
            print(f"Total unique topics: {gaps['total_topics']}")
            print(f"Matched topics: {gaps['matched_topics']}")
            print(f"Unmatched topics: {gaps['unmatched_topics']}")
            print(f"Matched voice records: {gaps['matched_voice_count']}")
            print(f"Unmatched voice records: {gaps['unmatched_voice_count']}")
            
            if gaps['unmatched_details']:
                print(f"\n--- UNMATCHED DOMINANT TOPICS ---")
                for topic, count in list(gaps['unmatched_details'].items())[:10]:
                    print(f"'{topic}' - {count} voice records")
                
                if len(gaps['unmatched_details']) > 10:
                    print(f"... and {len(gaps['unmatched_details']) - 10} more")
                
                print(f"\n💡 To get 100% matches, you need to:")
                print(f"1. Add these topics to your voice cluster keyphrases, OR")
                print(f"2. Create a 'catch-all' cluster for unmatched voice topics")
        
        # Debug the matching process first
        print("\n--- Debugging Mode ---")
        debug_choice = input("Run debug mode to see why DB isn't updating? (y/n): ")
        if debug_choice.lower() == 'y':
            matcher.debug_matching_process()
        
        # Get a preview first
        print("\n--- Preview of Voice Matches ---")
        preview = matcher.get_preview(limit=5)
        
        for i, item in enumerate(preview, 1):
            print(f"\n--- Voice Record {i} ---")
            print(f"Dominant Topic: {item['dominant_topic']}")
            
            if item['subcluster_match']:
                print(f"✓ Subcluster Match: Cluster ID={item['subcluster_match']['cluster_id']}, "
                      f"Subcluster ID={item['subcluster_match']['subcluster_id']}, "
                      f"Label={item['subcluster_match']['subcluster_label']}")
            elif item['cluster_match']:
                print(f"✓ Cluster Match: ID={item['cluster_match']['cluster_id']}, "
                      f"Label={item['cluster_match']['dominant_label']}")
            else:
                print("✗ No match found")
        
        # Process all voice records
        print("\n--- Processing Options ---")
        print("1. Dry run (see what would be updated without changing DB)")
        print("2. Full processing (actually update the database)")
        print("3. Process with fallback cluster (100% match guarantee)")
        choice = input("Choose option (1, 2, or 3): ")
        
        if choice in ['1', '2', '3']:
            if choice == '3':
                # Use fallback processing
                dry_run = False
                fallback_choice = input("Dry run with fallback first? (y/n): ")
                if fallback_choice.lower() == 'y':
                    dry_run = True
                
                print(f"\nStarting {'DRY RUN' if dry_run else 'LIVE'} processing with fallback...")
                batch_size = int(input("Enter batch size (recommended: 5000-10000): ") or "5000")
                stats = matcher.process_with_fallback(batch_size=batch_size, dry_run=dry_run)
            else:
                # Regular processing
                dry_run = (choice == '1')
                
                print(f"\nStarting {'DRY RUN' if dry_run else 'LIVE PROCESSING'}...")
                
                # Use larger batch size for better performance
                batch_size = int(input("Enter batch size (recommended: 5000-10000): ") or "5000")
                
                stats = matcher.process_voice_optimized(batch_size=batch_size, dry_run=dry_run)
            
            print("\n--- Final Results ---")
            print(f"Total voice records processed: {stats['total_voice_records']:,}")
            print(f"Total updates made: {stats['total_updates']:,}")
            print(f"Cluster matches: {stats['matched_clusters']:,} ({stats['cluster_match_rate']:.1f}%)")
            print(f"Subcluster matches: {stats['matched_subclusters']:,} ({stats['subcluster_match_rate']:.1f}%)")
            print(f"Processing time: {stats['processing_time']:.2f} seconds")
            print(f"Processing rate: {stats['records_per_second']:.1f} records/second")
        
    finally:
        matcher.close_connection()

if __name__ == "__main__":
    main()

Connecting to database: sparzaai
Loading voice cluster data into cache...
Found 24 voice clusters to cache
Cache loaded in 1.95 seconds
Cached 155 cluster keyphrases
Cached 155 subcluster keyphrases

--- Database Statistics ---
total_voice_records: 2,040
voice_with_topic: 2,040
total_voice_clusters: 24
total_all_clusters: 98
cached_cluster_keyphrases: 155
cached_subcluster_keyphrases: 155

--- Gap Analysis ---
Analyzing matching gaps...
Found 155 unique dominant topics in voice records

=== MATCHING GAP ANALYSIS ===
Total unique topics: 155
Matched topics: 155
Unmatched topics: 0
Matched voice records: 2040
Unmatched voice records: 0

--- Debugging Mode ---

--- Preview of Voice Matches ---

--- Voice Record 1 ---
✓ Subcluster Match: Cluster ID=11, Subcluster ID=1, Label=Fraud Detection & Prevention

--- Voice Record 2 ---
Dominant Topic: Standing Order Setup
✓ Subcluster Match: Cluster ID=5, Subcluster ID=1, Label=Automated Payment Setup

--- Voice Record 3 ---
Dominant Topic: API Int

In [4]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {mongo_database_name}")
print("=" * 60)

# Connect to MongoDB
client = MongoClient(mongo_connection_string)
db = client[mongo_database_name]
voice_collection = db['voice']

def rename_voice_fields():
    """
    Rename fields in voice documents:
    - is_urgent -> urgency
    - dominant_label -> dominant_cluster_label
    """
    
    print("🎤 Starting voice field rename process...")
    print("Fields to rename:")
    print("  - is_urgent → urgency")
    print("  - dominant_label → dominant_cluster_label")
    print("-" * 50)
    
    try:
        # Check current state before rename
        total_voice_records = voice_collection.count_documents({})
        print(f"Total voice records in collection: {total_voice_records}")
        
        if total_voice_records == 0:
            print("⚠ No voice records found in collection")
            return
        
        # Count existing fields before rename
        is_urgent_count = voice_collection.count_documents({"is_urgent": {"$exists": True}})
        dominant_label_count = voice_collection.count_documents({"dominant_label": {"$exists": True}})
        
        print(f"Voice records with 'is_urgent' field: {is_urgent_count}")
        print(f"Voice records with 'dominant_label' field: {dominant_label_count}")
        
        # Count already renamed fields
        urgency_count = voice_collection.count_documents({"urgency": {"$exists": True}})
        dominant_cluster_label_count = voice_collection.count_documents({"dominant_cluster_label": {"$exists": True}})
        
        print(f"Voice records already with 'urgency' field: {urgency_count}")
        print(f"Voice records already with 'dominant_cluster_label' field: {dominant_cluster_label_count}")
        
        print("\n" + "=" * 50)
        print("RENAMING VOICE FIELDS...")
        print("=" * 50)
        
        # Rename both fields in a single operation for all voice records
        result = voice_collection.update_many(
            {},  # Empty filter to match all voice documents
            {
                "$rename": {
                    "is_urgent": "urgency",
                    "dominant_label": "dominant_cluster_label"
                }
            }
        )
        
        # Print results
        print(f"✓ Field rename operation completed:")
        print(f"  Matched voice records: {result.matched_count}")
        print(f"  Modified voice records: {result.modified_count}")
        print(f"  Operation acknowledged: {result.acknowledged}")
        
        # Verify the changes
        verify_rename_changes()
        
    except Exception as e:
        print(f"❌ Error during voice field rename: {str(e)}")

def verify_rename_changes():
    """
    Verify that the field rename was successful
    """
    print("\n" + "=" * 50)
    print("VERIFICATION OF VOICE FIELD RENAME")
    print("=" * 50)
    
    try:
        # Verify the changes by checking a sample voice document
        sample_voice = voice_collection.find_one()
        if sample_voice:
            print("Sample voice document after rename:")
            print(f"  Voice ID: {sample_voice.get('_id')}")
            print(f"  Has 'urgency' field: {'urgency' in sample_voice}")
            print(f"  Has 'dominant_cluster_label' field: {'dominant_cluster_label' in sample_voice}")
            print(f"  Has old 'is_urgent' field: {'is_urgent' in sample_voice}")
            print(f"  Has old 'dominant_label' field: {'dominant_label' in sample_voice}")
            
            # Show sample values if they exist
            if 'urgency' in sample_voice:
                print(f"  Sample 'urgency' value: {sample_voice['urgency']}")
            if 'dominant_cluster_label' in sample_voice:
                print(f"  Sample 'dominant_cluster_label' value: {sample_voice['dominant_cluster_label']}")
        else:
            print("⚠ No voice documents found in the collection")
        
        # Count voice records with the new field names
        urgency_count = voice_collection.count_documents({"urgency": {"$exists": True}})
        cluster_label_count = voice_collection.count_documents({"dominant_cluster_label": {"$exists": True}})
        
        # Count voice records with old field names (should be 0 after rename)
        old_is_urgent_count = voice_collection.count_documents({"is_urgent": {"$exists": True}})
        old_dominant_label_count = voice_collection.count_documents({"dominant_label": {"$exists": True}})
        
        print(f"\nField counts after rename:")
        print(f"  Voice records with 'urgency' field: {urgency_count}")
        print(f"  Voice records with 'dominant_cluster_label' field: {cluster_label_count}")
        print(f"  Voice records with old 'is_urgent' field: {old_is_urgent_count}")
        print(f"  Voice records with old 'dominant_label' field: {old_dominant_label_count}")
        
        # Success check
        if old_is_urgent_count == 0 and old_dominant_label_count == 0:
            print("\n✅ SUCCESS: All voice fields have been renamed successfully!")
        else:
            print(f"\n⚠ WARNING: Some voice records still have old field names")
            
    except Exception as e:
        print(f"❌ Error during verification: {str(e)}")

def get_voice_field_statistics():
    """
    Get detailed statistics about voice fields after rename
    """
    print("\n" + "=" * 50)
    print("VOICE FIELD STATISTICS")
    print("=" * 50)
    
    try:
        total_voice_records = voice_collection.count_documents({})
        
        # Get statistics for new field names
        urgency_stats = list(voice_collection.aggregate([
            {"$match": {"urgency": {"$exists": True}}},
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        cluster_label_stats = list(voice_collection.aggregate([
            {"$match": {"dominant_cluster_label": {"$exists": True}}},
            {"$group": {"_id": "$dominant_cluster_label", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}},
            {"$limit": 10}  # Show top 10 cluster labels
        ]))
        
        print(f"Total voice records: {total_voice_records}")
        
        if urgency_stats:
            print(f"\nUrgency field distribution:")
            for stat in urgency_stats:
                urgency_value = stat['_id']
                count = stat['count']
                percentage = (count / total_voice_records) * 100 if total_voice_records > 0 else 0
                print(f"  '{urgency_value}': {count} voice records ({percentage:.1f}%)")
        else:
            print(f"\nNo voice records found with 'urgency' field")
        
        if cluster_label_stats:
            print(f"\nTop 10 dominant cluster labels:")
            for i, stat in enumerate(cluster_label_stats, 1):
                label = stat['_id'] if stat['_id'] is not None else 'null'
                count = stat['count']
                percentage = (count / total_voice_records) * 100 if total_voice_records > 0 else 0
                print(f"  {i}. '{label}': {count} voice records ({percentage:.1f}%)")
        else:
            print(f"\nNo voice records found with 'dominant_cluster_label' field")
            
    except Exception as e:
        print(f"❌ Error getting voice statistics: {str(e)}")

def show_sample_voice_records():
    """
    Show sample voice documents with renamed fields
    """
    print("\n" + "=" * 50)
    print("SAMPLE VOICE DOCUMENTS")
    print("=" * 50)
    
    try:
        # Get sample voice records with both new fields
        sample_voice_records = list(voice_collection.find(
            {
                "urgency": {"$exists": True},
                "dominant_cluster_label": {"$exists": True}
            },
            {
                "_id": 1,
                "urgency": 1,
                "dominant_cluster_label": 1,
                "transcript": 1,  # Include transcript if it exists
                "duration": 1,    # Include duration if it exists
                "caller_id": 1    # Include caller_id if it exists
            }
        ).limit(3))
        
        if sample_voice_records:
            print("Sample voice records with renamed fields:")
            for i, voice_record in enumerate(sample_voice_records, 1):
                print(f"\nVoice Record {i}:")
                print(f"  ID: {voice_record.get('_id')}")
                print(f"  Urgency: {voice_record.get('urgency', 'N/A')}")
                print(f"  Dominant Cluster Label: {voice_record.get('dominant_cluster_label', 'N/A')}")
                if 'transcript' in voice_record:
                    transcript = voice_record.get('transcript', 'N/A')
                    # Truncate transcript if too long
                    if len(str(transcript)) > 100:
                        transcript = str(transcript)[:100] + "..."
                    print(f"  Transcript: {transcript}")
                if 'duration' in voice_record:
                    print(f"  Duration: {voice_record.get('duration', 'N/A')}")
                if 'caller_id' in voice_record:
                    print(f"  Caller ID: {voice_record.get('caller_id', 'N/A')}")
        else:
            print("No voice records found with both renamed fields")
            
    except Exception as e:
        print(f"❌ Error showing sample voice records: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        # Test database connection
        test_voice_record = voice_collection.find_one()
        if test_voice_record:
            print("✓ Database connection successful")
            print(f"Sample voice record fields: {list(test_voice_record.keys())}\n")
        else:
            print("⚠ No voice records found in voice collection")
            print("Please ensure you have voice documents before running this script")
            exit(1)
        
        # Execute the field rename for voice records
        rename_voice_fields()
        
        # Get detailed statistics
        get_voice_field_statistics()
        
        # Show sample voice records
        show_sample_voice_records()
        
        print("\n" + "=" * 60)
        print("✅ Voice field rename process completed successfully!")
        print("Fields renamed:")
        print("  - is_urgent → urgency")
        print("  - dominant_label → dominant_cluster_label")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
✓ Database connection successful
Sample voice record fields: ['_id', 'call_id', 'timestamp', 'customer_name', 'customer_id', 'email', 'dominant_topic', 'subtopics', 'call_purpose', 'conversation', 'priority', 'resolution_status', 'sentiment', 'urgency', 'embeddings', 'dominant_label', 'kmeans_cluster_id', 'subcluster_id', 'subcluster_label']

🎤 Starting voice field rename process...
Fields to rename:
  - is_urgent → urgency
  - dominant_label → dominant_cluster_label
--------------------------------------------------
Total voice records in collection: 2040
Voice records with 'is_urgent' field: 0
Voice records with 'dominant_label' field: 2040
Voice records already with 'urgency' field: 2040
Voice records already with 'dominant_cluster_label' field: 0

RENAMING VOICE FIELDS...
✓ Field rename operation completed:
  Matched voice records: 2040
  Modified voice records: 2040
  Operation acknowledged: True

VERIFICATION OF VOICE FIELD RENAME
Sampl

In [5]:
from pymongo import MongoClient, UpdateOne
from typing import Dict, List, Optional
import os
from dotenv import load_dotenv
import time

# Load environment variables
load_dotenv()

class VoiceClusterKeyphraseUpdater:
    def __init__(self, connection_string: str, database_name: str):
        """Initialize the updater with MongoDB connection"""
        self.client = MongoClient(connection_string)
        self.db = self.client[database_name]
        self.voice_collection = self.db['voice']
        self.clusters_collection = self.db['cluster']
        
        # Cache for keyphrase -> cluster mapping (ONLY voice cluster level)
        self._keyphrase_to_cluster = {}
        self._load_keyphrase_cache()
    
    def _load_keyphrase_cache(self):
        """Load only voice cluster-level keyphrases into memory for fast lookups"""
        print("Loading voice cluster keyphrase cache...")
        start_time = time.time()
        
        # Only load clusters with data: "voice"
        voice_clusters = list(self.clusters_collection.find({"data": "voice"}))
        print(f"Found {len(voice_clusters)} voice clusters to process")
        
        for cluster in voice_clusters:
            cluster_id = cluster.get('cluster_id')
            dominant_label = cluster.get('dominant_label')
            cluster_keyphrases = cluster.get('keyphrases', [])
            
            # Cache ONLY voice cluster-level keyphrases
            for keyphrase in cluster_keyphrases:
                self._keyphrase_to_cluster[keyphrase] = {
                    'cluster_id': cluster_id,
                    'dominant_label': dominant_label,
                    'matched_keyphrase': keyphrase
                }
        
        cache_time = time.time() - start_time
        print(f"Voice cluster cache loaded in {cache_time:.2f} seconds")
        print(f"Cached {len(self._keyphrase_to_cluster)} voice cluster keyphrases")
    
    def find_matching_keyphrase(self, dominant_topic: str) -> Optional[Dict]:
        """Find the matching keyphrase for a dominant topic (voice cluster level only)"""
        return self._keyphrase_to_cluster.get(dominant_topic)
    
    def process_voice_batch(self, voice_records: List[Dict]) -> List:
        """Process a batch of voice records and return bulk operations"""
        bulk_operations = []
        
        for voice_record in voice_records:
            dominant_topic = voice_record.get('dominant_topic')
            if not dominant_topic:
                continue
            
            # Find matching keyphrase from voice clusters
            match_info = self.find_matching_keyphrase(dominant_topic)
            
            if match_info:
                update_data = {
                    'kmeans_cluster_keyphrase': match_info['matched_keyphrase']
                }
                
                bulk_operations.append(
                    UpdateOne(
                        {'_id': voice_record['_id']}, 
                        {'$set': update_data}
                    )
                )
        
        return bulk_operations
    
    def add_keyphrase_field(self, batch_size: int = 5000, dry_run: bool = False) -> Dict:
        """Add kmeans_cluster_keyphrase field to all matching voice records"""
        start_time = time.time()
        
        # Get total count
        total_voice = self.voice_collection.count_documents({
            "dominant_topic": {"$exists": True, "$ne": None}
        })
        
        processed = 0
        matched = 0
        total_updates = 0
        
        print(f"Processing {total_voice} voice records with dominant_topic")
        print(f"Batch size: {batch_size}")
        print(f"DRY RUN MODE: {'ON' if dry_run else 'OFF'}")
        
        # Create index for faster queries
        try:
            self.voice_collection.create_index([("dominant_topic", 1)], background=True)
            print("✓ Index on dominant_topic verified for voice collection")
        except Exception as e:
            print(f"Index note: {e}")
        
        # Process in batches
        cursor = self.voice_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}},
            projection={'dominant_topic': 1}
        ).batch_size(batch_size)
        
        batch = []
        batch_count = 0
        
        for voice_record in cursor:
            batch.append(voice_record)
            
            if len(batch) >= batch_size:
                batch_count += 1
                print(f"\n--- Processing voice batch {batch_count} ({len(batch)} voice records) ---")
                
                # Process batch
                bulk_operations = self.process_voice_batch(batch)
                batch_matched = len(bulk_operations)
                matched += batch_matched
                
                print(f"Generated {batch_matched} keyphrase updates for this voice batch")
                
                # Execute bulk update (or skip if dry run)
                if bulk_operations and not dry_run:
                    try:
                        result = self.voice_collection.bulk_write(
                            bulk_operations, 
                            ordered=False
                        )
                        total_updates += result.modified_count
                        print(f"✓ Updated {result.modified_count} voice records with keyphrase field")
                        
                    except Exception as e:
                        print(f"❌ Bulk write error in voice batch {batch_count}: {e}")
                
                elif bulk_operations and dry_run:
                    print(f"DRY RUN: Would add keyphrase field to {batch_matched} voice records")
                    # Show sample operations
                    for i, op in enumerate(bulk_operations[:3]):
                        keyphrase = op._doc['$set']['kmeans_cluster_keyphrase']
                        print(f"  Sample {i+1}: Would set keyphrase='{keyphrase}'")
                
                processed += len(batch)
                batch = []
                
                # Progress update
                elapsed = time.time() - start_time
                rate = processed / elapsed if elapsed > 0 else 0
                print(f"Progress: {processed}/{total_voice} ({rate:.1f} voice records/sec)")
        
        # Process remaining voice records in final batch
        if batch:
            batch_count += 1
            print(f"\n--- Processing final voice batch {batch_count} ({len(batch)} voice records) ---")
            
            bulk_operations = self.process_voice_batch(batch)
            batch_matched = len(bulk_operations)
            matched += batch_matched
            
            print(f"Generated {batch_matched} keyphrase updates for final voice batch")
            
            if bulk_operations and not dry_run:
                try:
                    result = self.voice_collection.bulk_write(
                        bulk_operations, 
                        ordered=False
                    )
                    total_updates += result.modified_count
                    print(f"✓ Updated {result.modified_count} voice records in final batch")
                except Exception as e:
                    print(f"❌ Bulk write error in final voice batch: {e}")
            elif bulk_operations and dry_run:
                print(f"DRY RUN: Would add keyphrase field to {batch_matched} voice records")
            
            processed += len(batch)
        
        total_time = time.time() - start_time
        
        # Final verification
        if not dry_run and total_updates > 0:
            print(f"\n--- Verification ---")
            keyphrase_count = self.voice_collection.count_documents({
                "kmeans_cluster_keyphrase": {"$exists": True}
            })
            print(f"Total voice records with kmeans_cluster_keyphrase: {keyphrase_count}")
            
            # Show some sample results
            samples = list(self.voice_collection.find(
                {"kmeans_cluster_keyphrase": {"$exists": True}},
                {"dominant_topic": 1, "kmeans_cluster_keyphrase": 1}
            ).limit(5))
            
            print(f"\nSample voice results:")
            for i, sample in enumerate(samples, 1):
                print(f"  {i}. Topic: '{sample.get('dominant_topic')}' -> "
                      f"Keyphrase: '{sample.get('kmeans_cluster_keyphrase')}'")
        
        stats = {
            'total_voice_processed': processed,
            'voice_matched': matched,
            'total_updates': total_updates,
            'processing_time': total_time,
            'voice_per_second': processed / total_time if total_time > 0 else 0,
            'match_rate': (matched / processed * 100) if processed > 0 else 0,
            'dry_run': dry_run
        }
        
        return stats
    
    def debug_keyphrase_matching(self, limit: int = 10) -> None:
        """Debug the keyphrase matching process for voice records"""
        print("\n=== DEBUGGING VOICE CLUSTER KEYPHRASE MATCHING ===")
        
        # Check cache
        if not self._keyphrase_to_cluster:
            print("❌ NO VOICE CLUSTER KEYPHRASE CACHE DATA!")
            return
        
        print(f"✓ Voice cluster keyphrase cache: {len(self._keyphrase_to_cluster)} entries")
        
        # Show sample keyphrases
        print(f"\nSample voice cluster keyphrases:")
        for i, (keyphrase, info) in enumerate(list(self._keyphrase_to_cluster.items())[:10]):
            print(f"  {i+1}. '{keyphrase}' -> Voice Cluster {info['cluster_id']} ({info['dominant_label']})")
        
        # Test with actual voice records
        print(f"\n=== TESTING {limit} VOICE RECORDS ===")
        voice_records = list(self.voice_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}}
        ).limit(limit))
        
        if not voice_records:
            print("❌ NO VOICE RECORDS with dominant_topic found!")
            return
        
        for i, voice_record in enumerate(voice_records, 1):
            dominant_topic = voice_record.get('dominant_topic', 'NO_TOPIC')
            print(f"\n--- Voice Record {i} ---")
            print(f"Voice Record ID: {voice_record['_id']}")
            print(f"Dominant Topic: '{dominant_topic}'")
            
            # Test keyphrase matching
            match_info = self.find_matching_keyphrase(dominant_topic)
            if match_info:
                print(f"✓ VOICE CLUSTER KEYPHRASE MATCH: '{match_info['matched_keyphrase']}'")
                print(f"  Cluster ID: {match_info['cluster_id']}")
                print(f"  Cluster Label: {match_info['dominant_label']}")
            else:
                print(f"❌ No voice cluster keyphrase match for '{dominant_topic}'")
    
    def get_keyphrase_stats(self) -> Dict:
        """Get statistics about keyphrase matching for voice records"""
        # Count voice records with dominant_topic
        voice_with_topic = self.voice_collection.count_documents({
            "dominant_topic": {"$exists": True, "$ne": None}
        })
        
        # Count voice records already with keyphrase field
        voice_with_keyphrase = self.voice_collection.count_documents({
            "kmeans_cluster_keyphrase": {"$exists": True}
        })
        
        # Get unique dominant topics and check match rates
        unique_topics = self.voice_collection.distinct("dominant_topic")
        unique_topics = [topic for topic in unique_topics if topic is not None]
        
        matchable_topics = 0
        for topic in unique_topics:
            if self.find_matching_keyphrase(topic):
                matchable_topics += 1
        
        # Count voice clusters
        total_voice_clusters = self.clusters_collection.count_documents({"data": "voice"})
        
        return {
            'total_voice_with_topic': voice_with_topic,
            'voice_with_keyphrase_field': voice_with_keyphrase,
            'unique_dominant_topics': len(unique_topics),
            'matchable_topics': matchable_topics,
            'topic_match_rate': (matchable_topics / len(unique_topics) * 100) if unique_topics else 0,
            'cached_voice_cluster_keyphrases': len(self._keyphrase_to_cluster),
            'total_voice_clusters': total_voice_clusters
        }
    
    def close_connection(self):
        """Close MongoDB connection"""
        self.client.close()

def main():
    # Get configuration from environment variables
    CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
    DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')
    
    if not CONNECTION_STRING:
        raise ValueError("MONGO_CONNECTION_STRING not found in environment variables")
    if not DATABASE_NAME:
        raise ValueError("MONGO_DATABASE_NAME not found in environment variables")
    
    print(f"Connecting to database: {DATABASE_NAME}")
    print("Processing voice records with voice clusters (data: 'voice')")
    
    # Initialize voice keyphrase updater
    updater = VoiceClusterKeyphraseUpdater(CONNECTION_STRING, DATABASE_NAME)
    
    try:
        # Show current statistics
        print("\n--- Current Voice Statistics ---")
        stats = updater.get_keyphrase_stats()
        for key, value in stats.items():
            if isinstance(value, float):
                print(f"{key}: {value:.1f}")
            else:
                print(f"{key}: {value:,}")
        
        # Check if we have voice clusters
        if stats['total_voice_clusters'] == 0:
            print("\n❌ No voice clusters found (data: 'voice')!")
            print("Please ensure you have clusters with data: 'voice' before running this script.")
            return
        
        # Debug keyphrase matching
        print("\n--- Debug Mode ---")
        debug_choice = input("Run debug mode to see voice keyphrase matching? (y/n): ")
        if debug_choice.lower() == 'y':
            updater.debug_keyphrase_matching()
        
        # Choose processing mode
        print("\n--- Processing Options ---")
        print("1. Dry run (see what would be updated)")
        print("2. Live processing (actually add keyphrase field)")
        choice = input("Choose option (1 or 2): ")
        
        if choice in ['1', '2']:
            dry_run = (choice == '1')
            
            print(f"\nStarting {'DRY RUN' if dry_run else 'LIVE PROCESSING'} for voice records...")
            
            # Get batch size
            batch_size = int(input("Enter batch size (recommended: 5000): ") or "5000")
            
            # Process voice records
            results = updater.add_keyphrase_field(batch_size=batch_size, dry_run=dry_run)
            
            print("\n--- Final Results ---")
            print(f"Total voice records processed: {results['total_voice_processed']:,}")
            print(f"Voice records with matching keyphrases: {results['voice_matched']:,}")
            print(f"Total updates made: {results['total_updates']:,}")
            print(f"Match rate: {results['match_rate']:.1f}%")
            print(f"Processing time: {results['processing_time']:.2f} seconds")
            print(f"Processing rate: {results['voice_per_second']:.1f} voice records/second")
            
            if not dry_run and results['total_updates'] > 0:
                print(f"\n✅ Successfully added kmeans_cluster_keyphrase field to {results['total_updates']:,} voice records!")
            elif dry_run:
                print(f"\nDRY RUN COMPLETE: Would add keyphrase field to {results['voice_matched']:,} voice records")
        
    finally:
        updater.close_connection()

if __name__ == "__main__":
    main()

Connecting to database: sparzaai
Processing voice records with voice clusters (data: 'voice')
Loading voice cluster keyphrase cache...
Found 24 voice clusters to process
Voice cluster cache loaded in 2.07 seconds
Cached 155 voice cluster keyphrases

--- Current Voice Statistics ---
total_voice_with_topic: 2,040
voice_with_keyphrase_field: 0
unique_dominant_topics: 155
matchable_topics: 155
topic_match_rate: 100.0
cached_voice_cluster_keyphrases: 155
total_voice_clusters: 24

--- Debug Mode ---

=== DEBUGGING VOICE CLUSTER KEYPHRASE MATCHING ===
✓ Voice cluster keyphrase cache: 155 entries

Sample voice cluster keyphrases:
  1. 'Balance Inquiry Call' -> Voice Cluster 0 (Basic Account Information & Services)
  2. 'Account Statement Request' -> Voice Cluster 0 (Basic Account Information & Services)
  3. 'Transaction History Inquiry' -> Voice Cluster 0 (Basic Account Information & Services)
  4. 'Account Reconciliation Discussion' -> Voice Cluster 0 (Basic Account Information & Services)
 

In [9]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get connection details from environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
    raise ValueError("Please set MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME in your environment variables")

# Connect to MongoDB
client = MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DATABASE_NAME]
collection = db['voice']

try:
    # Add domain field to all documents
    result = collection.update_many(
        {},  # Empty filter to match all documents
        {"$set": {"domain": "voice"}}
    )
    
    print(f"Matched documents: {result.matched_count}")
    print(f"Modified documents: {result.modified_count}")
    
    if result.matched_count == 2004:
        print("Successfully updated all 2004 documents to voice domain!")
    else:
        print(f"Expected 2004 documents, but found {result.matched_count}")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the connection
    client.close()

# Alternative: Add domain field only to documents that don't already have it
def add_domain_conditionally():
    try:
        result = collection.update_many(
            {"domain": {"$exists": False}},  # Only documents without 'domain' field
            {"$set": {"domain": "voice"}}
        )
        
        print(f"Documents without domain field: {result.matched_count}")
        print(f"Modified documents: {result.modified_count}")
        
    except Exception as e:
        print(f"An error occurred: {e}")

# Alternative: Update only documents that currently have domain set to "banking"
def update_banking_to_voice():
    try:
        result = collection.update_many(
            {"domain": "banking"},  # Only documents with domain set to "banking"
            {"$set": {"domain": "voice"}}
        )
        
        print(f"Documents with banking domain: {result.matched_count}")
        print(f"Modified documents: {result.modified_count}")
        
    except Exception as e:
        print(f"An error occurred: {e}")

# Uncomment the line below if you want to run the conditional update instead
# add_domain_conditionally()

# Uncomment the line below if you want to update only banking domains to voice
# update_banking_to_voice()

Matched documents: 2040
Modified documents: 0
Expected 2004 documents, but found 2040


In [1]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {mongo_database_name}")
print("Collection: voice")
print("=" * 60)

# Connect to MongoDB
client = MongoClient(mongo_connection_string)
db = client[mongo_database_name]
voice_collection = db['voice']

def analyze_urgency_values():
    """Analyze current urgency field values"""
    print("Analyzing current urgency values...")
    print("-" * 40)
    
    try:
        # Get all unique urgency values
        urgency_values = voice_collection.distinct("urgency")
        print(f"Unique urgency values found: {urgency_values}")
        
        # Count each urgency value
        urgency_stats = list(voice_collection.aggregate([
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        total_voices = voice_collection.count_documents({})
        print(f"\nTotal voice records: {total_voices}")
        print(f"Urgency distribution:")
        
        for stat in urgency_stats:
            value = stat['_id'] if stat['_id'] is not None else 'null/missing'
            count = stat['count']
            percentage = (count / total_voices) * 100 if total_voices > 0 else 0
            print(f"  '{value}': {count} voice records ({percentage:.1f}%)")
            
        return urgency_stats
        
    except Exception as e:
        print(f"Error analyzing urgency values: {str(e)}")
        return []

def update_urgency_to_boolean():
    """Update urgency field from string to boolean"""
    print("\nStarting urgency field update...")
    print("Conversion rules:")
    print("  'Critical' → true")
    print("  'High' → false")
    print("-" * 40)
    
    try:
        # Update Critical to true
        critical_result = voice_collection.update_many(
            {"urgency": "Critical"},
            {"$set": {"urgency": True}}
        )
        
        print(f"✓ Updated 'Critical' urgency:")
        print(f"  Matched: {critical_result.matched_count}")
        print(f"  Modified: {critical_result.modified_count}")
        
        # Update High to false
        high_result = voice_collection.update_many(
            {"urgency": "High"},
            {"$set": {"urgency": False}}
        )
        
        print(f"✓ Updated 'High' urgency:")
        print(f"  Matched: {high_result.matched_count}")
        print(f"  Modified: {high_result.modified_count}")
        
        total_updated = critical_result.modified_count + high_result.modified_count
        print(f"\nTotal voice records updated: {total_updated}")
        
        return {
            'critical_updated': critical_result.modified_count,
            'high_updated': high_result.modified_count,
            'total_updated': total_updated
        }
        
    except Exception as e:
        print(f"Error updating urgency values: {str(e)}")
        return None

def verify_boolean_conversion():
    """Verify that urgency values are now boolean"""
    print("\n" + "=" * 50)
    print("VERIFICATION OF URGENCY CONVERSION")
    print("=" * 50)
    
    try:
        # Count boolean urgency values
        true_count = voice_collection.count_documents({"urgency": True})
        false_count = voice_collection.count_documents({"urgency": False})
        
        # Count any remaining string values
        critical_count = voice_collection.count_documents({"urgency": "Critical"})
        high_count = voice_collection.count_documents({"urgency": "High"})
        
        # Count other values
        other_urgency = list(voice_collection.aggregate([
            {"$match": {"urgency": {"$nin": [True, False, "Critical", "High"]}}},
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        print(f"Boolean urgency values:")
        print(f"  urgency: true → {true_count} voice records")
        print(f"  urgency: false → {false_count} voice records")
        
        print(f"\nRemaining string values:")
        print(f"  urgency: 'Critical' → {critical_count} voice records")
        print(f"  urgency: 'High' → {high_count} voice records")
        
        if other_urgency:
            print(f"\nOther urgency values:")
            for other in other_urgency:
                value = other['_id'] if other['_id'] is not None else 'null/missing'
                count = other['count']
                print(f"  urgency: '{value}' → {count} voice records")
        
        # Show sample voice records with boolean urgency
        print(f"\nSample voice records with boolean urgency:")
        samples = list(voice_collection.find(
            {"urgency": {"$in": [True, False]}},
            {
                "voice_id": 1,
                "urgency": 1,
                "priority": 1,
                "title": 1
            }
        ).limit(5))
        
        for i, sample in enumerate(samples, 1):
            voice_id = sample.get('voice_id', sample.get('_id', 'N/A'))
            urgency = sample.get('urgency')
            priority = sample.get('priority', 'N/A')
            title = sample.get('title', 'N/A')[:50] + '...' if len(sample.get('title', '')) > 50 else sample.get('title', 'N/A')
            
            print(f"  {i}. {voice_id}: urgency={urgency}, priority='{priority}'")
            print(f"     Title: {title}")
        
        # Success check
        if critical_count == 0 and high_count == 0:
            print(f"\n✅ SUCCESS: All 'Critical' and 'High' urgency values converted to boolean!")
            print(f"Summary: {true_count} critical (true) + {false_count} high (false) = {true_count + false_count} total")
        else:
            print(f"\n⚠ WARNING: Some string urgency values remain")
            
    except Exception as e:
        print(f"Error during verification: {str(e)}")

def handle_other_urgency_values():
    """Check and optionally handle other urgency values"""
    print("\n" + "=" * 50)
    print("HANDLING OTHER URGENCY VALUES")
    print("=" * 50)
    
    try:
        # Find voice records with urgency values other than True/False/Critical/High
        other_urgency = list(voice_collection.aggregate([
            {"$match": {"urgency": {"$nin": [True, False, "Critical", "High"]}}},
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        if not other_urgency:
            print("No other urgency values found - all voice records have been processed!")
            return
        
        print("Found voice records with other urgency values:")
        for other in other_urgency:
            value = other['_id'] if other['_id'] is not None else 'null/missing'
            count = other['count']
            print(f"  '{value}': {count} voice records")
        
        print(f"\nSample voice records with other urgency values:")
        samples = list(voice_collection.find(
            {"urgency": {"$nin": [True, False, "Critical", "High"]}},
            {
                "voice_id": 1,
                "urgency": 1,
                "priority": 1,
                "title": 1
            }
        ).limit(3))
        
        for i, sample in enumerate(samples, 1):
            voice_id = sample.get('voice_id', sample.get('_id', 'N/A'))
            urgency = sample.get('urgency')
            priority = sample.get('priority', 'N/A')
            title = sample.get('title', 'N/A')[:50] + '...' if len(sample.get('title', '')) > 50 else sample.get('title', 'N/A')
            
            print(f"  {i}. {voice_id}: urgency={urgency}, priority='{priority}'")
            print(f"     Title: {title}")
            
    except Exception as e:
        print(f"Error handling other urgency values: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        # Test database connection
        test_voice = voice_collection.find_one()
        if test_voice:
            print("✓ Database connection successful")
            print(f"Sample voice record fields: {list(test_voice.keys())}\n")
        else:
            print("⚠ No voice records found in voice collection")
            exit(1)
        
        # Analyze current urgency values
        analyze_urgency_values()
        
        # Confirm before proceeding
        print(f"\nThis will update urgency values:")
        print(f"  'Critical' → true")
        print(f"  'High' → false")
        
        confirm = input(f"\nProceed with urgency conversion? (y/n): ")
        if confirm.lower() != 'y':
            print("Operation cancelled.")
            exit(0)
        
        # Execute the urgency update
        update_result = update_urgency_to_boolean()
        
        if update_result:
            # Verify the conversion
            verify_boolean_conversion()
            
            # Handle other urgency values
            handle_other_urgency_values()
            
            print(f"\n" + "=" * 60)
            print("✅ URGENCY CONVERSION COMPLETED SUCCESSFULLY!")
            print(f"Summary:")
            print(f"  Critical voice records (now true): {update_result['critical_updated']}")
            print(f"  High voice records (now false): {update_result['high_updated']}")
            print(f"  Total voice records updated: {update_result['total_updated']}")
            print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
Collection: voice
✓ Database connection successful
Sample voice record fields: ['_id', 'call_id', 'timestamp', 'customer_name', 'customer_id', 'email', 'dominant_topic', 'subtopics', 'call_purpose', 'conversation', 'priority', 'resolution_status', 'sentiment', 'urgency', 'embeddings', 'kmeans_cluster_id', 'subcluster_id', 'subcluster_label', 'dominant_cluster_label', 'kmeans_cluster_keyphrase', 'domain']

Analyzing current urgency values...
----------------------------------------
Unique urgency values found: [False, True]

Total voice records: 2040
Urgency distribution:
  'False': 1789 voice records (87.7%)
  'True': 251 voice records (12.3%)

This will update urgency values:
  'Critical' → true
  'High' → false

Starting urgency field update...
Conversion rules:
  'Critical' → true
  'High' → false
----------------------------------------
✓ Updated 'Critical' urgency:
  Matched: 0
  Modified: 0
✓ Updated 'High' urgency:
  Matched: 0
  Modif

In [2]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {mongo_database_name}")
print("Collection: voice")
print("=" * 60)

# Connect to MongoDB
client = MongoClient(mongo_connection_string)
db = client[mongo_database_name]
voice_collection = db['voice']

def analyze_subcluster_id_values():
    """Analyze current subcluster_id field values"""
    print("Analyzing current subcluster_id values...")
    print("-" * 40)
    
    try:
        # Get all unique subcluster_id values
        subcluster_values = voice_collection.distinct("subcluster_id")
        print(f"Unique subcluster_id values found: {subcluster_values}")
        
        # Count each subcluster_id value type
        subcluster_stats = list(voice_collection.aggregate([
            {"$group": {
                "_id": {"value": "$subcluster_id", "type": {"$type": "$subcluster_id"}}, 
                "count": {"$sum": 1}
            }},
            {"$sort": {"count": -1}}
        ]))
        
        total_voices = voice_collection.count_documents({})
        print(f"\nTotal voice records: {total_voices}")
        print(f"Subcluster_id distribution:")
        
        for stat in subcluster_stats:
            value = stat['_id']['value'] if stat['_id']['value'] is not None else 'null/missing'
            data_type = stat['_id']['type']
            count = stat['count']
            percentage = (count / total_voices) * 100 if total_voices > 0 else 0
            print(f"  '{value}' (type: {data_type}): {count} voice records ({percentage:.1f}%)")
            
        return subcluster_stats
        
    except Exception as e:
        print(f"Error analyzing subcluster_id values: {str(e)}")
        return []

def update_subcluster_id_to_string():
    """Update subcluster_id field from integer to string"""
    print("\nStarting subcluster_id field update...")
    print("Conversion rule: All integer values → string values")
    print("-" * 40)
    
    try:
        # Find all voice records with integer subcluster_id
        integer_voices = list(voice_collection.find(
            {"subcluster_id": {"$type": "int"}},
            {"_id": 1, "subcluster_id": 1}
        ))
        
        print(f"Found {len(integer_voices)} voice records with integer subcluster_id")
        
        updated_count = 0
        
        # Update each voice record individually to convert integer to string
        for voice in integer_voices:
            old_value = voice['subcluster_id']
            new_value = str(old_value)
            
            result = voice_collection.update_one(
                {"_id": voice['_id']},
                {"$set": {"subcluster_id": new_value}}
            )
            
            if result.modified_count > 0:
                updated_count += 1
        
        print(f"✓ Updated subcluster_id from integer to string:")
        print(f"  Total processed: {len(integer_voices)}")
        print(f"  Successfully updated: {updated_count}")
        
        return updated_count
        
    except Exception as e:
        print(f"Error updating subcluster_id values: {str(e)}")
        return 0

def verify_string_conversion():
    """Verify that subcluster_id values are now strings"""
    print("\n" + "=" * 50)
    print("VERIFICATION OF SUBCLUSTER_ID CONVERSION")
    print("=" * 50)
    
    try:
        # Count string subcluster_id values
        string_count = voice_collection.count_documents({"subcluster_id": {"$type": "string"}})
        
        # Count any remaining integer values
        integer_count = voice_collection.count_documents({"subcluster_id": {"$type": "int"}})
        
        # Count other data types
        other_types = list(voice_collection.aggregate([
            {"$match": {"subcluster_id": {"$nin": [None]}}},
            {"$group": {"_id": {"$type": "$subcluster_id"}, "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        print(f"Subcluster_id by data type:")
        for type_stat in other_types:
            data_type = type_stat['_id']
            count = type_stat['count']
            print(f"  {data_type}: {count} voice records")
        
        # Count null/missing values
        null_count = voice_collection.count_documents({"subcluster_id": None})
        missing_count = voice_collection.count_documents({"subcluster_id": {"$exists": False}})
        
        if null_count > 0:
            print(f"  null: {null_count} voice records")
        if missing_count > 0:
            print(f"  missing: {missing_count} voice records")
        
        # Show sample voice records with string subcluster_id
        print(f"\nSample voice records with string subcluster_id:")
        samples = list(voice_collection.find(
            {"subcluster_id": {"$type": "string"}},
            {
                "voice_id": 1,
                "subcluster_id": 1,
                "title": 1
            }
        ).limit(5))
        
        for i, sample in enumerate(samples, 1):
            voice_id = sample.get('voice_id', sample.get('_id', 'N/A'))
            subcluster_id = sample.get('subcluster_id')
            title = sample.get('title', 'N/A')[:50] + '...' if len(sample.get('title', '')) > 50 else sample.get('title', 'N/A')
            
            print(f"  {i}. {voice_id}: subcluster_id=\"{subcluster_id}\"")
            print(f"     Title: {title}")
        
        # Success check
        if integer_count == 0:
            print(f"\n✅ SUCCESS: All integer subcluster_id values converted to strings!")
            print(f"Summary: {string_count} voice records now have string subcluster_id")
        else:
            print(f"\n⚠ WARNING: {integer_count} integer subcluster_id values remain")
            
    except Exception as e:
        print(f"Error during verification: {str(e)}")

def handle_other_subcluster_id_values():
    """Check for any unusual subcluster_id values"""
    print("\n" + "=" * 50)
    print("CHECKING FOR OTHER SUBCLUSTER_ID VALUES")
    print("=" * 50)
    
    try:
        # Find voice records with null or missing subcluster_id
        null_count = voice_collection.count_documents({"subcluster_id": None})
        missing_count = voice_collection.count_documents({"subcluster_id": {"$exists": False}})
        
        if null_count > 0:
            print(f"Found {null_count} voice records with null subcluster_id")
            
        if missing_count > 0:
            print(f"Found {missing_count} voice records with missing subcluster_id field")
            
        if null_count == 0 and missing_count == 0:
            print("All voice records have valid subcluster_id values!")
            
        # Show sample of any problematic voice records
        if null_count > 0 or missing_count > 0:
            print(f"\nSample voice records with null/missing subcluster_id:")
            samples = list(voice_collection.find(
                {"$or": [
                    {"subcluster_id": None},
                    {"subcluster_id": {"$exists": False}}
                ]},
                {
                    "voice_id": 1,
                    "subcluster_id": 1,
                    "title": 1
                }
            ).limit(3))
            
            for i, sample in enumerate(samples, 1):
                voice_id = sample.get('voice_id', sample.get('_id', 'N/A'))
                subcluster_id = sample.get('subcluster_id', 'MISSING_FIELD')
                title = sample.get('title', 'N/A')[:50] + '...' if len(sample.get('title', '')) > 50 else sample.get('title', 'N/A')
                
                print(f"  {i}. {voice_id}: subcluster_id={subcluster_id}")
                print(f"     Title: {title}")
            
    except Exception as e:
        print(f"Error checking other subcluster_id values: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        # Test database connection
        test_voice = voice_collection.find_one()
        if test_voice:
            print("✓ Database connection successful")
            print(f"Sample voice record fields: {list(test_voice.keys())}\n")
        else:
            print("⚠ No voice records found in voice collection")
            exit(1)
        
        # Analyze current subcluster_id values
        analyze_subcluster_id_values()
        
        # Confirm before proceeding
        print(f"\nThis will convert all integer subcluster_id values to strings")
        print(f"Example: subcluster_id: 1 → subcluster_id: \"1\"")
        
        confirm = input(f"\nProceed with subcluster_id conversion? (y/n): ")
        if confirm.lower() != 'y':
            print("Operation cancelled.")
            exit(0)
        
        # Execute the subcluster_id update
        updated_count = update_subcluster_id_to_string()
        
        if updated_count > 0:
            # Verify the conversion
            verify_string_conversion()
            
            # Handle other subcluster_id values
            handle_other_subcluster_id_values()
            
            print(f"\n" + "=" * 60)
            print("✅ SUBCLUSTER_ID CONVERSION COMPLETED SUCCESSFULLY!")
            print(f"Summary:")
            print(f"  Total voice records updated: {updated_count}")
            print(f"  All integer subcluster_id values converted to strings")
            print("=" * 60)
        else:
            print(f"\n⚠ No updates were made. Check if subcluster_id fields are already strings or if there are no integer values.")
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
Collection: voice
✓ Database connection successful
Sample voice record fields: ['_id', 'call_id', 'timestamp', 'customer_name', 'customer_id', 'email', 'dominant_topic', 'subtopics', 'call_purpose', 'conversation', 'priority', 'resolution_status', 'sentiment', 'urgency', 'embeddings', 'kmeans_cluster_id', 'subcluster_id', 'subcluster_label', 'dominant_cluster_label', 'kmeans_cluster_keyphrase', 'domain']

Analyzing current subcluster_id values...
----------------------------------------
Unique subcluster_id values found: [0, 1, 2]

Total voice records: 2040
Subcluster_id distribution:
  '1' (type: int): 1018 voice records (49.9%)
  '0' (type: int): 898 voice records (44.0%)
  '2' (type: int): 124 voice records (6.1%)

This will convert all integer subcluster_id values to strings
Example: subcluster_id: 1 → subcluster_id: "1"

Starting subcluster_id field update...
Conversion rule: All integer values → string values
---------------------------

In [3]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

def rename_field_in_cluster_collection():
    """
    Rename field 'voice_ids' to 'voices_ids' in documents where data="voice"
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        cluster_collection = db['cluster']
        
        # Define the filter for documents that have data="voice" and voice_ids field exists
        filter_query = {
            "data": "voice",
            "voice_ids": {"$exists": True}
        }
        
        # Count documents that match the criteria before update
        count_before = cluster_collection.count_documents(filter_query)
        print(f"Found {count_before} documents matching criteria (data='voice' and voice_ids exists)")
        
        if count_before == 0:
            print("No documents found to update.")
            return
        
        # Use $rename operator to rename the field
        update_operation = {
            "$rename": {
                "voice_ids": "voices_ids"
            }
        }
        
        # Perform the update operation
        result = cluster_collection.update_many(filter_query, update_operation)
        
        print(f"Successfully updated {result.modified_count} documents")
        print(f"Matched {result.matched_count} documents")
        
        # Verify the update by counting documents with the new field name
        verification_query = {
            "data": "voice",
            "voices_ids": {"$exists": True}
        }
        count_after = cluster_collection.count_documents(verification_query)
        print(f"Verification: {count_after} documents now have 'voices_ids' field")
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("MongoDB connection closed")

if __name__ == "__main__":
    rename_field_in_cluster_collection()

Found 24 documents matching criteria (data='voice' and voice_ids exists)
Successfully updated 24 documents
Matched 24 documents
Verification: 24 documents now have 'voices_ids' field
MongoDB connection closed


In [4]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

def validate_keyphrases_in_subclusters():
    """
    Cross-check keyphrases field with subclusters keyphrases for voice data.
    Find any keyphrases that exist in main keyphrases but are missing from all subclusters.
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        cluster_collection = db['cluster']
        
        # Find all documents that have both keyphrases and subclusters fields for voice data
        query = {
            "data": "voice",  # Filter for voice data
            "keyphrases": {"$exists": True, "$ne": None},
            "subclusters": {"$exists": True, "$ne": None}
        }
        
        documents = cluster_collection.find(query)
        
        missing_keyphrases = []
        total_documents_checked = 0
        
        for doc in documents:
            total_documents_checked += 1
            cluster_id = doc.get('cluster_id')
            main_keyphrases = doc.get('keyphrases', [])
            subclusters = doc.get('subclusters', {})
            
            # Collect all keyphrases from all subclusters
            subcluster_keyphrases = set()
            
            # subclusters is an object with keys like "0", "1", "2", etc.
            for subcluster_key, subcluster_data in subclusters.items():
                if isinstance(subcluster_data, dict) and 'keyphrases' in subcluster_data:
                    subcluster_keyphrase_list = subcluster_data.get('keyphrases', [])
                    if isinstance(subcluster_keyphrase_list, list):
                        subcluster_keyphrases.update(subcluster_keyphrase_list)
            
            # Check each main keyphrase against subcluster keyphrases
            for keyphrase in main_keyphrases:
                if keyphrase not in subcluster_keyphrases:
                    missing_keyphrases.append({
                        'cluster_id': cluster_id,
                        'missing_keyphrase': keyphrase,
                        'total_main_keyphrases': len(main_keyphrases),
                        'total_subcluster_keyphrases': len(subcluster_keyphrases)
                    })
        
        # Display results
        print(f"Total voice cluster documents checked: {total_documents_checked}")
        print(f"Total missing keyphrases found: {len(missing_keyphrases)}")
        print("-" * 80)
        
        if missing_keyphrases:
            print("MISSING KEYPHRASES REPORT (VOICE DATA):")
            print("-" * 80)
            
            # Group by cluster_id for better readability
            cluster_groups = {}
            for item in missing_keyphrases:
                cluster_id = item['cluster_id']
                if cluster_id not in cluster_groups:
                    cluster_groups[cluster_id] = []
                cluster_groups[cluster_id].append(item)
            
            for cluster_id, missing_items in cluster_groups.items():
                print(f"Voice Cluster ID: {cluster_id}")
                print(f"Missing keyphrases ({len(missing_items)}):")
                for item in missing_items:
                    print(f"  - '{item['missing_keyphrase']}'")
                print(f"Total main keyphrases: {missing_items[0]['total_main_keyphrases']}")
                print(f"Total subcluster keyphrases: {missing_items[0]['total_subcluster_keyphrases']}")
                print("-" * 40)
                
        else:
            print("✅ All keyphrases from main field are present in subclusters for voice data!")
            
        # Summary statistics
        if missing_keyphrases:
            clusters_with_issues = len(set(item['cluster_id'] for item in missing_keyphrases))
            print(f"\nSUMMARY (VOICE DATA):")
            print(f"Voice clusters with missing keyphrases: {clusters_with_issues}")
            print(f"Total missing keyphrase instances: {len(missing_keyphrases)}")
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("\nMongoDB connection closed")

def get_detailed_analysis():
    """
    Get more detailed analysis including sample voice data structure
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        cluster_collection = db['cluster']
        
        # Get a sample document to understand voice data structure
        sample_doc = cluster_collection.find_one({
            "data": "voice",  # Filter for voice data
            "keyphrases": {"$exists": True},
            "subclusters": {"$exists": True}
        })
        
        if sample_doc:
            print("SAMPLE VOICE CLUSTER DOCUMENT STRUCTURE:")
            print("-" * 40)
            print(f"Voice Cluster ID: {sample_doc.get('cluster_id')}")
            print(f"Voice Cluster Name: {sample_doc.get('cluster_name', 'N/A')}")
            print(f"Data Type: {sample_doc.get('data', 'N/A')}")
            print(f"Main keyphrases count: {len(sample_doc.get('keyphrases', []))}")
            
            subclusters = sample_doc.get('subclusters', {})
            print(f"Voice subclusters count: {len(subclusters)}")
            
            if sample_doc.get('keyphrases'):
                print(f"Sample main keyphrases: {sample_doc['keyphrases'][:3]}...")
            
            if subclusters:
                print("Voice subcluster structure:")
                for key, subcluster in list(subclusters.items())[:2]:  # Show first 2 subclusters
                    if isinstance(subcluster, dict):
                        label = subcluster.get('label', 'No label')
                        keyphrases_count = len(subcluster.get('keyphrases', []))
                        print(f"  {key}: '{label}' ({keyphrases_count} keyphrases)")
                        if subcluster.get('keyphrases'):
                            print(f"    Sample keyphrases: {subcluster['keyphrases'][:2]}...")
            print("-" * 40)
        else:
            print("No voice cluster documents found with keyphrases and subclusters fields")
    
    except Exception as e:
        print(f"Error in detailed analysis: {str(e)}")
    finally:
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    print("Starting voice keyphrases validation...")
    print("=" * 80)
    
    # First, get structure analysis
    get_detailed_analysis()
    
    # Then run validation
    validate_keyphrases_in_subclusters()

Starting voice keyphrases validation...
SAMPLE VOICE CLUSTER DOCUMENT STRUCTURE:
----------------------------------------
Voice Cluster ID: 0
Voice Cluster Name: N/A
Data Type: voice
Main keyphrases count: 5
Voice subclusters count: 2
Sample main keyphrases: ['Balance Inquiry Call', 'Account Statement Request', 'Transaction History Inquiry']...
Voice subcluster structure:
  0: 'Account Data Retrieval' (3 keyphrases)
    Sample keyphrases: ['Balance Inquiry Call', 'Account Statement Request']...
  1: 'Account Verification & Reconciliation' (2 keyphrases)
    Sample keyphrases: ['Account Reconciliation Discussion', 'Account Verification Call']...
----------------------------------------
Total voice cluster documents checked: 24
Total missing keyphrases found: 0
--------------------------------------------------------------------------------
✅ All keyphrases from main field are present in subclusters for voice data!

MongoDB connection closed


In [5]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

def convert_subcluster_id_to_string():
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['voice']
        
        # Find all documents where subcluster_id is an integer
        query = {"subcluster_id": {"$type": "number"}}
        documents_to_update = list(collection.find(query))
        
        print(f"Found {len(documents_to_update)} voice records with integer subcluster_id")
        
        if len(documents_to_update) == 0:
            print("No voice records found with integer subcluster_id")
            return
        
        # Update each document
        updated_count = 0
        for doc in documents_to_update:
            try:
                # Convert the integer subcluster_id to string
                new_subcluster_id = str(doc['subcluster_id'])
                
                # Update the document
                result = collection.update_one(
                    {"_id": doc["_id"]},
                    {"$set": {"subcluster_id": new_subcluster_id}}
                )
                
                if result.modified_count > 0:
                    updated_count += 1
                    print(f"Updated voice record {doc['_id']}: subcluster_id {doc['subcluster_id']} -> '{new_subcluster_id}'")
                
            except Exception as e:
                print(f"Error updating voice record {doc['_id']}: {e}")
        
        print(f"\nSummary: Successfully updated {updated_count} out of {len(documents_to_update)} voice records")
        
        # Verify the changes
        remaining_int_docs = collection.count_documents({"subcluster_id": {"$type": "number"}})
        string_docs = collection.count_documents({"subcluster_id": {"$type": "string"}})
        
        print(f"Verification:")
        print(f"- Voice records with integer subcluster_id: {remaining_int_docs}")
        print(f"- Voice records with string subcluster_id: {string_docs}")
        
    except Exception as e:
        print(f"Error connecting to MongoDB or updating voice records: {e}")
    finally:
        if 'client' in locals():
            client.close()
            print("MongoDB connection closed")

def preview_changes():
    """Preview what changes will be made without actually updating"""
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['voice']
        
        # Find documents where subcluster_id is an integer
        query = {"subcluster_id": {"$type": "number"}}
        documents_to_update = list(collection.find(query, {"_id": 1, "subcluster_id": 1}))
        
        print("PREVIEW MODE - No changes will be made")
        print(f"Found {len(documents_to_update)} voice records that would be updated:")
        
        for i, doc in enumerate(documents_to_update[:10]):  # Show first 10
            print(f"  Voice record {doc['_id']}: subcluster_id {doc['subcluster_id']} -> '{str(doc['subcluster_id'])}'")
        
        if len(documents_to_update) > 10:
            print(f"  ... and {len(documents_to_update) - 10} more voice records")
        
    except Exception as e:
        print(f"Error during preview: {e}")
    finally:
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    print("MongoDB Voice Subcluster_id Converter")
    print("=" * 40)
    
    # First, preview the changes
    print("\n1. PREVIEW CHANGES:")
    preview_changes()
    
    # Ask for confirmation
    print("\n2. CONFIRMATION:")
    response = input("Do you want to proceed with the voice conversion? (yes/no): ").lower().strip()
    
    if response == 'yes':
        print("\n3. EXECUTING VOICE CONVERSION:")
        convert_subcluster_id_to_string()
    else:
        print("Voice conversion cancelled.")

MongoDB Voice Subcluster_id Converter

1. PREVIEW CHANGES:
PREVIEW MODE - No changes will be made
Found 0 voice records that would be updated:

2. CONFIRMATION:
Voice conversion cancelled.


In [6]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def rename_domain_value():
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['cluster']
        
        logger.info("Connected to MongoDB successfully")
        
        # First, let's check how many documents have "voice Support" in domains array
        count_before = collection.count_documents({"domains": "voice Support"})
        logger.info(f"Found {count_before} documents with 'voice Support' in domains array")
        
        if count_before == 0:
            logger.info("No documents found with 'voice Support' domain. Checking for other variations...")
            # Check for case variations or similar patterns
            variations = ["Voice Support", "voice support", "VOICE SUPPORT", "Voice support"]
            for variation in variations:
                count = collection.count_documents({"domains": variation})
                if count > 0:
                    logger.info(f"Found {count} documents with '{variation}' domain")
        
        # Update documents where domains array contains "voice Support"
        # This will replace "voice Support" with "banking" in the domains array
        result = collection.update_many(
            {"domains": "voice Support"},
            {"$set": {"domains.$[elem]": "banking"}},
            array_filters=[{"elem": "voice Support"}]
        )
        
        logger.info(f"Updated {result.modified_count} documents")
        
        # Also handle case variations if they exist
        variations_to_update = ["Voice Support", "voice support", "VOICE SUPPORT", "Voice support"]
        total_updated = result.modified_count
        
        for variation in variations_to_update:
            result_var = collection.update_many(
                {"domains": variation},
                {"$set": {"domains.$[elem]": "banking"}},
                array_filters=[{"elem": variation}]
            )
            if result_var.modified_count > 0:
                logger.info(f"Updated {result_var.modified_count} documents with '{variation}' domain")
                total_updated += result_var.modified_count
        
        # Verify the changes
        count_after = collection.count_documents({"domains": "banking"})
        logger.info(f"After update: {count_after} documents have 'banking' in domains array")
        
        # Show a sample of updated documents
        logger.info("Sample of updated documents:")
        sample_docs = collection.find({"domains": "banking"}).limit(3)
        for doc in sample_docs:
            logger.info(f"Document ID: {doc.get('_id')}, Domains: {doc.get('domains')}")
        
        logger.info(f"Operation completed. Total documents updated: {total_updated}")
        
    except Exception as e:
        logger.error(f"Error occurred: {str(e)}")
    finally:
        # Close the connection
        client.close()
        logger.info("MongoDB connection closed")

def rollback_domain_value():
    """
    Function to rollback the changes if needed
    This will change "banking" back to "voice Support"
    """
    try:
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['cluster']
        
        logger.info("Starting rollback operation...")
        
        # Update documents where domains array contains "banking"
        result = collection.update_many(
            {"domains": "banking"},
            {"$set": {"domains.$[elem]": "voice Support"}},
            array_filters=[{"elem": "banking"}]
        )
        
        logger.info(f"Rollback completed. Updated {result.modified_count} documents")
        
    except Exception as e:
        logger.error(f"Rollback error: {str(e)}")
    finally:
        client.close()
        logger.info("MongoDB connection closed")

if __name__ == "__main__":
    # Run the domain rename operation
    rename_domain_value()
    
    # Uncomment the line below if you need to rollback the changes
    # rollback_domain_value()

2025-09-03 14:52:41,416 - INFO - Connected to MongoDB successfully
2025-09-03 14:52:42,904 - INFO - Found 0 documents with 'voice Support' in domains array
2025-09-03 14:52:42,906 - INFO - No documents found with 'voice Support' domain. Checking for other variations...
2025-09-03 14:52:43,147 - INFO - Found 24 documents with 'Voice Support' domain
2025-09-03 14:52:44,117 - INFO - Updated 0 documents
2025-09-03 14:52:44,362 - INFO - Updated 24 documents with 'Voice Support' domain
2025-09-03 14:52:45,524 - INFO - After update: 96 documents have 'banking' in domains array
2025-09-03 14:52:45,525 - INFO - Sample of updated documents:
2025-09-03 14:52:45,769 - INFO - Document ID: 68aacadc05037130937cbae2, Domains: ['banking']
2025-09-03 14:52:45,770 - INFO - Document ID: 68aacadc05037130937cbae3, Domains: ['banking']
2025-09-03 14:52:45,771 - INFO - Document ID: 68aacadc05037130937cbae4, Domains: ['banking']
2025-09-03 14:52:45,773 - INFO - Operation completed. Total documents updated: 24


In [1]:
# Import required libraries
from pymongo import MongoClient
from collections import defaultdict
import re
import os
from dotenv import load_dotenv
from datetime import datetime

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

def add_processed_at_field():
    """
    Add processed_at field to all documents in chat-chunks and voice collections
    """
    try:
        # Connect to MongoDB
        client = MongoClient(MONGO_CONNECTION_STRING)
        db = client[MONGO_DATABASE_NAME]
        
        # Define the collections to update
        collections_to_update = ['chat-chunks', 'voice']
        
        # Define the processed_at value
        processed_at_value = "2025-08-01 13:06:59"
        
        print(f"Connected to database: {MONGO_DATABASE_NAME}")
        print("-" * 50)
        
        for collection_name in collections_to_update:
            collection = db[collection_name]
            
            # Count total documents before update
            total_docs = collection.count_documents({})
            print(f"Collection: {collection_name}")
            print(f"Total documents: {total_docs}")
            
            if total_docs == 0:
                print("No documents found in this collection.")
                print("-" * 50)
                continue
            
            # Update all documents in the collection
            # Using $set to add the processed_at field to all documents
            update_result = collection.update_many(
                {},  # Empty filter matches all documents
                {"$set": {"processed_at": processed_at_value}}
            )
            
            print(f"Documents matched: {update_result.matched_count}")
            print(f"Documents modified: {update_result.modified_count}")
            
            # Verify the update by checking a few documents
            sample_docs = list(collection.find({}).limit(3))
            print(f"Sample documents after update:")
            for i, doc in enumerate(sample_docs, 1):
                print(f"  Document {i}: processed_at = {doc.get('processed_at', 'NOT FOUND')}")
            
            print("-" * 50)
        
        # Close the connection
        client.close()
        print("Database connection closed.")
        print("Update completed successfully!")
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return False
    
    return True

def verify_updates():
    """
    Verify that the processed_at field has been added to all documents
    """
    try:
        # Connect to MongoDB
        client = MongoClient(MONGO_CONNECTION_STRING)
        db = client[MONGO_DATABASE_NAME]
        
        collections_to_check = ['chat-chunks', 'voice']
        
        print("\n" + "=" * 50)
        print("VERIFICATION RESULTS")
        print("=" * 50)
        
        for collection_name in collections_to_check:
            collection = db[collection_name]
            
            # Count total documents
            total_docs = collection.count_documents({})
            
            # Count documents with processed_at field
            docs_with_processed_at = collection.count_documents({
                "processed_at": {"$exists": True}
            })
            
            # Count documents with the specific processed_at value
            docs_with_correct_value = collection.count_documents({
                "processed_at": "2025-08-01 13:06:59"
            })
            
            print(f"Collection: {collection_name}")
            print(f"  Total documents: {total_docs}")
            print(f"  Documents with processed_at field: {docs_with_processed_at}")
            print(f"  Documents with correct value: {docs_with_correct_value}")
            
            if total_docs == docs_with_correct_value:
                print(f"  ✅ SUCCESS: All documents updated correctly!")
            else:
                print(f"  ❌ WARNING: Some documents may not have been updated!")
            
            print("-" * 30)
        
        client.close()
        
    except Exception as e:
        print(f"Error during verification: {str(e)}")

if __name__ == "__main__":
    # Check if environment variables are loaded
    if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
        print("Error: Please make sure MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME are set in your .env file")
        exit(1)
    
    print("Starting update process...")
    print(f"Target collections: chat-chunks, voice")
    print(f"Field to add: processed_at")
    print(f"Value to set: 2025-08-01 13:06:59")
    print("\n")
    
    # Perform the update
    success = add_processed_at_field()
    
    if success:
        # Verify the updates
        verify_updates()
    else:
        print("Update failed. Please check the error messages above.")

Starting update process...
Target collections: chat-chunks, voice
Field to add: processed_at
Value to set: 2025-08-01 13:06:59


Connected to database: sparzaai
--------------------------------------------------
Collection: chat-chunks
Total documents: 600
Documents matched: 600
Documents modified: 600
Sample documents after update:
  Document 1: processed_at = 2025-08-01 13:06:59
  Document 2: processed_at = 2025-08-01 13:06:59
  Document 3: processed_at = 2025-08-01 13:06:59
--------------------------------------------------
Collection: voice
Total documents: 2040
Documents matched: 2040
Documents modified: 2040
Sample documents after update:
  Document 1: processed_at = 2025-08-01 13:06:59
  Document 2: processed_at = 2025-08-01 13:06:59
  Document 3: processed_at = 2025-08-01 13:06:59
--------------------------------------------------
Database connection closed.
Update completed successfully!

VERIFICATION RESULTS
Collection: chat-chunks
  Total documents: 600
  Documents with proc