In [14]:
# Import required libraries
from pymongo import MongoClient
from collections import defaultdict
import re
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {MONGO_DATABASE_NAME}")

client = MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DATABASE_NAME]

# Get collections
clusters_collection = db['cluster']
chat_chunks_collection = db['chat-chunks']

def normalize_text(text):
    """Normalize text for better matching"""
    return re.sub(r'[^\w\s]', '', text.lower().strip())

def match_chat_chunks_to_clusters():
    """
    Match chat-chunks to clusters based on dominant_topic matching keyphrases
    and update cluster documents with chat_chunk_ids array
    """
    
    print("Fetching clusters with data: 'chat-chunks'...")
    # Get only clusters that have data field set to "chat-chunks"
    clusters = list(clusters_collection.find({"data": "chat-chunks"}))
    print(f"Found {len(clusters)} clusters with data='chat-chunks' to process\n")
    
    # Process each cluster
    for cluster in clusters:
        cluster_id = cluster['cluster_id']
        keyphrases = cluster.get('keyphrases', [])
        
        print(f"Processing Cluster ID: {cluster_id}")
        print(f"Cluster Name: {cluster.get('cluster_name', 'N/A')}")
        print(f"Keyphrases: {keyphrases}")
        
        # Normalize keyphrases for matching
        normalized_keyphrases = [normalize_text(phrase) for phrase in keyphrases]
        print(f"Normalized keyphrases: {normalized_keyphrases}")
        
        # Find matching chat chunks
        matching_chat_chunk_ids = []
        
        # Get all chat chunks - using cursor for better memory management
        print("  Searching through chat chunks...")
        chat_chunks_cursor = chat_chunks_collection.find({}, {
            '_id': 1, 
            'dominant_topic': 1
        })
        
        chat_chunk_count = 0
        for chat_chunk in chat_chunks_cursor:
            chat_chunk_count += 1
            if chat_chunk_count % 1000 == 0:
                print(f"    Processed {chat_chunk_count} chat chunks...")
                
            chat_chunk_dominant_topic = chat_chunk.get('dominant_topic', '')
            
            if chat_chunk_dominant_topic:
                normalized_topic = normalize_text(chat_chunk_dominant_topic)
                
                # Check if any keyphrase matches the dominant topic
                for keyphrase in normalized_keyphrases:
                    if keyphrase and normalized_topic:  # Ensure both are not empty
                        if keyphrase in normalized_topic or normalized_topic in keyphrase:
                            matching_chat_chunk_ids.append(str(chat_chunk['_id']))
                            print(f"    Match found: {chat_chunk['_id']} - Topic: '{chat_chunk_dominant_topic}' matches keyphrase: '{keyphrase}'")
                            break
        
        print(f"  Finished processing {chat_chunk_count} chat chunks")
        
        # Remove duplicates (in case a chat chunk matches multiple times)
        matching_chat_chunk_ids = list(set(matching_chat_chunk_ids))
        
        # Update cluster with chat_chunk_ids (only for clusters with data: "chat-chunks")
        try:
            if matching_chat_chunk_ids:
                result = clusters_collection.update_one(
                    {'cluster_id': cluster_id, 'data': 'chat-chunks'},
                    {'$set': {'chat_chunk_ids': matching_chat_chunk_ids}}
                )
                if result.modified_count > 0:
                    print(f"  ✓ Successfully updated cluster {cluster_id} with {len(matching_chat_chunk_ids)} chat chunk IDs")
                elif result.matched_count > 0:
                    print(f"  ⚠ Cluster {cluster_id} already has same chat chunk data")
                else:
                    print(f"  ❌ Cluster {cluster_id} not found or doesn't have data: 'chat-chunks'")
            else:
                # Set empty array if no matches found
                result = clusters_collection.update_one(
                    {'cluster_id': cluster_id, 'data': 'chat-chunks'},
                    {'$set': {'chat_chunk_ids': []}}
                )
                if result.modified_count > 0:
                    print(f"  ✓ Set empty chat_chunk_ids array for cluster {cluster_id} (no matches found)")
                elif result.matched_count > 0:
                    print(f"  ⚠ Cluster {cluster_id} already has empty chat_chunk_ids array")
                else:
                    print(f"  ❌ Cluster {cluster_id} not found or doesn't have data: 'chat-chunks'")
        except Exception as e:
            print(f"  ❌ Error updating cluster {cluster_id}: {str(e)}")
        
        print(f"  Total unique chat chunks matched: {len(matching_chat_chunk_ids)}")
        print("-" * 50)

def verify_results():
    """
    Verify the results by displaying updated clusters
    """
    print("\n" + "=" * 60)
    print("VERIFICATION RESULTS")
    print("=" * 60)
    
    try:
        clusters = list(clusters_collection.find({"data": "chat-chunks"}, {
            'cluster_id': 1, 
            'cluster_name': 1, 
            'keyphrases': 1, 
            'chat_chunk_ids': 1,
            'data': 1
        }).sort('cluster_id', 1))
        
        for cluster in clusters:
            chat_chunk_count = len(cluster.get('chat_chunk_ids', []))
            print(f"\nCluster {cluster['cluster_id']}: {cluster.get('cluster_name', 'N/A')} (data: {cluster.get('data', 'N/A')})")
            print(f"  Keyphrases: {cluster.get('keyphrases', [])}")
            print(f"  Chat Chunk IDs count: {chat_chunk_count}")
            if chat_chunk_count > 0:
                print(f"  First 3 Chat Chunk IDs: {cluster['chat_chunk_ids'][:3]}")
                if chat_chunk_count > 3:
                    print(f"  ... and {chat_chunk_count - 3} more")
    except Exception as e:
        print(f"❌ Error during verification: {str(e)}")

def get_summary_stats():
    """Get summary statistics"""
    print("\n" + "=" * 60)
    print("SUMMARY STATISTICS")
    print("=" * 60)
    
    try:
        total_clusters = clusters_collection.count_documents({"data": "chat-chunks"})
        clusters_with_chat_chunks = clusters_collection.count_documents({
            "data": "chat-chunks",
            'chat_chunk_ids': {'$exists': True, '$ne': []}
        })
        
        pipeline = [
            {'$match': {"data": "chat-chunks", 'chat_chunk_ids': {'$exists': True}}},
            {'$project': {'chat_chunk_count': {'$size': '$chat_chunk_ids'}}},
            {'$group': {'_id': None, 'total_chat_chunks_matched': {'$sum': '$chat_chunk_count'}}}
        ]
        
        result = list(clusters_collection.aggregate(pipeline))
        total_chat_chunks_matched = result[0]['total_chat_chunks_matched'] if result else 0
        
        total_chat_chunks = chat_chunks_collection.count_documents({})
        
        print(f"Total clusters with data='chat-chunks': {total_clusters}")
        print(f"Clusters with matched chat chunks: {clusters_with_chat_chunks}")
        print(f"Clusters without matches: {total_clusters - clusters_with_chat_chunks}")
        print(f"Total chat chunks in database: {total_chat_chunks}")
        print(f"Total chat chunk-cluster matches: {total_chat_chunks_matched}")
        
        if total_chat_chunks > 0:
            match_percentage = (total_chat_chunks_matched / total_chat_chunks) * 100
            print(f"Match percentage: {match_percentage:.2f}%")
            
    except Exception as e:
        print(f"❌ Error getting statistics: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        print("🚀 Starting chat chunk-cluster matching process...")
        print("This will match chat chunks to clusters with data: 'chat-chunks'")
        print("=" * 60)
        
        # Test database connection
        test_cluster = clusters_collection.find_one({"data": "chat-chunks"})
        test_chat_chunk = chat_chunks_collection.find_one()
        
        if not test_cluster:
            print("⚠️  Warning: No clusters found with data: 'chat-chunks'")
        if not test_chat_chunk:
            print("⚠️  Warning: No chat chunks found in chat-chunks collection")
            
        print("✓ Database connection successful\n")
        
        # Execute the matching process
        match_chat_chunks_to_clusters()
        
        # Verify results
        verify_results()
        
        # Get summary statistics
        get_summary_stats()
        
        print("\n" + "=" * 60)
        print("✅ Process completed successfully!")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close database connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
🚀 Starting chat chunk-cluster matching process...
This will match chat chunks to clusters with data: 'chat-chunks'
✓ Database connection successful

Fetching clusters with data: 'chat-chunks'...
Found 10 clusters with data='chat-chunks' to process

Processing Cluster ID: 0
Cluster Name: Workforce & HR Management
Keyphrases: ['Shift Coverage Request', 'Training Schedule Update', 'Performance Review Planning', 'Break Schedule Coordination', 'Overtime Approval Need', 'Team Meeting Reminder', 'Staff Management Discussion', 'Workforce Planning Update', 'Employee Relations Coordination', 'Attendance Management Alert', 'Leave Request Approval', 'Skill Development Update']
Normalized keyphrases: ['shift coverage request', 'training schedule update', 'performance review planning', 'break schedule coordination', 'overtime approval need', 'team meeting reminder', 'staff management discussion', 'workforce planning update', 'employee relations coordinatio

In [15]:
# Import required libraries
from pymongo import MongoClient
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {MONGO_DATABASE_NAME}")

client = MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DATABASE_NAME]

# Get collections
clusters_collection = db['cluster']

def update_domains_to_chat_support():
    """
    Update chat-chunks cluster documents to change domains to ["Chat Support"]
    Only processes clusters with data: "chat-chunks"
    """
    
    print("Starting domain update process for chat-chunks clusters...")
    print("=" * 50)
    
    try:
        # Count total chat-chunks clusters before update
        total_chat_chunks_clusters = clusters_collection.count_documents({"data": "chat-chunks"})
        print(f"Total chat-chunks clusters in collection: {total_chat_chunks_clusters}")
        
        if total_chat_chunks_clusters == 0:
            print("⚠ No chat-chunks clusters found (data: 'chat-chunks')")
            return
        
        # Count chat-chunks clusters that currently have ["EU bank"] or other domains
        eu_bank_count = clusters_collection.count_documents({
            "data": "chat-chunks", 
            "domains": ["EU bank"]
        })
        print(f"Chat-chunks clusters with 'EU bank' domain: {eu_bank_count}")
        
        old_chat_support_count = clusters_collection.count_documents({
            "data": "chat-chunks", 
            "domains": ["Chat Support"]
        })
        print(f"Chat-chunks clusters with 'Chat Support' domain: {old_chat_support_count}")
        
        # Count chat-chunks clusters with other domains
        other_domains_count = clusters_collection.count_documents({
            "data": "chat-chunks",
            "domains": {"$nin": [["EU bank"], ["Chat Support"]]}
        })
        print(f"Chat-chunks clusters with other domains: {other_domains_count}")
        
        print("\n" + "=" * 50)
        print("UPDATING CHAT-CHUNKS CLUSTER DOMAINS...")
        print("=" * 50)
        
        # Update only chat-chunks clusters to have domains: ["Chat Support"]
        update_result = clusters_collection.update_many(
            {"data": "chat-chunks"},  # Only update clusters with data: "chat-chunks"
            {"$set": {"domains": ["Chat Support"]}}
        )
        
        print(f"✓ Successfully updated {update_result.modified_count} chat-chunks clusters")
        print(f"  Matched chat-chunks clusters: {update_result.matched_count}")
        
        # Verify the update
        verify_update()
        
    except Exception as e:
        print(f"❌ Error during chat-chunks domain update: {str(e)}")

def verify_update():
    """
    Verify that all chat-chunks clusters have been updated to ["Chat Support"]
    """
    print("\n" + "=" * 50)
    print("CHAT-CHUNKS CLUSTER VERIFICATION")
    print("=" * 50)
    
    try:
        # Count chat-chunks clusters with different domain values
        chat_support_count = clusters_collection.count_documents({
            "data": "chat-chunks",
            "domains": ["Chat Support"]
        })
        eu_bank_count = clusters_collection.count_documents({
            "data": "chat-chunks",
            "domains": ["EU bank"]
        })
        other_domains = clusters_collection.count_documents({
            "data": "chat-chunks",
            "domains": {"$nin": [["Chat Support"], ["EU bank"]]}
        })
        
        total_chat_chunks_clusters = clusters_collection.count_documents({"data": "chat-chunks"})
        
        print(f"Total chat-chunks clusters: {total_chat_chunks_clusters}")
        print(f"Chat-chunks clusters with 'Chat Support' domain: {chat_support_count}")
        print(f"Chat-chunks clusters with 'EU bank' domain: {eu_bank_count}")
        print(f"Chat-chunks clusters with other domains: {other_domains}")
        
        if chat_support_count == total_chat_chunks_clusters:
            print("\n✅ SUCCESS: All chat-chunks clusters now have 'Chat Support' domain!")
        else:
            print(f"\n⚠ WARNING: {total_chat_chunks_clusters - chat_support_count} chat-chunks clusters still have different domains")
        
        # Show sample of updated chat-chunks cluster documents
        print(f"\nSample of updated chat-chunks clusters:")
        samples = list(clusters_collection.find(
            {"data": "chat-chunks"}, 
            {
                'cluster_id': 1, 
                'domains': 1, 
                'dominant_label': 1,
                'data': 1
            }
        ).limit(5).sort('cluster_id', 1))
        
        for sample in samples:
            cluster_id = sample.get('cluster_id', 'N/A')
            domains = sample.get('domains', [])
            label = sample.get('dominant_label', 'N/A')
            data_type = sample.get('data', 'N/A')
            print(f"  Chat-chunks Cluster {cluster_id}: data={data_type}, domains={domains}, label='{label}'")
            
    except Exception as e:
        print(f"❌ Error during verification: {str(e)}")

def get_domain_statistics():
    """
    Get detailed statistics about domains in chat-chunks clusters only
    """
    print("\n" + "=" * 50)
    print("CHAT-CHUNKS CLUSTER DOMAIN STATISTICS")
    print("=" * 50)
    
    try:
        # Aggregate to get all unique domain combinations for chat-chunks clusters only
        pipeline = [
            {'$match': {"data": "chat-chunks"}},
            {'$group': {'_id': '$domains', 'count': {'$sum': 1}}},
            {'$sort': {'count': -1}}
        ]
        
        domain_stats = list(clusters_collection.aggregate(pipeline))
        
        print("Domain distribution for chat-chunks clusters:")
        for stat in domain_stats:
            domains = stat['_id']
            count = stat['count']
            print(f"  {domains}: {count} chat-chunks clusters")
            
        total_chat_chunks_clusters = clusters_collection.count_documents({"data": "chat-chunks"})
        if total_chat_chunks_clusters > 0:
            chat_support_percentage = (clusters_collection.count_documents({
                "data": "chat-chunks",
                "domains": ["Chat Support"]
            }) / total_chat_chunks_clusters) * 100
            print(f"\nPercentage of chat-chunks clusters with 'Chat Support' domain: {chat_support_percentage:.1f}%")
        
        # Show comparison with other data types
        print(f"\nComparison with other cluster types:")
        all_data_types = list(clusters_collection.aggregate([
            {'$group': {'_id': '$data', 'count': {'$sum': 1}}},
            {'$sort': {'count': -1}}
        ]))
        
        for data_type in all_data_types:
            data_value = data_type['_id']
            count = data_type['count']
            print(f"  Clusters with data='{data_value}': {count}")
            
    except Exception as e:
        print(f"❌ Error getting statistics: {str(e)}")

def show_chat_chunks_cluster_summary():
    """
    Show summary of chat-chunks cluster fields after domain update
    """
    print("\n" + "=" * 50)
    print("CHAT-CHUNKS CLUSTER SUMMARY")
    print("=" * 50)
    
    try:
        # Get chat-chunks cluster statistics
        chat_chunks_clusters_with_chat_chunk_ids = clusters_collection.count_documents({
            "data": "chat-chunks",
            "chat_chunk_ids": {"$exists": True, "$ne": []}
        })
        
        # Get average chat chunk count per cluster
        chat_chunks_pipeline = [
            {'$match': {"data": "chat-chunks", "chat_chunk_ids": {"$exists": True}}},
            {'$project': {'chat_chunk_count': {'$size': '$chat_chunk_ids'}}},
            {'$group': {
                '_id': None,
                'total_chat_chunks': {'$sum': '$chat_chunk_count'},
                'avg_chat_chunks_per_cluster': {'$avg': '$chat_chunk_count'},
                'max_chat_chunks_per_cluster': {'$max': '$chat_chunk_count'}
            }}
        ]
        
        chat_chunks_result = list(clusters_collection.aggregate(chat_chunks_pipeline))
        
        total_chat_chunks_clusters = clusters_collection.count_documents({"data": "chat-chunks"})
        
        print(f"Total chat-chunks clusters: {total_chat_chunks_clusters}")
        print(f"Chat-chunks clusters with assigned chat chunks: {chat_chunks_clusters_with_chat_chunk_ids}")
        
        if chat_chunks_result:
            result = chat_chunks_result[0]
            print(f"Total chat chunks assigned to clusters: {result['total_chat_chunks']}")
            print(f"Average chat chunks per cluster: {result['avg_chat_chunks_per_cluster']:.2f}")
            print(f"Maximum chat chunks in a cluster: {result['max_chat_chunks_per_cluster']}")
            
    except Exception as e:
        print(f"❌ Error getting chat-chunks summary: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        print("💬 Starting domains update to 'Chat Support' for chat-chunks clusters...")
        print("This will only update clusters with data: 'chat-chunks'")
        print("=" * 60)
        
        # Test database connection
        test_doc = clusters_collection.find_one({"data": "chat-chunks"})
        if test_doc:
            print("✓ Database connection successful")
            current_domains = test_doc.get('domains', 'N/A')
            data_type = test_doc.get('data', 'N/A')
            print(f"Sample chat-chunks cluster - data: {data_type}, domains: {current_domains}\n")
        else:
            print("⚠ No chat-chunks clusters found (data: 'chat-chunks') in clusters collection")
            print("Please ensure you have clusters with data: 'chat-chunks' before running this script")
            exit(1)
        
        # Execute the domain update for chat-chunks clusters
        update_domains_to_chat_support()
        
        # Get detailed statistics
        get_domain_statistics()
        
        # Show chat-chunks cluster summary
        show_chat_chunks_cluster_summary()
        
        print("\n" + "=" * 60)
        print("✅ Chat-chunks cluster domain update process completed successfully!")
        print("All chat-chunks clusters now have domains: ['Chat Support']")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close database connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
💬 Starting domains update to 'Chat Support' for chat-chunks clusters...
This will only update clusters with data: 'chat-chunks'
✓ Database connection successful
Sample chat-chunks cluster - data: chat-chunks, domains: ['banking']

Starting domain update process for chat-chunks clusters...
Total chat-chunks clusters in collection: 10
Chat-chunks clusters with 'EU bank' domain: 0
Chat-chunks clusters with 'Chat Support' domain: 0
Chat-chunks clusters with other domains: 10

UPDATING CHAT-CHUNKS CLUSTER DOMAINS...
✓ Successfully updated 10 chat-chunks clusters
  Matched chat-chunks clusters: 10

CHAT-CHUNKS CLUSTER VERIFICATION
Total chat-chunks clusters: 10
Chat-chunks clusters with 'Chat Support' domain: 10
Chat-chunks clusters with 'EU bank' domain: 0
Chat-chunks clusters with other domains: 0

✅ SUCCESS: All chat-chunks clusters now have 'Chat Support' domain!

Sample of updated chat-chunks clusters:
  Chat-chunks Cluster 0: data=chat-chunks

In [16]:
from pymongo import MongoClient, UpdateOne
from typing import Dict, List, Optional, Set
import os
from dotenv import load_dotenv
from collections import defaultdict
import threading
from concurrent.futures import ThreadPoolExecutor
import time

# Load environment variables
load_dotenv()

class OptimizedChatChunkClusterMatcher:
    def __init__(self, connection_string: str, database_name: str):
        """
        Initialize the matcher with MongoDB connection
        """
        self.client = MongoClient(connection_string)
        self.db = self.client[database_name]
        self.chat_chunks_collection = self.db['chat-chunks']
        self.clusters_collection = self.db['cluster']
        
        # Cache for cluster data - this is the key optimization
        self._cluster_cache = None
        self._subcluster_cache = None
        self._load_cluster_cache()
    
    def _load_cluster_cache(self):
        """
        Load all cluster data into memory for fast lookups
        Only load clusters where data equals "chat-chunks"
        """
        print("Loading chat-chunks cluster data into cache...")
        start_time = time.time()
        
        # Dictionary mapping keyphrase -> cluster info
        self._cluster_cache = {}
        # Dictionary mapping keyphrase -> subcluster info
        self._subcluster_cache = {}
        
        # Only get clusters where data = "chat-chunks"
        clusters = list(self.clusters_collection.find({"data": "chat-chunks"}))
        print(f"Found {len(clusters)} chat-chunks clusters to cache")
        
        for cluster in clusters:
            cluster_id = cluster.get('cluster_id')
            dominant_label = cluster.get('dominant_label')
            keyphrases = cluster.get('keyphrases', [])
            subclusters = cluster.get('subclusters', {})
            
            # Cache cluster keyphrases
            for keyphrase in keyphrases:
                self._cluster_cache[keyphrase] = {
                    'cluster_id': cluster_id,
                    'dominant_label': dominant_label,
                    'subclusters': subclusters
                }
            
            # Cache subcluster keyphrases
            for subcluster_id, subcluster_data in subclusters.items():
                if not isinstance(subcluster_data, dict):
                    continue
                    
                subcluster_keyphrases = subcluster_data.get('keyphrases', [])
                for keyphrase in subcluster_keyphrases:
                    self._subcluster_cache[keyphrase] = {
                        'cluster_id': cluster_id,
                        'dominant_label': dominant_label,
                        'subcluster_id': int(subcluster_id),
                        'subcluster_label': subcluster_data.get('label')
                    }
        
        cache_time = time.time() - start_time
        print(f"Cache loaded in {cache_time:.2f} seconds")
        print(f"Cached {len(self._cluster_cache)} cluster keyphrases")
        print(f"Cached {len(self._subcluster_cache)} subcluster keyphrases")
    
    def find_matching_cluster_fast(self, dominant_topic: str) -> Optional[Dict]:
        """
        Fast cluster lookup using cached data
        """
        return self._cluster_cache.get(dominant_topic)
    
    def find_matching_subcluster_fast(self, dominant_topic: str) -> Optional[Dict]:
        """
        Fast subcluster lookup using cached data
        """
        return self._subcluster_cache.get(dominant_topic)
    
    def find_unmatched_chat_chunks(self, limit: int = None) -> List[Dict]:
        """
        Find chat-chunks that don't match any cluster or subcluster
        """
        unmatched = []
        
        # Get all chat-chunks with dominant_topic
        query = {"dominant_topic": {"$exists": True, "$ne": None}}
        cursor = self.chat_chunks_collection.find(query, {"dominant_topic": 1})
        
        if limit:
            cursor = cursor.limit(limit)
        
        for chat_chunk in cursor:
            dominant_topic = chat_chunk.get('dominant_topic')
            if not dominant_topic:
                continue
                
            # Check if it matches any cluster or subcluster
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            
            if not cluster_match and not subcluster_match:
                unmatched.append({
                    'chat_chunk_id': str(chat_chunk['_id']),
                    'dominant_topic': dominant_topic
                })
        
        return unmatched
    
    def get_unique_dominant_topics(self) -> Dict:
        """
        Get all unique dominant_topic values and their counts from chat-chunks
        """
        pipeline = [
            {"$match": {"dominant_topic": {"$exists": True, "$ne": None}}},
            {"$group": {"_id": "$dominant_topic", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]
        
        result = list(self.chat_chunks_collection.aggregate(pipeline))
        
        topics_info = {
            'total_unique_topics': len(result),
            'topics': result
        }
        
        return topics_info
    
    def analyze_matching_gaps(self) -> Dict:
        """
        Analyze what dominant_topics exist but don't match any chat-chunks clusters
        """
        print("Analyzing matching gaps...")
        
        # Get all unique dominant topics
        topics_info = self.get_unique_dominant_topics()
        print(f"Found {topics_info['total_unique_topics']} unique dominant topics in chat-chunks")
        
        # Check which ones don't match
        unmatched_topics = {}
        matched_topics = {}
        
        for topic_data in topics_info['topics']:
            topic = topic_data['_id']
            count = topic_data['count']
            
            cluster_match = self.find_matching_cluster_fast(topic)
            subcluster_match = self.find_matching_subcluster_fast(topic)
            
            if cluster_match or subcluster_match:
                matched_topics[topic] = {
                    'count': count,
                    'cluster_match': bool(cluster_match),
                    'subcluster_match': bool(subcluster_match)
                }
            else:
                unmatched_topics[topic] = count
        
        return {
            'total_topics': topics_info['total_unique_topics'],
            'matched_topics': len(matched_topics),
            'unmatched_topics': len(unmatched_topics),
            'unmatched_details': unmatched_topics,
            'matched_details': matched_topics,
            'unmatched_chat_chunk_count': sum(unmatched_topics.values()),
            'matched_chat_chunk_count': sum([data['count'] for data in matched_topics.values()])
        }
    
    def create_fallback_cluster_entry(self, unmatched_topics: List[str]) -> Dict:
        """
        Create a fallback cluster entry for unmatched topics (for chat-chunks)
        """
        fallback_cluster = {
            'cluster_id': 999,  # Use a high number to avoid conflicts
            'dominant_label': 'Unclassified Chat-Chunk Topics',
            'keyphrases': unmatched_topics,
            'data': 'chat-chunks',  # Specify that this is for chat-chunks
            'subclusters': {
                '0': {
                    'label': 'Miscellaneous Chat-Chunks',
                    'keyphrases': unmatched_topics
                }
            }
        }
        return fallback_cluster
    
    def add_fallback_cluster_to_cache(self, unmatched_topics: List[str]) -> None:
        """
        Add unmatched topics to cache as a fallback cluster
        """
        print(f"Adding {len(unmatched_topics)} unmatched topics to fallback cluster...")
        
        for topic in unmatched_topics:
            # Add to cluster cache
            self._cluster_cache[topic] = {
                'cluster_id': 999,
                'dominant_label': 'Unclassified Chat-Chunk Topics',
                'subclusters': {'0': {'label': 'Miscellaneous Chat-Chunks', 'keyphrases': unmatched_topics}}
            }
            
            # Add to subcluster cache
            self._subcluster_cache[topic] = {
                'cluster_id': 999,
                'dominant_label': 'Unclassified Chat-Chunk Topics',
                'subcluster_id': 0,
                'subcluster_label': 'Miscellaneous Chat-Chunks'
            }
        
        print(f"✓ Added fallback cluster. Cache now has:")
        print(f"  - Cluster keyphrases: {len(self._cluster_cache)}")
        print(f"  - Subcluster keyphrases: {len(self._subcluster_cache)}")
    
    def process_chat_chunks_batch(self, chat_chunks: List[Dict]) -> List:
        """
        Process a batch of chat-chunks and return bulk operations in correct PyMongo format
        """
        bulk_operations = []
        
        for chat_chunk in chat_chunks:
            dominant_topic = chat_chunk.get('dominant_topic')
            if not dominant_topic:
                continue
            
            # Fast cluster lookup
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            
            update_data = {}
            
            if cluster_match:
                update_data.update({
                    'kmeans_cluster_id': cluster_match['cluster_id'],
                    'dominant_label': cluster_match['dominant_label']
                })
            
            if subcluster_match:
                update_data.update({
                    'kmeans_cluster_id': subcluster_match['cluster_id'],
                    'dominant_label': subcluster_match['dominant_label'],
                    'subcluster_id': subcluster_match['subcluster_id'],
                    'subcluster_label': subcluster_match['subcluster_label']
                })
            
            if update_data:
                # Use PyMongo's UpdateOne class instead of dict
                bulk_operations.append(
                    UpdateOne(
                        {'_id': chat_chunk['_id']}, 
                        {'$set': update_data}
                    )
                )
        
        return bulk_operations
    
    def process_chat_chunks_optimized(self, batch_size: int = 5000, max_workers: int = 4, dry_run: bool = False) -> Dict:
        """
        Optimized chat-chunks processing with larger batches and optional threading
        """
        start_time = time.time()
        
        # Get total count more efficiently
        total_chat_chunks = self.chat_chunks_collection.estimated_document_count()
        processed = 0
        matched_clusters = 0
        matched_subclusters = 0
        total_updates = 0
        
        print(f"Processing ~{total_chat_chunks} chat-chunks in batches of {batch_size}")
        print(f"DRY RUN MODE: {'ON' if dry_run else 'OFF'}")
        
        # Create index on dominant_topic if it doesn't exist (for faster queries)
        try:
            self.chat_chunks_collection.create_index([("dominant_topic", 1)], background=True)
            print("✓ Index on dominant_topic created/verified")
        except Exception as e:
            print(f"Index creation note: {e}")
        
        # Process chat-chunks in larger batches
        cursor = self.chat_chunks_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}},  # Only get chat-chunks with dominant_topic
            projection={'dominant_topic': 1}  # Only fetch the field we need
        ).batch_size(batch_size)
        
        batch = []
        batch_count = 0
        
        for chat_chunk in cursor:
            batch.append(chat_chunk)
            
            if len(batch) >= batch_size:
                batch_count += 1
                print(f"\n--- Processing batch {batch_count} ({len(batch)} chat-chunks) ---")
                
                # Process batch
                bulk_operations = self.process_chat_chunks_batch(batch)
                print(f"Generated {len(bulk_operations)} update operations")
                
                # Count matches for statistics
                batch_cluster_matches = 0
                batch_subcluster_matches = 0
                for chat_chunk in batch:
                    dominant_topic = chat_chunk.get('dominant_topic')
                    if dominant_topic:
                        if self.find_matching_cluster_fast(dominant_topic):
                            matched_clusters += 1
                            batch_cluster_matches += 1
                        if self.find_matching_subcluster_fast(dominant_topic):
                            matched_subclusters += 1
                            batch_subcluster_matches += 1
                
                print(f"Batch matches - Clusters: {batch_cluster_matches}, Subclusters: {batch_subcluster_matches}")
                
                # Execute bulk update (or skip if dry run)
                if bulk_operations and not dry_run:
                    try:
                        print("Executing bulk write...")
                        result = self.chat_chunks_collection.bulk_write(
                            bulk_operations, 
                            ordered=False  # Faster unordered operations
                        )
                        total_updates += result.modified_count
                        print(f"✓ Updated {result.modified_count} documents in batch {batch_count}")
                        
                        # Verify some updates
                        if result.modified_count > 0:
                            sample_updated = list(self.chat_chunks_collection.find(
                                {"kmeans_cluster_id": {"$exists": True}},
                                {"dominant_topic": 1, "kmeans_cluster_id": 1, "subcluster_id": 1}
                            ).limit(3))
                            print(f"Sample updated documents: {len(sample_updated)} found with cluster IDs")
                        
                    except Exception as e:
                        print(f"❌ Bulk write error in batch {batch_count}: {e}")
                        print(f"Error type: {type(e).__name__}")
                        # Show sample operation for debugging in readable format
                        if bulk_operations:
                            sample_op = bulk_operations[0]
                            print(f"Sample operation: Update {sample_op._filter} with {sample_op._doc}")
                elif bulk_operations and dry_run:
                    print(f"DRY RUN: Would update {len(bulk_operations)} documents")
                    # Show sample operations in readable format
                    for i, op in enumerate(bulk_operations[:3]):
                        print(f"Sample operation {i+1}: Update {op._filter} with {op._doc}")
                else:
                    print("No operations to execute (no matches found)")
                
                processed += len(batch)
                batch = []
                
                # Progress update
                elapsed = time.time() - start_time
                rate = processed / elapsed if elapsed > 0 else 0
                print(f"Progress: {processed} chat-chunks processed ({rate:.1f} chat-chunks/sec)")
        
        # Process remaining chat-chunks in the last batch
        if batch:
            batch_count += 1
            print(f"\n--- Processing final batch {batch_count} ({len(batch)} chat-chunks) ---")
            
            bulk_operations = self.process_chat_chunks_batch(batch)
            print(f"Generated {len(bulk_operations)} update operations")
            
            # Count matches for final batch
            batch_cluster_matches = 0
            batch_subcluster_matches = 0
            for chat_chunk in batch:
                dominant_topic = chat_chunk.get('dominant_topic')
                if dominant_topic:
                    if self.find_matching_cluster_fast(dominant_topic):
                        matched_clusters += 1
                        batch_cluster_matches += 1
                    if self.find_matching_subcluster_fast(dominant_topic):
                        matched_subclusters += 1
                        batch_subcluster_matches += 1
            
            print(f"Final batch matches - Clusters: {batch_cluster_matches}, Subclusters: {batch_subcluster_matches}")
            
            if bulk_operations and not dry_run:
                try:
                    print("Executing final bulk write...")
                    result = self.chat_chunks_collection.bulk_write(
                        bulk_operations, 
                        ordered=False
                    )
                    total_updates += result.modified_count
                    print(f"✓ Updated {result.modified_count} documents in final batch")
                except Exception as e:
                    print(f"❌ Bulk write error in final batch: {e}")
                    print(f"Error type: {type(e).__name__}")
            elif bulk_operations and dry_run:
                print(f"DRY RUN: Would update {len(bulk_operations)} documents")
            
            processed += len(batch)
        
        total_time = time.time() - start_time
        
        # Final verification
        if not dry_run and total_updates > 0:
            print(f"\n--- Verification ---")
            updated_count = self.chat_chunks_collection.count_documents({"kmeans_cluster_id": {"$exists": True}})
            print(f"Total chat-chunk documents with kmeans_cluster_id: {updated_count}")
            
            subcluster_count = self.chat_chunks_collection.count_documents({"subcluster_id": {"$exists": True}})
            print(f"Total chat-chunk documents with subcluster_id: {subcluster_count}")
        
        stats = {
            'total_chat_chunks': processed,
            'matched_clusters': matched_clusters,
            'matched_subclusters': matched_subclusters,
            'total_updates': total_updates,
            'processing_time': total_time,
            'chat_chunks_per_second': processed / total_time if total_time > 0 else 0,
            'cluster_match_rate': (matched_clusters / processed * 100) if processed > 0 else 0,
            'subcluster_match_rate': (matched_subclusters / processed * 100) if processed > 0 else 0,
            'dry_run': dry_run
        }
        
        return stats
    
    def process_with_fallback(self, batch_size: int = 5000, dry_run: bool = False) -> Dict:
        """
        Process chat-chunks with automatic fallback cluster for unmatched topics
        """
        print("=== PROCESSING CHAT-CHUNKS WITH FALLBACK CLUSTER ===")
        
        # First, analyze gaps
        gaps = self.analyze_matching_gaps()
        
        if gaps['unmatched_chat_chunk_count'] > 0:
            print(f"Found {gaps['unmatched_chat_chunk_count']} unmatched chat-chunks")
            print(f"Unmatched topics: {list(gaps['unmatched_details'].keys())}")
            
            # Add fallback cluster to cache
            unmatched_topic_list = list(gaps['unmatched_details'].keys())
            self.add_fallback_cluster_to_cache(unmatched_topic_list)
            
            # Optionally save fallback cluster to database
            save_choice = input("Save fallback cluster to database permanently? (y/n): ")
            if save_choice.lower() == 'y':
                fallback_cluster = self.create_fallback_cluster_entry(unmatched_topic_list)
                try:
                    self.clusters_collection.insert_one(fallback_cluster)
                    print("✓ Fallback cluster saved to database")
                except Exception as e:
                    print(f"⚠️  Could not save fallback cluster: {e}")
        
        # Now process all chat-chunks (should be 100% match rate)
        return self.process_chat_chunks_optimized(batch_size=batch_size, dry_run=dry_run)
    
    def get_performance_stats(self) -> Dict:
        """
        Get database performance statistics
        """
        stats = {}
        
        # Collection sizes
        stats['total_chat_chunks'] = self.chat_chunks_collection.estimated_document_count()
        stats['chat_chunks_with_topic'] = self.chat_chunks_collection.count_documents({
            "dominant_topic": {"$exists": True, "$ne": None}
        })
        stats['total_chat_chunk_clusters'] = self.clusters_collection.count_documents({"data": "chat-chunks"})
        stats['total_all_clusters'] = self.clusters_collection.estimated_document_count()
        
        # Cache statistics
        stats['cached_cluster_keyphrases'] = len(self._cluster_cache) if self._cluster_cache else 0
        stats['cached_subcluster_keyphrases'] = len(self._subcluster_cache) if self._subcluster_cache else 0
        
        return stats
    
    def debug_matching_process(self, limit: int = 5) -> None:
        """
        Debug the matching process to see what's happening with chat-chunks
        """
        print("\n=== DEBUGGING CHAT-CHUNK MATCHING PROCESS ===")
        
        # Check if we have any cluster data
        if not self._cluster_cache and not self._subcluster_cache:
            print("❌ NO CHAT-CHUNK CLUSTER CACHE DATA! This is why updates are failing.")
            return
        
        print(f"✓ Chat-chunk cluster cache has {len(self._cluster_cache)} entries")
        print(f"✓ Chat-chunk subcluster cache has {len(self._subcluster_cache)} entries")
        
        # Sample some cluster keyphrases
        print(f"\nSample chat-chunk cluster keyphrases:")
        for i, keyphrase in enumerate(list(self._cluster_cache.keys())[:10]):
            cluster_info = self._cluster_cache[keyphrase]
            print(f"  {i+1}. '{keyphrase}' -> Cluster {cluster_info['cluster_id']}")
        
        # Sample some subcluster keyphrases  
        print(f"\nSample chat-chunk subcluster keyphrases:")
        for i, keyphrase in enumerate(list(self._subcluster_cache.keys())[:10]):
            subcluster_info = self._subcluster_cache[keyphrase]
            print(f"  {i+1}. '{keyphrase}' -> Cluster {subcluster_info['cluster_id']}, Subcluster {subcluster_info['subcluster_id']}")
        
        # Check some actual chat-chunks
        print(f"\n=== TESTING {limit} CHAT-CHUNKS ===")
        chat_chunks = list(self.chat_chunks_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}}
        ).limit(limit))
        
        if not chat_chunks:
            print("❌ NO CHAT-CHUNKS with dominant_topic found!")
            return
        
        for i, chat_chunk in enumerate(chat_chunks, 1):
            dominant_topic = chat_chunk.get('dominant_topic', 'NO_TOPIC')
            print(f"\n--- Chat-Chunk {i} ---")
            print(f"Chat-Chunk ID: {chat_chunk['_id']}")
            print(f"Dominant Topic: '{dominant_topic}'")
            
            # Test cluster matching
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            if cluster_match:
                print(f"✓ CLUSTER MATCH: ID={cluster_match['cluster_id']}, Label='{cluster_match['dominant_label']}'")
            else:
                print(f"❌ No cluster match for '{dominant_topic}'")
            
            # Test subcluster matching  
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            if subcluster_match:
                print(f"✓ SUBCLUSTER MATCH: Cluster={subcluster_match['cluster_id']}, Subcluster={subcluster_match['subcluster_id']}, Label='{subcluster_match['subcluster_label']}'")
            else:
                print(f"❌ No subcluster match for '{dominant_topic}'")
            
            # Show what the update operation would look like
            update_data = {}
            if cluster_match:
                update_data.update({
                    'kmeans_cluster_id': cluster_match['cluster_id'],
                    'dominant_label': cluster_match['dominant_label']
                })
            if subcluster_match:
                update_data.update({
                    'kmeans_cluster_id': subcluster_match['cluster_id'],
                    'dominant_label': subcluster_match['dominant_label'],
                    'subcluster_id': subcluster_match['subcluster_id'],
                    'subcluster_label': subcluster_match['subcluster_label']
                })
            
            if update_data:
                print(f"UPDATE OPERATION: {update_data}")
            else:
                print("NO UPDATE OPERATION (no matches)")
        
        print(f"\n=== DATABASE STATE CHECK ===")
        # Check existing updates
        existing_with_cluster = self.chat_chunks_collection.count_documents({"kmeans_cluster_id": {"$exists": True}})
        existing_with_subcluster = self.chat_chunks_collection.count_documents({"subcluster_id": {"$exists": True}})
        chat_chunks_with_topic = self.chat_chunks_collection.count_documents({"dominant_topic": {"$exists": True, "$ne": None}})
        
        print(f"Chat-chunks with dominant_topic: {chat_chunks_with_topic}")
        print(f"Chat-chunks already with kmeans_cluster_id: {existing_with_cluster}")
        print(f"Chat-chunks already with subcluster_id: {existing_with_subcluster}")
        
        if chat_chunks_with_topic == 0:
            print("❌ PROBLEM: No chat-chunks have 'dominant_topic' field!")
        elif existing_with_cluster == chat_chunks_with_topic:
            print("✓ All chat-chunks already processed!")
        else:
            print(f"📝 {chat_chunks_with_topic - existing_with_cluster} chat-chunks need processing")
    
    def get_preview(self, limit: int = 10) -> List[Dict]:
        """
        Get a preview of chat-chunk-cluster matches for testing
        """
        chat_chunks = list(self.chat_chunks_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}}
        ).limit(limit))
        
        preview = []
        
        for chat_chunk in chat_chunks:
            dominant_topic = chat_chunk.get('dominant_topic')
            if not dominant_topic:
                continue
            
            cluster_match = self.find_matching_cluster_fast(dominant_topic)
            subcluster_match = self.find_matching_subcluster_fast(dominant_topic)
            
            preview.append({
                'chat_chunk_id': str(chat_chunk['_id']),
                'dominant_topic': dominant_topic,
                'cluster_match': cluster_match,
                'subcluster_match': subcluster_match
            })
        
        return preview
    
    def close_connection(self):
        """Close MongoDB connection"""
        self.client.close()

# Usage example
def main():
    # Get configuration from environment variables
    CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
    DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')
    
    if not CONNECTION_STRING:
        raise ValueError("MONGO_CONNECTION_STRING not found in environment variables")
    if not DATABASE_NAME:
        raise ValueError("MONGO_DATABASE_NAME not found in environment variables")
    
    print(f"Connecting to database: {DATABASE_NAME}")
    
    # Initialize optimized chat-chunk matcher
    matcher = OptimizedChatChunkClusterMatcher(CONNECTION_STRING, DATABASE_NAME)
    
    try:
        # Show performance stats
        print("\n--- Database Statistics ---")
        perf_stats = matcher.get_performance_stats()
        for key, value in perf_stats.items():
            print(f"{key}: {value:,}")
        
        # Analyze matching gaps
        print("\n--- Gap Analysis ---")
        gap_choice = input("Analyze which chat-chunks aren't matching? (y/n): ")
        if gap_choice.lower() == 'y':
            gaps = matcher.analyze_matching_gaps()
            print(f"\n=== MATCHING GAP ANALYSIS ===")
            print(f"Total unique topics: {gaps['total_topics']}")
            print(f"Matched topics: {gaps['matched_topics']}")
            print(f"Unmatched topics: {gaps['unmatched_topics']}")
            print(f"Matched chat-chunks: {gaps['matched_chat_chunk_count']}")
            print(f"Unmatched chat-chunks: {gaps['unmatched_chat_chunk_count']}")
            
            if gaps['unmatched_details']:
                print(f"\n--- UNMATCHED DOMINANT TOPICS ---")
                for topic, count in list(gaps['unmatched_details'].items())[:10]:
                    print(f"'{topic}' - {count} chat-chunks")
                
                if len(gaps['unmatched_details']) > 10:
                    print(f"... and {len(gaps['unmatched_details']) - 10} more")
                
                print(f"\n💡 To get 100% matches, you need to:")
                print(f"1. Add these topics to your chat-chunk cluster keyphrases, OR")
                print(f"2. Create a 'catch-all' cluster for unmatched chat-chunk topics")
        
        # Debug the matching process first
        print("\n--- Debugging Mode ---")
        debug_choice = input("Run debug mode to see why DB isn't updating? (y/n): ")
        if debug_choice.lower() == 'y':
            matcher.debug_matching_process()
        
        # Get a preview first
        print("\n--- Preview of Chat-Chunk Matches ---")
        preview = matcher.get_preview(limit=5)
        
        for i, item in enumerate(preview, 1):
            print(f"\n--- Chat-Chunk {i} ---")
            print(f"Dominant Topic: {item['dominant_topic']}")
            
            if item['subcluster_match']:
                print(f"✓ Subcluster Match: Cluster ID={item['subcluster_match']['cluster_id']}, "
                      f"Subcluster ID={item['subcluster_match']['subcluster_id']}, "
                      f"Label={item['subcluster_match']['subcluster_label']}")
            elif item['cluster_match']:
                print(f"✓ Cluster Match: ID={item['cluster_match']['cluster_id']}, "
                      f"Label={item['cluster_match']['dominant_label']}")
            else:
                print("✗ No match found")
        
        # Process all chat-chunks
        print("\n--- Processing Options ---")
        print("1. Dry run (see what would be updated without changing DB)")
        print("2. Full processing (actually update the database)")
        print("3. Process with fallback cluster (100% match guarantee)")
        choice = input("Choose option (1, 2, or 3): ")
        
        if choice in ['1', '2', '3']:
            if choice == '3':
                # Use fallback processing
                dry_run = False
                fallback_choice = input("Dry run with fallback first? (y/n): ")
                if fallback_choice.lower() == 'y':
                    dry_run = True
                
                print(f"\nStarting {'DRY RUN' if dry_run else 'LIVE'} processing with fallback...")
                batch_size = int(input("Enter batch size (recommended: 5000-10000): ") or "5000")
                stats = matcher.process_with_fallback(batch_size=batch_size, dry_run=dry_run)
            else:
                # Regular processing
                dry_run = (choice == '1')
                
                print(f"\nStarting {'DRY RUN' if dry_run else 'LIVE PROCESSING'}...")
                
                # Use larger batch size for better performance
                batch_size = int(input("Enter batch size (recommended: 5000-10000): ") or "5000")
                
                stats = matcher.process_chat_chunks_optimized(batch_size=batch_size, dry_run=dry_run)
            
            print("\n--- Final Results ---")
            print(f"Total chat-chunks processed: {stats['total_chat_chunks']:,}")
            print(f"Total updates made: {stats['total_updates']:,}")
            print(f"Cluster matches: {stats['matched_clusters']:,} ({stats['cluster_match_rate']:.1f}%)")
            print(f"Subcluster matches: {stats['matched_subclusters']:,} ({stats['subcluster_match_rate']:.1f}%)")
            print(f"Processing time: {stats['processing_time']:.2f} seconds")
            print(f"Processing rate: {stats['chat_chunks_per_second']:.1f} chat-chunks/second")
        
    finally:
        matcher.close_connection()

if __name__ == "__main__":
    main()

Connecting to database: sparzaai
Loading chat-chunks cluster data into cache...
Found 10 chat-chunks clusters to cache
Cache loaded in 1.89 seconds
Cached 65 cluster keyphrases
Cached 65 subcluster keyphrases

--- Database Statistics ---
total_chat_chunks: 600
chat_chunks_with_topic: 600
total_chat_chunk_clusters: 10
total_all_clusters: 59
cached_cluster_keyphrases: 65
cached_subcluster_keyphrases: 65

--- Gap Analysis ---
Analyzing matching gaps...
Found 65 unique dominant topics in chat-chunks

=== MATCHING GAP ANALYSIS ===
Total unique topics: 65
Matched topics: 65
Unmatched topics: 0
Matched chat-chunks: 600
Unmatched chat-chunks: 0

--- Debugging Mode ---

--- Preview of Chat-Chunk Matches ---

--- Chat-Chunk 1 ---
Dominant Topic: Account Access Problem
✓ Subcluster Match: Cluster ID=9, Subcluster ID=1, Label=Client & External Queries

--- Chat-Chunk 2 ---
Dominant Topic: Account Access Problem
✓ Subcluster Match: Cluster ID=9, Subcluster ID=1, Label=Client & External Queries

---

In [17]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {mongo_database_name}")
print("=" * 60)

# Connect to MongoDB
client = MongoClient(mongo_connection_string)
db = client[mongo_database_name]
chat_chunks_collection = db['chat-chunks']

def rename_chat_chunk_fields():
    """
    Rename fields in chat-chunk documents:
    - is_urgent -> urgency
    - dominant_label -> dominant_cluster_label
    """
    
    print("💬 Starting chat-chunk field rename process...")
    print("Fields to rename:")
    print("  - is_urgent → urgency")
    print("  - dominant_label → dominant_cluster_label")
    print("-" * 50)
    
    try:
        # Check current state before rename
        total_chat_chunks = chat_chunks_collection.count_documents({})
        print(f"Total chat-chunks in collection: {total_chat_chunks}")
        
        if total_chat_chunks == 0:
            print("⚠ No chat-chunks found in collection")
            return
        
        # Count existing fields before rename
        is_urgent_count = chat_chunks_collection.count_documents({"is_urgent": {"$exists": True}})
        dominant_label_count = chat_chunks_collection.count_documents({"dominant_label": {"$exists": True}})
        
        print(f"Chat-chunks with 'is_urgent' field: {is_urgent_count}")
        print(f"Chat-chunks with 'dominant_label' field: {dominant_label_count}")
        
        # Count already renamed fields
        urgency_count = chat_chunks_collection.count_documents({"urgency": {"$exists": True}})
        dominant_cluster_label_count = chat_chunks_collection.count_documents({"dominant_cluster_label": {"$exists": True}})
        
        print(f"Chat-chunks already with 'urgency' field: {urgency_count}")
        print(f"Chat-chunks already with 'dominant_cluster_label' field: {dominant_cluster_label_count}")
        
        print("\n" + "=" * 50)
        print("RENAMING CHAT-CHUNK FIELDS...")
        print("=" * 50)
        
        # Rename both fields in a single operation for all chat-chunks
        result = chat_chunks_collection.update_many(
            {},  # Empty filter to match all chat-chunk documents
            {
                "$rename": {
                    "is_urgent": "urgency",
                    "dominant_label": "dominant_cluster_label"
                }
            }
        )
        
        # Print results
        print(f"✓ Field rename operation completed:")
        print(f"  Matched chat-chunks: {result.matched_count}")
        print(f"  Modified chat-chunks: {result.modified_count}")
        print(f"  Operation acknowledged: {result.acknowledged}")
        
        # Verify the changes
        verify_rename_changes()
        
    except Exception as e:
        print(f"❌ Error during chat-chunk field rename: {str(e)}")

def verify_rename_changes():
    """
    Verify that the field rename was successful
    """
    print("\n" + "=" * 50)
    print("VERIFICATION OF CHAT-CHUNK FIELD RENAME")
    print("=" * 50)
    
    try:
        # Verify the changes by checking a sample chat-chunk document
        sample_chat_chunk = chat_chunks_collection.find_one()
        if sample_chat_chunk:
            print("Sample chat-chunk document after rename:")
            print(f"  Chat-chunk ID: {sample_chat_chunk.get('_id')}")
            print(f"  Has 'urgency' field: {'urgency' in sample_chat_chunk}")
            print(f"  Has 'dominant_cluster_label' field: {'dominant_cluster_label' in sample_chat_chunk}")
            print(f"  Has old 'is_urgent' field: {'is_urgent' in sample_chat_chunk}")
            print(f"  Has old 'dominant_label' field: {'dominant_label' in sample_chat_chunk}")
            
            # Show sample values if they exist
            if 'urgency' in sample_chat_chunk:
                print(f"  Sample 'urgency' value: {sample_chat_chunk['urgency']}")
            if 'dominant_cluster_label' in sample_chat_chunk:
                print(f"  Sample 'dominant_cluster_label' value: {sample_chat_chunk['dominant_cluster_label']}")
        else:
            print("⚠ No chat-chunk documents found in the collection")
        
        # Count chat-chunks with the new field names
        urgency_count = chat_chunks_collection.count_documents({"urgency": {"$exists": True}})
        cluster_label_count = chat_chunks_collection.count_documents({"dominant_cluster_label": {"$exists": True}})
        
        # Count chat-chunks with old field names (should be 0 after rename)
        old_is_urgent_count = chat_chunks_collection.count_documents({"is_urgent": {"$exists": True}})
        old_dominant_label_count = chat_chunks_collection.count_documents({"dominant_label": {"$exists": True}})
        
        print(f"\nField counts after rename:")
        print(f"  Chat-chunks with 'urgency' field: {urgency_count}")
        print(f"  Chat-chunks with 'dominant_cluster_label' field: {cluster_label_count}")
        print(f"  Chat-chunks with old 'is_urgent' field: {old_is_urgent_count}")
        print(f"  Chat-chunks with old 'dominant_label' field: {old_dominant_label_count}")
        
        # Success check
        if old_is_urgent_count == 0 and old_dominant_label_count == 0:
            print("\n✅ SUCCESS: All chat-chunk fields have been renamed successfully!")
        else:
            print(f"\n⚠ WARNING: Some chat-chunks still have old field names")
            
    except Exception as e:
        print(f"❌ Error during verification: {str(e)}")

def get_chat_chunk_field_statistics():
    """
    Get detailed statistics about chat-chunk fields after rename
    """
    print("\n" + "=" * 50)
    print("CHAT-CHUNK FIELD STATISTICS")
    print("=" * 50)
    
    try:
        total_chat_chunks = chat_chunks_collection.count_documents({})
        
        # Get statistics for new field names
        urgency_stats = list(chat_chunks_collection.aggregate([
            {"$match": {"urgency": {"$exists": True}}},
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        cluster_label_stats = list(chat_chunks_collection.aggregate([
            {"$match": {"dominant_cluster_label": {"$exists": True}}},
            {"$group": {"_id": "$dominant_cluster_label", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}},
            {"$limit": 10}  # Show top 10 cluster labels
        ]))
        
        print(f"Total chat-chunks: {total_chat_chunks}")
        
        if urgency_stats:
            print(f"\nUrgency field distribution:")
            for stat in urgency_stats:
                urgency_value = stat['_id']
                count = stat['count']
                percentage = (count / total_chat_chunks) * 100 if total_chat_chunks > 0 else 0
                print(f"  '{urgency_value}': {count} chat-chunks ({percentage:.1f}%)")
        else:
            print(f"\nNo chat-chunks found with 'urgency' field")
        
        if cluster_label_stats:
            print(f"\nTop 10 dominant cluster labels:")
            for i, stat in enumerate(cluster_label_stats, 1):
                label = stat['_id'] if stat['_id'] is not None else 'null'
                count = stat['count']
                percentage = (count / total_chat_chunks) * 100 if total_chat_chunks > 0 else 0
                print(f"  {i}. '{label}': {count} chat-chunks ({percentage:.1f}%)")
        else:
            print(f"\nNo chat-chunks found with 'dominant_cluster_label' field")
            
    except Exception as e:
        print(f"❌ Error getting chat-chunk statistics: {str(e)}")

def show_sample_chat_chunks():
    """
    Show sample chat-chunk documents with renamed fields
    """
    print("\n" + "=" * 50)
    print("SAMPLE CHAT-CHUNK DOCUMENTS")
    print("=" * 50)
    
    try:
        # Get sample chat-chunks with both new fields
        sample_chat_chunks = list(chat_chunks_collection.find(
            {
                "urgency": {"$exists": True},
                "dominant_cluster_label": {"$exists": True}
            },
            {
                "_id": 1,
                "urgency": 1,
                "dominant_cluster_label": 1,
                "content": 1,      # Include content if it exists
                "message": 1,      # Include message if it exists
                "text": 1,         # Include text if it exists
                "chunk_text": 1,   # Include chunk_text if it exists
                "priority": 1      # Include priority if it exists
            }
        ).limit(3))
        
        if sample_chat_chunks:
            print("Sample chat-chunks with renamed fields:")
            for i, chat_chunk in enumerate(sample_chat_chunks, 1):
                print(f"\nChat-chunk {i}:")
                print(f"  ID: {chat_chunk.get('_id')}")
                print(f"  Urgency: {chat_chunk.get('urgency', 'N/A')}")
                print(f"  Dominant Cluster Label: {chat_chunk.get('dominant_cluster_label', 'N/A')}")
                
                # Show text content from various possible field names
                content_fields = ['content', 'message', 'text', 'chunk_text']
                for field in content_fields:
                    if field in chat_chunk and chat_chunk[field]:
                        content = str(chat_chunk[field])[:100] + "..." if len(str(chat_chunk[field])) > 100 else chat_chunk[field]
                        print(f"  {field.title()}: {content}")
                        break
                
                if 'priority' in chat_chunk:
                    print(f"  Priority: {chat_chunk.get('priority', 'N/A')}")
        else:
            print("No chat-chunks found with both renamed fields")
            
    except Exception as e:
        print(f"❌ Error showing sample chat-chunks: {str(e)}")

def check_collection_structure():
    """
    Check the structure of chat-chunk documents to understand available fields
    """
    print("\n" + "=" * 50)
    print("CHAT-CHUNK COLLECTION STRUCTURE ANALYSIS")
    print("=" * 50)
    
    try:
        # Get a sample document to see its structure
        sample_doc = chat_chunks_collection.find_one()
        if sample_doc:
            print("Available fields in sample chat-chunk document:")
            for field in sorted(sample_doc.keys()):
                field_type = type(sample_doc[field]).__name__
                print(f"  - {field}: {field_type}")
        
        # Get field frequency analysis
        print(f"\nField frequency analysis (top 15 fields):")
        pipeline = [
            {"$project": {"fields": {"$objectToArray": "$$ROOT"}}},
            {"$unwind": "$fields"},
            {"$group": {"_id": "$fields.k", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}},
            {"$limit": 15}
        ]
        
        field_stats = list(chat_chunks_collection.aggregate(pipeline))
        for stat in field_stats:
            field_name = stat['_id']
            count = stat['count']
            total_docs = chat_chunks_collection.count_documents({})
            percentage = (count / total_docs) * 100 if total_docs > 0 else 0
            print(f"  {field_name}: {count}/{total_docs} documents ({percentage:.1f}%)")
            
    except Exception as e:
        print(f"❌ Error analyzing collection structure: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        # Test database connection
        test_chat_chunk = chat_chunks_collection.find_one()
        if test_chat_chunk:
            print("✓ Database connection successful")
            print(f"Sample chat-chunk fields: {list(test_chat_chunk.keys())}\n")
        else:
            print("⚠ No chat-chunks found in chat-chunks collection")
            print("Please ensure you have chat-chunk documents before running this script")
            exit(1)
        
        # Check collection structure first
        check_collection_structure()
        
        # Execute the field rename for chat-chunks
        rename_chat_chunk_fields()
        
        # Get detailed statistics
        get_chat_chunk_field_statistics()
        
        # Show sample chat-chunks
        show_sample_chat_chunks()
        
        print("\n" + "=" * 60)
        print("✅ Chat-chunk field rename process completed successfully!")
        print("Fields renamed:")
        print("  - is_urgent → urgency")
        print("  - dominant_label → dominant_cluster_label")
        print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
✓ Database connection successful
Sample chat-chunk fields: ['_id', 'chat_id', 'chat_members', 'dominant_topic', 'subtopics', 'raw_segments', 'embeddings', 'kmeans_cluster_id', 'subcluster_id', 'subcluster_label', 'dominant_cluster_label', 'kmeans_cluster_keyphrase', 'domain', 'dominant_label']


CHAT-CHUNK COLLECTION STRUCTURE ANALYSIS
Available fields in sample chat-chunk document:
  - _id: ObjectId
  - chat_id: str
  - chat_members: list
  - domain: str
  - dominant_cluster_label: NoneType
  - dominant_label: str
  - dominant_topic: str
  - embeddings: list
  - kmeans_cluster_id: int
  - kmeans_cluster_keyphrase: str
  - raw_segments: list
  - subcluster_id: int
  - subcluster_label: str
  - subtopics: str

Field frequency analysis (top 15 fields):
  subtopics: 600/600 documents (100.0%)
  embeddings: 600/600 documents (100.0%)
  _id: 600/600 documents (100.0%)
  kmeans_cluster_id: 600/600 documents (100.0%)
  dominant_cluster_label: 600/60

In [18]:
from pymongo import MongoClient, UpdateOne
from typing import Dict, List, Optional
import os
from dotenv import load_dotenv
import time

# Load environment variables
load_dotenv()

class ChatChunkClusterKeyphraseUpdater:
    def __init__(self, connection_string: str, database_name: str):
        """Initialize the updater with MongoDB connection"""
        self.client = MongoClient(connection_string)
        self.db = self.client[database_name]
        self.chat_chunks_collection = self.db['chat-chunks']
        self.clusters_collection = self.db['cluster']
        
        # Cache for keyphrase -> cluster mapping (ONLY chat-chunk cluster level)
        self._keyphrase_to_cluster = {}
        self._load_keyphrase_cache()
    
    def _load_keyphrase_cache(self):
        """Load only chat-chunk cluster-level keyphrases into memory for fast lookups"""
        print("Loading chat-chunk cluster keyphrase cache...")
        start_time = time.time()
        
        # Only load clusters with data: "chat-chunks"
        chat_chunk_clusters = list(self.clusters_collection.find({"data": "chat-chunks"}))
        print(f"Found {len(chat_chunk_clusters)} chat-chunk clusters to process")
        
        for cluster in chat_chunk_clusters:
            cluster_id = cluster.get('cluster_id')
            dominant_label = cluster.get('dominant_label')
            cluster_keyphrases = cluster.get('keyphrases', [])
            
            # Cache ONLY chat-chunk cluster-level keyphrases
            for keyphrase in cluster_keyphrases:
                self._keyphrase_to_cluster[keyphrase] = {
                    'cluster_id': cluster_id,
                    'dominant_label': dominant_label,
                    'matched_keyphrase': keyphrase
                }
        
        cache_time = time.time() - start_time
        print(f"Chat-chunk cluster cache loaded in {cache_time:.2f} seconds")
        print(f"Cached {len(self._keyphrase_to_cluster)} chat-chunk cluster keyphrases")
    
    def find_matching_keyphrase(self, dominant_topic: str) -> Optional[Dict]:
        """Find the matching keyphrase for a dominant topic (chat-chunk cluster level only)"""
        return self._keyphrase_to_cluster.get(dominant_topic)
    
    def process_chat_chunks_batch(self, chat_chunks: List[Dict]) -> List:
        """Process a batch of chat-chunks and return bulk operations"""
        bulk_operations = []
        
        for chat_chunk in chat_chunks:
            dominant_topic = chat_chunk.get('dominant_topic')
            if not dominant_topic:
                continue
            
            # Find matching keyphrase from chat-chunk clusters
            match_info = self.find_matching_keyphrase(dominant_topic)
            
            if match_info:
                update_data = {
                    'kmeans_cluster_keyphrase': match_info['matched_keyphrase']
                }
                
                bulk_operations.append(
                    UpdateOne(
                        {'_id': chat_chunk['_id']}, 
                        {'$set': update_data}
                    )
                )
        
        return bulk_operations
    
    def add_keyphrase_field(self, batch_size: int = 5000, dry_run: bool = False) -> Dict:
        """Add kmeans_cluster_keyphrase field to all matching chat-chunks"""
        start_time = time.time()
        
        # Get total count
        total_chat_chunks = self.chat_chunks_collection.count_documents({
            "dominant_topic": {"$exists": True, "$ne": None}
        })
        
        processed = 0
        matched = 0
        total_updates = 0
        
        print(f"Processing {total_chat_chunks} chat-chunks with dominant_topic")
        print(f"Batch size: {batch_size}")
        print(f"DRY RUN MODE: {'ON' if dry_run else 'OFF'}")
        
        # Create index for faster queries
        try:
            self.chat_chunks_collection.create_index([("dominant_topic", 1)], background=True)
            print("✓ Index on dominant_topic verified for chat-chunks collection")
        except Exception as e:
            print(f"Index note: {e}")
        
        # Process in batches
        cursor = self.chat_chunks_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}},
            projection={'dominant_topic': 1}
        ).batch_size(batch_size)
        
        batch = []
        batch_count = 0
        
        for chat_chunk in cursor:
            batch.append(chat_chunk)
            
            if len(batch) >= batch_size:
                batch_count += 1
                print(f"\n--- Processing chat-chunk batch {batch_count} ({len(batch)} chat-chunks) ---")
                
                # Process batch
                bulk_operations = self.process_chat_chunks_batch(batch)
                batch_matched = len(bulk_operations)
                matched += batch_matched
                
                print(f"Generated {batch_matched} keyphrase updates for this chat-chunk batch")
                
                # Execute bulk update (or skip if dry run)
                if bulk_operations and not dry_run:
                    try:
                        result = self.chat_chunks_collection.bulk_write(
                            bulk_operations, 
                            ordered=False
                        )
                        total_updates += result.modified_count
                        print(f"✓ Updated {result.modified_count} chat-chunks with keyphrase field")
                        
                    except Exception as e:
                        print(f"❌ Bulk write error in chat-chunk batch {batch_count}: {e}")
                
                elif bulk_operations and dry_run:
                    print(f"DRY RUN: Would add keyphrase field to {batch_matched} chat-chunks")
                    # Show sample operations
                    for i, op in enumerate(bulk_operations[:3]):
                        keyphrase = op._doc['$set']['kmeans_cluster_keyphrase']
                        print(f"  Sample {i+1}: Would set keyphrase='{keyphrase}'")
                
                processed += len(batch)
                batch = []
                
                # Progress update
                elapsed = time.time() - start_time
                rate = processed / elapsed if elapsed > 0 else 0
                print(f"Progress: {processed}/{total_chat_chunks} ({rate:.1f} chat-chunks/sec)")
        
        # Process remaining chat-chunks in final batch
        if batch:
            batch_count += 1
            print(f"\n--- Processing final chat-chunk batch {batch_count} ({len(batch)} chat-chunks) ---")
            
            bulk_operations = self.process_chat_chunks_batch(batch)
            batch_matched = len(bulk_operations)
            matched += batch_matched
            
            print(f"Generated {batch_matched} keyphrase updates for final chat-chunk batch")
            
            if bulk_operations and not dry_run:
                try:
                    result = self.chat_chunks_collection.bulk_write(
                        bulk_operations, 
                        ordered=False
                    )
                    total_updates += result.modified_count
                    print(f"✓ Updated {result.modified_count} chat-chunks in final batch")
                except Exception as e:
                    print(f"❌ Bulk write error in final chat-chunk batch: {e}")
            elif bulk_operations and dry_run:
                print(f"DRY RUN: Would add keyphrase field to {batch_matched} chat-chunks")
            
            processed += len(batch)
        
        total_time = time.time() - start_time
        
        # Final verification
        if not dry_run and total_updates > 0:
            print(f"\n--- Verification ---")
            keyphrase_count = self.chat_chunks_collection.count_documents({
                "kmeans_cluster_keyphrase": {"$exists": True}
            })
            print(f"Total chat-chunks with kmeans_cluster_keyphrase: {keyphrase_count}")
            
            # Show some sample results
            samples = list(self.chat_chunks_collection.find(
                {"kmeans_cluster_keyphrase": {"$exists": True}},
                {"dominant_topic": 1, "kmeans_cluster_keyphrase": 1}
            ).limit(5))
            
            print(f"\nSample chat-chunk results:")
            for i, sample in enumerate(samples, 1):
                print(f"  {i}. Topic: '{sample.get('dominant_topic')}' -> "
                      f"Keyphrase: '{sample.get('kmeans_cluster_keyphrase')}'")
        
        stats = {
            'total_chat_chunks_processed': processed,
            'chat_chunks_matched': matched,
            'total_updates': total_updates,
            'processing_time': total_time,
            'chat_chunks_per_second': processed / total_time if total_time > 0 else 0,
            'match_rate': (matched / processed * 100) if processed > 0 else 0,
            'dry_run': dry_run
        }
        
        return stats
    
    def debug_keyphrase_matching(self, limit: int = 10) -> None:
        """Debug the keyphrase matching process for chat-chunks"""
        print("\n=== DEBUGGING CHAT-CHUNK CLUSTER KEYPHRASE MATCHING ===")
        
        # Check cache
        if not self._keyphrase_to_cluster:
            print("❌ NO CHAT-CHUNK CLUSTER KEYPHRASE CACHE DATA!")
            return
        
        print(f"✓ Chat-chunk cluster keyphrase cache: {len(self._keyphrase_to_cluster)} entries")
        
        # Show sample keyphrases
        print(f"\nSample chat-chunk cluster keyphrases:")
        for i, (keyphrase, info) in enumerate(list(self._keyphrase_to_cluster.items())[:10]):
            print(f"  {i+1}. '{keyphrase}' -> Chat-Chunk Cluster {info['cluster_id']} ({info['dominant_label']})")
        
        # Test with actual chat-chunks
        print(f"\n=== TESTING {limit} CHAT-CHUNKS ===")
        chat_chunks = list(self.chat_chunks_collection.find(
            {"dominant_topic": {"$exists": True, "$ne": None}}
        ).limit(limit))
        
        if not chat_chunks:
            print("❌ NO CHAT-CHUNKS with dominant_topic found!")
            return
        
        for i, chat_chunk in enumerate(chat_chunks, 1):
            dominant_topic = chat_chunk.get('dominant_topic', 'NO_TOPIC')
            print(f"\n--- Chat-Chunk {i} ---")
            print(f"Chat-Chunk ID: {chat_chunk['_id']}")
            print(f"Dominant Topic: '{dominant_topic}'")
            
            # Test keyphrase matching
            match_info = self.find_matching_keyphrase(dominant_topic)
            if match_info:
                print(f"✓ CHAT-CHUNK CLUSTER KEYPHRASE MATCH: '{match_info['matched_keyphrase']}'")
                print(f"  Cluster ID: {match_info['cluster_id']}")
                print(f"  Cluster Label: {match_info['dominant_label']}")
            else:
                print(f"❌ No chat-chunk cluster keyphrase match for '{dominant_topic}'")
    
    def get_keyphrase_stats(self) -> Dict:
        """Get statistics about keyphrase matching for chat-chunks"""
        # Count chat-chunks with dominant_topic
        chat_chunks_with_topic = self.chat_chunks_collection.count_documents({
            "dominant_topic": {"$exists": True, "$ne": None}
        })
        
        # Count chat-chunks already with keyphrase field
        chat_chunks_with_keyphrase = self.chat_chunks_collection.count_documents({
            "kmeans_cluster_keyphrase": {"$exists": True}
        })
        
        # Get unique dominant topics and check match rates
        unique_topics = self.chat_chunks_collection.distinct("dominant_topic")
        unique_topics = [topic for topic in unique_topics if topic is not None]
        
        matchable_topics = 0
        for topic in unique_topics:
            if self.find_matching_keyphrase(topic):
                matchable_topics += 1
        
        # Count chat-chunk clusters
        total_chat_chunk_clusters = self.clusters_collection.count_documents({"data": "chat-chunks"})
        
        return {
            'total_chat_chunks_with_topic': chat_chunks_with_topic,
            'chat_chunks_with_keyphrase_field': chat_chunks_with_keyphrase,
            'unique_dominant_topics': len(unique_topics),
            'matchable_topics': matchable_topics,
            'topic_match_rate': (matchable_topics / len(unique_topics) * 100) if unique_topics else 0,
            'cached_chat_chunk_cluster_keyphrases': len(self._keyphrase_to_cluster),
            'total_chat_chunk_clusters': total_chat_chunk_clusters
        }
    
    def close_connection(self):
        """Close MongoDB connection"""
        self.client.close()

def main():
    # Get configuration from environment variables
    CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
    DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')
    
    if not CONNECTION_STRING:
        raise ValueError("MONGO_CONNECTION_STRING not found in environment variables")
    if not DATABASE_NAME:
        raise ValueError("MONGO_DATABASE_NAME not found in environment variables")
    
    print(f"Connecting to database: {DATABASE_NAME}")
    print("Processing chat-chunks with chat-chunk clusters (data: 'chat-chunks')")
    
    # Initialize chat-chunk keyphrase updater
    updater = ChatChunkClusterKeyphraseUpdater(CONNECTION_STRING, DATABASE_NAME)
    
    try:
        # Show current statistics
        print("\n--- Current Chat-Chunk Statistics ---")
        stats = updater.get_keyphrase_stats()
        for key, value in stats.items():
            if isinstance(value, float):
                print(f"{key}: {value:.1f}")
            else:
                print(f"{key}: {value:,}")
        
        # Check if we have chat-chunk clusters
        if stats['total_chat_chunk_clusters'] == 0:
            print("\n❌ No chat-chunk clusters found (data: 'chat-chunks')!")
            print("Please ensure you have clusters with data: 'chat-chunks' before running this script.")
            return
        
        # Debug keyphrase matching
        print("\n--- Debug Mode ---")
        debug_choice = input("Run debug mode to see chat-chunk keyphrase matching? (y/n): ")
        if debug_choice.lower() == 'y':
            updater.debug_keyphrase_matching()
        
        # Choose processing mode
        print("\n--- Processing Options ---")
        print("1. Dry run (see what would be updated)")
        print("2. Live processing (actually add keyphrase field)")
        choice = input("Choose option (1 or 2): ")
        
        if choice in ['1', '2']:
            dry_run = (choice == '1')
            
            print(f"\nStarting {'DRY RUN' if dry_run else 'LIVE PROCESSING'} for chat-chunks...")
            
            # Get batch size
            batch_size = int(input("Enter batch size (recommended: 5000): ") or "5000")
            
            # Process chat-chunks
            results = updater.add_keyphrase_field(batch_size=batch_size, dry_run=dry_run)
            
            print("\n--- Final Results ---")
            print(f"Total chat-chunks processed: {results['total_chat_chunks_processed']:,}")
            print(f"Chat-chunks with matching keyphrases: {results['chat_chunks_matched']:,}")
            print(f"Total updates made: {results['total_updates']:,}")
            print(f"Match rate: {results['match_rate']:.1f}%")
            print(f"Processing time: {results['processing_time']:.2f} seconds")
            print(f"Processing rate: {results['chat_chunks_per_second']:.1f} chat-chunks/second")
            
            if not dry_run and results['total_updates'] > 0:
                print(f"\n✅ Successfully added kmeans_cluster_keyphrase field to {results['total_updates']:,} chat-chunks!")
            elif dry_run:
                print(f"\nDRY RUN COMPLETE: Would add keyphrase field to {results['chat_chunks_matched']:,} chat-chunks")
        
    finally:
        updater.close_connection()

if __name__ == "__main__":
    main()

Connecting to database: sparzaai
Processing chat-chunks with chat-chunk clusters (data: 'chat-chunks')
Loading chat-chunk cluster keyphrase cache...
Found 10 chat-chunk clusters to process
Chat-chunk cluster cache loaded in 1.75 seconds
Cached 65 chat-chunk cluster keyphrases

--- Current Chat-Chunk Statistics ---
total_chat_chunks_with_topic: 600
chat_chunks_with_keyphrase_field: 600
unique_dominant_topics: 65
matchable_topics: 65
topic_match_rate: 100.0
cached_chat_chunk_cluster_keyphrases: 65
total_chat_chunk_clusters: 10

--- Debug Mode ---

=== DEBUGGING CHAT-CHUNK CLUSTER KEYPHRASE MATCHING ===
✓ Chat-chunk cluster keyphrase cache: 65 entries

Sample chat-chunk cluster keyphrases:
  1. 'Shift Coverage Request' -> Chat-Chunk Cluster 0 (Workforce & HR Management)
  2. 'Training Schedule Update' -> Chat-Chunk Cluster 0 (Workforce & HR Management)
  3. 'Performance Review Planning' -> Chat-Chunk Cluster 0 (Workforce & HR Management)
  4. 'Break Schedule Coordination' -> Chat-Chunk Cl

In [19]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get connection details from environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

if not MONGO_CONNECTION_STRING or not MONGO_DATABASE_NAME:
    raise ValueError("Please set MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME in your environment variables")

# Connect to MongoDB
client = MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DATABASE_NAME]
collection = db['chat-chunks']

try:
    # First, get the total count of chat-chunks
    total_chat_chunks = collection.count_documents({})
    print(f"Total chat-chunks in collection: {total_chat_chunks}")
    
    # Add domain field to all chat-chunks
    result = collection.update_many(
        {},  # Empty filter to match all documents
        {"$set": {"domain": "banking"}}
    )
    
    print(f"Matched chat-chunks: {result.matched_count}")
    print(f"Modified chat-chunks: {result.modified_count}")
    
    if result.matched_count == total_chat_chunks:
        print(f"Successfully updated all {total_chat_chunks} chat-chunks!")
    else:
        print(f"Expected {total_chat_chunks} chat-chunks, but matched {result.matched_count}")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the connection
    client.close()

# Alternative: Add domain field only to chat-chunks that don't already have it
def add_domain_conditionally():
    try:
        # Count chat-chunks without domain field
        chat_chunks_without_domain = collection.count_documents({"domain": {"$exists": False}})
        print(f"Chat-chunks without domain field: {chat_chunks_without_domain}")
        
        if chat_chunks_without_domain == 0:
            print("All chat-chunks already have domain field!")
            return
        
        result = collection.update_many(
            {"domain": {"$exists": False}},  # Only chat-chunks without 'domain' field
            {"$set": {"domain": "banking"}}
        )
        
        print(f"Chat-chunks matched for update: {result.matched_count}")
        print(f"Modified chat-chunks: {result.modified_count}")
        
        if result.modified_count > 0:
            print(f"Successfully added domain field to {result.modified_count} chat-chunks!")
        
    except Exception as e:
        print(f"An error occurred: {e}")

# Function to verify domain field addition
def verify_domain_update():
    try:
        # Count chat-chunks with domain field
        chat_chunks_with_domain = collection.count_documents({"domain": {"$exists": True}})
        total_chat_chunks = collection.count_documents({})
        
        print(f"\n--- Verification ---")
        print(f"Total chat-chunks: {total_chat_chunks}")
        print(f"Chat-chunks with domain field: {chat_chunks_with_domain}")
        print(f"Chat-chunks without domain field: {total_chat_chunks - chat_chunks_with_domain}")
        
        if chat_chunks_with_domain == total_chat_chunks:
            print("✅ All chat-chunks have domain field!")
        else:
            print(f"❌ {total_chat_chunks - chat_chunks_with_domain} chat-chunks still missing domain field")
        
        # Show sample chat-chunks with domain field
        sample_chat_chunks = list(collection.find({"domain": {"$exists": True}}).limit(3))
        if sample_chat_chunks:
            print(f"\nSample chat-chunks with domain field:")
            for i, chat_chunk in enumerate(sample_chat_chunks, 1):
                print(f"  {i}. ID: {chat_chunk['_id']}, Domain: {chat_chunk.get('domain')}")
                
    except Exception as e:
        print(f"An error occurred during verification: {e}")

# Uncomment the line below if you want to run the conditional update instead
# add_domain_conditionally()

# Uncomment the line below to verify the domain field addition
# verify_domain_update()

Total chat-chunks in collection: 600
Matched chat-chunks: 600
Modified chat-chunks: 0
Successfully updated all 600 chat-chunks!


In [20]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {mongo_database_name}")
print("Collection: chat-chunks")
print("=" * 60)

# Connect to MongoDB
client = MongoClient(mongo_connection_string)
db = client[mongo_database_name]
chat_chunks_collection = db['chat-chunks']

def analyze_urgency_values():
    """Analyze current urgency field values"""
    print("Analyzing current urgency values...")
    print("-" * 40)
    
    try:
        # Get all unique urgency values
        urgency_values = chat_chunks_collection.distinct("urgency")
        print(f"Unique urgency values found: {urgency_values}")
        
        # Count each urgency value
        urgency_stats = list(chat_chunks_collection.aggregate([
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        total_chat_chunks = chat_chunks_collection.count_documents({})
        print(f"\nTotal chat-chunks: {total_chat_chunks}")
        print(f"Urgency distribution:")
        
        for stat in urgency_stats:
            value = stat['_id'] if stat['_id'] is not None else 'null/missing'
            count = stat['count']
            percentage = (count / total_chat_chunks) * 100 if total_chat_chunks > 0 else 0
            print(f"  '{value}': {count} chat-chunks ({percentage:.1f}%)")
            
        return urgency_stats
        
    except Exception as e:
        print(f"Error analyzing urgency values: {str(e)}")
        return []

def update_urgency_to_boolean():
    """Update urgency field from string to boolean"""
    print("\nStarting urgency field update...")
    print("Conversion rules:")
    print("  'Critical' → true")
    print("  'High' → false")
    print("-" * 40)
    
    try:
        # Update Critical to true
        critical_result = chat_chunks_collection.update_many(
            {"urgency": "Critical"},
            {"$set": {"urgency": True}}
        )
        
        print(f"✓ Updated 'Critical' urgency:")
        print(f"  Matched: {critical_result.matched_count}")
        print(f"  Modified: {critical_result.modified_count}")
        
        # Update High to false
        high_result = chat_chunks_collection.update_many(
            {"urgency": "High"},
            {"$set": {"urgency": False}}
        )
        
        print(f"✓ Updated 'High' urgency:")
        print(f"  Matched: {high_result.matched_count}")
        print(f"  Modified: {high_result.modified_count}")
        
        total_updated = critical_result.modified_count + high_result.modified_count
        print(f"\nTotal chat-chunks updated: {total_updated}")
        
        return {
            'critical_updated': critical_result.modified_count,
            'high_updated': high_result.modified_count,
            'total_updated': total_updated
        }
        
    except Exception as e:
        print(f"Error updating urgency values: {str(e)}")
        return None

def verify_boolean_conversion():
    """Verify that urgency values are now boolean"""
    print("\n" + "=" * 50)
    print("VERIFICATION OF URGENCY CONVERSION")
    print("=" * 50)
    
    try:
        # Count boolean urgency values
        true_count = chat_chunks_collection.count_documents({"urgency": True})
        false_count = chat_chunks_collection.count_documents({"urgency": False})
        
        # Count any remaining string values
        critical_count = chat_chunks_collection.count_documents({"urgency": "Critical"})
        high_count = chat_chunks_collection.count_documents({"urgency": "High"})
        
        # Count other values
        other_urgency = list(chat_chunks_collection.aggregate([
            {"$match": {"urgency": {"$nin": [True, False, "Critical", "High"]}}},
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        print(f"Boolean urgency values:")
        print(f"  urgency: true → {true_count} chat-chunks")
        print(f"  urgency: false → {false_count} chat-chunks")
        
        print(f"\nRemaining string values:")
        print(f"  urgency: 'Critical' → {critical_count} chat-chunks")
        print(f"  urgency: 'High' → {high_count} chat-chunks")
        
        if other_urgency:
            print(f"\nOther urgency values:")
            for other in other_urgency:
                value = other['_id'] if other['_id'] is not None else 'null/missing'
                count = other['count']
                print(f"  urgency: '{value}' → {count} chat-chunks")
        
        # Show sample chat-chunks with boolean urgency
        print(f"\nSample chat-chunks with boolean urgency:")
        samples = list(chat_chunks_collection.find(
            {"urgency": {"$in": [True, False]}},
            {
                "chunk_id": 1,
                "urgency": 1,
                "priority": 1,
                "content": 1
            }
        ).limit(5))
        
        for i, sample in enumerate(samples, 1):
            chunk_id = sample.get('chunk_id', sample.get('_id', 'N/A'))
            urgency = sample.get('urgency')
            priority = sample.get('priority', 'N/A')
            content = sample.get('content', 'N/A')[:50] + '...' if len(sample.get('content', '')) > 50 else sample.get('content', 'N/A')
            
            print(f"  {i}. {chunk_id}: urgency={urgency}, priority='{priority}'")
            print(f"     Content: {content}")
        
        # Success check
        if critical_count == 0 and high_count == 0:
            print(f"\n✅ SUCCESS: All 'Critical' and 'High' urgency values converted to boolean!")
            print(f"Summary: {true_count} critical (true) + {false_count} high (false) = {true_count + false_count} total")
        else:
            print(f"\n⚠ WARNING: Some string urgency values remain")
            
    except Exception as e:
        print(f"Error during verification: {str(e)}")

def handle_other_urgency_values():
    """Check and optionally handle other urgency values"""
    print("\n" + "=" * 50)
    print("HANDLING OTHER URGENCY VALUES")
    print("=" * 50)
    
    try:
        # Find chat-chunks with urgency values other than True/False/Critical/High
        other_urgency = list(chat_chunks_collection.aggregate([
            {"$match": {"urgency": {"$nin": [True, False, "Critical", "High"]}}},
            {"$group": {"_id": "$urgency", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        if not other_urgency:
            print("No other urgency values found - all chat-chunks have been processed!")
            return
        
        print("Found chat-chunks with other urgency values:")
        for other in other_urgency:
            value = other['_id'] if other['_id'] is not None else 'null/missing'
            count = other['count']
            print(f"  '{value}': {count} chat-chunks")
        
        print(f"\nSample chat-chunks with other urgency values:")
        samples = list(chat_chunks_collection.find(
            {"urgency": {"$nin": [True, False, "Critical", "High"]}},
            {
                "chunk_id": 1,
                "urgency": 1,
                "priority": 1,
                "content": 1
            }
        ).limit(3))
        
        for i, sample in enumerate(samples, 1):
            chunk_id = sample.get('chunk_id', sample.get('_id', 'N/A'))
            urgency = sample.get('urgency')
            priority = sample.get('priority', 'N/A')
            content = sample.get('content', 'N/A')[:50] + '...' if len(sample.get('content', '')) > 50 else sample.get('content', 'N/A')
            
            print(f"  {i}. {chunk_id}: urgency={urgency}, priority='{priority}'")
            print(f"     Content: {content}")
            
    except Exception as e:
        print(f"Error handling other urgency values: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        # Test database connection
        test_chat_chunk = chat_chunks_collection.find_one()
        if test_chat_chunk:
            print("✓ Database connection successful")
            print(f"Sample chat-chunk fields: {list(test_chat_chunk.keys())}\n")
        else:
            print("⚠ No chat-chunks found in chat-chunks collection")
            exit(1)
        
        # Analyze current urgency values
        analyze_urgency_values()
        
        # Confirm before proceeding
        print(f"\nThis will update urgency values:")
        print(f"  'Critical' → true")
        print(f"  'High' → false")
        
        confirm = input(f"\nProceed with urgency conversion? (y/n): ")
        if confirm.lower() != 'y':
            print("Operation cancelled.")
            exit(0)
        
        # Execute the urgency update
        update_result = update_urgency_to_boolean()
        
        if update_result:
            # Verify the conversion
            verify_boolean_conversion()
            
            # Handle other urgency values
            handle_other_urgency_values()
            
            print(f"\n" + "=" * 60)
            print("✅ URGENCY CONVERSION COMPLETED SUCCESSFULLY!")
            print(f"Summary:")
            print(f"  Critical chat-chunks (now true): {update_result['critical_updated']}")
            print(f"  High chat-chunks (now false): {update_result['high_updated']}")
            print(f"  Total chat-chunks updated: {update_result['total_updated']}")
            print("=" * 60)
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
Collection: chat-chunks
✓ Database connection successful
Sample chat-chunk fields: ['_id', 'chat_id', 'chat_members', 'dominant_topic', 'subtopics', 'raw_segments', 'embeddings', 'kmeans_cluster_id', 'subcluster_id', 'subcluster_label', 'dominant_cluster_label', 'kmeans_cluster_keyphrase', 'domain']

Analyzing current urgency values...
----------------------------------------
Unique urgency values found: []

Total chat-chunks: 600
Urgency distribution:
  'null/missing': 600 chat-chunks (100.0%)

This will update urgency values:
  'Critical' → true
  'High' → false

Starting urgency field update...
Conversion rules:
  'Critical' → true
  'High' → false
----------------------------------------
✓ Updated 'Critical' urgency:
  Matched: 0
  Modified: 0
✓ Updated 'High' urgency:
  Matched: 0
  Modified: 0

Total chat-chunks updated: 0

VERIFICATION OF URGENCY CONVERSION
Boolean urgency values:
  urgency: true → 0 chat-chunks
  urgency: false → 0 ch

In [21]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

print(f"Connecting to MongoDB...")
print(f"Database: {mongo_database_name}")
print("Collection: chat-chunks")
print("=" * 60)

# Connect to MongoDB
client = MongoClient(mongo_connection_string)
db = client[mongo_database_name]
chat_chunks_collection = db['chat-chunks']

def analyze_subcluster_id_values():
    """Analyze current subcluster_id field values"""
    print("Analyzing current subcluster_id values...")
    print("-" * 40)
    
    try:
        # Get all unique subcluster_id values
        subcluster_values = chat_chunks_collection.distinct("subcluster_id")
        print(f"Unique subcluster_id values found: {subcluster_values}")
        
        # Count each subcluster_id value type
        subcluster_stats = list(chat_chunks_collection.aggregate([
            {"$group": {
                "_id": {"value": "$subcluster_id", "type": {"$type": "$subcluster_id"}}, 
                "count": {"$sum": 1}
            }},
            {"$sort": {"count": -1}}
        ]))
        
        total_chat_chunks = chat_chunks_collection.count_documents({})
        print(f"\nTotal chat-chunks: {total_chat_chunks}")
        print(f"Subcluster_id distribution:")
        
        for stat in subcluster_stats:
            value = stat['_id']['value'] if stat['_id']['value'] is not None else 'null/missing'
            data_type = stat['_id']['type']
            count = stat['count']
            percentage = (count / total_chat_chunks) * 100 if total_chat_chunks > 0 else 0
            print(f"  '{value}' (type: {data_type}): {count} chat-chunks ({percentage:.1f}%)")
            
        return subcluster_stats
        
    except Exception as e:
        print(f"Error analyzing subcluster_id values: {str(e)}")
        return []

def update_subcluster_id_to_string():
    """Update subcluster_id field from integer to string"""
    print("\nStarting subcluster_id field update...")
    print("Conversion rule: All integer values → string values")
    print("-" * 40)
    
    try:
        # Find all chat-chunks with integer subcluster_id
        integer_chat_chunks = list(chat_chunks_collection.find(
            {"subcluster_id": {"$type": "int"}},
            {"_id": 1, "subcluster_id": 1}
        ))
        
        print(f"Found {len(integer_chat_chunks)} chat-chunks with integer subcluster_id")
        
        updated_count = 0
        
        # Update each chat-chunk individually to convert integer to string
        for chat_chunk in integer_chat_chunks:
            old_value = chat_chunk['subcluster_id']
            new_value = str(old_value)
            
            result = chat_chunks_collection.update_one(
                {"_id": chat_chunk['_id']},
                {"$set": {"subcluster_id": new_value}}
            )
            
            if result.modified_count > 0:
                updated_count += 1
        
        print(f"✓ Updated subcluster_id from integer to string:")
        print(f"  Total processed: {len(integer_chat_chunks)}")
        print(f"  Successfully updated: {updated_count}")
        
        return updated_count
        
    except Exception as e:
        print(f"Error updating subcluster_id values: {str(e)}")
        return 0

def verify_string_conversion():
    """Verify that subcluster_id values are now strings"""
    print("\n" + "=" * 50)
    print("VERIFICATION OF SUBCLUSTER_ID CONVERSION")
    print("=" * 50)
    
    try:
        # Count string subcluster_id values
        string_count = chat_chunks_collection.count_documents({"subcluster_id": {"$type": "string"}})
        
        # Count any remaining integer values
        integer_count = chat_chunks_collection.count_documents({"subcluster_id": {"$type": "int"}})
        
        # Count other data types
        other_types = list(chat_chunks_collection.aggregate([
            {"$match": {"subcluster_id": {"$nin": [None]}}},
            {"$group": {"_id": {"$type": "$subcluster_id"}, "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]))
        
        print(f"Subcluster_id by data type:")
        for type_stat in other_types:
            data_type = type_stat['_id']
            count = type_stat['count']
            print(f"  {data_type}: {count} chat-chunks")
        
        # Count null/missing values
        null_count = chat_chunks_collection.count_documents({"subcluster_id": None})
        missing_count = chat_chunks_collection.count_documents({"subcluster_id": {"$exists": False}})
        
        if null_count > 0:
            print(f"  null: {null_count} chat-chunks")
        if missing_count > 0:
            print(f"  missing: {missing_count} chat-chunks")
        
        # Show sample chat-chunks with string subcluster_id
        print(f"\nSample chat-chunks with string subcluster_id:")
        samples = list(chat_chunks_collection.find(
            {"subcluster_id": {"$type": "string"}},
            {
                "chunk_id": 1,
                "subcluster_id": 1,
                "content": 1
            }
        ).limit(5))
        
        for i, sample in enumerate(samples, 1):
            chunk_id = sample.get('chunk_id', sample.get('_id', 'N/A'))
            subcluster_id = sample.get('subcluster_id')
            content = sample.get('content', 'N/A')[:50] + '...' if len(sample.get('content', '')) > 50 else sample.get('content', 'N/A')
            
            print(f"  {i}. {chunk_id}: subcluster_id=\"{subcluster_id}\"")
            print(f"     Content: {content}")
        
        # Success check
        if integer_count == 0:
            print(f"\n✅ SUCCESS: All integer subcluster_id values converted to strings!")
            print(f"Summary: {string_count} chat-chunks now have string subcluster_id")
        else:
            print(f"\n⚠ WARNING: {integer_count} integer subcluster_id values remain")
            
    except Exception as e:
        print(f"Error during verification: {str(e)}")

def handle_other_subcluster_id_values():
    """Check for any unusual subcluster_id values"""
    print("\n" + "=" * 50)
    print("CHECKING FOR OTHER SUBCLUSTER_ID VALUES")
    print("=" * 50)
    
    try:
        # Find chat-chunks with null or missing subcluster_id
        null_count = chat_chunks_collection.count_documents({"subcluster_id": None})
        missing_count = chat_chunks_collection.count_documents({"subcluster_id": {"$exists": False}})
        
        if null_count > 0:
            print(f"Found {null_count} chat-chunks with null subcluster_id")
            
        if missing_count > 0:
            print(f"Found {missing_count} chat-chunks with missing subcluster_id field")
            
        if null_count == 0 and missing_count == 0:
            print("All chat-chunks have valid subcluster_id values!")
            
        # Show sample of any problematic chat-chunks
        if null_count > 0 or missing_count > 0:
            print(f"\nSample chat-chunks with null/missing subcluster_id:")
            samples = list(chat_chunks_collection.find(
                {"$or": [
                    {"subcluster_id": None},
                    {"subcluster_id": {"$exists": False}}
                ]},
                {
                    "chunk_id": 1,
                    "subcluster_id": 1,
                    "content": 1
                }
            ).limit(3))
            
            for i, sample in enumerate(samples, 1):
                chunk_id = sample.get('chunk_id', sample.get('_id', 'N/A'))
                subcluster_id = sample.get('subcluster_id', 'MISSING_FIELD')
                content = sample.get('content', 'N/A')[:50] + '...' if len(sample.get('content', '')) > 50 else sample.get('content', 'N/A')
                
                print(f"  {i}. {chunk_id}: subcluster_id={subcluster_id}")
                print(f"     Content: {content}")
            
    except Exception as e:
        print(f"Error checking other subcluster_id values: {str(e)}")

# Main execution
if __name__ == "__main__":
    try:
        # Test database connection
        test_chat_chunk = chat_chunks_collection.find_one()
        if test_chat_chunk:
            print("✓ Database connection successful")
            print(f"Sample chat-chunk fields: {list(test_chat_chunk.keys())}\n")
        else:
            print("⚠ No chat-chunks found in chat-chunks collection")
            exit(1)
        
        # Analyze current subcluster_id values
        analyze_subcluster_id_values()
        
        # Confirm before proceeding
        print(f"\nThis will convert all integer subcluster_id values to strings")
        print(f"Example: subcluster_id: 1 → subcluster_id: \"1\"")
        
        confirm = input(f"\nProceed with subcluster_id conversion? (y/n): ")
        if confirm.lower() != 'y':
            print("Operation cancelled.")
            exit(0)
        
        # Execute the subcluster_id update
        updated_count = update_subcluster_id_to_string()
        
        if updated_count > 0:
            # Verify the conversion
            verify_string_conversion()
            
            # Handle other subcluster_id values
            handle_other_subcluster_id_values()
            
            print(f"\n" + "=" * 60)
            print("✅ SUBCLUSTER_ID CONVERSION COMPLETED SUCCESSFULLY!")
            print(f"Summary:")
            print(f"  Total chat-chunks updated: {updated_count}")
            print(f"  All integer subcluster_id values converted to strings")
            print("=" * 60)
        else:
            print(f"\n⚠ No updates were made. Check if subcluster_id fields are already strings or if there are no integer values.")
        
    except Exception as e:
        print(f"\n❌ Error during execution: {str(e)}")
        print("Please check your environment variables and database connection.")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("Database connection closed.")

Connecting to MongoDB...
Database: sparzaai
Collection: chat-chunks
✓ Database connection successful
Sample chat-chunk fields: ['_id', 'chat_id', 'chat_members', 'dominant_topic', 'subtopics', 'raw_segments', 'embeddings', 'kmeans_cluster_id', 'subcluster_id', 'subcluster_label', 'dominant_cluster_label', 'kmeans_cluster_keyphrase', 'domain']

Analyzing current subcluster_id values...
----------------------------------------
Unique subcluster_id values found: [0, 1, 2]

Total chat-chunks: 600
Subcluster_id distribution:
  '1' (type: int): 305 chat-chunks (50.8%)
  '0' (type: int): 276 chat-chunks (46.0%)
  '2' (type: int): 19 chat-chunks (3.2%)

This will convert all integer subcluster_id values to strings
Example: subcluster_id: 1 → subcluster_id: "1"

Starting subcluster_id field update...
Conversion rule: All integer values → string values
----------------------------------------
Found 600 chat-chunks with integer subcluster_id
✓ Updated subcluster_id from integer to string:
  Total

In [22]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

def rename_field_in_cluster_collection():
    """
    Rename field 'chat_chunk_ids' to 'chat_chunks_ids' in documents where data="chat-chunks"
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        cluster_collection = db['cluster']
       
        # Define the filter for documents that have data="chat-chunks" and chat_chunk_ids field exists
        filter_query = {
            "data": "chat-chunks",
            "chat_chunk_ids": {"$exists": True}
        }
       
        # Count documents that match the criteria before update
        count_before = cluster_collection.count_documents(filter_query)
        print(f"Found {count_before} documents matching criteria (data='chat-chunks' and chat_chunk_ids exists)")
       
        if count_before == 0:
            print("No documents found to update.")
            return
       
        # Use $rename operator to rename the field
        update_operation = {
            "$rename": {
                "chat_chunk_ids": "chat_chunks_ids"
            }
        }
       
        # Perform the update operation
        result = cluster_collection.update_many(filter_query, update_operation)
       
        print(f"Successfully updated {result.modified_count} documents")
        print(f"Matched {result.matched_count} documents")
       
        # Verify the update by counting documents with the new field name
        verification_query = {
            "data": "chat-chunks",
            "chat_chunks_ids": {"$exists": True}
        }
        count_after = cluster_collection.count_documents(verification_query)
        print(f"Verification: {count_after} documents now have 'chat_chunks_ids' field")
       
    except Exception as e:
        print(f"Error occurred: {str(e)}")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("MongoDB connection closed")

if __name__ == "__main__":
    rename_field_in_cluster_collection()

Found 10 documents matching criteria (data='chat-chunks' and chat_chunk_ids exists)
Successfully updated 10 documents
Matched 10 documents
Verification: 10 documents now have 'chat_chunks_ids' field
MongoDB connection closed


In [23]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

def validate_keyphrases_in_subclusters():
    """
    Cross-check keyphrases field with subclusters keyphrases for chat-chunks data.
    Find any keyphrases that exist in main keyphrases but are missing from all subclusters.
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        cluster_collection = db['cluster']
        
        # Find all documents that have both keyphrases and subclusters fields for chat-chunks data
        query = {
            "data": "chat-chunks",
            "keyphrases": {"$exists": True, "$ne": None},
            "subclusters": {"$exists": True, "$ne": None}
        }
        
        documents = cluster_collection.find(query)
        
        missing_keyphrases = []
        total_documents_checked = 0
        
        for doc in documents:
            total_documents_checked += 1
            cluster_id = doc.get('cluster_id')
            main_keyphrases = doc.get('keyphrases', [])
            subclusters = doc.get('subclusters', {})
            
            # Collect all keyphrases from all subclusters
            subcluster_keyphrases = set()
            
            # subclusters is an object with keys like "0", "1", "2", etc.
            for subcluster_key, subcluster_data in subclusters.items():
                if isinstance(subcluster_data, dict) and 'keyphrases' in subcluster_data:
                    subcluster_keyphrase_list = subcluster_data.get('keyphrases', [])
                    if isinstance(subcluster_keyphrase_list, list):
                        subcluster_keyphrases.update(subcluster_keyphrase_list)
            
            # Check each main keyphrase against subcluster keyphrases
            for keyphrase in main_keyphrases:
                if keyphrase not in subcluster_keyphrases:
                    missing_keyphrases.append({
                        'cluster_id': cluster_id,
                        'missing_keyphrase': keyphrase,
                        'total_main_keyphrases': len(main_keyphrases),
                        'total_subcluster_keyphrases': len(subcluster_keyphrases)
                    })
        
        # Display results
        print(f"Total chat-chunks documents checked: {total_documents_checked}")
        print(f"Total missing keyphrases found: {len(missing_keyphrases)}")
        print("-" * 80)
        
        if missing_keyphrases:
            print("MISSING KEYPHRASES REPORT (CHAT-CHUNKS):")
            print("-" * 80)
            
            # Group by cluster_id for better readability
            cluster_groups = {}
            for item in missing_keyphrases:
                cluster_id = item['cluster_id']
                if cluster_id not in cluster_groups:
                    cluster_groups[cluster_id] = []
                cluster_groups[cluster_id].append(item)
            
            for cluster_id, missing_items in cluster_groups.items():
                print(f"Chat-Chunks Cluster ID: {cluster_id}")
                print(f"Missing keyphrases ({len(missing_items)}):")
                for item in missing_items:
                    print(f"  - '{item['missing_keyphrase']}'")
                print(f"Total main keyphrases: {missing_items[0]['total_main_keyphrases']}")
                print(f"Total subcluster keyphrases: {missing_items[0]['total_subcluster_keyphrases']}")
                print("-" * 40)
                
        else:
            print("✅ All keyphrases from main field are present in subclusters for chat-chunks data!")
            
        # Summary statistics
        if missing_keyphrases:
            clusters_with_issues = len(set(item['cluster_id'] for item in missing_keyphrases))
            print(f"\nSUMMARY (CHAT-CHUNKS):")
            print(f"Chat-chunks clusters with missing keyphrases: {clusters_with_issues}")
            print(f"Total missing keyphrase instances: {len(missing_keyphrases)}")
        
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            print("\nMongoDB connection closed")

def get_detailed_analysis():
    """
    Get more detailed analysis including sample chat-chunks data structure
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        cluster_collection = db['cluster']
        
        # Get a sample document to understand structure for chat-chunks data
        sample_doc = cluster_collection.find_one({
            "data": "chat-chunks",
            "keyphrases": {"$exists": True},
            "subclusters": {"$exists": True}
        })
        
        if sample_doc:
            print("SAMPLE CHAT-CHUNKS DOCUMENT STRUCTURE:")
            print("-" * 40)
            print(f"Cluster ID: {sample_doc.get('cluster_id')}")
            print(f"Cluster Name: {sample_doc.get('cluster_name', 'N/A')}")
            print(f"Data Type: {sample_doc.get('data')}")
            print(f"Main keyphrases count: {len(sample_doc.get('keyphrases', []))}")
            
            subclusters = sample_doc.get('subclusters', {})
            print(f"Subclusters count: {len(subclusters)}")
            
            if sample_doc.get('keyphrases'):
                print(f"Sample main keyphrases: {sample_doc['keyphrases'][:3]}...")
            
            if subclusters:
                print("Subcluster structure:")
                for key, subcluster in list(subclusters.items())[:2]:  # Show first 2 subclusters
                    if isinstance(subcluster, dict):
                        label = subcluster.get('label', 'No label')
                        keyphrases_count = len(subcluster.get('keyphrases', []))
                        print(f"  {key}: '{label}' ({keyphrases_count} keyphrases)")
                        if subcluster.get('keyphrases'):
                            print(f"    Sample keyphrases: {subcluster['keyphrases'][:2]}...")
            
            # Check if chat_chunks_ids field exists
            if 'chat_chunks_ids' in sample_doc:
                print(f"Chat chunks count: {len(sample_doc.get('chat_chunks_ids', []))}")
            elif 'chat_chunk_ids' in sample_doc:
                print(f"Chat chunks count: {len(sample_doc.get('chat_chunk_ids', []))}")
            
            print("-" * 40)
        else:
            print("No chat-chunks documents found with both keyphrases and subclusters fields.")
            
            # Check what chat-chunks documents exist
            chat_chunks_count = cluster_collection.count_documents({"data": "chat-chunks"})
            print(f"Total chat-chunks documents in collection: {chat_chunks_count}")
            
            # Check structure of any chat-chunks document
            any_chat_chunk_doc = cluster_collection.find_one({"data": "chat-chunks"})
            if any_chat_chunk_doc:
                print("Available fields in chat-chunks documents:")
                print(f"  Fields: {list(any_chat_chunk_doc.keys())}")
    
    except Exception as e:
        print(f"Error in detailed analysis: {str(e)}")
    finally:
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    print("Starting chat-chunks keyphrase validation...")
    print("=" * 80)
    
    # First, get structure analysis
    get_detailed_analysis()
    
    # Then run validation
    validate_keyphrases_in_subclusters()

Starting chat-chunks keyphrase validation...
SAMPLE CHAT-CHUNKS DOCUMENT STRUCTURE:
----------------------------------------
Cluster ID: 0
Cluster Name: Workforce & HR Management
Data Type: chat-chunks
Main keyphrases count: 12
Subclusters count: 2
Sample main keyphrases: ['Shift Coverage Request', 'Training Schedule Update', 'Performance Review Planning']...
Subcluster structure:
  0: 'Workforce Scheduling & Planning' (4 keyphrases)
    Sample keyphrases: ['Shift Coverage Request', 'Break Schedule Coordination']...
  1: 'Employee Development & Relations' (8 keyphrases)
    Sample keyphrases: ['Training Schedule Update', 'Performance Review Planning']...
Chat chunks count: 98
----------------------------------------
Total chat-chunks documents checked: 10
Total missing keyphrases found: 0
--------------------------------------------------------------------------------
✅ All keyphrases from main field are present in subclusters for chat-chunks data!

MongoDB connection closed


In [24]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

if not mongo_connection_string or not mongo_database_name:
    raise ValueError("MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME must be set in environment variables")

def convert_subcluster_id_to_string():
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['chat-chunks']
        
        # Find all documents where subcluster_id is an integer
        query = {"subcluster_id": {"$type": "number"}}
        documents_to_update = list(collection.find(query))
        
        print(f"Found {len(documents_to_update)} chat-chunks documents with integer subcluster_id")
        
        if len(documents_to_update) == 0:
            print("No chat-chunks documents found with integer subcluster_id")
            return
        
        # Update each document
        updated_count = 0
        for doc in documents_to_update:
            try:
                # Convert the integer subcluster_id to string
                new_subcluster_id = str(doc['subcluster_id'])
                
                # Update the document
                result = collection.update_one(
                    {"_id": doc["_id"]},
                    {"$set": {"subcluster_id": new_subcluster_id}}
                )
                
                if result.modified_count > 0:
                    updated_count += 1
                    print(f"Updated chat-chunk document {doc['_id']}: subcluster_id {doc['subcluster_id']} -> '{new_subcluster_id}'")
                
            except Exception as e:
                print(f"Error updating chat-chunk document {doc['_id']}: {e}")
        
        print(f"\nSummary: Successfully updated {updated_count} out of {len(documents_to_update)} chat-chunks documents")
        
        # Verify the changes
        remaining_int_docs = collection.count_documents({"subcluster_id": {"$type": "number"}})
        string_docs = collection.count_documents({"subcluster_id": {"$type": "string"}})
        
        print(f"Verification for chat-chunks collection:")
        print(f"- Documents with integer subcluster_id: {remaining_int_docs}")
        print(f"- Documents with string subcluster_id: {string_docs}")
        
    except Exception as e:
        print(f"Error connecting to MongoDB or updating chat-chunks documents: {e}")
    finally:
        if 'client' in locals():
            client.close()
            print("MongoDB connection closed")

def preview_changes():
    """Preview what changes will be made to chat-chunks without actually updating"""
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['chat-chunks']
        
        # Find documents where subcluster_id is an integer
        query = {"subcluster_id": {"$type": "number"}}
        documents_to_update = list(collection.find(query, {"_id": 1, "subcluster_id": 1}))
        
        print("PREVIEW MODE - No changes will be made to chat-chunks")
        print(f"Found {len(documents_to_update)} chat-chunks documents that would be updated:")
        
        for i, doc in enumerate(documents_to_update[:10]):  # Show first 10
            print(f"  Chat-chunk document {doc['_id']}: subcluster_id {doc['subcluster_id']} -> '{str(doc['subcluster_id'])}'")
        
        if len(documents_to_update) > 10:
            print(f"  ... and {len(documents_to_update) - 10} more chat-chunks documents")
        
        # Also show total documents in collection for context
        total_docs = collection.count_documents({})
        print(f"\nTotal documents in chat-chunks collection: {total_docs}")
        
    except Exception as e:
        print(f"Error during chat-chunks preview: {e}")
    finally:
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    print("MongoDB Chat-Chunks subcluster_id Converter")
    print("=" * 50)
    
    # First, preview the changes
    print("\n1. PREVIEW CHANGES FOR CHAT-CHUNKS:")
    preview_changes()
    
    # Ask for confirmation
    print("\n2. CONFIRMATION:")
    response = input("Do you want to proceed with the chat-chunks conversion? (yes/no): ").lower().strip()
    
    if response == 'yes':
        print("\n3. EXECUTING CHAT-CHUNKS CONVERSION:")
        convert_subcluster_id_to_string()
    else:
        print("Chat-chunks conversion cancelled.")

MongoDB Chat-Chunks subcluster_id Converter

1. PREVIEW CHANGES FOR CHAT-CHUNKS:
PREVIEW MODE - No changes will be made to chat-chunks
Found 0 chat-chunks documents that would be updated:

Total documents in chat-chunks collection: 600

2. CONFIRMATION:
Chat-chunks conversion cancelled.


In [None]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def copy_processed_at_field():
    """
    Copy processed_at field from 'sample ticket' collection to 'tickets' collection
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        
        # Get collections
        sample_ticket_collection = db['sample ticket']
        tickets_collection = db['tickets']
        
        logger.info("Connected to MongoDB successfully")
        
        # Get all documents from sample ticket collection with processed_at field
        sample_tickets = list(sample_ticket_collection.find(
            {"processed_at": {"$exists": True}},
            {"_id": 1, "processed_at": 1}
        ))
        
        logger.info(f"Found {len(sample_tickets)} documents with processed_at field in 'sample ticket' collection")
        
        if not sample_tickets:
            logger.warning("No documents found with processed_at field in 'sample ticket' collection")
            return
        
        # Create a mapping of _id to processed_at value
        processed_at_mapping = {doc['_id']: doc['processed_at'] for doc in sample_tickets}
        
        # Update tickets collection
        updated_count = 0
        failed_count = 0
        
        for doc_id, processed_at_value in processed_at_mapping.items():
            try:
                # Update the document in tickets collection
                result = tickets_collection.update_one(
                    {"_id": doc_id},
                    {"$set": {"processed_at": processed_at_value}},
                    upsert=False  # Don't create new documents if they don't exist
                )
                
                if result.matched_count > 0:
                    updated_count += 1
                    if updated_count % 100 == 0:  # Log progress every 100 updates
                        logger.info(f"Updated {updated_count} documents so far...")
                else:
                    logger.warning(f"Document with _id {doc_id} not found in tickets collection")
                    failed_count += 1
                    
            except Exception as e:
                logger.error(f"Failed to update document {doc_id}: {str(e)}")
                failed_count += 1
        
        logger.info(f"Operation completed:")
        logger.info(f"- Successfully updated: {updated_count} documents")
        logger.info(f"- Failed/Not found: {failed_count} documents")
        
        # Verify the operation
        verify_count = tickets_collection.count_documents({"processed_at": {"$exists": True}})
        logger.info(f"Verification: {verify_count} documents in 'tickets' collection now have 'processed_at' field")
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
    finally:
        # Close the connection
        try:
            client.close()
            logger.info("MongoDB connection closed")
        except:
            pass

def copy_all_processed_at_regardless_of_id():
    """
    Alternative approach: Copy processed_at values based on document order/position
    Use this if documents don't have matching _ids between collections
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        
        # Get collections
        sample_ticket_collection = db['sample ticket']
        tickets_collection = db['tickets']
        
        logger.info("Connected to MongoDB successfully")
        
        # Get all documents from both collections
        sample_tickets = list(sample_ticket_collection.find().sort("_id", 1))
        tickets = list(tickets_collection.find().sort("_id", 1))
        
        logger.info(f"Sample ticket collection has {len(sample_tickets)} documents")
        logger.info(f"Tickets collection has {len(tickets)} documents")
        
        # Ensure both collections have the same number of documents
        min_count = min(len(sample_tickets), len(tickets))
        
        if len(sample_tickets) != len(tickets):
            logger.warning(f"Collections have different sizes. Will process {min_count} documents")
        
        updated_count = 0
        
        # Update tickets with processed_at values from sample tickets
        for i in range(min_count):
            sample_doc = sample_tickets[i]
            ticket_doc = tickets[i]
            
            # Check if sample document has processed_at field
            if 'processed_at' in sample_doc:
                try:
                    # Update the corresponding ticket document
                    result = tickets_collection.update_one(
                        {"_id": ticket_doc['_id']},
                        {"$set": {"processed_at": sample_doc['processed_at']}}
                    )
                    
                    if result.modified_count > 0:
                        updated_count += 1
                        if updated_count % 100 == 0:
                            logger.info(f"Updated {updated_count} documents so far...")
                            
                except Exception as e:
                    logger.error(f"Failed to update document at index {i}: {str(e)}")
        
        logger.info(f"Successfully updated {updated_count} documents with processed_at field")
        
        # Verify the operation
        verify_count = tickets_collection.count_documents({"processed_at": {"$exists": True}})
        logger.info(f"Verification: {verify_count} documents in 'tickets' collection now have 'processed_at' field")
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
    finally:
        # Close the connection
        try:
            client.close()
            logger.info("MongoDB connection closed")
        except:
            pass

if __name__ == "__main__":
    # Check if environment variables are set
    if not mongo_connection_string:
        logger.error("MONGO_CONNECTION_STRING environment variable is not set")
        exit(1)
    
    if not mongo_database_name:
        logger.error("MONGO_DATABASE_NAME environment variable is not set")
        exit(1)
    
    print("Choose the method to copy processed_at field:")
    print("1. Copy based on matching document _id (recommended)")
    print("2. Copy based on document order/position")
    
    choice = input("Enter your choice (1 or 2): ").strip()
    
    if choice == "1":
        logger.info("Starting copy operation based on matching _id...")
        copy_processed_at_field()
    elif choice == "2":
        logger.info("Starting copy operation based on document order...")
        copy_all_processed_at_regardless_of_id()
    else:
        logger.error("Invalid choice. Please run the script again and choose 1 or 2.")

Choose the method to copy processed_at field:
1. Copy based on matching document _id (recommended)
2. Copy based on document order/position


2025-09-02 22:31:31,423 - INFO - Starting copy operation based on document order...
2025-09-02 22:31:31,427 - INFO - Connected to MongoDB successfully
2025-09-02 22:32:29,732 - INFO - Sample ticket collection has 0 documents
2025-09-02 22:32:29,734 - INFO - Tickets collection has 2000 documents
2025-09-02 22:32:29,735 - INFO - Successfully updated 0 documents with processed_at field
2025-09-02 22:32:30,003 - INFO - Verification: 2000 documents in 'tickets' collection now have 'processed_at' field
2025-09-02 22:32:30,263 - INFO - MongoDB connection closed


In [1]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def update_total_messages():
    """
    Updates all documents in the chat-chunks collection to add a total_messages field
    that contains the count of arrays in the raw_segments field.
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['chat-chunks']
        
        logger.info("Connected to MongoDB successfully")
        
        # Get total document count for progress tracking
        total_docs = collection.count_documents({})
        logger.info(f"Total documents to process: {total_docs}")
        
        # Update documents using aggregation pipeline
        # This approach is more efficient for large collections
        update_pipeline = [
            {
                "$set": {
                    "total_messages": {
                        "$cond": {
                            "if": {"$isArray": "$raw_segments"},
                            "then": {"$size": "$raw_segments"},
                            "else": 0
                        }
                    }
                }
            }
        ]
        
        # Execute the update
        logger.info("Starting bulk update operation...")
        result = collection.update_many({}, update_pipeline)
        
        # Log results
        logger.info(f"Update completed successfully!")
        logger.info(f"Documents matched: {result.matched_count}")
        logger.info(f"Documents modified: {result.modified_count}")
        
        # Verify the update with a sample
        logger.info("Verifying update with sample documents:")
        sample_docs = collection.find({}, {"_id": 1, "raw_segments": 1, "total_messages": 1}).limit(3)
        
        for i, doc in enumerate(sample_docs, 1):
            raw_segments_count = len(doc.get('raw_segments', [])) if doc.get('raw_segments') else 0
            total_messages = doc.get('total_messages', 0)
            logger.info(f"Sample {i}: ID={doc['_id']}, raw_segments array length={raw_segments_count}, total_messages={total_messages}")
            
            # Verify consistency
            if raw_segments_count != total_messages:
                logger.warning(f"Mismatch found in document {doc['_id']}")
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            logger.info("MongoDB connection closed")

def update_total_messages_batch_approach():
    """
    Alternative approach using batch processing for very large collections
    This processes documents in batches to avoid memory issues
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['chat-chunks']
        
        logger.info("Connected to MongoDB successfully (Batch Approach)")
        
        batch_size = 1000  # Process 1000 documents at a time
        skip = 0
        total_processed = 0
        
        while True:
            # Get batch of documents
            documents = list(collection.find({}, {"_id": 1, "raw_segments": 1}).skip(skip).limit(batch_size))
            
            if not documents:
                break
            
            # Prepare bulk operations
            bulk_operations = []
            
            for doc in documents:
                raw_segments = doc.get('raw_segments', [])
                total_messages = len(raw_segments) if isinstance(raw_segments, list) else 0
                
                bulk_operations.append({
                    "updateOne": {
                        "filter": {"_id": doc["_id"]},
                        "update": {"$set": {"total_messages": total_messages}}
                    }
                })
            
            # Execute bulk operations
            if bulk_operations:
                result = collection.bulk_write(bulk_operations)
                total_processed += result.modified_count
                logger.info(f"Processed batch: {len(documents)} documents, Modified: {result.modified_count}")
            
            skip += batch_size
        
        logger.info(f"Batch update completed! Total documents processed: {total_processed}")
        
    except Exception as e:
        logger.error(f"An error occurred in batch approach: {str(e)}")
        raise
    finally:
        if 'client' in locals():
            client.close()
            logger.info("MongoDB connection closed")

if __name__ == "__main__":
    # Validate environment variables
    if not mongo_connection_string:
        logger.error("MONGO_CONNECTION_STRING environment variable is not set")
        exit(1)
    
    if not mongo_database_name:
        logger.error("MONGO_DATABASE_NAME environment variable is not set")
        exit(1)
    
    # Choose which approach to use
    # For most cases, use the first approach (more efficient)
    # For very large collections (millions of documents), consider the batch approach
    
    try:
        logger.info("Starting update process...")
        update_total_messages()  # Use this for most cases
        # update_total_messages_batch_approach()  # Uncomment for very large collections
        logger.info("Update process completed successfully!")
        
    except Exception as e:
        logger.error(f"Update process failed: {str(e)}")
        exit(1)

2025-09-04 12:07:17,237 - INFO - Starting update process...
2025-09-04 12:07:17,443 - INFO - Connected to MongoDB successfully
2025-09-04 12:07:19,023 - INFO - Total documents to process: 600
2025-09-04 12:07:19,025 - INFO - Starting bulk update operation...
2025-09-04 12:07:19,824 - INFO - Update completed successfully!
2025-09-04 12:07:19,825 - INFO - Documents matched: 600
2025-09-04 12:07:19,827 - INFO - Documents modified: 600
2025-09-04 12:07:19,829 - INFO - Verifying update with sample documents:
2025-09-04 12:07:20,075 - INFO - Sample 1: ID=68a8525133508dc58e3ead33, raw_segments array length=9, total_messages=9
2025-09-04 12:07:20,076 - INFO - Sample 2: ID=68a8525133508dc58e3ead34, raw_segments array length=9, total_messages=9
2025-09-04 12:07:20,078 - INFO - Sample 3: ID=68a8525133508dc58e3ead35, raw_segments array length=9, total_messages=9
2025-09-04 12:07:20,323 - INFO - MongoDB connection closed
2025-09-04 12:07:20,328 - INFO - Update process completed successfully!
