In [None]:
# Import required libraries
from pymongo import MongoClient
from collections import defaultdict
import re
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

def remove_socialmedia_ids_from_all_records():
    """
    Remove 'socialmedia_ids' field from ALL records in the cluster collection
    """
    try:
        # Create MongoDB client
        client = MongoClient(MONGO_CONNECTION_STRING)
        
        # Access the database
        db = client[MONGO_DATABASE_NAME]
        
        # Access the cluster collection
        cluster_collection = db.cluster
        
        # First, let's check how many total records exist
        total_records_count = cluster_collection.count_documents({})
        print(f"Found {total_records_count} total records in cluster collection")
        
        if total_records_count == 0:
            print("No records found in cluster collection. Nothing to update.")
            return
        
        # Check how many records actually have the 'socialmedia_ids' field
        records_with_field = cluster_collection.count_documents({
            "socialmedia_ids": {"$exists": True}
        })
        print(f"Found {records_with_field} records that have 'socialmedia_ids' field")
        
        if records_with_field == 0:
            print("No records with 'socialmedia_ids' field found. Nothing to update.")
            return
        
        # Perform the update operation to remove 'socialmedia_ids' field from ALL records
        result = cluster_collection.update_many(
            {},  # Empty filter: match ALL records
            {"$unset": {"socialmedia_ids": ""}}  # Remove the socialmedia_ids field
        )
        
        # Print the results
        print(f"\nUpdate operation completed:")
        print(f"Records matched: {result.matched_count}")
        print(f"Records modified: {result.modified_count}")
        
        if result.modified_count > 0:
            print(f"Successfully removed 'socialmedia_ids' field from {result.modified_count} records.")
        else:
            print("No records were modified. This could mean:")
            print("- The records don't have the 'socialmedia_ids' field")
            print("- Or the field was already removed")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    
    finally:
        # Close the MongoDB connection
        if 'client' in locals():
            client.close()
            print("\nMongoDB connection closed.")

def preview_all_records_before_update():
    """
    Preview ALL records that will be affected before making changes
    """
    try:
        # Create MongoDB client
        client = MongoClient(MONGO_CONNECTION_STRING)
        
        # Access the database
        db = client[MONGO_DATABASE_NAME]
        
        # Access the cluster collection
        cluster_collection = db.cluster
        
        # Find ALL records that have the socialmedia_ids field
        preview_records = cluster_collection.find(
            {
                "socialmedia_ids": {"$exists": True}
            }
        ).limit(10)  # Limit to first 10 records for preview
        
        print("Preview of ALL records that will be updated:")
        print("-" * 60)
        
        count = 0
        for record in preview_records:
            count += 1
            print(f"Record {count}:")
            print(f"  _id: {record.get('_id')}")
            print(f"  data: {record.get('data', 'N/A')}")
            print(f"  cluster_id: {record.get('cluster_id', 'N/A')}")
            print(f"  socialmedia_ids: {record.get('socialmedia_ids', 'N/A')}")
            print()
        
        if count == 0:
            print("No records found that have the 'socialmedia_ids' field.")
        else:
            print(f"Showing {count} records (there may be more)")
        
    except Exception as e:
        print(f"An error occurred during preview: {str(e)}")
    
    finally:
        # Close the MongoDB connection
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    print("MongoDB Operation: Remove 'socialmedia_ids' field from ALL records")
    print("=" * 70)
    
    # First preview the records that will be affected
    print("\n1. PREVIEWING ALL RECORDS:")
    preview_all_records_before_update()
    
    # Ask for confirmation before proceeding
    print("\n" + "=" * 70)
    confirmation = input("Do you want to proceed with removing 'socialmedia_ids' field from ALL records? (yes/no): ").lower().strip()
    
    if confirmation in ['yes', 'y']:
        print("\n2. EXECUTING UPDATE OPERATION:")
        remove_socialmedia_ids_from_all_records()
    else:
        print("Operation cancelled by user.")


In [7]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
import random

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def connect_to_mongodb():
    """Establish connection to MongoDB"""
    try:
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        logger.info("Successfully connected to MongoDB")
        return db
    except Exception as e:
        logger.error(f"Failed to connect to MongoDB: {e}")
        raise

def get_all_email_records(db):
    """Get all email records from emailmessages collection"""
    try:
        emailmessages_collection = db['emailmessages']
        
        # Get all email records with sender_id and sender_name
        email_records = list(emailmessages_collection.find(
            {}, 
            {"sender_id": 1, "sender_name": 1, "_id": 0}
        ))
        
        logger.info(f"Retrieved {len(email_records)} email records from emailmessages collection")
        return email_records
    except Exception as e:
        logger.error(f"Failed to retrieve email records: {e}")
        raise

def create_random_samples_for_remaining(existing_emails, needed_count):
    """Create random samples from existing emails for remaining records"""
    try:
        # Create additional samples by randomly selecting from existing emails
        additional_samples = []
        
        for _ in range(needed_count):
            # Randomly pick an email record
            random_email = random.choice(existing_emails)
            additional_samples.append({
                "sender_id": random_email["sender_id"],
                "sender_name": random_email["sender_name"]
            })
        
        logger.info(f"Created {len(additional_samples)} additional random samples")
        return additional_samples
        
    except Exception as e:
        logger.error(f"Failed to create random samples: {e}")
        raise

def update_socialmedia_collection(db, all_email_data, target_count=2237):
    """Update socialmedia collection with email data"""
    try:
        socialmedia_collection = db['socialmedia']
        
        # Get current count of socialmedia records
        current_count = socialmedia_collection.count_documents({})
        logger.info(f"Current socialmedia records: {current_count}")
        
        # Clear existing email-related fields if any
        socialmedia_collection.update_many(
            {},
            {"$unset": {"username": "", "email_id": "", "data_source": ""}}
        )
        
        # If we have fewer records than target, create new ones
        if current_count < target_count:
            records_to_create = target_count - current_count
            logger.info(f"Creating {records_to_create} additional socialmedia records")
            
            # Create new empty records
            new_records = [{"created_for_distribution": True} for _ in range(records_to_create)]
            result = socialmedia_collection.insert_many(new_records)
            logger.info(f"Created {len(result.inserted_ids)} new records")
        
        # Get all socialmedia record IDs (should be 2237 now)
        socialmedia_records = list(socialmedia_collection.find({}, {"_id": 1}))
        socialmedia_ids = [record["_id"] for record in socialmedia_records]
        
        # Randomly shuffle the socialmedia IDs for random distribution
        random.shuffle(socialmedia_ids)
        
        # Update records with email data
        updated_count = 0
        
        for i, email_data in enumerate(all_email_data):
            socialmedia_id = socialmedia_ids[i]
            
            # Determine data source
            data_source = "original" if i < 2004 else "random_sample"
            
            update_data = {
                "username": email_data["sender_name"],
                "email_id": email_data["sender_id"], 
                "data_source": data_source  # Track whether original or sampled
            }
            
            result = socialmedia_collection.update_one(
                {"_id": socialmedia_id},
                {"$set": update_data}
            )
            
            if result.modified_count > 0:
                updated_count += 1
        
        logger.info(f"Updated {updated_count} socialmedia records with email data")
        
        # Verify the final counts
        final_count = socialmedia_collection.count_documents({})
        original_data_count = socialmedia_collection.count_documents({"data_source": "original"})
        sampled_data_count = socialmedia_collection.count_documents({"data_source": "random_sample"})
        
        logger.info(f"Final socialmedia collection count: {final_count}")
        logger.info(f"Records with original email data: {original_data_count}")
        logger.info(f"Records with randomly sampled data: {sampled_data_count}")
        
        return True
        
    except Exception as e:
        logger.error(f"Failed to update socialmedia collection: {e}")
        raise

def main():
    """Main execution function"""
    try:
        # Connect to database
        db = connect_to_mongodb()
        
        # Get initial collection counts
        emailmessages_count = db['emailmessages'].count_documents({})
        socialmedia_count = db['socialmedia'].count_documents({})
        
        logger.info(f"Initial emailmessages collection count: {emailmessages_count}")
        logger.info(f"Initial socialmedia collection count: {socialmedia_count}")
        
        # Get all email records from emailmessages collection
        all_email_records = get_all_email_records(db)
        actual_email_count = len(all_email_records)
        
        logger.info(f"Retrieved {actual_email_count} email records")
        
        # Calculate how many additional samples we need
        target_total = 2237
        additional_needed = target_total - actual_email_count
        
        logger.info(f"Target total records: {target_total}")
        logger.info(f"Actual email records: {actual_email_count}")
        logger.info(f"Additional samples needed: {additional_needed}")
        
        # Prepare final email data list
        final_email_data = all_email_records.copy()
        
        # If we need more records, create random samples
        if additional_needed > 0:
            additional_samples = create_random_samples_for_remaining(all_email_records, additional_needed)
            final_email_data.extend(additional_samples)
        elif additional_needed < 0:
            # If we have more emails than needed, truncate
            logger.warning(f"Have more emails ({actual_email_count}) than target ({target_total}). Using first {target_total} records.")
            final_email_data = final_email_data[:target_total]
        
        logger.info(f"Final email data prepared: {len(final_email_data)} records")
        
        # Update socialmedia collection
        update_socialmedia_collection(db, final_email_data, target_total)
        
        # Final verification and statistics
        final_socialmedia_count = db['socialmedia'].count_documents({})
        records_with_username = db['socialmedia'].count_documents({"username": {"$exists": True}})
        records_with_email_id = db['socialmedia'].count_documents({"email_id": {"$exists": True}})
        original_data_count = db['socialmedia'].count_documents({"data_source": "original"})
        sampled_data_count = db['socialmedia'].count_documents({"data_source": "random_sample"})
        
        logger.info("=== FINAL RESULTS ===")
        logger.info(f"Total socialmedia records: {final_socialmedia_count}")
        logger.info(f"Records with username field: {records_with_username}")
        logger.info(f"Records with email_id field: {records_with_email_id}")
        logger.info(f"Records with original email data: {original_data_count}")
        logger.info(f"Records with randomly sampled data: {sampled_data_count}")
        
        # Sample verification - show a few records
        logger.info("=== SAMPLE RECORDS ===")
        sample_records = list(db['socialmedia'].find(
            {"username": {"$exists": True}}, 
            {"username": 1, "email_id": 1, "data_source": 1, "_id": 0}
        ).limit(5))
        
        for i, record in enumerate(sample_records, 1):
            logger.info(f"Sample {i}: username='{record.get('username')}', email_id='{record.get('email_id')}', source='{record.get('data_source')}'")
        
    except Exception as e:
        logger.error(f"Script execution failed: {e}")
        raise

if __name__ == "__main__":
    main()

2025-09-05 22:12:16,559 - INFO - Successfully connected to MongoDB


2025-09-05 22:12:18,351 - INFO - Initial emailmessages collection count: 2004
2025-09-05 22:12:18,352 - INFO - Initial socialmedia collection count: 0
2025-09-05 22:12:19,366 - INFO - Retrieved 2004 email records from emailmessages collection
2025-09-05 22:12:19,367 - INFO - Retrieved 2004 email records
2025-09-05 22:12:19,368 - INFO - Target total records: 2237
2025-09-05 22:12:19,368 - INFO - Actual email records: 2004
2025-09-05 22:12:19,369 - INFO - Additional samples needed: 233
2025-09-05 22:12:19,371 - INFO - Created 233 additional random samples
2025-09-05 22:12:19,372 - INFO - Final email data prepared: 2237 records
2025-09-05 22:12:19,620 - INFO - Current socialmedia records: 0
2025-09-05 22:12:19,869 - INFO - Creating 2237 additional socialmedia records
2025-09-05 22:12:21,043 - INFO - Created 2237 new records
2025-09-05 22:22:26,519 - INFO - Updated 2237 socialmedia records with email data
2025-09-05 22:22:27,266 - INFO - Final socialmedia collection count: 2237
2025-09-05 

In [8]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
import random

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def update_channel_field():
    """
    Updates the socialmedia collection to add channel field with specified distribution
    """
    
    # Channel distribution
    channel_distribution = {
        "Trustpilot": 1147,
        "App Store/Google Play": 701,
        "Twitter": 257,
        "Reddit": 132
    }
    
    total_expected_records = sum(channel_distribution.values())
    logger.info(f"Expected total records to update: {total_expected_records}")
    
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['socialmedia']
        
        # Check current collection count
        current_count = collection.count_documents({})
        logger.info(f"Current collection count: {current_count}")
        
        if current_count < total_expected_records:
            logger.error(f"Not enough documents in collection. Need {total_expected_records}, but only have {current_count}")
            return False
        
        # Get all document IDs to randomly distribute channels
        all_docs = list(collection.find({}, {"_id": 1}))
        all_ids = [doc["_id"] for doc in all_docs]
        
        # Shuffle the IDs to ensure random distribution
        random.shuffle(all_ids)
        
        # Distribute channels
        current_index = 0
        update_count = 0
        
        for channel, count in channel_distribution.items():
            logger.info(f"Updating {count} records with channel: {channel}")
            
            # Get the IDs for this channel
            channel_ids = all_ids[current_index:current_index + count]
            current_index += count
            
            # Update documents with this channel
            result = collection.update_many(
                {"_id": {"$in": channel_ids}},
                {"$set": {"channel": channel}}
            )
            
            update_count += result.modified_count
            logger.info(f"Updated {result.modified_count} documents with channel: {channel}")
        
        logger.info(f"Total documents updated: {update_count}")
        
        # Verify the distribution
        verify_distribution(collection)
        
        return True
        
    except Exception as e:
        logger.error(f"Error updating channel field: {str(e)}")
        return False
    
    finally:
        if 'client' in locals():
            client.close()

def verify_distribution(collection):
    """
    Verifies the channel distribution in the collection
    """
    logger.info("Verifying channel distribution...")
    
    pipeline = [
        {"$group": {"_id": "$channel", "count": {"$sum": 1}}},
        {"$sort": {"count": -1}}
    ]
    
    results = list(collection.aggregate(pipeline))
    
    for result in results:
        channel = result["_id"]
        count = result["count"]
        logger.info(f"Channel: {channel} | Count: {count}")

def add_channel_field_to_existing_without_channel():
    """
    Alternative approach: Only update documents that don't have a channel field
    """
    
    channel_distribution = {
        "Trustpilot": 1147,
        "App Store/Google Play": 701,
        "Twitter": 257,
        "Reddit": 132
    }
    
    try:
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['socialmedia']
        
        # Find documents without channel field
        docs_without_channel = list(collection.find({"channel": {"$exists": False}}, {"_id": 1}))
        
        if len(docs_without_channel) < sum(channel_distribution.values()):
            logger.error(f"Not enough documents without channel field. Need {sum(channel_distribution.values())}, but only have {len(docs_without_channel)}")
            return False
        
        # Shuffle for random distribution
        random.shuffle(docs_without_channel)
        
        current_index = 0
        for channel, count in channel_distribution.items():
            channel_ids = [doc["_id"] for doc in docs_without_channel[current_index:current_index + count]]
            current_index += count
            
            result = collection.update_many(
                {"_id": {"$in": channel_ids}},
                {"$set": {"channel": channel}}
            )
            
            logger.info(f"Updated {result.modified_count} documents with channel: {channel}")
        
        verify_distribution(collection)
        return True
        
    except Exception as e:
        logger.error(f"Error: {str(e)}")
        return False
    
    finally:
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    logger.info("Starting channel field update process...")
    
    # Choose one of the following approaches:
    
    # Approach 1: Update all documents (will overwrite existing channel values)
    success = update_channel_field()
    
    # Approach 2: Only update documents without channel field (uncomment to use)
    # success = add_channel_field_to_existing_without_channel()
    
    if success:
        logger.info("Channel field update completed successfully!")
    else:
        logger.error("Channel field update failed!")

2025-09-05 22:22:48,015 - INFO - Starting channel field update process...
2025-09-05 22:22:48,016 - INFO - Expected total records to update: 2237
2025-09-05 22:22:49,669 - INFO - Current collection count: 2237
2025-09-05 22:22:50,719 - INFO - Updating 1147 records with channel: Trustpilot
2025-09-05 22:22:51,251 - INFO - Updated 1147 documents with channel: Trustpilot
2025-09-05 22:22:51,252 - INFO - Updating 701 records with channel: App Store/Google Play
2025-09-05 22:22:51,524 - INFO - Updated 701 documents with channel: App Store/Google Play
2025-09-05 22:22:51,527 - INFO - Updating 257 records with channel: Twitter
2025-09-05 22:22:51,780 - INFO - Updated 257 documents with channel: Twitter
2025-09-05 22:22:51,782 - INFO - Updating 132 records with channel: Reddit
2025-09-05 22:22:52,296 - INFO - Updated 132 documents with channel: Reddit
2025-09-05 22:22:52,297 - INFO - Total documents updated: 2237
2025-09-05 22:22:52,298 - INFO - Verifying channel distribution...
2025-09-05 22:

In [30]:
import os
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
import random
import itertools

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

class CSVToMongoProcessor:
    def __init__(self):
        """Initialize the processor with MongoDB connection"""
        try:
            self.client = MongoClient(mongo_connection_string)
            self.db = self.client[mongo_database_name]
            self.collection = self.db['socialmedia']
            logger.info("Successfully connected to MongoDB")
        except Exception as e:
            logger.error(f"Failed to connect to MongoDB: {e}")
            raise

    def load_csv_data(self, csv_file_path):
        """Load and process CSV data"""
        try:
            # Read CSV file
            df = pd.read_csv(csv_file_path)
            logger.info(f"Successfully loaded CSV with {len(df)} rows")
            
            # Clean column names (remove extra spaces)
            df.columns = df.columns.str.strip()
            
            # Convert to list of dictionaries for easier processing
            csv_data = []
            for _, row in df.iterrows():
                csv_data.append({
                    'dominant_topic': str(row['dominant topic']).strip(),
                    'count': int(row['count']),
                    'channel_distribution': str(row['Channel Distribution']).strip(),
                    'subtopics': str(row['Subtopics']).strip()
                })
            
            return csv_data
            
        except Exception as e:
            logger.error(f"Error loading CSV data: {e}")
            raise

    def generate_unique_subtopic_combinations(self, subtopics_string, count):
        """Generate unique combinations of subtopics for the given count"""
        try:
            # Split subtopics by comma and clean them
            subtopics_list = [topic.strip() for topic in subtopics_string.split(',')]
            logger.info(f"Generating {count} unique combinations from {len(subtopics_list)} subtopics")
            
            unique_combinations = []
            
            # Strategy: Create combinations of different lengths to ensure uniqueness
            all_possible_combinations = []
            
            # 1. Individual subtopics
            all_possible_combinations.extend(subtopics_list)
            
            # 2. Pairs of subtopics
            if len(subtopics_list) >= 2:
                for combo in itertools.combinations(subtopics_list, 2):
                    all_possible_combinations.append(", ".join(combo))
            
            # 3. Triple combinations
            if len(subtopics_list) >= 3:
                for combo in itertools.combinations(subtopics_list, 3):
                    all_possible_combinations.append(", ".join(combo))
            
            # 4. Quadruple combinations if needed
            if len(subtopics_list) >= 4 and len(all_possible_combinations) < count:
                for combo in itertools.combinations(subtopics_list, 4):
                    all_possible_combinations.append(", ".join(combo))
            
            # 5. If we still need more, create random combinations with repetition allowed
            while len(all_possible_combinations) < count:
                # Create random combinations of 2-5 subtopics
                combo_size = random.randint(2, min(5, len(subtopics_list)))
                combo = random.sample(subtopics_list, combo_size)
                combo_str = ", ".join(sorted(combo))  # Sort for consistency
                
                if combo_str not in all_possible_combinations:
                    all_possible_combinations.append(combo_str)
                else:
                    # Create variation by changing order
                    combo_str_alt = ", ".join(sorted(combo, reverse=True))
                    if combo_str_alt not in all_possible_combinations:
                        all_possible_combinations.append(combo_str_alt)
                    else:
                        # If still exists, add with random selection
                        random.shuffle(combo)
                        combo_str_random = ", ".join(combo)
                        all_possible_combinations.append(combo_str_random)
            
            # Shuffle to randomize selection and take required count
            random.shuffle(all_possible_combinations)
            unique_combinations = all_possible_combinations[:count]
            
            logger.info(f"Successfully generated {len(unique_combinations)} unique subtopic combinations")
            return unique_combinations
            
        except Exception as e:
            logger.error(f"Error generating subtopic combinations: {e}")
            raise

    def get_channel_status(self, channel):
        """Get detailed status of a channel's processing"""
        try:
            # Total records for this channel
            total_records = self.collection.count_documents({'channel': channel})
            
            # Processed records (have dominant_topic field)
            processed_records = self.collection.count_documents({
                'channel': channel,
                'dominant_topic': {'$exists': True}
            })
            
            # Unprocessed records
            unprocessed_records = total_records - processed_records
            
            logger.info(f"Channel '{channel}' Status:")
            logger.info(f"  Total Records: {total_records}")
            logger.info(f"  Processed Records: {processed_records}")
            logger.info(f"  Unprocessed Records: {unprocessed_records}")
            
            return {
                'total': total_records,
                'processed': processed_records,
                'unprocessed': unprocessed_records
            }
            
        except Exception as e:
            logger.error(f"Error getting channel status: {e}")
            return {'total': 0, 'processed': 0, 'unprocessed': 0}

    def update_next_batch_records(self, channel, dominant_topic, subtopic_combinations, target_count):
        """Update the next batch of unprocessed records for a channel"""
        try:
            logger.info(f"Updating next {target_count} unprocessed records for channel: {channel}")
            
            # Get the exact number of unprocessed records we need
            unprocessed_records = list(self.collection.find({
                'channel': channel,
                'dominant_topic': {'$exists': False},
                'subtopics': {'$exists': False}
            }).limit(target_count))
            
            if len(unprocessed_records) < target_count:
                logger.warning(f"Only {len(unprocessed_records)} unprocessed records available for {channel}, requested {target_count}")
                target_count = len(unprocessed_records)
                subtopic_combinations = subtopic_combinations[:target_count]
            
            logger.info(f"Found {len(unprocessed_records)} unprocessed records to update")
            
            # Update each record
            updated_count = 0
            failed_updates = 0
            
            for i, record in enumerate(unprocessed_records):
                try:
                    # Ensure we have a subtopic for this record
                    subtopic_to_use = subtopic_combinations[i] if i < len(subtopic_combinations) else subtopic_combinations[0]
                    
                    # Update the record with safety check
                    result = self.collection.update_one(
                        {
                            '_id': record['_id'],
                            'dominant_topic': {'$exists': False},  # Safety: ensure still unprocessed
                            'subtopics': {'$exists': False}        # Safety: ensure still unprocessed
                        },
                        {
                            '$set': {
                                'dominant_topic': dominant_topic,
                                'subtopics': subtopic_to_use
                            }
                        }
                    )
                    
                    if result.modified_count > 0:
                        updated_count += 1
                        logger.debug(f"Updated record {i+1}/{len(unprocessed_records)}: {record['_id']}")
                    else:
                        failed_updates += 1
                        logger.warning(f"Failed to update record {record['_id']} - may have been processed by another operation")
                        
                except Exception as e:
                    failed_updates += 1
                    logger.error(f"Error updating record {record['_id']}: {e}")
                    continue
            
            # Verification
            verification_count = self.collection.count_documents({
                'channel': channel,
                'dominant_topic': dominant_topic
            })
            
            logger.info(f"Update Results:")
            logger.info(f"  Records Successfully Updated: {updated_count}")
            logger.info(f"  Failed Updates: {failed_updates}")
            logger.info(f"  Verification Count in DB: {verification_count} records with dominant_topic '{dominant_topic}'")
            
            return updated_count
            
        except Exception as e:
            logger.error(f"Error updating batch records: {e}")
            raise

    def process_csv_sequentially(self, csv_file_path):
        """Process CSV data sequentially, updating records in order"""
        try:
            # Load CSV data
            csv_data = self.load_csv_data(csv_file_path)
            
            # Group by channel to show initial status
            channels = set([row['channel_distribution'] for row in csv_data])
            logger.info(f"Processing channels: {channels}")
            
            # Show initial status for each channel
            for channel in channels:
                self.get_channel_status(channel)
            
            # Process each CSV row in sequence
            for row_index, row in enumerate(csv_data, 1):
                channel = row['channel_distribution']
                dominant_topic = row['dominant_topic']
                count = row['count']
                subtopics_string = row['subtopics']
                
                logger.info(f"\n--- Processing Row {row_index}/{len(csv_data)} ---")
                logger.info(f"Dominant Topic: {dominant_topic}")
                logger.info(f"Channel: {channel}")
                logger.info(f"Count: {count}")
                
                # Check current channel status
                channel_status = self.get_channel_status(channel)
                
                if channel_status['unprocessed'] == 0:
                    logger.warning(f"No unprocessed records available for channel: {channel}. Skipping...")
                    continue
                
                if channel_status['unprocessed'] < count:
                    logger.warning(f"Only {channel_status['unprocessed']} unprocessed records available, but need {count}")
                    actual_count = channel_status['unprocessed']
                else:
                    actual_count = count
                
                # Generate unique subtopic combinations
                logger.info(f"Generating {actual_count} unique subtopic combinations...")
                subtopic_combinations = self.generate_unique_subtopic_combinations(subtopics_string, actual_count)
                
                # Update the next batch of unprocessed records
                updated_count = self.update_next_batch_records(channel, dominant_topic, subtopic_combinations, actual_count)
                
                logger.info(f"Completed processing '{dominant_topic}': {updated_count} records updated")
                
                # Show updated channel status
                updated_status = self.get_channel_status(channel)
                
        except Exception as e:
            logger.error(f"Error in sequential processing: {e}")
            raise

    def get_final_summary(self):
        """Get final summary of all processing"""
        try:
            # Get summary by channel and dominant topic
            pipeline = [
                {
                    '$match': {
                        'dominant_topic': {'$exists': True}
                    }
                },
                {
                    '$group': {
                        '_id': {
                            'channel': '$channel',
                            'dominant_topic': '$dominant_topic'
                        },
                        'count': {'$sum': 1}
                    }
                },
                {
                    '$group': {
                        '_id': '$_id.channel',
                        'topics': {
                            '$push': {
                                'topic': '$_id.dominant_topic',
                                'count': '$count'
                            }
                        },
                        'total_processed': {'$sum': '$count'}
                    }
                },
                {
                    '$sort': {'_id': 1}
                }
            ]
            
            results = list(self.collection.aggregate(pipeline))
            
            logger.info("\n=== FINAL PROCESSING SUMMARY ===")
            for channel_data in results:
                channel = channel_data['_id']
                total_processed = channel_data['total_processed']
                topics = channel_data['topics']
                
                # Get total records for this channel
                total_records = self.collection.count_documents({'channel': channel})
                remaining_records = total_records - total_processed
                
                logger.info(f"\nChannel: {channel}")
                logger.info(f"  Total Records: {total_records}")
                logger.info(f"  Processed Records: {total_processed}")
                logger.info(f"  Remaining Records: {remaining_records}")
                logger.info(f"  Dominant Topics:")
                
                for topic in sorted(topics, key=lambda x: x['count'], reverse=True):
                    logger.info(f"    - {topic['topic']}: {topic['count']} records")
                    
        except Exception as e:
            logger.error(f"Error generating final summary: {e}")

    def close_connection(self):
        """Close MongoDB connection"""
        try:
            self.client.close()
            logger.info("MongoDB connection closed")
        except Exception as e:
            logger.error(f"Error closing MongoDB connection: {e}")

def main():
    """Main execution function"""
    try:
        # Initialize processor
        processor = CSVToMongoProcessor()
        
        # Process CSV file sequentially
        csv_file_path = "data.csv"  # Update this path to your CSV file location
        processor.process_csv_sequentially(csv_file_path)
        
        # Get final summary
        processor.get_final_summary()
        
        # Close connection
        processor.close_connection()
        
        logger.info("\n=== PROCESSING COMPLETED SUCCESSFULLY ===")
        
    except Exception as e:
        logger.error(f"Error in main execution: {e}")

if __name__ == "__main__":
    main()

2025-09-06 00:37:35,363 - INFO - Successfully connected to MongoDB
2025-09-06 00:37:35,368 - INFO - Successfully loaded CSV with 168 rows
2025-09-06 00:37:35,379 - INFO - Processing channels: {'Trustpilot', 'Twitter', 'App Store/Google Play', 'Reddit'}
2025-09-06 00:37:37,393 - INFO - Channel 'Trustpilot' Status:
2025-09-06 00:37:37,395 - INFO -   Total Records: 1147
2025-09-06 00:37:37,397 - INFO -   Processed Records: 0
2025-09-06 00:37:37,399 - INFO -   Unprocessed Records: 1147
2025-09-06 00:37:37,957 - INFO - Channel 'Twitter' Status:
2025-09-06 00:37:37,959 - INFO -   Total Records: 257
2025-09-06 00:37:37,960 - INFO -   Processed Records: 0
2025-09-06 00:37:37,964 - INFO -   Unprocessed Records: 257
2025-09-06 00:37:38,479 - INFO - Channel 'App Store/Google Play' Status:
2025-09-06 00:37:38,482 - INFO -   Total Records: 701
2025-09-06 00:37:38,485 - INFO -   Processed Records: 0
2025-09-06 00:37:38,487 - INFO -   Unprocessed Records: 701
2025-09-06 00:37:39,077 - INFO - Channel 

In [32]:
import os
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
import random
from datetime import datetime, timedelta

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

class SocialMediaUpdater:
    def __init__(self):
        self.channels = ["Twitter", "Reddit", "Trustpilot", "App Store/Google Play"]
        self.domain = "banking"
        
    def generate_user_id(self):
        """Generate a 6-digit user ID"""
        return str(random.randint(100000, 999999))
    
    def generate_created_at(self):
        """Generate a timestamp in ISO format"""
        # Generate random date within the last year
        end_date = datetime.now()
        start_date = end_date - timedelta(days=365)
        random_date = start_date + timedelta(
            seconds=random.randint(0, int((end_date - start_date).total_seconds()))
        )
        return random_date.strftime('%Y-%m-%dT%H:%M:%SZ')
    
    def generate_proper_id(self, channel):
        """Generate proper ID based on channel"""
        if channel == "Twitter":
            return f"T{random.randint(100000, 999999)}"
        elif channel == "Reddit":
            return f"R{random.randint(100000, 999999)}"
        elif channel == "Trustpilot":
            return f"TP{random.randint(100000, 999999)}"
        elif channel == "App Store/Google Play":
            return str(random.randint(100000, 999999))
        else:
            return str(random.randint(100000, 999999))
    
    def get_proper_id_field_name(self, channel):
        """Get the correct field name for the ID based on channel"""
        if channel == "Twitter":
            return "tweet_id"
        elif channel == "Reddit":
            return "post_id"
        elif channel in ["Trustpilot", "App Store/Google Play"]:
            return "review_id"
        else:
            return "id"

def connect_to_mongodb():
    """Connect to MongoDB and return database object"""
    try:
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        
        # Test the connection
        client.admin.command('ping')
        logger.info(f"Successfully connected to MongoDB database: {mongo_database_name}")
        
        return db
    except Exception as e:
        logger.error(f"Error connecting to MongoDB: {e}")
        return None

def analyze_existing_data():
    """Analyze the existing data structure"""
    try:
        db = connect_to_mongodb()
        if db is None:
            return None
        
        collection = db['socialmedia']
        
        # Get total count
        total_count = collection.count_documents({})
        logger.info(f"Total documents in collection: {total_count}")
        
        # Get sample documents to understand structure
        sample_docs = list(collection.find().limit(5))
        
        logger.info("Sample document structure:")
        for i, doc in enumerate(sample_docs):
            logger.info(f"Document {i+1}: {doc}")
        
        # Get unique channels
        pipeline = [
            {"$group": {"_id": "$channel", "count": {"$sum": 1}}}
        ]
        channel_stats = list(collection.aggregate(pipeline))
        
        logger.info("Channel distribution:")
        for stat in channel_stats:
            logger.info(f"  {stat['_id']}: {stat['count']} records")
        
        return sample_docs, channel_stats
        
    except Exception as e:
        logger.error(f"Error analyzing existing data: {e}")
        return None

def update_all_records():
    """Update all existing records with proper format"""
    try:
        db = connect_to_mongodb()
        if db is None:
            return False
        
        collection = db['socialmedia']
        updater = SocialMediaUpdater()
        
        # Get all documents
        all_docs = list(collection.find())
        logger.info(f"Found {len(all_docs)} documents to update")
        
        updated_count = 0
        error_count = 0
        
        for doc in all_docs:
            try:
                # Get channel from document, default to Twitter if not present
                channel = doc.get('channel', 'Twitter')
                
                # Prepare update operations
                update_operations = {}
                unset_operations = {}
                
                # 1. Set domain to "banking"
                update_operations['domain'] = updater.domain
                
                # 2. Generate proper user_id (6 digits)
                update_operations['user_id'] = updater.generate_user_id()
                
                # 3. Generate proper created_at timestamp
                update_operations['created_at'] = updater.generate_created_at()
                
                # 4. Set proper channel if not already set
                if 'channel' not in doc or doc['channel'] not in updater.channels:
                    # Randomly assign a channel if not present or invalid
                    update_operations['channel'] = random.choice(updater.channels)
                    channel = update_operations['channel']
                
                # 5. Generate and set proper ID field based on channel
                proper_id = updater.generate_proper_id(channel)
                proper_id_field = updater.get_proper_id_field_name(channel)
                update_operations[proper_id_field] = proper_id
                
                # 6. Remove old ID fields that don't match the channel
                all_id_fields = ['tweet_id', 'post_id', 'review_id', 'id']
                for field in all_id_fields:
                    if field != proper_id_field and field in doc:
                        unset_operations[field] = ""
                
                # Perform the update
                update_query = {}
                if update_operations:
                    update_query['$set'] = update_operations
                if unset_operations:
                    update_query['$unset'] = unset_operations
                
                if update_query:
                    result = collection.update_one(
                        {"_id": doc['_id']},
                        update_query
                    )
                    
                    if result.modified_count > 0:
                        updated_count += 1
                    
                    if updated_count % 100 == 0:
                        logger.info(f"Updated {updated_count} documents so far...")
                        
            except Exception as e:
                logger.error(f"Error updating document {doc.get('_id', 'unknown')}: {e}")
                error_count += 1
        
        logger.info(f"Update completed! Updated: {updated_count}, Errors: {error_count}")
        return True
        
    except Exception as e:
        logger.error(f"Error in update_all_records: {e}")
        return False

def verify_updated_data():
    """Verify that the data has been updated correctly"""
    try:
        db = connect_to_mongodb()
        if db is None:
            return False
        
        collection = db['socialmedia']
        
        # Get sample of updated records
        logger.info("=== VERIFICATION OF UPDATED DATA ===")
        
        # Check each channel
        for channel in ["Twitter", "Reddit", "Trustpilot", "App Store/Google Play"]:
            sample_doc = collection.find_one({"channel": channel})
            
            if sample_doc:
                logger.info(f"\n{channel} Sample Record:")
                
                # Check ID field
                if channel == "Twitter" and "tweet_id" in sample_doc:
                    id_value = sample_doc["tweet_id"]
                    logger.info(f"  tweet_id: {id_value} (starts with 'T': {id_value.startswith('T')})")
                elif channel == "Reddit" and "post_id" in sample_doc:
                    id_value = sample_doc["post_id"]
                    logger.info(f"  post_id: {id_value} (starts with 'R': {id_value.startswith('R')})")
                elif channel == "Trustpilot" and "review_id" in sample_doc:
                    id_value = sample_doc["review_id"]
                    logger.info(f"  review_id: {id_value} (starts with 'TP': {id_value.startswith('TP')})")
                elif channel == "App Store/Google Play" and "review_id" in sample_doc:
                    id_value = sample_doc["review_id"]
                    logger.info(f"  review_id: {id_value} (6 digits: {len(id_value) == 6})")
                
                # Check other required fields
                logger.info(f"  channel: {sample_doc.get('channel', 'NOT SET')}")
                user_id = sample_doc.get('user_id', 'NOT SET')
                logger.info(f"  user_id: {user_id} (6 digits: {len(str(user_id)) == 6 if user_id != 'NOT SET' else False})")
                logger.info(f"  domain: {sample_doc.get('domain', 'NOT SET')}")
                logger.info(f"  created_at: {sample_doc.get('created_at', 'NOT SET')}")
        
        # Get final statistics
        total_count = collection.count_documents({})
        logger.info(f"\nTotal documents: {total_count}")
        
        # Channel distribution
        pipeline = [
            {"$group": {"_id": "$channel", "count": {"$sum": 1}}}
        ]
        channel_stats = list(collection.aggregate(pipeline))
        
        logger.info("\nFinal Channel Distribution:")
        for stat in channel_stats:
            logger.info(f"  {stat['_id']}: {stat['count']} records")
        
        # Check for required fields
        logger.info("\nField Coverage Check:")
        fields_to_check = ['domain', 'user_id', 'created_at', 'channel']
        for field in fields_to_check:
            count_with_field = collection.count_documents({field: {"$exists": True, "$ne": None}})
            logger.info(f"  {field}: {count_with_field}/{total_count} records")
        
        return True
        
    except Exception as e:
        logger.error(f"Error in verification: {e}")
        return False

def create_backup_before_update():
    """Create a backup of existing data before updating"""
    try:
        db = connect_to_mongodb()
        if db is None:
            return False
        
        collection = db['socialmedia']
        
        # Export to CSV as backup
        all_docs = list(collection.find())
        df = pd.DataFrame(all_docs)
        
        # Convert ObjectId to string for CSV
        if '_id' in df.columns:
            df['_id'] = df['_id'].astype(str)
        
        backup_filename = f"socialmedia_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        df.to_csv(backup_filename, index=False)
        
        logger.info(f"Backup created: {backup_filename} ({len(all_docs)} records)")
        return True
        
    except Exception as e:
        logger.error(f"Error creating backup: {e}")
        return False

def main():
    """Main function to update existing records"""
    logger.info("Starting update of existing social media records...")
    
    # Step 1: Analyze existing data
    logger.info("Step 1: Analyzing existing data structure...")
    analysis_result = analyze_existing_data()
    
    if analysis_result is None:
        logger.error("Could not analyze existing data. Exiting.")
        return
    
    # Step 2: Create backup
    logger.info("Step 2: Creating backup of existing data...")
    backup_success = create_backup_before_update()
    
    if not backup_success:
        logger.warning("Backup creation failed. Continuing with update...")
    
    # Step 3: Update all records
    logger.info("Step 3: Updating all records with proper format...")
    update_success = update_all_records()
    
    if not update_success:
        logger.error("Update failed. Check the logs for details.")
        return
    
    # Step 4: Verify updated data
    logger.info("Step 4: Verifying updated data...")
    verify_updated_data()
    
    logger.info("Update process completed successfully!")

if __name__ == "__main__":
    main()

2025-09-06 01:08:04,186 - INFO - Starting update of existing social media records...
2025-09-06 01:08:04,188 - INFO - Step 1: Analyzing existing data structure...
2025-09-06 01:08:06,071 - INFO - Successfully connected to MongoDB database: sparzaai
2025-09-06 01:08:06,319 - INFO - Total documents in collection: 2337
2025-09-06 01:08:06,597 - INFO - Sample document structure:
2025-09-06 01:08:06,599 - INFO - Document 1: {'_id': ObjectId('68bb12eb35db675a8b09a217'), 'email_id': 'raymondwarner5185@gmail.com', 'username': 'Raymond Warner', 'channel': 'Trustpilot', 'dominant_topic': 'Long Wait Times', 'subtopics': 'Peak Hour Congestion, Resource Allocation Issues, Service Level Agreement Breach'}
2025-09-06 01:08:06,600 - INFO - Document 2: {'_id': ObjectId('68bb12eb35db675a8b09a218'), 'email_id': 'theresaolsen8118@gmail.com', 'username': 'Theresa Olsen', 'channel': 'Trustpilot', 'dominant_topic': 'Long Wait Times', 'subtopics': 'Hold Time Excessive, Queue Management Inefficient, Call Back 

In [29]:
import os
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
import random
from collections import Counter
from itertools import combinations

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def remove_fields_from_collection():
    """
    Remove specified fields from the socialmedia collection
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['socialmedia']
        
        logger.info("Connected to MongoDB successfully")
        
        # Get initial document count
        initial_count = collection.count_documents({})
        logger.info(f"Total documents in collection: {initial_count}")
        
        # Fields to remove
        fields_to_remove = [
            "dominant_topic",
            "subtopics", 
            "data_source",
            "created_for_distribution"
        ]
        
        # Create the update operation to unset (remove) the fields
        unset_operation = {field: "" for field in fields_to_remove}
        
        logger.info(f"Removing fields: {fields_to_remove}")
        
        # Remove the fields from all documents in the collection
        result = collection.update_many(
            {},  # Empty filter means update all documents
            {"$unset": unset_operation}
        )
        
        logger.info(f"Fields removed successfully!")
        logger.info(f"Documents matched: {result.matched_count}")
        logger.info(f"Documents modified: {result.modified_count}")
        
        # Verify the fields have been removed by checking a sample document
        sample_doc = collection.find_one()
        if sample_doc:
            remaining_fields = list(sample_doc.keys())
            removed_fields_check = [field for field in fields_to_remove if field not in remaining_fields]
            logger.info(f"Verification - Fields successfully removed: {removed_fields_check}")
            
            # Check if any of the target fields still exist
            still_existing = [field for field in fields_to_remove if field in remaining_fields]
            if still_existing:
                logger.warning(f"These fields still exist: {still_existing}")
            else:
                logger.info("All target fields have been successfully removed!")
        
    except Exception as e:
        logger.error(f"Error occurred: {str(e)}")
        raise e
    
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            logger.info("MongoDB connection closed")

def main():
    """
    Main function to execute the field removal
    """
    try:
        # Check if required environment variables are set
        if not mongo_connection_string or not mongo_database_name:
            logger.error("Missing required environment variables: MONGO_CONNECTION_STRING or MONGO_DATABASE_NAME")
            return
        
        # Confirm before proceeding (optional safety check)
        confirmation = input("Are you sure you want to remove the specified fields from all documents in the 'socialmedia' collection? (yes/no): ")
        
        if confirmation.lower() in ['yes', 'y']:
            remove_fields_from_collection()
        else:
            logger.info("Operation cancelled by user")
            
    except KeyboardInterrupt:
        logger.info("Operation interrupted by user")
    except Exception as e:
        logger.error(f"Unexpected error: {str(e)}")

if __name__ == "__main__":
    main()

2025-09-06 00:29:33,586 - INFO - Connected to MongoDB successfully
2025-09-06 00:29:35,183 - INFO - Total documents in collection: 2237
2025-09-06 00:29:35,185 - INFO - Removing fields: ['dominant_topic', 'subtopics', 'data_source', 'created_for_distribution']
2025-09-06 00:29:35,449 - INFO - Fields removed successfully!
2025-09-06 00:29:35,450 - INFO - Documents matched: 2237
2025-09-06 00:29:35,451 - INFO - Documents modified: 267
2025-09-06 00:29:35,690 - INFO - Verification - Fields successfully removed: ['dominant_topic', 'subtopics', 'data_source', 'created_for_distribution']
2025-09-06 00:29:35,691 - INFO - All target fields have been successfully removed!
2025-09-06 00:29:35,932 - INFO - MongoDB connection closed


Reddit removal

In [1]:
import os
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
import random

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def remove_fields_from_reddit_records():
    """
    Remove specific fields from records where channel = 'Reddit' in socialmedia collection
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['socialmedia']
        
        logger.info("Connected to MongoDB successfully")
        
        # Fields to remove from Reddit records
        fields_to_remove = [
            "comment_count",
            "content_generated_at", 
            "like_count",
            "priority",
            "sentiment",
            "share_count",
            "text",
            "urgency"
        ]
        
        # First, let's check how many Reddit records exist
        reddit_count = collection.count_documents({"channel": "Reddit"})
        logger.info(f"Found {reddit_count} records with channel = 'Reddit'")
        
        if reddit_count == 0:
            logger.warning("No records found with channel = 'Reddit'")
            return
        
        # Create the unset operation for the specified fields
        unset_fields = {field: "" for field in fields_to_remove}
        
        # Remove the fields from Reddit records
        result = collection.update_many(
            {"channel": "Reddit"},  # Filter: only Reddit records
            {"$unset": unset_fields}  # Remove specified fields
        )
        
        logger.info(f"Successfully updated {result.modified_count} Reddit records")
        logger.info(f"Removed fields: {', '.join(fields_to_remove)}")
        
        # Optional: Verify the update by checking a sample record
        sample_record = collection.find_one({"channel": "Reddit"})
        if sample_record:
            remaining_fields = list(sample_record.keys())
            removed_fields_check = [field for field in fields_to_remove if field not in remaining_fields]
            logger.info(f"Verification - Fields successfully removed: {removed_fields_check}")
            logger.info(f"Remaining fields in sample record: {remaining_fields}")
        
    except Exception as e:
        logger.error(f"Error occurred: {str(e)}")
        raise
    
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            logger.info("MongoDB connection closed")

def main():
    """
    Main function to execute the field removal operation
    """
    try:
        # Validate environment variables
        if not mongo_connection_string:
            raise ValueError("MONGO_CONNECTION_STRING environment variable is not set")
        if not mongo_database_name:
            raise ValueError("MONGO_DATABASE_NAME environment variable is not set")
        
        logger.info("Starting field removal operation for Reddit records")
        remove_fields_from_reddit_records()
        logger.info("Field removal operation completed successfully")
        
    except Exception as e:
        logger.error(f"Script execution failed: {str(e)}")
        return False
    
    return True

if __name__ == "__main__":
    success = main()
    if success:
        print("✅ Script executed successfully")
    else:
        print("❌ Script execution failed")

2025-09-07 22:58:36,688 - INFO - Starting field removal operation for Reddit records
2025-09-07 22:58:36,967 - INFO - Connected to MongoDB successfully
2025-09-07 22:58:38,517 - INFO - Found 157 records with channel = 'Reddit'
2025-09-07 22:58:38,763 - INFO - Successfully updated 4 Reddit records
2025-09-07 22:58:38,765 - INFO - Removed fields: comment_count, content_generated_at, like_count, priority, sentiment, share_count, text, urgency
2025-09-07 22:58:39,007 - INFO - Verification - Fields successfully removed: ['comment_count', 'content_generated_at', 'like_count', 'priority', 'sentiment', 'share_count', 'text', 'urgency']
2025-09-07 22:58:39,008 - INFO - Remaining fields in sample record: ['_id', 'email_id', 'username', 'channel', 'dominant_topic', 'subtopics', 'created_at', 'domain', 'post_id', 'user_id', 'subreddit']
2025-09-07 22:58:39,256 - INFO - MongoDB connection closed
2025-09-07 22:58:39,260 - INFO - Field removal operation completed successfully


✅ Script executed successfully


In [11]:
import os
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
import random
from collections import Counter
from itertools import combinations

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def add_subreddit_field():
    """
    Add subreddit field to documents where channel = 'Reddit'
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['socialmedia']
        
        logger.info("Connected to MongoDB successfully")
        
        # Check how many documents have channel = 'Reddit'
        reddit_count = collection.count_documents({"channel": "Reddit"})
        logger.info(f"Found {reddit_count} documents with channel = 'Reddit'")
        
        if reddit_count == 0:
            logger.info("No documents found with channel = 'Reddit'")
            return
        
        # Update documents where channel = 'Reddit' to add subreddit field
        update_result = collection.update_many(
            {"channel": "Reddit"},  # Filter: only documents where channel is Reddit
            {"$set": {"subreddit": "r/Subreddit"}}  # Add/update subreddit field
        )
        
        logger.info(f"Successfully updated {update_result.modified_count} documents")
        logger.info(f"Matched {update_result.matched_count} documents")
        
        # Verify the update by checking a sample document
        sample_doc = collection.find_one({"channel": "Reddit", "subreddit": {"$exists": True}})
        if sample_doc:
            logger.info(f"Sample updated document: {sample_doc.get('_id')} has subreddit: {sample_doc.get('subreddit')}")
        
        # Close the connection
        client.close()
        logger.info("Database connection closed")
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise

def add_subreddit_field_with_different_values():
    """
    Alternative: Add subreddit field with different subreddit names for variety
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['socialmedia']
        
        logger.info("Connected to MongoDB successfully")
        
        # Sample subreddit names for variety (optional)
        subreddit_options = [
            "r/technology", "r/news", "r/worldnews", "r/science", 
            "r/askreddit", "r/todayilearned", "r/funny", "r/pics"
        ]
        
        # Find all documents with channel = 'Reddit'
        reddit_docs = collection.find({"channel": "Reddit"})
        
        updated_count = 0
        for doc in reddit_docs:
            # You can either use a fixed value or random selection
            # Option 1: Fixed value (as requested)
            subreddit_value = "r/Subreddit"
            
            # Option 2: Random selection (uncomment if you want variety)
            # subreddit_value = random.choice(subreddit_options)
            
            collection.update_one(
                {"_id": doc["_id"]},
                {"$set": {"subreddit": subreddit_value}}
            )
            updated_count += 1
        
        logger.info(f"Successfully updated {updated_count} documents with subreddit field")
        
        # Close the connection
        client.close()
        logger.info("Database connection closed")
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise

def verify_update():
    """
    Verify that the subreddit field was added correctly
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['socialmedia']
        
        # Count documents with channel = 'Reddit' and subreddit field
        reddit_with_subreddit = collection.count_documents({
            "channel": "Reddit", 
            "subreddit": {"$exists": True}
        })
        
        # Count total Reddit documents
        total_reddit = collection.count_documents({"channel": "Reddit"})
        
        logger.info(f"Total Reddit documents: {total_reddit}")
        logger.info(f"Reddit documents with subreddit field: {reddit_with_subreddit}")
        
        # Show a few sample documents
        samples = collection.find({"channel": "Reddit", "subreddit": {"$exists": True}}).limit(3)
        logger.info("Sample documents:")
        for doc in samples:
            logger.info(f"ID: {doc.get('_id')}, Channel: {doc.get('channel')}, Subreddit: {doc.get('subreddit')}")
        
        # Close the connection
        client.close()
        
    except Exception as e:
        logger.error(f"An error occurred during verification: {str(e)}")
        raise

if __name__ == "__main__":
    # Add subreddit field to Reddit documents
    add_subreddit_field()
    
    # Verify the update
    verify_update()

2025-09-06 15:07:58,343 - INFO - Connected to MongoDB successfully
2025-09-06 15:08:00,003 - INFO - Found 157 documents with channel = 'Reddit'
2025-09-06 15:08:00,300 - INFO - Successfully updated 157 documents
2025-09-06 15:08:00,306 - INFO - Matched 157 documents
2025-09-06 15:08:00,558 - INFO - Sample updated document: 68bb12eb35db675a8b09a220 has subreddit: r/Subreddit
2025-09-06 15:08:00,803 - INFO - Database connection closed
2025-09-06 15:08:02,772 - INFO - Total Reddit documents: 157
2025-09-06 15:08:02,776 - INFO - Reddit documents with subreddit field: 157
2025-09-06 15:08:02,779 - INFO - Sample documents:
2025-09-06 15:08:03,028 - INFO - ID: 68bb12eb35db675a8b09a220, Channel: Reddit, Subreddit: r/Subreddit
2025-09-06 15:08:03,029 - INFO - ID: 68bb12eb35db675a8b09a222, Channel: Reddit, Subreddit: r/Subreddit
2025-09-06 15:08:03,030 - INFO - ID: 68bb12eb35db675a8b09a224, Channel: Reddit, Subreddit: r/Subreddit


Twitter removal


In [None]:
import os
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
import random
from collections import Counter
from itertools import combinations

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def remove_twitter_fields():
    """
    Remove specific fields from documents that have 'Twitter' in the channel field
    """
    try:
        # Connect to MongoDB
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['socialmedia']
        
        logger.info("Connected to MongoDB successfully")
        
        # Define the fields to remove
        fields_to_remove = {
            "hashtags": "",
            "like_count": "",
            "priority": "",
            "quote_count": "",
            "reply_count": "",
            "retweet_count": "",
            "sentiment": "",
            "text": ""
        }
        
        # First, let's check how many documents match our criteria
        query = {"channel": {"$regex": "Twitter", "$options": "i"}}
        matching_docs_count = collection.count_documents(query)
        logger.info(f"Found {matching_docs_count} documents with 'Twitter' in channel field")
        
        if matching_docs_count == 0:
            logger.warning("No documents found with 'Twitter' in channel field")
            return
        
        # Check how many documents have the fields we want to remove
        fields_check_query = {
            "$and": [
                {"channel": {"$regex": "Twitter", "$options": "i"}},
                {"$or": [
                    {"hashtags": {"$exists": True}},
                    {"like_count": {"$exists": True}},
                    {"priority": {"$exists": True}},
                    {"quote_count": {"$exists": True}},
                    {"reply_count": {"$exists": True}},
                    {"retweet_count": {"$exists": True}},
                    {"sentiment": {"$exists": True}},
                    {"text": {"$exists": True}}
                ]}
            ]
        }
        
        docs_with_fields_count = collection.count_documents(fields_check_query)
        logger.info(f"Found {docs_with_fields_count} Twitter documents that have at least one of the target fields")
        
        if docs_with_fields_count == 0:
            logger.info("No Twitter documents found with the specified fields to remove")
            return
        
        # Show a sample of documents before removal (for verification)
        sample_docs = list(collection.find(fields_check_query).limit(2))
        logger.info("Sample documents before field removal:")
        for i, doc in enumerate(sample_docs, 1):
            logger.info(f"Sample {i}: {list(doc.keys())}")
        
        # Ask for confirmation before proceeding
        print(f"\nAbout to remove fields {list(fields_to_remove.keys())} from {docs_with_fields_count} Twitter documents.")
        confirmation = input("Do you want to proceed? (yes/no): ").strip().lower()
        
        if confirmation not in ['yes', 'y']:
            logger.info("Operation cancelled by user")
            return
        
        # Remove the specified fields from documents with 'Twitter' in channel
        result = collection.update_many(
            query,
            {"$unset": fields_to_remove}
        )
        
        logger.info(f"Successfully updated {result.modified_count} documents")
        logger.info(f"Matched {result.matched_count} documents")
        
        # Verify the removal by checking a few documents
        verification_docs = list(collection.find(query).limit(2))
        logger.info("Sample documents after field removal:")
        for i, doc in enumerate(verification_docs, 1):
            logger.info(f"Sample {i}: {list(doc.keys())}")
            
        # Check if any of the removed fields still exist
        remaining_fields_check = collection.count_documents({
            "$and": [
                {"channel": {"$regex": "Twitter", "$options": "i"}},
                {"$or": [
                    {"hashtags": {"$exists": True}},
                    {"like_count": {"$exists": True}},
                    {"priority": {"$exists": True}},
                    {"quote_count": {"$exists": True}},
                    {"reply_count": {"$exists": True}},
                    {"retweet_count": {"$exists": True}},
                    {"sentiment": {"$exists": True}},
                    {"text": {"$exists": True}}
                ]}
            ]
        })
        
        if remaining_fields_check == 0:
            logger.info("✅ All specified fields have been successfully removed from Twitter documents")
        else:
            logger.warning(f"⚠️ {remaining_fields_check} Twitter documents still have some of the target fields")
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise
    
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            logger.info("MongoDB connection closed")

def preview_twitter_documents():
    """
    Preview Twitter documents to see their current structure
    """
    try:
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['socialmedia']
        
        query = {"channel": {"$regex": "Twitter", "$options": "i"}}
        docs = list(collection.find(query).limit(5))
        
        print("\n=== Preview of Twitter Documents ===")
        for i, doc in enumerate(docs, 1):
            print(f"\nDocument {i}:")
            print(f"  Channel: {doc.get('channel', 'N/A')}")
            print(f"  Fields: {list(doc.keys())}")
            
            # Check which target fields exist
            target_fields = ["hashtags", "like_count", "priority", "quote_count", 
                           "reply_count", "retweet_count", "sentiment", "text"]
            existing_target_fields = [field for field in target_fields if field in doc]
            if existing_target_fields:
                print(f"  Target fields present: {existing_target_fields}")
            else:
                print(f"  No target fields present")
        
        client.close()
        
    except Exception as e:
        logger.error(f"Error in preview: {str(e)}")

if __name__ == "__main__":
    print("MongoDB Field Remover for Twitter Documents")
    print("==========================================")
    
    # Check if environment variables are set
    if not mongo_connection_string or not mongo_database_name:
        logger.error("Missing required environment variables: MONGO_CONNECTION_STRING or MONGO_DATABASE_NAME")
        exit(1)
    
    while True:
        print("\nOptions:")
        print("1. Preview Twitter documents")
        print("2. Remove specified fields from Twitter documents")
        print("3. Exit")
        
        choice = input("\nEnter your choice (1-3): ").strip()
        
        if choice == '1':
            preview_twitter_documents()
        elif choice == '2':
            remove_twitter_fields()
        elif choice == '3':
            logger.info("Exiting...")
            break
        else:
            print("Invalid choice. Please enter 1, 2, or 3.")

MongoDB Field Remover for Twitter Documents

Options:
1. Preview Twitter documents
2. Remove specified fields from Twitter documents
3. Exit


2025-09-06 17:59:18,555 - INFO - Connected to MongoDB successfully
2025-09-06 17:59:20,035 - INFO - Found 282 documents with 'Twitter' in channel field
2025-09-06 17:59:20,287 - INFO - Found 192 Twitter documents that have at least one of the target fields
2025-09-06 17:59:20,536 - INFO - Sample documents before field removal:
2025-09-06 17:59:20,539 - INFO - Sample 1: ['_id', 'email_id', 'username', 'channel', 'dominant_topic', 'subtopics', 'created_at', 'domain', 'tweet_id', 'user_id', 'hashtags', 'quote_count', 'reply_count', 'retweet_count']
2025-09-06 17:59:20,541 - INFO - Sample 2: ['_id', 'email_id', 'username', 'channel', 'dominant_topic', 'subtopics', 'created_at', 'domain', 'tweet_id', 'user_id', 'hashtags', 'quote_count', 'reply_count', 'retweet_count']



About to remove fields ['hashtags', 'like_count', 'priority', 'quote_count', 'reply_count', 'retweet_count', 'sentiment', 'text'] from 192 Twitter documents.


2025-09-06 17:59:23,020 - INFO - Successfully updated 192 documents
2025-09-06 17:59:23,021 - INFO - Matched 282 documents
2025-09-06 17:59:23,270 - INFO - Sample documents after field removal:
2025-09-06 17:59:23,273 - INFO - Sample 1: ['_id', 'email_id', 'username', 'channel', 'dominant_topic', 'subtopics', 'created_at', 'domain', 'tweet_id', 'user_id']
2025-09-06 17:59:23,275 - INFO - Sample 2: ['_id', 'email_id', 'username', 'channel', 'dominant_topic', 'subtopics', 'created_at', 'domain', 'tweet_id', 'user_id']
2025-09-06 17:59:23,525 - INFO - ✅ All specified fields have been successfully removed from Twitter documents
2025-09-06 17:59:23,768 - INFO - MongoDB connection closed



Options:
1. Preview Twitter documents
2. Remove specified fields from Twitter documents
3. Exit


2025-09-06 17:59:25,593 - INFO - Exiting...


App/Google

In [5]:
import os
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
import random
from collections import Counter
from itertools import combinations

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def remove_fields_from_app_store_records():
    """
    Remove specific fields from documents where channel = 'App Store/Google Play'
    """
    try:
        # Connect to MongoDB
        logger.info("Connecting to MongoDB...")
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['socialmedia']
        
        # Define the fields to remove
        fields_to_remove = [
            "rating",
            "priority", 
            "review_helpful",
            "sentiment",
            "text",
            "urgency",
            "platform",
            "Title",  # Note: Added "Title" field as seen in your second example
            "content_generated_at"
        ]
        
        # Create the unset operation for removing fields
        unset_operation = {field: "" for field in fields_to_remove}
        
        # First, let's check how many documents match our criteria
        filter_criteria = {"channel": "App Store/Google Play"}
        matching_count = collection.count_documents(filter_criteria)
        logger.info(f"Found {matching_count} documents with channel = 'App Store/Google Play'")
        
        if matching_count == 0:
            logger.warning("No documents found matching the criteria. Operation cancelled.")
            return
        
        # Ask for confirmation before proceeding
        confirmation = input(f"Are you sure you want to remove fields from {matching_count} documents? (yes/no): ")
        if confirmation.lower() != 'yes':
            logger.info("Operation cancelled by user.")
            return
        
        # Perform the update operation to remove fields
        logger.info(f"Removing fields: {fields_to_remove}")
        result = collection.update_many(
            filter_criteria,
            {"$unset": unset_operation}
        )
        
        # Log the results
        logger.info(f"Operation completed successfully!")
        logger.info(f"Documents matched: {result.matched_count}")
        logger.info(f"Documents modified: {result.modified_count}")
        
        # Optional: Show a sample document after the operation
        sample_doc = collection.find_one(filter_criteria)
        if sample_doc:
            logger.info("Sample document after field removal:")
            logger.info(f"Document ID: {sample_doc.get('_id')}")
            remaining_fields = list(sample_doc.keys())
            logger.info(f"Remaining fields: {remaining_fields}")
        
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
    finally:
        # Close the connection
        if 'client' in locals():
            client.close()
            logger.info("MongoDB connection closed.")

def preview_operation():
    """
    Preview which documents will be affected without making changes
    """
    try:
        # Connect to MongoDB
        logger.info("Connecting to MongoDB for preview...")
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['socialmedia']
        
        filter_criteria = {"channel": "App Store/Google Play"}
        
        # Get sample documents
        sample_docs = list(collection.find(filter_criteria).limit(3))
        
        logger.info(f"Preview: Found {collection.count_documents(filter_criteria)} documents matching criteria")
        logger.info("Sample documents that will be affected:")
        
        for i, doc in enumerate(sample_docs, 1):
            logger.info(f"\nSample Document {i}:")
            logger.info(f"  _id: {doc.get('_id')}")
            logger.info(f"  channel: {doc.get('channel')}")
            
            # Show which fields will be removed
            fields_to_remove = ["rating", "priority", "review_helpful", "sentiment", "text", "urgency", "platform", "Title"]
            existing_fields_to_remove = [field for field in fields_to_remove if field in doc]
            
            if existing_fields_to_remove:
                logger.info(f"  Fields that will be removed: {existing_fields_to_remove}")
            else:
                logger.info("  No target fields found in this document")
        
    except Exception as e:
        logger.error(f"An error occurred during preview: {str(e)}")
    finally:
        if 'client' in locals():
            client.close()

def main():
    """
    Main function to run the field removal operation
    """
    logger.info("MongoDB Field Removal Tool")
    logger.info("=" * 50)
    
    # Check if environment variables are loaded
    if not mongo_connection_string or not mongo_database_name:
        logger.error("MongoDB connection details not found in environment variables.")
        logger.error("Please ensure MONGO_CONNECTION_STRING and MONGO_DATABASE_NAME are set in your .env file")
        return
    
    # Ask user what they want to do
    print("\nWhat would you like to do?")
    print("1. Preview the operation (recommended)")
    print("2. Execute the field removal")
    print("3. Exit")
    
    choice = input("Enter your choice (1-3): ")
    
    if choice == "1":
        preview_operation()
    elif choice == "2":
        remove_fields_from_app_store_records()
    elif choice == "3":
        logger.info("Exiting...")
    else:
        logger.warning("Invalid choice. Please run the script again.")

if __name__ == "__main__":
    main()

2025-09-08 12:13:37,827 - INFO - MongoDB Field Removal Tool



What would you like to do?
1. Preview the operation (recommended)
2. Execute the field removal
3. Exit


2025-09-08 12:13:39,162 - INFO - Connecting to MongoDB...
2025-09-08 12:13:40,742 - INFO - Found 726 documents with channel = 'App Store/Google Play'
2025-09-08 12:13:43,892 - INFO - Removing fields: ['rating', 'priority', 'review_helpful', 'sentiment', 'text', 'urgency', 'platform', 'Title', 'content_generated_at']
2025-09-08 12:13:44,228 - INFO - Operation completed successfully!
2025-09-08 12:13:44,232 - INFO - Documents matched: 726
2025-09-08 12:13:44,236 - INFO - Documents modified: 18
2025-09-08 12:13:44,482 - INFO - Sample document after field removal:
2025-09-08 12:13:44,485 - INFO - Document ID: 68bb12eb35db675a8b09a21b
2025-09-08 12:13:44,486 - INFO - Remaining fields: ['_id', 'email_id', 'username', 'channel', 'dominant_topic', 'subtopics', 'created_at', 'domain', 'review_id', 'user_id']
2025-09-08 12:13:44,740 - INFO - MongoDB connection closed.


platform creataion

In [9]:
import os
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
import logging
import random
from collections import Counter
from itertools import combinations

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Get connection details from environment variables
mongo_connection_string = os.getenv('MONGO_CONNECTION_STRING')
mongo_database_name = os.getenv('MONGO_DATABASE_NAME')

def assign_platform_to_records():
    """
    Assigns platform values to records where channel = "App Store/Google Play"
    70% will be assigned "Google Play Store" and 30% will be assigned "App Store"
    """
    try:
        # Connect to MongoDB
        logger.info("Connecting to MongoDB...")
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['socialmedia']
        
        # Find all records with channel = "App Store/Google Play"
        logger.info("Finding records with channel = 'App Store/Google Play'...")
        query = {"channel": "App Store/Google Play"}
        records = list(collection.find(query))
        
        if not records:
            logger.info("No records found with channel = 'App Store/Google Play'")
            return
        
        total_records = len(records)
        logger.info(f"Found {total_records} records to update")
        
        # Calculate the number of records for each platform
        google_play_count = int(total_records * 0.70)
        app_store_count = total_records - google_play_count
        
        logger.info(f"Will assign {google_play_count} records to 'Google Play Store' ({google_play_count/total_records*100:.1f}%)")
        logger.info(f"Will assign {app_store_count} records to 'App Store' ({app_store_count/total_records*100:.1f}%)")
        
        # Create platform assignment list
        platform_assignments = ["Google Play Store"] * google_play_count + ["App Store"] * app_store_count
        
        # Shuffle to randomize assignment
        random.shuffle(platform_assignments)
        
        # Update records with platform assignments
        logger.info("Starting platform assignment...")
        updated_count = 0
        
        for i, record in enumerate(records):
            try:
                # Update the record with the platform field
                result = collection.update_one(
                    {"_id": record["_id"]},
                    {"$set": {"platform": platform_assignments[i]}}
                )
                
                if result.modified_count > 0:
                    updated_count += 1
                    
            except Exception as e:
                logger.error(f"Error updating record {record['_id']}: {str(e)}")
        
        logger.info(f"Successfully updated {updated_count} out of {total_records} records")
        
        # Verify the distribution
        verify_distribution(collection)
        
    except Exception as e:
        logger.error(f"Error in assign_platform_to_records: {str(e)}")
    finally:
        if 'client' in locals():
            client.close()
            logger.info("MongoDB connection closed")

def verify_distribution(collection):
    """
    Verify the platform distribution in the updated records
    """
    try:
        logger.info("Verifying platform distribution...")
        
        # Count platform assignments for records that originally had "App Store/Google Play"
        pipeline = [
            {"$match": {"channel": "App Store/Google Play", "platform": {"$exists": True}}},
            {"$group": {
                "_id": "$platform",
                "count": {"$sum": 1}
            }}
        ]
        
        results = list(collection.aggregate(pipeline))
        
        total_assigned = sum(result["count"] for result in results)
        
        logger.info("Platform distribution verification:")
        for result in results:
            platform = result["_id"]
            count = result["count"]
            percentage = (count / total_assigned * 100) if total_assigned > 0 else 0
            logger.info(f"  {platform}: {count} records ({percentage:.1f}%)")
            
    except Exception as e:
        logger.error(f"Error in verify_distribution: {str(e)}")

def reset_platform_field():
    """
    Optional function to reset/remove platform field from all records
    Use this if you need to start over
    """
    try:
        logger.info("Resetting platform field...")
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['socialmedia']
        
        result = collection.update_many(
            {"channel": "App Store/Google Play"},
            {"$unset": {"platform": ""}}
        )
        
        logger.info(f"Reset platform field for {result.modified_count} records")
        
    except Exception as e:
        logger.error(f"Error in reset_platform_field: {str(e)}")
    finally:
        if 'client' in locals():
            client.close()

def check_existing_assignments():
    """
    Check if there are already platform assignments for these records
    """
    try:
        logger.info("Checking for existing platform assignments...")
        client = MongoClient(mongo_connection_string)
        db = client[mongo_database_name]
        collection = db['socialmedia']
        
        # Count records with existing platform assignments
        existing_count = collection.count_documents({
            "channel": "App Store/Google Play",
            "platform": {"$exists": True}
        })
        
        total_count = collection.count_documents({
            "channel": "App Store/Google Play"
        })
        
        logger.info(f"Records with channel 'App Store/Google Play': {total_count}")
        logger.info(f"Records already with platform assignment: {existing_count}")
        
        if existing_count > 0:
            logger.warning("Some records already have platform assignments. Consider using reset_platform_field() first.")
        
        return existing_count > 0
        
    except Exception as e:
        logger.error(f"Error in check_existing_assignments: {str(e)}")
        return False
    finally:
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    # Set random seed for reproducible results (optional)
    random.seed(42)
    
    # Check for existing assignments first
    has_existing = check_existing_assignments()
    
    if has_existing:
        response = input("Some records already have platform assignments. Do you want to reset them first? (y/n): ")
        if response.lower() == 'y':
            reset_platform_field()
    
    # Assign platform values
    assign_platform_to_records()
    
    logger.info("Script execution completed!")

2025-09-08 12:20:55,881 - INFO - Checking for existing platform assignments...
2025-09-08 12:20:57,624 - INFO - Records with channel 'App Store/Google Play': 726
2025-09-08 12:20:57,627 - INFO - Records already with platform assignment: 0
2025-09-08 12:20:57,874 - INFO - Connecting to MongoDB...
2025-09-08 12:20:57,887 - INFO - Finding records with channel = 'App Store/Google Play'...
2025-09-08 12:21:00,562 - INFO - Found 726 records to update
2025-09-08 12:21:00,564 - INFO - Will assign 508 records to 'Google Play Store' (70.0%)
2025-09-08 12:21:00,565 - INFO - Will assign 218 records to 'App Store' (30.0%)
2025-09-08 12:21:00,567 - INFO - Starting platform assignment...
2025-09-08 12:24:12,862 - INFO - Successfully updated 726 out of 726 records
2025-09-08 12:24:12,865 - INFO - Verifying platform distribution...
2025-09-08 12:24:13,115 - INFO - Platform distribution verification:
2025-09-08 12:24:13,116 - INFO -   App Store: 218 records (30.0%)
2025-09-08 12:24:13,117 - INFO -   Goo

In [10]:
# Import required libraries
from pymongo import MongoClient
from collections import defaultdict
import re
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

def remove_socialmedia_ids_from_email_records():
    """
    Remove 'socialmedia_ids' field from records where data = 'email' in the cluster collection
    """
    try:
        # Create MongoDB client
        client = MongoClient(MONGO_CONNECTION_STRING)
        
        # Access the database
        db = client[MONGO_DATABASE_NAME]
        
        # Access the cluster collection
        cluster_collection = db.cluster
        
        # First, let's check how many records match our criteria
        matching_records_count = cluster_collection.count_documents({"data": "email"})
        print(f"Found {matching_records_count} records with data = 'email'")
        
        if matching_records_count == 0:
            print("No records found with data = 'email'. Nothing to update.")
            return
        
        # Check how many of these records actually have the 'socialmedia_ids' field
        records_with_field = cluster_collection.count_documents({
            "data": "email",
            "socialmedia_ids": {"$exists": True}
        })
        print(f"Found {records_with_field} records with data = 'email' that have 'socialmedia_ids' field")
        
        if records_with_field == 0:
            print("No records with 'socialmedia_ids' field found. Nothing to update.")
            return
        
        # Perform the update operation to remove 'socialmedia_ids' field
        result = cluster_collection.update_many(
            {"data": "email"},  # Filter: only records where data = "email"
            {"$unset": {"socialmedia_ids": ""}}  # Remove the socialmedia_ids field
        )
        
        # Print the results
        print(f"\nUpdate operation completed:")
        print(f"Records matched: {result.matched_count}")
        print(f"Records modified: {result.modified_count}")
        
        if result.modified_count > 0:
            print(f"Successfully removed 'socialmedia_ids' field from {result.modified_count} records.")
        else:
            print("No records were modified. This could mean:")
            print("- The records don't have the 'socialmedia_ids' field")
            print("- Or the field was already removed")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    
    finally:
        # Close the MongoDB connection
        if 'client' in locals():
            client.close()
            print("\nMongoDB connection closed.")

def preview_records_before_update():
    """
    Preview the records that will be affected before making changes
    """
    try:
        # Create MongoDB client
        client = MongoClient(MONGO_CONNECTION_STRING)
        
        # Access the database
        db = client[MONGO_DATABASE_NAME]
        
        # Access the cluster collection
        cluster_collection = db.cluster
        
        # Find records that match our criteria and have the socialmedia_ids field
        preview_records = cluster_collection.find(
            {
                "data": "email",
                "socialmedia_ids": {"$exists": True}
            }
        ).limit(5)  # Limit to first 5 records for preview
        
        print("Preview of records that will be updated:")
        print("-" * 50)
        
        count = 0
        for record in preview_records:
            count += 1
            print(f"Record {count}:")
            print(f"  _id: {record.get('_id')}")
            print(f"  data: {record.get('data')}")
            print(f"  socialmedia_ids: {record.get('socialmedia_ids', 'N/A')}")
            # Show other fields (excluding socialmedia_ids for brevity)
            other_fields = {k: v for k, v in record.items() if k not in ['_id', 'data', 'socialmedia_ids']}
            if other_fields:
                print(f"  other fields: {list(other_fields.keys())}")
            print()
        
        if count == 0:
            print("No records found that match the criteria.")
        
    except Exception as e:
        print(f"An error occurred during preview: {str(e)}")
    
    finally:
        # Close the MongoDB connection
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    print("MongoDB Operation: Remove 'socialmedia_ids' field from email records")
    print("=" * 70)
    
    # First preview the records that will be affected
    print("\n1. PREVIEWING RECORDS:")
    preview_records_before_update()
    
    # Ask for confirmation before proceeding
    print("\n" + "=" * 70)
    confirmation = input("Do you want to proceed with removing 'socialmedia_ids' field? (yes/no): ").lower().strip()
    
    if confirmation in ['yes', 'y']:
        print("\n2. EXECUTING UPDATE OPERATION:")
        remove_socialmedia_ids_from_email_records()
    else:
        print("Operation cancelled by user.")

MongoDB Operation: Remove 'socialmedia_ids' field from email records

1. PREVIEWING RECORDS:
Preview of records that will be updated:
--------------------------------------------------
Record 1:
  _id: 68aacadc05037130937cbae2
  data: email
  socialmedia_ids: {'Twitter': [], 'Reddit': [], 'Trustpilot': [], 'App Store/Google Play': []}
  other fields: ['cluster_id', 'keyphrases', 'keyphrase_count', 'domains', 'email_ids', 'created_at', 'dominant_label', 'original_keyphrases_count', 'processing_date', 'subclusters', 'uniqueness_validated']

Record 2:
  _id: 68aacadc05037130937cbae3
  data: email
  socialmedia_ids: {'Twitter': ['68bb12eb35db675a8b09a6fe', '68bb12eb35db675a8b09a6c7', '68bb12eb35db675a8b09a6bc', '68bb12eb35db675a8b09a6e2', '68bb12eb35db675a8b09a6ce', '68bb12eb35db675a8b09a6ef', '68bb12eb35db675a8b09a701', '68bb12eb35db675a8b09a6b1'], 'Reddit': [], 'Trustpilot': [], 'App Store/Google Play': []}
  other fields: ['cluster_id', 'keyphrases', 'keyphrase_count', 'domains', 'email

In [None]:
# Import required libraries
from pymongo import MongoClient
from collections import defaultdict
import re
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Connect to MongoDB using environment variables
MONGO_CONNECTION_STRING = os.getenv('MONGO_CONNECTION_STRING')
MONGO_DATABASE_NAME = os.getenv('MONGO_DATABASE_NAME')

def remove_socialmedia_ids_from_all_records():
    """
    Remove 'socialmedia_ids' field from ALL records in the cluster collection
    """
    try:
        # Create MongoDB client
        client = MongoClient(MONGO_CONNECTION_STRING)
        
        # Access the database
        db = client[MONGO_DATABASE_NAME]
        
        # Access the cluster collection
        cluster_collection = db.cluster
        
        # First, let's check the total number of records in the collection
        total_records_count = cluster_collection.count_documents({})
        print(f"Total records in cluster collection: {total_records_count}")
        
        if total_records_count == 0:
            print("No records found in the cluster collection. Nothing to update.")
            return
        
        # Check how many records actually have the 'socialmedia_ids' field
        records_with_field = cluster_collection.count_documents({
            "socialmedia_ids": {"$exists": True}
        })
        print(f"Found {records_with_field} records that have 'socialmedia_ids' field")
        
        if records_with_field == 0:
            print("No records with 'socialmedia_ids' field found. Nothing to update.")
            return
        
        # Perform the update operation to remove 'socialmedia_ids' field from ALL records
        result = cluster_collection.update_many(
            {},  # Empty filter: match ALL records
            {"$unset": {"socialmedia_ids": ""}}  # Remove the socialmedia_ids field
        )
        
        # Print the results
        print(f"\nUpdate operation completed:")
        print(f"Records matched: {result.matched_count}")
        print(f"Records modified: {result.modified_count}")
        
        if result.modified_count > 0:
            print(f"Successfully removed 'socialmedia_ids' field from {result.modified_count} records.")
        else:
            print("No records were modified. This could mean:")
            print("- The records don't have the 'socialmedia_ids' field")
            print("- Or the field was already removed")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    
    finally:
        # Close the MongoDB connection
        if 'client' in locals():
            client.close()
            print("\nMongoDB connection closed.")

def preview_records_before_update():
    """
    Preview the records that will be affected before making changes
    """
    try:
        # Create MongoDB client
        client = MongoClient(MONGO_CONNECTION_STRING)
        
        # Access the database
        db = client[MONGO_DATABASE_NAME]
        
        # Access the cluster collection
        cluster_collection = db.cluster
        
        # Find records that have the socialmedia_ids field
        preview_records = cluster_collection.find(
            {"socialmedia_ids": {"$exists": True}}
        ).limit(5)  # Limit to first 5 records for preview
        
        print("Preview of records that will be updated:")
        print("-" * 50)
        
        count = 0
        for record in preview_records:
            count += 1
            print(f"Record {count}:")
            print(f"  _id: {record.get('_id')}")
            print(f"  data: {record.get('data', 'N/A')}")
            print(f"  socialmedia_ids: {record.get('socialmedia_ids', 'N/A')}")
            # Show other fields (excluding socialmedia_ids for brevity)
            other_fields = {k: v for k, v in record.items() if k not in ['_id', 'data', 'socialmedia_ids']}
            if other_fields:
                print(f"  other fields: {list(other_fields.keys())}")
            print()
        
        if count == 0:
            print("No records found that have the 'socialmedia_ids' field.")
        else:
            # Show summary of all record types that will be affected
            pipeline = [
                {"$match": {"socialmedia_ids": {"$exists": True}}},
                {"$group": {"_id": "$data", "count": {"$sum": 1}}},
                {"$sort": {"count": -1}}
            ]
            
            data_type_summary = list(cluster_collection.aggregate(pipeline))
            
            if data_type_summary:
                print(f"\nSummary of record types that will be affected:")
                for item in data_type_summary:
                    data_type = item['_id'] if item['_id'] is not None else 'null/missing'
                    count = item['count']
                    print(f"  - data = '{data_type}': {count} records")
        
    except Exception as e:
        print(f"An error occurred during preview: {str(e)}")
    
    finally:
        # Close the MongoDB connection
        if 'client' in locals():
            client.close()

def get_collection_stats():
    """
    Get basic statistics about the cluster collection
    """
    try:
        # Create MongoDB client
        client = MongoClient(MONGO_CONNECTION_STRING)
        
        # Access the database
        db = client[MONGO_DATABASE_NAME]
        
        # Access the cluster collection
        cluster_collection = db.cluster
        
        # Get total count
        total_count = cluster_collection.count_documents({})
        
        # Get count with socialmedia_ids
        with_socialmedia_ids = cluster_collection.count_documents({"socialmedia_ids": {"$exists": True}})
        
        # Get data type distribution
        pipeline = [
            {"$group": {"_id": "$data", "count": {"$sum": 1}}},
            {"$sort": {"count": -1}}
        ]
        
        data_distribution = list(cluster_collection.aggregate(pipeline))
        
        print("Collection Statistics:")
        print("-" * 30)
        print(f"Total records: {total_count}")
        print(f"Records with 'socialmedia_ids': {with_socialmedia_ids}")
        print(f"Records without 'socialmedia_ids': {total_count - with_socialmedia_ids}")
        
        print(f"\nData type distribution:")
        for item in data_distribution:
            data_type = item['_id'] if item['_id'] is not None else 'null/missing'
            count = item['count']
            print(f"  - data = '{data_type}': {count} records")
        
    except Exception as e:
        print(f"An error occurred while getting stats: {str(e)}")
    
    finally:
        # Close the MongoDB connection
        if 'client' in locals():
            client.close()

if __name__ == "__main__":
    print("MongoDB Operation: Remove 'socialmedia_ids' field from ALL records")
    print("=" * 70)
    
    # First show collection statistics
    print("\n1. COLLECTION STATISTICS:")
    get_collection_stats()
    
    # Preview the records that will be affected
    print("\n2. PREVIEWING RECORDS:")
    preview_records_before_update()
    
    # Ask for confirmation before proceeding
    print("\n" + "=" * 70)
    print("WARNING: This will remove 'socialmedia_ids' field from ALL records in the cluster collection!")
    confirmation = input("Do you want to proceed with removing 'socialmedia_ids' field from ALL records? (yes/no): ").lower().strip()
    
    if confirmation in ['yes', 'y']:
        print("\n3. EXECUTING UPDATE OPERATION:")
        remove_socialmedia_ids_from_all_records()
    else:
        print("Operation cancelled by user.")

MongoDB Operation: Remove 'socialmedia_ids' field from ALL records

1. COLLECTION STATISTICS:
Collection Statistics:
------------------------------
Total records: 98
Records with 'socialmedia_ids': 98
Records without 'socialmedia_ids': 0

Data type distribution:
  - data = 'email': 33 records
  - data = 'voice': 24 records
  - data = 'tickets': 16 records
  - data = 'socialmedia': 15 records
  - data = 'chat-chunks': 10 records

2. PREVIEWING RECORDS:
Preview of records that will be updated:
--------------------------------------------------
Record 1:
  _id: 68aacadc05037130937cbae2
  data: email
  socialmedia_ids: {'Twitter': ['68bb12eb35db675a8b09a8d3', '68bb12eb35db675a8b09a8df', '68bb12eb35db675a8b09a8b4', '68bb12eb35db675a8b09a8cf', '68bb12eb35db675a8b09a720', '68bb12eb35db675a8b09a743', '68bb12eb35db675a8b09a708', '68bb12eb35db675a8b09a71c', '68bb12eb35db675a8b09a74c', '68bb12eb35db675a8b09a8cc', '68bb12eb35db675a8b09a70d', '68bb12eb35db675a8b09a731', '68bb12eb35db675a8b09a705', 