In [6]:
import pandas as pd
from langdetect import detect
import os
import glob
import random

# ===== Configuration Variables =====
BATCH_NAME = 'test'  # e.g., 'january_2024', 'test_batch', 'main_collection', etc.
MIN_REPLY_COUNT = 5  # Minimum number of replies to consider a comment "popular"
SAMPLE_LENGTH = 50  # Number of popular comments to sample for final dataset

INPUT_DIR = f'comments/{BATCH_NAME}/'
OUTPUT_DIR = f'comments/{BATCH_NAME}_sampled/'

# ===== Processing Functions =====

def is_valid_comment(comment):
    """Check if comment is English"""
    comment = str(comment)
    try:
        if detect(comment) == 'en':
            return True
        else: 
            return False
    except:
        pass
    return True

def filter_english_comments(df):
    """Filter English comments from dataframe"""
    df['Comment'] = df['Comment'].astype(str)
    df_english = df[df['Comment'].apply(is_valid_comment)]
    return df_english



In [7]:
def analyze_comment_structure(df_english):
    """Analyze comment structure (parent comments vs replies)"""
    depth_0 = df_english[df_english['Depth'] == 0]
    depth_1 = df_english[df_english['Depth'] == 1]
    
    print(f"Parent comments (Depth 0): {len(depth_0)}")
    print(f"Replies (Depth 1): {len(depth_1)}")

def count_replies_per_parent(df_english):
    """Count number of replies per parent comment"""
    reply_counts = df_english[df_english['Depth'] == 1].groupby('ParentID').size().reset_index(name='ReplyCount')
    return reply_counts



In [8]:
def filter_popular_comments(df_english, reply_counts, min_reply_count):
    """Filter comments with more than min_reply_count replies"""
    popular_parents = reply_counts[reply_counts['ReplyCount'] > min_reply_count]['ParentID']
    df_filtered = df_english[df_english['ID'].isin(popular_parents) | df_english['ParentID'].isin(popular_parents)]
    print(f"Number of popular parents (>{min_reply_count} replies): {len(popular_parents)}")
    return df_filtered, popular_parents

def sample_popular_comments(df_english, popular_parents, sample_length):
    """Sample popular comments for final dataset"""
    if len(popular_parents) > 0:
        sample_size = min(sample_length, len(popular_parents))
        sampled_parents = random.sample(list(popular_parents), sample_size)
        df_sampled = df_english[df_english['ID'].isin(sampled_parents) | df_english['ParentID'].isin(sampled_parents)]
        return df_sampled, sample_size
    else:
        return None, 0



In [9]:
def process_single_file(filename, output_dir, min_reply_count, sample_length):
    """Process a single CSV file"""
    file_id = os.path.basename(filename).replace('.csv', '')
    print(f"\n{'='*60}")
    print(f"Processing: {file_id}")
    print(f"File: {filename}")
    
    try:
        # Load the CSV file
        df = pd.read_csv(filename)
        print(f"Total comments: {len(df)}")
        
        # Check if file is empty
        if len(df) == 0:
            print("File is empty, skipping...")
            return
        
        # Check if required columns exist
        required_columns = ['Comment', 'Depth', 'ID', 'ParentID']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            print(f"Missing required columns: {missing_columns}")
            print(f"Available columns: {list(df.columns)}")
            print("Skipping this file...")
            return
        
        # Filter English comments
        df_english = filter_english_comments(df)
        print(f"Total English comments: {len(df_english)}")
        
        # Check if we have any English comments
        if len(df_english) == 0:
            print("No English comments found, skipping...")
            return
        
        # Analyze comment structure
        analyze_comment_structure(df_english)
        
        # Count replies per parent
        reply_counts = count_replies_per_parent(df_english)
        
        # Check if we have any replies
        if len(reply_counts) == 0:
            print("No replies found, skipping...")
            return
        
        # Filter popular comments
        df_filtered, popular_parents = filter_popular_comments(df_english, reply_counts, min_reply_count)
        
        # Save popular comments
        popular_filename = f'{output_dir}{file_id}_popular.csv'
        df_filtered.to_csv(popular_filename, index=False)
        print(f"Popular comments saved to: {popular_filename}")
        
        # Sample popular comments
        df_sampled, sample_size = sample_popular_comments(df_english, popular_parents, sample_length)
        
        if df_sampled is not None:
            sampled_filename = f'{output_dir}{file_id}_sampled{sample_size}.csv'
            df_sampled.to_csv(sampled_filename, index=False)
            print(f"Sampled {sample_size} popular comments saved to: {sampled_filename}")
        else:
            print("No popular comments found to sample")
            
    except Exception as e:
        print(f"Error processing {file_id}: {e}")
        print("Skipping this file...")




In [11]:
csv_files = glob.glob(f"{INPUT_DIR}*.csv")
if not csv_files:
    print(f"No CSV files found in {INPUT_DIR}")
    print("Available files:")
    for file in glob.glob("comments/*/*.csv"):
        print(f"  {file}")
else:
    print(f"Processing batch: {BATCH_NAME}")
    print(f"Found {len(csv_files)} CSV files to process")
    print(f"Output directory: {OUTPUT_DIR}")
    
    # Create output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # Process each CSV file
    for filename in csv_files:
        process_single_file(filename, OUTPUT_DIR, MIN_REPLY_COUNT, SAMPLE_LENGTH)
    
    print(f"\n{'='*60}")
    print(f"Batch processing completed!")
    print(f"All results saved to: {OUTPUT_DIR}")

Processing batch: test
Found 3 CSV files to process
Output directory: comments/test_sampled/

Processing: Debunking_the_bulls**t_ZnmsMg6joGo
File: comments/test/Debunking_the_bulls**t_ZnmsMg6joGo.csv
Total comments: 0
File is empty, skipping...

Processing: Why_children_get_so_many_vaccines__1gluMtaUmg
File: comments/test/Why_children_get_so_many_vaccines__1gluMtaUmg.csv
Total comments: 740
Total English comments: 667
Parent comments (Depth 0): 300
Replies (Depth 1): 367
Number of popular parents (>5 replies): 13
Popular comments saved to: comments/test_sampled/Why_children_get_so_many_vaccines__1gluMtaUmg_popular.csv
Sampled 13 popular comments saved to: comments/test_sampled/Why_children_get_so_many_vaccines__1gluMtaUmg_sampled13.csv

Processing: Doctor_Mike_vs_20_Anti-Vaxxers_|_Surrounded_o69BiOqY1Ec
File: comments/test/Doctor_Mike_vs_20_Anti-Vaxxers_|_Surrounded_o69BiOqY1Ec.csv
Total comments: 9143
Total English comments: 8604
Parent comments (Depth 0): 6819
Replies (Depth 1): 1785