CODE TO CHOOSE WHAT IS LABELED AS A GOOD POST AND A BAD POST

Good post: Viral, score is high and interactions are high (top 10%)
Bad post: Low-Performing, score is low and interactions are low (bottom 10%)

In [4]:
import pandas as pd
import numpy as np
import os
import glob

In [2]:
def filter_posts_by_performance(input_csv, output_dir="./filtered_data", 
                                 top_percentile=90, bottom_percentile=10):
    """
    Filter posts into 'viral' (high-performing) and 'low-performing' categories.
    
    Parameters:
    -----------
    input_csv : str
        Path to the input CSV file
    output_dir : str
        Directory to save filtered CSVs
    top_percentile : int
        Percentile threshold for viral posts (default: 90 = top 10%)
    bottom_percentile : int
        Percentile threshold for low-performing posts (default: 10 = bottom 10%)
    """
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Load the data
    print(f"Loading data from {input_csv}...")
    df = pd.read_csv(input_csv, low_memory=False)
    print(f"Loaded {len(df)} posts")
    
    # Clean and convert numeric columns
    df["score"] = pd.to_numeric(df["score"], errors="coerce")
    df["num_comments"] = pd.to_numeric(df["num_comments"], errors="coerce")
    
    # Remove rows with missing values in key columns
    df = df.dropna(subset=["score", "num_comments"])
    print(f"After removing NaN values: {len(df)} posts")
    
    # Calculate normalized engagement score (same as your popularity metric)
    def minmax_norm(series):
        smin, smax = series.min(), series.max()
        if pd.isna(smin) or pd.isna(smax) or smax == smin:
            return pd.Series([0.5] * len(series), index=series.index)
        return (series - smin) / (smax - smin)
    
    df["score_norm"] = minmax_norm(df["score"])
    df["comments_norm"] = minmax_norm(df["num_comments"])
    df["engagement_score"] = 0.5 * df["score_norm"] + 0.5 * df["comments_norm"]
    
    # Calculate percentile thresholds
    viral_threshold = np.percentile(df["engagement_score"], top_percentile)
    low_threshold = np.percentile(df["engagement_score"], bottom_percentile)
    
    # Filter viral posts (top performers)
    viral_posts = df[df["engagement_score"] >= viral_threshold].copy()
    viral_posts = viral_posts.sort_values("engagement_score", ascending=False)
    
    # Filter low-performing posts (bottom performers)
    low_posts = df[df["engagement_score"] <= low_threshold].copy()
    low_posts = low_posts.sort_values("engagement_score", ascending=True)
    
    # Generate output filenames
    base_name = os.path.splitext(os.path.basename(input_csv))[0]
    viral_output = os.path.join(output_dir, f"{base_name}_viral.csv")
    low_output = os.path.join(output_dir, f"{base_name}_low_performing.csv")
    
    # Save filtered datasets
    viral_posts.to_csv(viral_output, index=False)
    low_posts.to_csv(low_output, index=False)
    
    # Print summary statistics
    print("\n" + "="*60)
    print("FILTERING SUMMARY")
    print("="*60)
    print(f"\nOriginal dataset: {len(df)} posts")
    print(f"\nEngagement score range: {df['engagement_score'].min():.4f} to {df['engagement_score'].max():.4f}")
    print(f"Viral threshold (top {100-top_percentile}%): {viral_threshold:.4f}")
    print(f"Low-performing threshold (bottom {bottom_percentile}%): {low_threshold:.4f}")
    
    print(f"\n🔥 VIRAL POSTS (Top {100-top_percentile}%): {len(viral_posts)} posts")
    print(f"   Score range: {viral_posts['score'].min():.0f} to {viral_posts['score'].max():.0f}")
    print(f"   Comments range: {viral_posts['num_comments'].min():.0f} to {viral_posts['num_comments'].max():.0f}")
    print(f"   Engagement score range: {viral_posts['engagement_score'].min():.4f} to {viral_posts['engagement_score'].max():.4f}")
    print(f"   Saved to: {viral_output}")
    
    print(f"\n📉 LOW-PERFORMING POSTS (Bottom {bottom_percentile}%): {len(low_posts)} posts")
    print(f"   Score range: {low_posts['score'].min():.0f} to {low_posts['score'].max():.0f}")
    print(f"   Comments range: {low_posts['num_comments'].min():.0f} to {low_posts['num_comments'].max():.0f}")
    print(f"   Engagement score range: {low_posts['engagement_score'].min():.4f} to {low_posts['engagement_score'].max():.4f}")
    print(f"   Saved to: {low_output}")
    
    print("\n" + "="*60)
    
    # Show sample posts from each category
    if "title" in viral_posts.columns:
        print("\n📊 SAMPLE VIRAL POSTS:")
        print(viral_posts[["title", "score", "num_comments", "engagement_score"]].head(3).to_string(index=False))
    
    if "title" in low_posts.columns:
        print("\n📊 SAMPLE LOW-PERFORMING POSTS:")
        print(low_posts[["title", "score", "num_comments", "engagement_score"]].head(3).to_string(index=False))
    
    return viral_posts, low_posts, df



In [3]:

# Automatically run when script is executed
print("Starting automatic filtering...")
print("Searching for CSV files...")

data_dir = "./updated_data_rp3/data/careeradvice/"
csv_files = glob.glob(os.path.join(data_dir, "*.csv"))

if not csv_files:
    print(f"❌ No CSV files found in {data_dir}")
    print(f"Checking current directory for the folder...")
    
    # Check if the directory exists
    if not os.path.exists(data_dir):
        print(f"❌ Directory does not exist: {data_dir}")
        print("\nPlease update the 'data_dir' variable with the correct path to your CSV file.")
    else:
        print(f"Directory exists but no CSV files found.")
        zip_files = glob.glob(os.path.join(data_dir, "*.zip"))
        if zip_files:
            print(f"Found zip file(s): {zip_files}")
            print("Please unzip the file first.")
else:
    print(f"Found {len(csv_files)} CSV file(s):")
    for f in csv_files:
        print(f"  - {f}")
    
    # Use the first CSV file found
    input_csv = csv_files[0]
    print(f"\nProcessing: {input_csv}")
    
    viral, low, all_data = filter_posts_by_performance(
        input_csv=input_csv,
        output_dir="./filtered_data",
        top_percentile=90,  # Top 10% are viral
        bottom_percentile=10  # Bottom 10% are low-performing
    )
    
    print("\n✅ Filtering complete! Check the './filtered_data/' directory for results.")

Starting automatic filtering...
Searching for CSV files...
Found 1 CSV file(s):
  - ./updated_data_rp3/data/careeradvice/combined_careeradvice_raw.csv

Processing: ./updated_data_rp3/data/careeradvice/combined_careeradvice_raw.csv
Loading data from ./updated_data_rp3/data/careeradvice/combined_careeradvice_raw.csv...
Loaded 68692 posts
After removing NaN values: 68684 posts

FILTERING SUMMARY

Original dataset: 68684 posts

Engagement score range: 0.0000 to 0.6570
Viral threshold (top 10%): 0.0019
Low-performing threshold (bottom 10%): 0.0000

🔥 VIRAL POSTS (Top 10%): 6962 posts
   Score range: 0 to 17044
   Comments range: 1 to 3169
   Engagement score range: 0.0019 to 0.6570
   Saved to: ./filtered_data/combined_careeradvice_raw_viral.csv

📉 LOW-PERFORMING POSTS (Bottom 10%): 25306 posts
   Score range: 0 to 1
   Comments range: 0 to 0
   Engagement score range: 0.0000 to 0.0000
   Saved to: ./filtered_data/combined_careeradvice_raw_low_performing.csv


📊 SAMPLE VIRAL POSTS:
        