In [None]:
# ============================================================================
# BUILD DATASET FROM MCL DOWNLOAD OUTPUT
# ============================================================================
#
# PURPOSE: Process the raw output from 00_data_download.ipynb into a clean,
#          analysis-ready dataset with proper list classifications.
#
# INPUT:
#   - Output from 00_data_download.ipynb:
#     * posts_with_provenance_DATE.rds (posts with list membership)
#     * producer_provenance_mapping_DATE.csv (surface ID → list mapping)
#   - Optional: Re-elected MPs file for experience distinction
#
# OUTPUT:
#   - combined_datasets/political_posts_TIMESTAMP.rds
#
# KEY OPERATIONS:
#   1. Load posts with provenance from 00_data_download
#   2. Apply surface ID fallback for Stories content
#   3. Standardize list classifications (main_list, sub_list)
#   4. Optionally distinguish re-elected vs new MPs
#   5. Add standardized date/time variables
#   6. Save combined dataset for analysis
#
# ============================================================================

library(tidyverse)
library(lubridate)

cat("\n")
cat("========================================================================\n")
cat("BUILD DATASET FROM MCL DOWNLOAD OUTPUT\n")
cat("========================================================================\n\n")

In [None]:
# ============================================================================
# CONFIGURATION
# ============================================================================

# Path to 00_data_download output directory
# Update this to match your actual output location
DATA_DOWNLOAD_OUTPUT_DIR <- "data_download_output"

# Optional: Path to re-elected MPs file for experience distinction
# Set to NULL if not available or not needed
REELECTED_MPS_FILE <- "rawdata/reelected_mps.rds"  # Update path as needed

# Output directory
OUTPUT_DIR <- "combined_datasets"

# List name mappings - standardize names from your producer lists
# Update these to match your actual producer list names from 00_data_download
LIST_MAPPINGS <- list(
  # Format: "name_in_provenance" = list(main_list = "...", sub_list = "...")
  "MPs (Re-elected)" = list(main_list = "MPs_Reelected", sub_list = "MPs_Reelected"),
  "MPs (ALL)" = list(main_list = "MPs", sub_list = "MPs"),
  "Prominent Politicians" = list(main_list = "Prominent_Politicians", sub_list = "Prominent_Politicians"),
  "Extremists" = list(main_list = "Extremists", sub_list = "Extremists")
  # Add more mappings as needed for your producer lists
)

cat("Configuration:\n")
cat("  Data download output:", DATA_DOWNLOAD_OUTPUT_DIR, "\n")
cat("  Re-elected MPs file:", ifelse(is.null(REELECTED_MPS_FILE), "Not configured", REELECTED_MPS_FILE), "\n")
cat("  Output directory:", OUTPUT_DIR, "\n\n")

In [None]:
# ============================================================================
# STEP 1: LOAD POSTS WITH PROVENANCE
# ============================================================================

cat("STEP 1: Loading posts with provenance from 00_data_download output...\n")
cat("------------------------------------------------------------------------\n\n")

# Find the most recent posts_with_provenance file
provenance_files <- list.files(
  DATA_DOWNLOAD_OUTPUT_DIR,
  pattern = "posts_with_provenance_.*\\.rds$",
  full.names = TRUE
)

if (length(provenance_files) == 0) {
  stop("No posts_with_provenance_*.rds files found in ", DATA_DOWNLOAD_OUTPUT_DIR, "\n",
       "Please run 00_data_download.ipynb first.")
}

# Use the most recent file
posts_file <- provenance_files[which.max(file.info(provenance_files)$mtime)]
cat("Loading:", posts_file, "\n")

posts_data <- readRDS(posts_file)

cat("✓ Loaded posts data\n")
cat("  Total posts:", format(nrow(posts_data), big.mark = ","), "\n")
cat("  Columns:", ncol(posts_data), "\n")
cat("  Column names:\n")
cat("   ", paste(head(names(posts_data), 15), collapse = ", "), "...\n\n")

# Check for list_names column from provenance tracking
if ("list_names" %in% names(posts_data)) {
  cat("List membership (from provenance):\n")
  print(table(posts_data$list_names, useNA = "ifany"))
  cat("\n")
} else {
  cat("⚠ Warning: 'list_names' column not found - provenance may not have been added\n\n")
}

In [None]:
# ============================================================================
# STEP 2: APPLY SURFACE ID FALLBACK FOR STORIES
# ============================================================================

cat("STEP 2: Applying surface ID fallback for Stories...\n")
cat("------------------------------------------------------------------------\n\n")

# Identify surface ID column
surface_id_col <- NULL
possible_cols <- c("surface.id", "surface_id", "surfaceId")
for (col in possible_cols) {
  if (col %in% names(posts_data)) {
    surface_id_col <- col
    break
  }
}

if (is.null(surface_id_col)) {
  cat("⚠ Warning: No surface ID column found. Checking for surface.id...\n")
  surface_id_col <- "surface.id"
}

cat("Using surface ID column:", surface_id_col, "\n")

# Check for Stories with NA surface.id
# Stories cannot be reshared, so post_owner = surface for stories
if ("content_type" %in% names(posts_data)) {
  
  stories_with_na <- sum(
    is.na(posts_data[[surface_id_col]]) & 
    posts_data$content_type == "stories",
    na.rm = TRUE
  )
  
  if (stories_with_na > 0) {
    cat("Found", stories_with_na, "stories with NA surface.id\n")
    cat("Applying fallback: For stories, post_owner = surface (stories can't be reshared)\n")
    
    # Apply fallback
    posts_data <- posts_data %>%
      mutate(
        !!surface_id_col := if_else(
          is.na(.data[[surface_id_col]]) & content_type == "stories" & !is.na(post_owner.id),
          post_owner.id,
          .data[[surface_id_col]]
        )
      )
    
    # Also apply to surface.name and surface.username if they exist
    if ("surface.name" %in% names(posts_data) && "post_owner.name" %in% names(posts_data)) {
      posts_data <- posts_data %>%
        mutate(
          surface.name = if_else(
            is.na(surface.name) & content_type == "stories" & !is.na(post_owner.name),
            post_owner.name,
            surface.name
          )
        )
    }
    
    if ("surface.username" %in% names(posts_data) && "post_owner.username" %in% names(posts_data)) {
      posts_data <- posts_data %>%
        mutate(
          surface.username = if_else(
            is.na(surface.username) & content_type == "stories" & !is.na(post_owner.username),
            post_owner.username,
            surface.username
          )
        )
    }
    
    cat("✓ Applied fallback to", stories_with_na, "stories\n\n")
  } else {
    cat("✓ No stories with NA surface.id found\n\n")
  }
} else {
  cat("⚠ content_type column not found - skipping stories fallback\n\n")
}

# Report remaining NAs
remaining_na <- sum(is.na(posts_data[[surface_id_col]]))
cat("Remaining posts with NA surface ID:", remaining_na, "\n\n")

In [None]:
# ============================================================================
# STEP 3: STANDARDIZE LIST CLASSIFICATIONS
# ============================================================================

cat("STEP 3: Standardizing list classifications...\n")
cat("------------------------------------------------------------------------\n\n")

# Map list_names from provenance to standardized main_list and sub_list
if ("list_names" %in% names(posts_data)) {
  
  cat("Mapping producer list names to standard categories...\n\n")
  
  # Create mapping function
  get_main_list <- function(list_name) {
    if (is.na(list_name)) return(NA_character_)
    
    # Handle multiple list memberships (separated by "; ")
    first_list <- strsplit(list_name, "; ")[[1]][1]
    
    if (first_list %in% names(LIST_MAPPINGS)) {
      return(LIST_MAPPINGS[[first_list]]$main_list)
    } else {
      # Default: use the list name as-is, cleaned up
      return(gsub("[^[:alnum:]_]", "_", first_list))
    }
  }
  
  get_sub_list <- function(list_name) {
    if (is.na(list_name)) return(NA_character_)
    
    first_list <- strsplit(list_name, "; ")[[1]][1]
    
    if (first_list %in% names(LIST_MAPPINGS)) {
      return(LIST_MAPPINGS[[first_list]]$sub_list)
    } else {
      return(gsub("[^[:alnum:]_]", "_", first_list))
    }
  }
  
  # Apply mappings
  posts_data <- posts_data %>%
    mutate(
      main_list = sapply(list_names, get_main_list),
      sub_list = sapply(list_names, get_sub_list)
    )
  
  cat("✓ Applied list mappings\n\n")
  
  # Show results
  cat("Posts by main_list:\n")
  print(table(posts_data$main_list, useNA = "ifany"))
  cat("\n")
  
} else {
  cat("⚠ Warning: 'list_names' column not found\n")
  cat("Creating default list classification from available data...\n\n")
  
  # If no provenance, try to use existing columns or set default
  if (!"main_list" %in% names(posts_data)) {
    posts_data$main_list <- "Unknown"
  }
  if (!"sub_list" %in% names(posts_data)) {
    posts_data$sub_list <- "Unknown"
  }
}

In [None]:
# ============================================================================
# STEP 4: OPTIONAL - DISTINGUISH RE-ELECTED VS NEW MPS
# ============================================================================

cat("STEP 4: Checking for re-elected MPs distinction...\n")
cat("------------------------------------------------------------------------\n\n")

# This step is optional - only applies if you have a list of re-elected MPs
# and want to distinguish them from newly elected MPs

if (!is.null(REELECTED_MPS_FILE) && file.exists(REELECTED_MPS_FILE)) {
  
  cat("Loading re-elected MPs list...\n")
  reelected_mps <- readRDS(REELECTED_MPS_FILE)
  
  # Check for ID column (common names: ids, id, surface_id)
  id_col <- intersect(names(reelected_mps), c("ids", "id", "surface_id", "surface.id"))[1]
  
  if (!is.na(id_col)) {
    reelected_ids <- unique(reelected_mps[[id_col]])
    cat("✓ Loaded", length(reelected_ids), "re-elected MP IDs\n\n")
    
    # Split MPs into re-elected vs new
    # Only applies to rows where main_list is "MPs" or similar
    mp_patterns <- c("MPs", "MPs_Reelected", "MPs_New", "MPs (ALL)", "MPs (Re-elected)")
    
    posts_data <- posts_data %>%
      mutate(
        is_reelected = .data[[surface_id_col]] %in% reelected_ids,
        main_list = case_when(
          main_list %in% mp_patterns & is_reelected ~ "MPs_Reelected",
          main_list %in% mp_patterns & !is_reelected ~ "MPs_New",
          TRUE ~ main_list
        ),
        sub_list = case_when(
          sub_list %in% mp_patterns & is_reelected ~ "MPs_Reelected",
          sub_list %in% mp_patterns & !is_reelected ~ "MPs_New",
          TRUE ~ sub_list
        )
      )
    
    # Report breakdown
    cat("MP classification results:\n")
    mp_summary <- posts_data %>%
      filter(grepl("^MPs", main_list)) %>%
      group_by(main_list) %>%
      summarise(
        n_posts = n(),
        n_accounts = n_distinct(.data[[surface_id_col]], na.rm = TRUE),
        .groups = "drop"
      )
    print(mp_summary)
    cat("\n")
    
  } else {
    cat("⚠ Could not find ID column in re-elected MPs file\n")
    cat("  Available columns:", paste(names(reelected_mps), collapse = ", "), "\n\n")
  }
  
} else {
  cat("Re-elected MPs file not configured or not found\n")
  cat("Skipping MP experience distinction (all MPs treated as single group)\n\n")
}

In [None]:
# ============================================================================
# STEP 5: ADD STANDARDIZED DATE/TIME VARIABLES
# ============================================================================

cat("STEP 5: Adding standardized date/time variables...\n")
cat("------------------------------------------------------------------------\n\n")

# Find creation_time column
time_col <- intersect(names(posts_data), c("creation_time", "created_time", "timestamp"))[1]

if (!is.na(time_col)) {
  cat("Using time column:", time_col, "\n")
  
  posts_data <- posts_data %>%
    mutate(
      date = as.Date(.data[[time_col]]),
      year = year(.data[[time_col]]),
      month = month(.data[[time_col]]),
      week = floor_date(date, "week")
    )
  
  cat("✓ Added date, year, month, week variables\n")
  cat("  Date range:", 
      format(min(posts_data$date, na.rm = TRUE), "%Y-%m-%d"), "to",
      format(max(posts_data$date, na.rm = TRUE), "%Y-%m-%d"), "\n\n")
  
} else {
  cat("⚠ Warning: No timestamp column found\n")
  cat("  Available columns:", paste(head(names(posts_data), 20), collapse = ", "), "\n\n")
}

In [None]:
# ============================================================================
# STEP 6: SAVE COMBINED DATASET
# ============================================================================

cat("STEP 6: Saving combined dataset...\n")
cat("------------------------------------------------------------------------\n\n")

# Create output directory if needed
if (!dir.exists(OUTPUT_DIR)) {
  dir.create(OUTPUT_DIR, recursive = TRUE)
}

# Save with timestamp
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
output_file <- file.path(OUTPUT_DIR, paste0("political_posts_", timestamp, ".rds"))

saveRDS(posts_data, output_file)

cat("✓ Saved combined dataset\n")
cat("  File:", output_file, "\n")
cat("  Total posts:", format(nrow(posts_data), big.mark = ","), "\n")
cat("  Total unique surfaces:", format(n_distinct(posts_data[[surface_id_col]], na.rm = TRUE), big.mark = ","), "\n\n")

In [None]:
# ============================================================================
# SUMMARY REPORT
# ============================================================================

cat("========================================================================\n")
cat("BUILD DATASET COMPLETE\n")
cat("========================================================================\n\n")

cat("FINAL DATASET STRUCTURE:\n\n")

# Summary by main_list
summary_by_list <- posts_data %>%
  group_by(main_list) %>%
  summarise(
    n_posts = n(),
    n_accounts = n_distinct(.data[[surface_id_col]], na.rm = TRUE),
    min_date = min(date, na.rm = TRUE),
    max_date = max(date, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(n_posts))

cat("By main_list:\n")
print(summary_by_list)
cat("\n")

# Check for key columns
cat("Key columns present:\n")
key_cols <- c(surface_id_col, "main_list", "sub_list", "date", "creation_time",
              "statistics.view_count", "statistics.reaction_count", 
              "statistics.share_count", "statistics.comment_count")
for (col in key_cols) {
  present <- col %in% names(posts_data)
  cat("  ", col, ":", ifelse(present, "✓", "✗"), "\n")
}
cat("\n")

cat("Output file:", output_file, "\n\n")

cat("Next steps:\n")
cat("  1. Run 02_data_cleaning.ipynb to validate and create analysis datasets\n")
cat("  2. Optionally run 03_enrich_surface_info.ipynb for account metadata\n")
cat("  3. Run 05_breakpoint_analysis.ipynb for main analysis\n\n")

cat("========================================================================\n")
cat("✓ ALL DONE!\n")
cat("========================================================================\n")