In [1]:
# ============================================================================
# DATA LOADING AND COMBINATION SCRIPT - DISTINGUISHING RE-ELECTED vs NEW MPS
# Italian Political Accounts: MPs (Re-elected/New), Prominent Politicians, and Extremists
# VERSION: 3.2 (2024-12-06) - Added re-elected MP distinction
# BASED ON: combine_datasets_v2.R with exact same file loading strategy
# ============================================================================
# IMPORTANT: This script identifies accounts by surface.id/name/username
# surface = the page where post appeared (what we study)
# post_owner = the author of the post (different for reshares)
# 
# SPECIAL CASE - STORIES:
# - Stories have NA surface.id (data collection quirk)
# - Stories CANNOT be reshared (always original content)
# - For stories: post_owner = the page (no distinction)
# - Fallback: Use post_owner fields for stories with NA surface
#
# NEW IN v3.2:
# - Distinguishes between re-elected MPs (in parliament 2021 & 2022)
#   and new MPs (elected only in 2022)
# - Uses Parlamentari_ITA_Leg_XIX_no_XVIII.rds for re-elected MP IDs
# - Uses EXACT same file paths as combine_datasets_v2.R
#
# DATA SOURCES:
# - MPs: ../combined_query_results/combined_queries_2025-11-21_deduplicated_20251123_101758.rds
# - Prominent Politicians: 3 files (Original, 2021, 2025)
# - Extremists: 5 files (FG, GM, MT, 2021_EXT, 2025_EXT)
# ============================================================================

# Load required libraries
# ============================================================================
library(tidyverse)
library(lubridate)

cat("\n")
cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n")
cat("ITALIAN POLITICAL ACCOUNTS DATA COMBINATION - VERSION 3.2\n")
cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n")
cat("SURFACE fields (pages) + Stories fallback + RE-ELECTED MP DISTINCTION\n")
cat("MPs: Re-elected (2021+2022) vs New (2022 only)\n")
cat("Prominent Politicians: 3 sub-lists | Extremists: 5 sub-lists\n")
cat("BASED ON: combine_datasets_v2.R with exact same file paths\n")
cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

# ============================================================================
# STEP 0: LOAD RE-ELECTED MPs LIST
# ============================================================================

cat("STEP 0: Loading re-elected MPs list...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

reelected_file <- "rawdata/Parlamentari_ITA_Leg_XIX_no_XVIII.rds"

if (file.exists(reelected_file)) {
  reelected_mps <- readRDS(reelected_file)
  
  cat("✓ Loaded re-elected MPs list\n")
  cat("  File:", reelected_file, "\n")
  
  # Check structure
  cat("  Columns:", paste(names(reelected_mps), collapse = ", "), "\n")
  
  if ("ids" %in% names(reelected_mps)) {
    reelected_ids <- unique(reelected_mps$ids)
    cat("  Re-elected MPs:", length(reelected_ids), "\n\n")
  } else {
    stop("ERROR: 'id' column not found in re-elected MPs file.\n",
         "Available columns: ", paste(names(reelected_mps), collapse = ", "))
  }
  
} else {
  cat("⚠ WARNING: Re-elected MPs file not found at:", reelected_file, "\n")
  cat("Will treat all MPs as a single group (no distinction)\n\n")
  reelected_ids <- character(0)
}

# ============================================================================
# STEP 1: LOAD MAIN LIST 1 - ITALIAN MPS (PARLIAMENT SEPT 2022)
# ============================================================================

cat("STEP 1: Loading Italian Parliament Members (elected Sept 2022)...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

mp_file <- "../combined_query_results/combined_queries_2025-11-21_deduplicated_20251123_101758.rds"

if (file.exists(mp_file)) {
  mps_data <- readRDS(mp_file)
  
  # STORIES FALLBACK: For stories, post_owner IS the page (stories can't be reshared)
  stories_with_na <- sum(is.na(mps_data$surface.id) & mps_data$content_type == "stories")
  if (stories_with_na > 0) {
    cat("Found", stories_with_na, "stories with NA surface.id\n")
    cat("Applying fallback: For stories, post_owner = page (stories can't be reshared)\n")
    
    mps_data <- mps_data %>%
      mutate(
        surface.id = if_else(
          is.na(surface.id) & content_type == "stories" & !is.na(post_owner.id),
          post_owner.id,
          surface.id
        ),
        surface.name = if_else(
          is.na(surface.name) & content_type == "stories" & !is.na(post_owner.name),
          post_owner.name,
          surface.name
        ),
        surface.username = if_else(
          is.na(surface.username) & content_type == "stories" & !is.na(post_owner.username),
          post_owner.username,
          surface.username
        )
      )
    cat("✓ Applied fallback to", stories_with_na, "stories\n\n")
  }
  
  # CLASSIFY MPs: Re-elected vs New
  if (length(reelected_ids) > 0) {
    cat("Classifying MPs as Re-elected vs New...\n")
    
    mps_data <- mps_data %>%
      mutate(
        is_reelected = surface.id %in% reelected_ids,
        main_list = if_else(is_reelected, "MPs_Reelected", "MPs_New"),
        sub_list = if_else(is_reelected, "MPs_Reelected", "MPs_New"),
        list_description = if_else(
          is_reelected,
          "Italian Parliament members re-elected (served in 2021 & 2022)",
          "Italian Parliament members elected only in 2022 (new)"
        ),
        source_file = mp_file,
        date_collected = as.Date("2025-11-21")
      )
    
    # Report breakdown
    mp_breakdown <- mps_data %>%
      group_by(main_list) %>%
      summarise(
        n_posts = n(),
        n_accounts = n_distinct(surface.id, na.rm = TRUE),
        .groups = "drop"
      )
    
    cat("✓ MPs classified:\n")
    print(mp_breakdown)
    cat("\n")
    
  } else {
    # No re-elected list available - treat all as one group
    cat("⚠ No re-elected MPs list - using single 'MPs' category\n")
    
    mps_data <- mps_data %>%
      mutate(
        main_list = "MPs",
        sub_list = "MPs",
        list_description = "Italian Parliament members elected September 2022",
        source_file = mp_file,
        date_collected = as.Date("2025-11-21")
      )
  }
  
  cat("✓ Loaded MPs data\n")
  cat("  Posts:", nrow(mps_data), "\n")
  cat("  Unique surfaces (pages):", n_distinct(mps_data$surface.id, na.rm = TRUE), "\n")
  cat("  Date range:", 
      format(min(as.Date(mps_data$creation_time), na.rm = TRUE), "%Y-%m-%d"), "to", 
      format(max(as.Date(mps_data$creation_time), na.rm = TRUE), "%Y-%m-%d"), "\n\n")
  
  # CHECK FOR NA IN surface.id
  na_surface_ids <- mps_data %>%
    filter(is.na(surface.id))
  
  if (nrow(na_surface_ids) > 0) {
    cat("\n")
    cat("✗✗✗ ERROR: NA VALUES FOUND IN surface.id ✗✗✗\n")
    cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n\n")
    cat("Dataset: MPs\n")
    cat("Number of posts with NA surface.id:", nrow(na_surface_ids), "\n\n")
    
    cat("Example cases (first 10):\n")
    cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n")
    print(na_surface_ids %>%
            select(id, surface.id, surface.name, surface.username, 
                   content_type, creation_time, statistics.views) %>%
            head(10))
    cat("\n")
    cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")
    
    # Ask user what to do
    cat("⚠ WARNING: This will cause problems in analysis\n")
    cat("Options:\n")
    cat("  1. STOP here and investigate\n")
    cat("  2. Remove these posts (data loss but clean dataset)\n")
    cat("  3. Continue anyway (may cause issues later)\n\n")
    
    stop("NA values found in surface.id - please review and decide how to handle")
  }
  
} else {
  stop("MPs data file not found: ", mp_file)
}

# ============================================================================
# CONTINUE WITH REST OF ORIGINAL SCRIPT
# (Steps 2-4: Prominent Politicians, Extremists, Combination)
# ============================================================================

# ============================================================================
# STEP 2: LOAD PROMINENT POLITICIANS (3 SUB-LISTS)
# ============================================================================

cat("STEP 2: Loading Prominent Politicians...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

# Define prominent politician files
prominent_files <- list(
  list(file = "../combined_query_results/combined_queries_2025-11-26_deduplicated_20251127_102907.rds",
       sub_list = "Prominent_Politicians_Original",
       date = "2025-11-26"),
  list(file = "../combined_query_results/posts_with_provenance_2025-12-05_2021_PROM_POL.rds",
       sub_list = "Prominent_Politicians_2021",
       date = "2025-12-05"),
  list(file = "../combined_query_results/posts_with_provenance_2025-12-05_2025_PROM_POL.rds",
       sub_list = "Prominent_Politicians_2025",
       date = "2025-12-05")
)

prominent_data_list <- list()

for (prom_file_info in prominent_files) {
  file_path <- prom_file_info$file
  sub_list_name <- prom_file_info$sub_list
  
  if (file.exists(file_path)) {
    temp_data <- readRDS(file_path) %>%
      mutate(
        main_list = "Prominent_Politicians",
        sub_list = sub_list_name,
        list_description = paste0("Prominent Italian politicians - ", sub_list_name),
        source_file = file_path,
        date_collected = as.Date(prom_file_info$date)
      )
    
    # Stories fallback
    stories_na <- sum(is.na(temp_data$surface.id) & temp_data$content_type == "stories")
    if (stories_na > 0) {
      temp_data <- temp_data %>%
        mutate(
          surface.id = if_else(is.na(surface.id) & content_type == "stories" & !is.na(post_owner.id), 
                              post_owner.id, surface.id),
          surface.name = if_else(is.na(surface.name) & content_type == "stories" & !is.na(post_owner.name), 
                                post_owner.name, surface.name),
          surface.username = if_else(is.na(surface.username) & content_type == "stories" & !is.na(post_owner.username), 
                                    post_owner.username, surface.username)
        )
      cat("  Applied stories fallback to", stories_na, "stories in", sub_list_name, "\n")
    }
    
    prominent_data_list[[sub_list_name]] <- temp_data
    
    cat("✓ Loaded", sub_list_name, "\n")
    cat("  Posts:", nrow(temp_data), "\n")
    cat("  Unique surfaces:", n_distinct(temp_data$surface.id, na.rm = TRUE), "\n\n")
  } else {
    cat("⚠ WARNING: File not found for", sub_list_name, "\n")
    cat("  Expected:", file_path, "\n\n")
  }
}

if (length(prominent_data_list) == 0) {
  stop("No prominent politician data files found!")
}

# ============================================================================
# STEP 3: LOAD EXTREMISTS (5 SUB-LISTS)
# ============================================================================

cat("STEP 3: Loading Extremist accounts...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

extremist_files <- list(
  list(file = "../multi_list_provenance_analysis/posts_with_provenance_2025-12-03-FG.rds",
       sub_list = "Extremist_FG",
       date = "2025-12-03"),
  list(file = "../multi_list_provenance_analysis/posts_with_provenance_2025-12-04_GM.rds",
       sub_list = "Extremist_GM",
       date = "2025-12-04"),
  list(file = "../multi_list_provenance_analysis/posts_with_provenance_2025-12-03-MT.rds",
       sub_list = "Extremist_MT",
       date = "2025-12-03"),
  list(file = "../multi_list_provenance_analysis/posts_with_provenance_2025-12-05_2021_EXT.rds",
       sub_list = "Extremist_2021_EXT",
       date = "2025-12-05"),
  list(file = "../multi_list_provenance_analysis/posts_with_provenance_2025-12-05_2025_EXT.rds",
       sub_list = "Extremist_2025_EXT",
       date = "2025-12-05")
)

extremist_data_list <- list()

for (ext_file_info in extremist_files) {
  file_path <- ext_file_info$file
  sub_list_name <- ext_file_info$sub_list
  
  if (file.exists(file_path)) {
    temp_data <- readRDS(file_path) %>%
      mutate(
        main_list = "Extremists",
        sub_list = sub_list_name,
        list_description = paste0("Extremist accounts - ", sub_list_name),
        source_file = file_path,
        date_collected = as.Date(ext_file_info$date)
      )
    
    # Stories fallback
    stories_na <- sum(is.na(temp_data$surface.id) & temp_data$content_type == "stories")
    if (stories_na > 0) {
      temp_data <- temp_data %>%
        mutate(
          surface.id = if_else(is.na(surface.id) & content_type == "stories" & !is.na(post_owner.id), 
                              post_owner.id, surface.id),
          surface.name = if_else(is.na(surface.name) & content_type == "stories" & !is.na(post_owner.name), 
                                post_owner.name, surface.name),
          surface.username = if_else(is.na(surface.username) & content_type == "stories" & !is.na(post_owner.username), 
                                    post_owner.username, surface.username)
        )
      cat("  Applied stories fallback to", stories_na, "stories in", sub_list_name, "\n")
    }
    
    extremist_data_list[[sub_list_name]] <- temp_data
    
    cat("✓ Loaded", sub_list_name, "\n")
    cat("  Posts:", nrow(temp_data), "\n")
    cat("  Unique surfaces:", n_distinct(temp_data$surface.id, na.rm = TRUE), "\n\n")
  } else {
    cat("⚠ WARNING: File not found for", sub_list_name, "\n")
    cat("  Expected:", file_path, "\n\n")
  }
}

if (length(extremist_data_list) == 0) {
  stop("No extremist data files found!")
}

# ============================================================================
# STEP 4: COMBINE ALL DATASETS
# ============================================================================

cat("STEP 4: Combining all datasets...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

# First, bind each list into single data frames
cat("Combining Prominent Politicians sub-lists...\n")
prominent_combined <- bind_rows(prominent_data_list)
cat("✓ Combined", length(prominent_data_list), "Prominent Politicians files\n\n")

cat("Combining Extremist sub-lists...\n")
extremist_combined <- bind_rows(extremist_data_list)
cat("✓ Combined", length(extremist_data_list), "Extremist files\n\n")

cat("Combining all main lists...\n")
# Now combine the three main groups
all_data <- bind_rows(
  mps_data,
  prominent_combined,
  extremist_combined
)

cat("✓ Combined all datasets\n")
cat("  Total posts:", nrow(all_data), "\n")
cat("  Unique surfaces:", n_distinct(all_data$surface.id, na.rm = TRUE), "\n\n")

# Summary by main_list
cat("Summary by main_list:\n")
summary_table <- all_data %>%
  group_by(main_list) %>%
  summarise(
    n_posts = n(),
    n_surfaces = n_distinct(surface.id, na.rm = TRUE),
    min_date = min(as.Date(creation_time), na.rm = TRUE),
    max_date = max(as.Date(creation_time), na.rm = TRUE),
    .groups = "drop"
  )
print(summary_table)
cat("\n")

# Summary by sub_list
cat("Summary by sub_list:\n")
sub_summary <- all_data %>%
  group_by(main_list, sub_list) %>%
  summarise(
    n_posts = n(),
    n_surfaces = n_distinct(surface.id, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(main_list, sub_list)
print(sub_summary)
cat("\n")

# ============================================================================
# STEP 5: ADD STANDARDIZED DATE AND TIME VARIABLES
# ============================================================================

cat("STEP 5: Adding standardized date/time variables...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

all_data <- all_data %>%
  mutate(
    date = as.Date(creation_time),
    year = year(creation_time),
    month = month(creation_time),
    week = floor_date(date, "week")
  )

cat("✓ Added date, year, month, week variables\n\n")

# ============================================================================
# STEP 6: SAVE COMBINED DATASET
# ============================================================================

cat("STEP 6: Saving combined dataset...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

# Create output directory if it doesn't exist
if (!dir.exists("combined_datasets")) {
  dir.create("combined_datasets")
}

# Save with timestamp
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
output_file <- paste0("combined_datasets/italian_political_accounts_", timestamp, ".rds")

saveRDS(all_data, output_file)

cat("✓ Saved combined dataset\n")
cat("  File:", output_file, "\n")
cat("  Total posts:", nrow(all_data), "\n")
cat("  Total unique surfaces:", n_distinct(all_data$surface.id, na.rm = TRUE), "\n\n")

# ============================================================================
# STEP 7: GENERATE SUMMARY REPORT
# ============================================================================

cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n")
cat("DATA COMBINATION COMPLETE\n")
cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

cat("FINAL DATASET STRUCTURE:\n\n")

if (length(reelected_ids) > 0) {
  cat("Main Lists (with MP distinction):\n")
  cat("  1. MPs_Reelected - Parliament members re-elected (2021 & 2022)\n")
  cat("  2. MPs_New - Parliament members elected only in 2022\n")
} else {
  cat("Main Lists:\n")
  cat("  1. MPs - All Italian Parliament members (no distinction)\n")
}

cat("  3. Prominent_Politicians - Prominent figures (3 sub-lists)\n")
cat("  4. Extremists - Extremist accounts (5 sub-lists)\n\n")

cat("By main_list:\n")
print(summary_table)
cat("\n")

cat("By sub_list:\n")
print(sub_summary)
cat("\n")

cat("Output file:", output_file, "\n\n")

cat("Next steps:\n")
cat("  1. Run validation: source('validate_and_clean_dataset.R')\n")
cat("  2. Inspect data quality and temporal coverage\n")
cat("  3. Begin analysis\n\n")

cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n")
cat("✓ ALL DONE!\n")
cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

[NOTICE] 2 output(s) filtered out