In [1]:
# ============================================================================
# APPEND OCTOBER-NOVEMBER 2025 DATA TO COMBINED DATASET
# VERSION: 1.0 (2025-12-23)
# BASED ON: combine_datasets_v3.2.R structure and conventions
# ============================================================================
# 
# PURPOSE:
# This script appends new posts from October-November 2025 to the existing
# combined dataset created by combine_datasets_v3.2.R
#
# INPUT FILES (from ../multi_list_provenance_analysis/):
# - subset_mps.rds
# - subset_prominent_politicians.rds  
# - subset_extremist.rds
#
# These files have a different structure with boolean flags:
# - in_MPs_Re_elected, in_MPs_ALL (for MPs)
# - in_Prominent_Politicians (for prominent politicians)
# - in_Extremiste_cluster* (for extremists)
#
# OUTPUT:
# Updated combined dataset in combined_datasets/ directory
# ============================================================================

# Load required libraries
library(tidyverse)
library(lubridate)

cat("\n")
cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n")
cat("APPEND OCTOBER-NOVEMBER 2025 DATA\n")
cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n")
cat("Adding new posts to existing combined dataset\n")
cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

# ============================================================================
# STEP 0A: LOAD RE-ELECTED MPs LIST (same as original script)
# ============================================================================

cat("STEP 0A: Loading re-elected MPs list...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

reelected_file <- "rawdata/Parlamentari_ITA_Leg_XIX_no_XVIII.rds"

if (file.exists(reelected_file)) {
  reelected_mps <- readRDS(reelected_file)
  
  cat("✓ Loaded re-elected MPs list\n")
  cat("  File:", reelected_file, "\n")
  
  # Check structure
  cat("  Columns:", paste(names(reelected_mps), collapse = ", "), "\n")
  
  if ("ids" %in% names(reelected_mps)) {
    reelected_ids <- unique(reelected_mps$ids)
    cat("  Re-elected MPs:", length(reelected_ids), "\n\n")
  } else {
    stop("ERROR: 'ids' column not found in re-elected MPs file.\n",
         "Available columns: ", paste(names(reelected_mps), collapse = ", "))
  }
  
} else {
  cat("⚠ WARNING: Re-elected MPs file not found at:", reelected_file, "\n")
  cat("Will treat all MPs as a single group (no distinction)\n\n")
  reelected_ids <- character(0)
}

# ============================================================================
# STEP 0B: FIND AND LOAD THE MOST RECENT COMBINED DATASET
# ============================================================================

cat("STEP 0B: Loading existing combined dataset...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

# Find the most recent combined dataset
combined_files <- list.files(
  "combined_datasets",
  pattern = "^italian_political_accounts_.*\\.rds$",
  full.names = TRUE
)

if (length(combined_files) == 0) {
  stop("No existing combined dataset found in combined_datasets/\n",
       "Please run combine_datasets_v3.2.R first.")
}

# Sort by modification time and get the most recent
combined_files_info <- file.info(combined_files)
most_recent_file <- rownames(combined_files_info)[which.max(combined_files_info$mtime)]

cat("Found", length(combined_files), "combined dataset(s)\n")
cat("Loading most recent:", basename(most_recent_file), "\n")

existing_data <- readRDS(most_recent_file)

cat("✓ Loaded existing dataset\n")
cat("  Posts:", nrow(existing_data), "\n")
cat("  Unique surfaces:", n_distinct(existing_data$surface.id, na.rm = TRUE), "\n")
cat("  Date range:", 
    format(min(existing_data$date, na.rm = TRUE), "%Y-%m-%d"), "to",
    format(max(existing_data$date, na.rm = TRUE), "%Y-%m-%d"), "\n\n")

# Show existing structure
cat("Existing main_list breakdown:\n")
print(existing_data %>%
        group_by(main_list) %>%
        summarise(n_posts = n(), n_surfaces = n_distinct(surface.id, na.rm = TRUE), .groups = "drop"))
cat("\n")

# ============================================================================
# STEP 1: LOAD NEW SUBSET FILES
# ============================================================================

cat("STEP 1: Loading new subset files...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

new_data_dir <- "../multi_list_provenance_analysis"

# Define new files
new_files <- list(
  mps = file.path(new_data_dir, "subset_mps.rds"),
  prominent = file.path(new_data_dir, "subset_prominent_politicians.rds"),
  extremist = file.path(new_data_dir, "subset_extremist.rds")
)

# Load each file
new_data_list <- list()

for (name in names(new_files)) {
  file_path <- new_files[[name]]
  
  if (file.exists(file_path)) {
    temp_data <- readRDS(file_path)
    new_data_list[[name]] <- temp_data
    
    cat("✓ Loaded", name, "\n")
    cat("  File:", basename(file_path), "\n")
    cat("  Posts:", nrow(temp_data), "\n")
    cat("  Columns:", ncol(temp_data), "\n")
    cat("  Date range:", 
        format(min(as.Date(temp_data$creation_time), na.rm = TRUE), "%Y-%m-%d"), "to",
        format(max(as.Date(temp_data$creation_time), na.rm = TRUE), "%Y-%m-%d"), "\n\n")
  } else {
    cat("⚠ WARNING: File not found:", file_path, "\n\n")
  }
}

if (length(new_data_list) == 0) {
  stop("No new data files found!")
}

# ============================================================================
# STEP 2: PROCESS MPs DATA (using same logic as original script)
# ============================================================================

cat("STEP 2: Processing MPs data...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

if (!is.null(new_data_list$mps)) {
  mps_new <- new_data_list$mps
  
  # Apply stories fallback (same as original script)
  # STORIES FALLBACK: For stories, post_owner IS the page (stories can't be reshared)
  stories_with_na <- sum(is.na(mps_new$surface.id) & mps_new$content_type == "stories", na.rm = TRUE)
  if (stories_with_na > 0) {
    cat("Found", stories_with_na, "stories with NA surface.id\n")
    cat("Applying fallback: For stories, post_owner = page (stories can't be reshared)\n")
    
    mps_new <- mps_new %>%
      mutate(
        surface.id = if_else(
          is.na(surface.id) & content_type == "stories" & !is.na(post_owner.id),
          post_owner.id, surface.id
        ),
        surface.name = if_else(
          is.na(surface.name) & content_type == "stories" & !is.na(post_owner.name),
          post_owner.name, surface.name
        ),
        surface.username = if_else(
          is.na(surface.username) & content_type == "stories" & !is.na(post_owner.username),
          post_owner.username, surface.username
        )
      )
    cat("✓ Applied fallback to", stories_with_na, "stories\n\n")
  }
  
  # CLASSIFY MPs: Re-elected vs New (EXACT SAME LOGIC AS ORIGINAL SCRIPT)
  if (length(reelected_ids) > 0) {
    cat("Classifying MPs as Re-elected vs New...\n")
    cat("Using surface.id membership in reelected_ids list\n\n")
    
    mps_new <- mps_new %>%
      mutate(
        is_reelected = surface.id %in% reelected_ids,
        main_list = if_else(is_reelected, "MPs_Reelected", "MPs_New"),
        sub_list = if_else(is_reelected, "MPs_Reelected", "MPs_New"),
        list_description = if_else(
          is_reelected,
          "Italian Parliament members re-elected (served in 2021 & 2022)",
          "Italian Parliament members elected only in 2022 (new)"
        ),
        source_file = new_files$mps,
        date_collected = as.Date("2025-12-23")
      )
    
    # Report breakdown (same format as original)
    mp_breakdown <- mps_new %>%
      group_by(main_list) %>%
      summarise(
        n_posts = n(),
        n_accounts = n_distinct(surface.id, na.rm = TRUE),
        .groups = "drop"
      )
    
    cat("✓ MPs classified:\n")
    print(mp_breakdown)
    cat("\n")
    
  } else {
    # No re-elected list available - treat all as one group (same as original)
    cat("⚠ No re-elected MPs list - using single 'MPs' category\n")
    
    mps_new <- mps_new %>%
      mutate(
        main_list = "MPs",
        sub_list = "MPs",
        list_description = "Italian Parliament members elected September 2022",
        source_file = new_files$mps,
        date_collected = as.Date("2025-12-23")
      )
  }
  
  # CHECK FOR NA IN surface.id (same validation as original)
  na_surface_ids <- mps_new %>%
    filter(is.na(surface.id))
  
  if (nrow(na_surface_ids) > 0) {
    cat("\n")
    cat("⚠ WARNING: NA VALUES FOUND IN surface.id\n")
    cat("Number of posts with NA surface.id:", nrow(na_surface_ids), "\n")
    cat("Content types affected:\n")
    print(table(na_surface_ids$content_type, useNA = "ifany"))
    cat("\n")
  }
  
  cat("✓ Processed MPs data:", nrow(mps_new), "posts\n")
  cat("  Unique surfaces (pages):", n_distinct(mps_new$surface.id, na.rm = TRUE), "\n\n")
  
} else {
  mps_new <- NULL
  cat("⚠ No MPs data to process\n\n")
}

# ============================================================================
# STEP 3: PROCESS PROMINENT POLITICIANS DATA
# ============================================================================

cat("STEP 3: Processing Prominent Politicians data...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

if (!is.null(new_data_list$prominent)) {
  prominent_new <- new_data_list$prominent
  
  # Apply stories fallback
  stories_with_na <- sum(is.na(prominent_new$surface.id) & prominent_new$content_type == "stories", na.rm = TRUE)
  if (stories_with_na > 0) {
    cat("Applying stories fallback to", stories_with_na, "stories...\n")
    prominent_new <- prominent_new %>%
      mutate(
        surface.id = if_else(
          is.na(surface.id) & content_type == "stories" & !is.na(post_owner.id),
          post_owner.id, surface.id
        ),
        surface.name = if_else(
          is.na(surface.name) & content_type == "stories" & !is.na(post_owner.name),
          post_owner.name, surface.name
        ),
        surface.username = if_else(
          is.na(surface.username) & content_type == "stories" & !is.na(post_owner.username),
          post_owner.username, surface.username
        )
      )
  }
  
  # Check for sub-list information in list_ids column
  if ("list_ids" %in% names(prominent_new)) {
    cat("Found list_ids column. Unique values:\n")
    print(table(prominent_new$list_ids, useNA = "ifany"))
    cat("\n")
  }
  
  # Classify as Prominent Politicians
  # Try to determine sub_list from list_ids if available
  prominent_new <- prominent_new %>%
    mutate(
      main_list = "Prominent_Politicians",
      sub_list = case_when(
        !is.null(list_ids) & grepl("2021", list_ids) ~ "Prominent_Politicians_2021",
        !is.null(list_ids) & grepl("2025", list_ids) ~ "Prominent_Politicians_2025",
        TRUE ~ "Prominent_Politicians_Oct_Nov_2025"
      ),
      list_description = "Prominent Italian politicians - October/November 2025 extension",
      source_file = new_files$prominent,
      date_collected = as.Date("2025-12-23")
    )
  
  cat("✓ Processed Prominent Politicians data:", nrow(prominent_new), "posts\n")
  cat("  Unique surfaces:", n_distinct(prominent_new$surface.id, na.rm = TRUE), "\n\n")
  
} else {
  prominent_new <- NULL
  cat("⚠ No Prominent Politicians data to process\n\n")
}

# ============================================================================
# STEP 4: PROCESS EXTREMIST DATA
# ============================================================================

cat("STEP 4: Processing Extremist data...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

if (!is.null(new_data_list$extremist)) {
  extremist_new <- new_data_list$extremist
  
  # Apply stories fallback
  stories_with_na <- sum(is.na(extremist_new$surface.id) & extremist_new$content_type == "stories", na.rm = TRUE)
  if (stories_with_na > 0) {
    cat("Applying stories fallback to", stories_with_na, "stories...\n")
    extremist_new <- extremist_new %>%
      mutate(
        surface.id = if_else(
          is.na(surface.id) & content_type == "stories" & !is.na(post_owner.id),
          post_owner.id, surface.id
        ),
        surface.name = if_else(
          is.na(surface.name) & content_type == "stories" & !is.na(post_owner.name),
          post_owner.name, surface.name
        ),
        surface.username = if_else(
          is.na(surface.username) & content_type == "stories" & !is.na(post_owner.username),
          post_owner.username, surface.username
        )
      )
  }
  
  # Check for cluster columns
  cluster_cols <- names(extremist_new)[grepl("^in_Extremist", names(extremist_new))]
  cat("Found cluster columns:", paste(cluster_cols, collapse = ", "), "\n\n")
  
  # Check list_names column for sub-list info
  if ("list_names" %in% names(extremist_new)) {
    cat("Found list_names column. Unique values:\n")
    print(table(extremist_new$list_names, useNA = "ifany"))
    cat("\n")
  }
  
  # Classify Extremists - use list_names if available, otherwise use cluster flags
  if ("list_names" %in% names(extremist_new)) {
    extremist_new <- extremist_new %>%
      mutate(
        main_list = "Extremists",
        sub_list = case_when(
          grepl("cluster1", list_names, ignore.case = TRUE) ~ "Extremist_Cluster1",
          grepl("cluster2", list_names, ignore.case = TRUE) ~ "Extremist_Cluster2",
          grepl("FG", list_names) ~ "Extremist_FG",
          grepl("GM", list_names) ~ "Extremist_GM",
          grepl("MT", list_names) ~ "Extremist_MT",
          grepl("2021", list_names) ~ "Extremist_2021_EXT",
          grepl("2025", list_names) ~ "Extremist_2025_EXT",
          TRUE ~ "Extremist_Oct_Nov_2025"
        ),
        list_description = paste0("Extremist accounts - ", sub_list),
        source_file = new_files$extremist,
        date_collected = as.Date("2025-12-23")
      )
  } else {
    extremist_new <- extremist_new %>%
      mutate(
        main_list = "Extremists",
        sub_list = "Extremist_Oct_Nov_2025",
        list_description = "Extremist accounts - October/November 2025 extension",
        source_file = new_files$extremist,
        date_collected = as.Date("2025-12-23")
      )
  }
  
  cat("Extremist sub_list breakdown:\n")
  print(extremist_new %>%
          group_by(sub_list) %>%
          summarise(n_posts = n(), .groups = "drop"))
  cat("\n")
  
  cat("✓ Processed Extremist data:", nrow(extremist_new), "posts\n")
  cat("  Unique surfaces:", n_distinct(extremist_new$surface.id, na.rm = TRUE), "\n\n")
  
} else {
  extremist_new <- NULL
  cat("⚠ No Extremist data to process\n\n")
}

# ============================================================================
# STEP 5: COMBINE NEW DATA
# ============================================================================

cat("STEP 5: Combining new datasets...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

# Get list of non-null datasets
new_datasets <- list(mps_new, prominent_new, extremist_new)
new_datasets <- new_datasets[!sapply(new_datasets, is.null)]

if (length(new_datasets) == 0) {
  stop("No new data to combine!")
}

# Identify common columns with existing data
existing_cols <- names(existing_data)
cat("Existing dataset has", length(existing_cols), "columns\n")

# For each new dataset, select only columns that exist in the original
# Plus the new classification columns
required_cols <- c("main_list", "sub_list", "list_description", "source_file", "date_collected")

new_data_harmonized <- lapply(new_datasets, function(df) {
  # Find overlapping columns
  common_cols <- intersect(names(df), existing_cols)
  
  # Add required classification columns
  cols_to_keep <- unique(c(common_cols, required_cols))
  cols_to_keep <- cols_to_keep[cols_to_keep %in% names(df)]
  
  df_selected <- df %>% select(all_of(cols_to_keep))
  
  return(df_selected)
})

# Bind new data together
new_data_combined <- bind_rows(new_data_harmonized)

cat("✓ Combined new data\n")
cat("  Total new posts:", nrow(new_data_combined), "\n")
cat("  Columns:", ncol(new_data_combined), "\n\n")

# Add standardized date variables if not present
if (!"date" %in% names(new_data_combined)) {
  new_data_combined <- new_data_combined %>%
    mutate(
      date = as.Date(creation_time),
      year = year(creation_time),
      month = month(creation_time),
      week = floor_date(date, "week")
    )
}

cat("New data by main_list:\n")
print(new_data_combined %>%
        group_by(main_list) %>%
        summarise(n_posts = n(), n_surfaces = n_distinct(surface.id, na.rm = TRUE), .groups = "drop"))
cat("\n")

# ============================================================================
# STEP 6: CHECK FOR DUPLICATES
# ============================================================================

cat("STEP 6: Checking for duplicates...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

# Check if 'id' column exists (post ID)
if ("id" %in% names(existing_data) & "id" %in% names(new_data_combined)) {
  
  existing_ids <- existing_data$id
  new_ids <- new_data_combined$id
  
  duplicates <- sum(new_ids %in% existing_ids)
  
  if (duplicates > 0) {
    cat("⚠ Found", duplicates, "posts already in existing dataset\n")
    cat("  These will be removed from new data to avoid duplicates\n\n")
    
    new_data_combined <- new_data_combined %>%
      filter(!id %in% existing_ids)
    
    cat("✓ Removed duplicates. Remaining new posts:", nrow(new_data_combined), "\n\n")
  } else {
    cat("✓ No duplicate post IDs found\n\n")
  }
  
} else {
  cat("⚠ Cannot check for duplicates - 'id' column not found\n")
  cat("  Proceeding without deduplication\n\n")
}

# ============================================================================
# STEP 7: MERGE WITH EXISTING DATA
# ============================================================================

cat("STEP 7: Merging with existing data...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

# Ensure column alignment - add missing columns as NA
all_cols <- union(names(existing_data), names(new_data_combined))

existing_data_aligned <- existing_data
new_data_aligned <- new_data_combined

for (col in all_cols) {
  if (!col %in% names(existing_data_aligned)) {
    existing_data_aligned[[col]] <- NA
  }
  if (!col %in% names(new_data_aligned)) {
    new_data_aligned[[col]] <- NA
  }
}

# Select columns in same order
existing_data_aligned <- existing_data_aligned %>% select(all_of(all_cols))
new_data_aligned <- new_data_aligned %>% select(all_of(all_cols))

# Combine
combined_data <- bind_rows(existing_data_aligned, new_data_aligned)

cat("✓ Merged datasets\n")
cat("  Previous posts:", nrow(existing_data), "\n")
cat("  New posts added:", nrow(new_data_aligned), "\n")
cat("  Total posts:", nrow(combined_data), "\n")
cat("  Total unique surfaces:", n_distinct(combined_data$surface.id, na.rm = TRUE), "\n\n")

# ============================================================================
# STEP 8: VALIDATE ACCOUNT MATCHING BETWEEN ORIGINAL AND NEW DATA
# ============================================================================

cat("STEP 8: Validating account matching between original and new data...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

# Get unique accounts from original dataset by main_list
original_accounts <- existing_data %>%
  group_by(main_list) %>%
  summarise(
    original_accounts = list(unique(surface.id[!is.na(surface.id)])),
    n_original = n_distinct(surface.id, na.rm = TRUE),
    .groups = "drop"
  )

# Get unique accounts from new data by main_list
new_accounts <- new_data_combined %>%
  group_by(main_list) %>%
  summarise(
    new_accounts = list(unique(surface.id[!is.na(surface.id)])),
    n_new = n_distinct(surface.id, na.rm = TRUE),
    .groups = "drop"
  )

# Join and compare
account_comparison <- original_accounts %>%
  full_join(new_accounts, by = "main_list") %>%
  rowwise() %>%
  mutate(
    n_original = if_else(is.null(original_accounts) || length(original_accounts) == 0, 0L, as.integer(n_original)),
    n_new = if_else(is.null(new_accounts) || length(new_accounts) == 0, 0L, as.integer(n_new)),
    # Accounts in new data that are also in original
    n_matching = if_else(
      is.null(original_accounts) || is.null(new_accounts),
      0L,
      as.integer(sum(new_accounts %in% original_accounts))
    ),
    # Accounts in new data that are NOT in original (unexpected)
    n_new_only = if_else(
      is.null(new_accounts),
      0L,
      as.integer(sum(!new_accounts %in% original_accounts))
    ),
    # Accounts in original that are NOT in new data
    n_original_only = if_else(
      is.null(original_accounts),
      0L,
      as.integer(sum(!original_accounts %in% new_accounts))
    )
  ) %>%
  ungroup() %>%
  select(main_list, n_original, n_new, n_matching, n_new_only, n_original_only)

cat("Account matching validation:\n")
cat("=" %>% rep(60) %>% paste0(collapse = ""), "\n")
print(account_comparison)
cat("\n")

cat("Legend:\n")
cat("  n_original     = Unique accounts in original dataset\n")
cat("  n_new          = Unique accounts in new subset data\n")
cat("  n_matching     = Accounts in new data that exist in original\n")
cat("  n_new_only     = Accounts in new data NOT in original (⚠ unexpected)\n")
cat("  n_original_only = Accounts in original NOT in new data (may be inactive)\n\n")

# Check for unexpected new accounts
total_new_only <- sum(account_comparison$n_new_only, na.rm = TRUE)
if (total_new_only > 0) {
  cat("⚠ WARNING:", total_new_only, "account(s) in new data not found in original dataset!\n")
  cat("  These accounts may need investigation:\n\n")
  
  # Show which accounts are new
  for (i in 1:nrow(account_comparison)) {
    ml <- account_comparison$main_list[i]
    if (!is.na(account_comparison$n_new_only[i]) && account_comparison$n_new_only[i] > 0) {
      
      orig_ids <- existing_data %>% 
        filter(main_list == ml) %>% 
        pull(surface.id) %>% 
        unique()
      
      new_ids <- new_data_combined %>% 
        filter(main_list == ml) %>% 
        pull(surface.id) %>% 
        unique()
      
      unexpected_ids <- setdiff(new_ids, orig_ids)
      
      cat("  ", ml, ":\n", sep = "")
      
      # Get names for these IDs
      unexpected_info <- new_data_combined %>%
        filter(surface.id %in% unexpected_ids) %>%
        select(surface.id, surface.name, surface.username) %>%
        distinct() %>%
        head(10)
      
      print(unexpected_info)
      cat("\n")
    }
  }
} else {
  cat("✓ All accounts in new data match accounts in original dataset\n\n")
}

# ============================================================================
# STEP 9: FINAL SUMMARY
# ============================================================================

cat("STEP 9: Generating summary...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

# Summary by main_list
cat("Summary by main_list:\n")
summary_main <- combined_data %>%
  group_by(main_list) %>%
  summarise(
    n_posts = n(),
    n_surfaces = n_distinct(surface.id, na.rm = TRUE),
    min_date = min(date, na.rm = TRUE),
    max_date = max(date, na.rm = TRUE),
    .groups = "drop"
  )
print(summary_main)
cat("\n")

# Summary by sub_list
cat("Summary by sub_list:\n")
summary_sub <- combined_data %>%
  group_by(main_list, sub_list) %>%
  summarise(
    n_posts = n(),
    n_surfaces = n_distinct(surface.id, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(main_list, sub_list)
print(summary_sub)
cat("\n")

# Date range
cat("Overall date range:",
    format(min(combined_data$date, na.rm = TRUE), "%Y-%m-%d"), "to",
    format(max(combined_data$date, na.rm = TRUE), "%Y-%m-%d"), "\n\n")

# ============================================================================
# STEP 10: MONTHLY BREAKDOWN
# ============================================================================

cat("STEP 10: Monthly breakdown of posts...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

# Create year-month variable
combined_data <- combined_data %>%
  mutate(year_month = floor_date(date, "month"))

# Overall monthly breakdown
cat("A) Overall monthly breakdown:\n")
cat("=" %>% rep(60) %>% paste0(collapse = ""), "\n")
monthly_overall <- combined_data %>%
  group_by(year_month) %>%
  summarise(
    n_posts = n(),
    n_surfaces = n_distinct(surface.id, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(year_month) %>%
  mutate(year_month = format(year_month, "%Y-%m"))

print(monthly_overall, n = 50)
cat("\n")

# Monthly breakdown by main_list
cat("B) Monthly breakdown by main_list:\n")
cat("=" %>% rep(60) %>% paste0(collapse = ""), "\n")
monthly_by_list <- combined_data %>%
  group_by(year_month, main_list) %>%
  summarise(
    n_posts = n(),
    .groups = "drop"
  ) %>%
  arrange(year_month, main_list) %>%
  mutate(year_month = format(year_month, "%Y-%m"))

print(monthly_by_list, n = 100)
cat("\n")

# Pivot table format for easier reading
cat("C) Monthly breakdown (pivot table format):\n")
cat("=" %>% rep(60) %>% paste0(collapse = ""), "\n")
monthly_pivot <- combined_data %>%
  mutate(year_month = format(year_month, "%Y-%m")) %>%
  group_by(year_month, main_list) %>%
  summarise(n_posts = n(), .groups = "drop") %>%
  pivot_wider(
    names_from = main_list,
    values_from = n_posts,
    values_fill = 0
  ) %>%
  arrange(year_month) %>%
  mutate(Total = rowSums(across(where(is.numeric))))

print(monthly_pivot, n = 50)
cat("\n")

# Highlight new months (Oct-Nov 2025)
cat("D) Focus on newly added months (Oct-Nov 2025):\n")
cat("=" %>% rep(60) %>% paste0(collapse = ""), "\n")
new_months_data <- combined_data %>%
  filter(year_month >= as.Date("2025-10-01")) %>%
  mutate(year_month = format(year_month, "%Y-%m")) %>%
  group_by(year_month, main_list) %>%
  summarise(
    n_posts = n(),
    n_surfaces = n_distinct(surface.id, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(year_month, main_list)

if (nrow(new_months_data) > 0) {
  print(new_months_data)
} else {
  cat("No posts found for Oct-Nov 2025\n")
}
cat("\n")

# ============================================================================
# STEP 11: SAVE UPDATED DATASET
# ============================================================================

cat("STEP 11: Saving updated dataset...\n")
cat("-" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

# Create output directory if doesn't exist
if (!dir.exists("combined_datasets")) {
  dir.create("combined_datasets")
}

# Save with new timestamp
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
output_file <- paste0("combined_datasets/italian_political_accounts_", timestamp, ".rds")

saveRDS(combined_data, output_file)

cat("✓ Saved updated dataset\n")
cat("  File:", output_file, "\n")
cat("  Total posts:", nrow(combined_data), "\n")
cat("  Total unique surfaces:", n_distinct(combined_data$surface.id, na.rm = TRUE), "\n\n")

# ============================================================================
# COMPLETION
# ============================================================================

cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n")
cat("DATA APPEND COMPLETE\n")
cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

cat("Summary of changes:\n")
cat("  Previous dataset:", nrow(existing_data), "posts\n")
cat("  New posts added:", nrow(new_data_aligned), "posts\n")
cat("  Updated dataset:", nrow(combined_data), "posts\n\n")

cat("Output file:", output_file, "\n\n")

cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n")
cat("✓ ALL DONE!\n")
cat("=" %>% rep(80) %>% paste0(collapse = ""), "\n\n")

[NOTICE] 2 output(s) filtered out