In [3]:
# xml_parse.ipynb (No TQDM)
# Purpose: Parse the official SemEval train/test XML data files (single large XML format), 
# extract raw text and publisher info, and save to separate CSV files for BERT input.

import os
import pandas as pd
import xml.etree.ElementTree as ET
import re
# Removed: from tqdm import tqdm 

# --- Configuration ---
# *** IMPORTANT: Adjust these paths to match where semeval_download.ipynb extracted the files ***
data_dir = "hyperpartisan_data_official" 
train_articles_xml_path = os.path.join(data_dir, "articles-training-byarticle-20181122.xml")
train_gt_path = os.path.join(data_dir, "ground-truth-training-byarticle-20181122.xml") 
test_articles_xml_path = os.path.join(data_dir, "articles-test-byarticle-20181207.xml") 
test_gt_path = os.path.join(data_dir, "ground-truth-test-byarticle-20181207.xml") 

output_train_csv = os.path.join(data_dir, "official_train_data.csv")
output_test_csv = os.path.join(data_dir, "official_test_data.csv")

# --- Parsing Functions ---
def parse_ground_truth(xml_path):
    """Parses the ground truth XML file and returns dicts mapping ID to label and publisher."""
    print(f"Parsing ground truth: {xml_path}")
    if not os.path.exists(xml_path):
        raise FileNotFoundError(f"Ground truth file not found: {xml_path}")
    
    tree = ET.parse(xml_path)
    root = tree.getroot()
    labels = {}
    publishers = {} 
    articles_in_gt = 0
    for article in root.findall('.//article'): # Find all article tags within the ground truth
        articles_in_gt += 1
        article_id = article.get('id')
        hyperpartisan_attr = article.get('hyperpartisan')
        # ---> Get publisher directly from the ground truth file's 'portal' attribute <---
        publisher = article.get('portal', "unknown") # Use 'unknown' if missing

        if article_id is None:
            print(f"Warning: Found article tag without 'id' in {xml_path}. Skipping.")
            continue
        if hyperpartisan_attr is None:
             print(f"Warning: Article {article_id} missing 'hyperpartisan' attribute in {xml_path}. Skipping.")
             continue

        labels[article_id] = hyperpartisan_attr.lower() == 'true'
        publishers[article_id] = publisher

    print(f"Parsed {articles_in_gt} entries from ground truth. Found {len(labels)} valid article IDs with labels.")
    return labels, publishers

def parse_single_articles_xml(xml_path, ground_truth_labels, ground_truth_publishers):
    """Parses a single large XML file containing multiple articles."""
    print(f"Parsing articles XML file: {xml_path}")
    if not os.path.exists(xml_path):
         raise FileNotFoundError(f"Articles XML file not found: {xml_path}")
    
    articles_data = []
    skipped_count = 0
    parsed_count = 0
    
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot() # Should be the root element (e.g., <articles>)

        # Iterate through each <article> tag within the root
        # Removed tqdm wrapper from the loop below
        all_articles = root.findall('.//article')
        total_articles_in_file = len(all_articles)
        print(f"Found {total_articles_in_file} article tags in {os.path.basename(xml_path)}. Parsing...")

        for i, article_element in enumerate(all_articles):
            # Optional: Print progress every N articles
            # if (i + 1) % 100 == 0:
            #     print(f"  Processed {i+1}/{total_articles_in_file} articles...")
                
            article_id = article_element.get('id')
            if not article_id:
                print(f"Warning: Found article tag without 'id' in {xml_path} (entry ~{i+1}). Skipping.")
                skipped_count += 1
                continue

            if article_id not in ground_truth_labels:
                print(f"Warning: Article ID '{article_id}' not found in ground truth. Skipping.")
                skipped_count += 1
                continue

            title = article_element.get('title', '') 

            # Extract text content
            paragraphs = article_element.findall('.//p')
            if paragraphs:
                text = "\n".join(p.text.strip() for p in paragraphs if p.text).strip()
            else: 
                 text = "".join(node.strip() for node in article_element.itertext() if node and node.strip()).strip()
                 if not text:
                      print(f"Warning: No text found in article {article_id} in {xml_path}.")

            # Get publisher from the ground truth data (already parsed)
            publisher = ground_truth_publishers.get(article_id, "unknown") 

            articles_data.append({
                'id': article_id,
                'title': title if title else 'No Title', 
                'text': text, 
                'publisher': publisher, 
                'hyperpartisan': ground_truth_labels[article_id]
            })
            parsed_count += 1

    except ET.ParseError as e:
        print(f"FATAL ERROR: Could not parse XML file {xml_path}: {e}")
        return pd.DataFrame() # Return empty dataframe on critical parse error
    except Exception as e:
         print(f"FATAL ERROR: Unexpected error parsing {xml_path}: {e}")
         return pd.DataFrame()

    df = pd.DataFrame(articles_data)
    empty_text_count = (df['text'] == "").sum()
    if empty_text_count > 0:
        print(f"Warning: Found {empty_text_count} articles with empty text after parsing.")
        
    print(f"Successfully parsed {parsed_count} articles. Skipped {skipped_count} entries.")
    return df
# --- End New Function ---


# --- Main Execution ---

# --- Process Training Data ---
print("\n===== PROCESSING OFFICIAL TRAINING DATA =====")
try:
    train_labels, train_publishers = parse_ground_truth(train_gt_path)
    train_df = parse_single_articles_xml(train_articles_xml_path, train_labels, train_publishers)
    if not train_df.empty:
        print("\nTraining Data Info:")
        train_df.info(memory_usage='deep') # Show memory usage too
        print("\nTraining Label Distribution:")
        print(train_df['hyperpartisan'].value_counts(normalize=True))
        print("\nTraining Publisher Distribution (Top 10):")
        print(train_df['publisher'].value_counts().head(10))
        # Save training data
        train_df.to_csv(output_train_csv, index=False)
        print(f"\nSaved official training data to {output_train_csv}")
    else:
        print("Failed to create training DataFrame.")
except FileNotFoundError as e:
    print(f"Error processing training data: {e}")
    train_df = pd.DataFrame() 

# --- Process Test Data ---
print("\n===== PROCESSING OFFICIAL TEST DATA =====")
try:
    test_labels, test_publishers = parse_ground_truth(test_gt_path)
    test_df = parse_single_articles_xml(test_articles_xml_path, test_labels, test_publishers)
    if not test_df.empty:
        print("\nTest Data Info:")
        test_df.info(memory_usage='deep') # Show memory usage
        print("\nTest Label Distribution:")
        print(test_df['hyperpartisan'].value_counts(normalize=True))
        print("\nTest Publisher Distribution (Top 10):")
        print(test_df['publisher'].value_counts().head(10))
         # Save test data
        test_df.to_csv(output_test_csv, index=False)
        print(f"\nSaved official test data to {output_test_csv}")
    else:
         print("Failed to create test DataFrame.")
except FileNotFoundError as e:
    print(f"Error processing test data: {e}")
    test_df = pd.DataFrame()

print("\nXML parsing and CSV creation complete.")


===== PROCESSING OFFICIAL TRAINING DATA =====
Parsing ground truth: hyperpartisan_data_official/ground-truth-training-byarticle-20181122.xml
Parsed 645 entries from ground truth. Found 645 valid article IDs with labels.
Parsing articles XML file: hyperpartisan_data_official/articles-training-byarticle-20181122.xml
Found 645 article tags in articles-training-byarticle-20181122.xml. Parsing...
Successfully parsed 645 articles. Skipped 0 entries.

Training Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 645 entries, 0 to 644
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             645 non-null    object
 1   title          645 non-null    object
 2   text           645 non-null    object
 3   publisher      645 non-null    object
 4   hyperpartisan  645 non-null    bool  
dtypes: bool(1), object(4)
memory usage: 3.2 MB

Training Label Distribution:
hyperpartisan
False    0.631008
True     0.3689