In [None]:
# xml_parse.ipynb
# Purpose: Parse the official SemEval train/test XML data, extract raw text
# and publisher info, and save to separate CSV files for BERT input.

import os
import pandas as pd
import xml.etree.ElementTree as ET
import re
from tqdm import tqdm # Optional: for progress bar

# --- Configuration ---
# *** IMPORTANT: Adjust these paths to match where semeval_download.ipynb extracted the files ***
data_dir = "hyperpartisan_data_official" 
train_articles_dir = os.path.join(data_dir, "articles-training-byarticle-20181122")
train_gt_path = os.path.join(data_dir, "ground-truth-training-byarticle-20181122.xml") 
test_articles_dir = os.path.join(data_dir, "articles-test-byarticle-20181207") 
test_gt_path = os.path.join(data_dir, "ground-truth-test-byarticle-20181207.xml") 

output_train_csv = os.path.join(data_dir, "official_train_data.csv")
output_test_csv = os.path.join(data_dir, "official_test_data.csv")

# --- Parsing Functions ---
def parse_ground_truth(xml_path):
    """Parses the ground truth XML file and returns dicts mapping ID to label and publisher."""
    print(f"Parsing ground truth: {xml_path}")
    if not os.path.exists(xml_path):
        raise FileNotFoundError(f"Ground truth file not found: {xml_path}")
    
    tree = ET.parse(xml_path)
    root = tree.getroot()
    labels = {}
    publishers = {} 
    articles_in_gt = 0
    for article in root.findall('.//article'):
        articles_in_gt += 1
        article_id = article.get('id')
        hyperpartisan_attr = article.get('hyperpartisan')
        publisher = article.get('portal', None) # Get portal attribute

        if article_id is None:
            print(f"Warning: Found article tag without 'id' in {xml_path}. Skipping.")
            continue
        if hyperpartisan_attr is None:
             print(f"Warning: Article {article_id} missing 'hyperpartisan' attribute in {xml_path}. Skipping.")
             continue

        labels[article_id] = hyperpartisan_attr.lower() == 'true'
        
        # Store publisher if found, otherwise mark for later extraction
        publishers[article_id] = publisher if publisher else "extract_from_filename"

    print(f"Parsed {articles_in_gt} entries from ground truth. Found {len(labels)} valid article IDs with labels.")
    return labels, publishers

def parse_articles_directory(dir_path, ground_truth_labels, ground_truth_publishers):
    """Parses a directory of article XMLs, extracts raw text, publisher, and adds labels."""
    print(f"Parsing articles directory: {dir_path}")
    if not os.path.isdir(dir_path):
         raise FileNotFoundError(f"Articles directory not found: {dir_path}")
    
    articles_data = []
    filenames = [f for f in os.listdir(dir_path) if f.endswith('.xml')]
    print(f"Found {len(filenames)} article XML files.")

    skipped_count = 0
    parsed_count = 0

    for filename in tqdm(filenames, desc=f"Parsing {os.path.basename(dir_path)}"):
        file_path = os.path.join(dir_path, filename)
        try:
            tree = ET.parse(file_path)
            root = tree.getroot() # Assumes the root element is <article>

            article_id = root.get('id')
            if not article_id: # Check if ID exists in article XML
                print(f"Warning: ID attribute missing in file {filename}. Trying to extract from filename...")
                # Attempt to extract ID from filename (e.g., article12345.xml)
                id_match = re.match(r'article(\d+)', filename)
                if id_match:
                    article_id = id_match.group(1)
                    print(f"  Extracted ID '{article_id}' from filename.")
                else:
                    print(f"  Could not extract ID from filename {filename}. Skipping.")
                    skipped_count += 1
                    continue

            if article_id not in ground_truth_labels:
                print(f"Warning: Article ID '{article_id}' from file {filename} not found in ground truth. Skipping.")
                skipped_count += 1
                continue

            title = root.get('title', '') # Use default if missing

            # Extract text content - joining paragraphs is usually best
            paragraphs = root.findall('.//p')
            if paragraphs:
                # Join text content of all <p> tags, handle None text
                text = "\n".join(p.text.strip() for p in paragraphs if p.text).strip()
            else: 
                 # Fallback: get all text within the article tag, might include unwanted stuff
                 text = "".join(node.strip() for node in root.itertext() if node and node.strip()).strip()
                 if not text:
                      print(f"Warning: No text found in article {article_id} ({filename}).")


            # --- Determine Publisher ---
            publisher = ground_truth_publishers.get(article_id, "unknown")
            if publisher == "extract_from_filename" or publisher == "unknown":
                 # Try extracting from filename (pattern: article<ID>_<PUBLISHER>.xml)
                 match = re.match(r'article\d+_([a-zA-Z0-9-]+)\.xml', filename)
                 if match:
                      publisher = match.group(1)
                 else:
                      publisher = "unknown" # Still unknown if pattern fails
                 # print(f"  Publisher for {article_id} from filename: {publisher}") # Optional debug
            # ---

            articles_data.append({
                'id': article_id,
                'title': title if title else 'No Title', # Handle potentially missing titles
                'text': text, # Use raw text, ensure it's not None
                'publisher': publisher, 
                'hyperpartisan': ground_truth_labels[article_id]
            })
            parsed_count += 1

        except ET.ParseError:
            print(f"Warning: Skipping file {filename} due to XML parse error.")
            skipped_count += 1
        except Exception as e:
             print(f"Warning: Skipping file {filename} due to unexpected error: {e}")
             skipped_count += 1

    df = pd.DataFrame(articles_data)
    # Simple check for empty text - replace with a placeholder if needed, or drop
    empty_text_count = (df['text'] == "").sum()
    if empty_text_count > 0:
        print(f"Warning: Found {empty_text_count} articles with empty text after parsing.")
        # df['text'] = df['text'].replace("", "[NO TEXT EXTRACTED]") # Option: Replace
        # df = df[df['text'] != ""] # Option: Drop

    print(f"Successfully parsed {parsed_count} articles. Skipped {skipped_count} files.")
    return df

# --- Main Execution ---

# --- Process Training Data ---
print("\n===== PROCESSING OFFICIAL TRAINING DATA =====")
try:
    train_labels, train_publishers = parse_ground_truth(train_gt_path)
    train_df = parse_articles_directory(train_articles_dir, train_labels, train_publishers)
    if not train_df.empty:
        print("\nTraining Data Info:")
        print(train_df.info())
        print("\nTraining Label Distribution:")
        print(train_df['hyperpartisan'].value_counts(normalize=True))
        print("\nTraining Publisher Distribution (Top 10):")
        print(train_df['publisher'].value_counts().head(10))
        # Save training data
        train_df.to_csv(output_train_csv, index=False)
        print(f"\nSaved official training data to {output_train_csv}")
    else:
        print("Failed to create training DataFrame.")
except FileNotFoundError as e:
    print(f"Error processing training data: {e}")
    train_df = pd.DataFrame() 

# --- Process Test Data ---
print("\n===== PROCESSING OFFICIAL TEST DATA =====")
try:
    test_labels, test_publishers = parse_ground_truth(test_gt_path)
    test_df = parse_articles_directory(test_articles_dir, test_labels, test_publishers)
    if not test_df.empty:
        print("\nTest Data Info:")
        print(test_df.info())
        print("\nTest Label Distribution:")
        print(test_df['hyperpartisan'].value_counts(normalize=True))
        print("\nTest Publisher Distribution (Top 10):")
        print(test_df['publisher'].value_counts().head(10))
         # Save test data
        test_df.to_csv(output_test_csv, index=False)
        print(f"\nSaved official test data to {output_test_csv}")
    else:
         print("Failed to create test DataFrame.")
except FileNotFoundError as e:
    print(f"Error processing test data: {e}")
    test_df = pd.DataFrame()

print("\nXML parsing and CSV creation complete.")