In [2]:
import os
import pandas as pd
import xml.etree.ElementTree as ET

# Create data directory
data_dir = "hyperpartisan_data"

# Use the correct XML file paths that match your directory
article_xml_path = os.path.join(data_dir, "articles-training-byarticle-20181122.xml")
ground_truth_xml_path = os.path.join(data_dir, "ground-truth-training-byarticle-20181122.xml")

print(f"Checking article XML file: {article_xml_path}")
print(f"Checking ground truth XML file: {ground_truth_xml_path}")

if os.path.exists(article_xml_path) and os.path.exists(ground_truth_xml_path):
    print(f"Parsing articles XML file: {article_xml_path}")
    
    tree = ET.parse(article_xml_path)
    root = tree.getroot()
    
    articles = []
    for article in root.findall('.//article'):
        # Extract article attributes
        article_id = article.get('id')
        title = article.get('title')
        published_at = article.get('published-at')
        
        # Extract article text
        text = ""
        for p in article.findall('.//p'):
            if p.text:
                text += p.text + "\n"
        
        articles.append({
            'id': article_id,
            'title': title,
            'published_at': published_at,
            'text': text
        })
    
    # Create DataFrame from articles
    articles_df = pd.DataFrame(articles)
    print(f"Parsed {len(articles_df)} articles")
    
    # Parse the ground truth XML file
    print(f"Parsing ground truth XML file: {ground_truth_xml_path}")
    
    gt_tree = ET.parse(ground_truth_xml_path)
    gt_root = gt_tree.getroot()
    
    hyperpartisan_dict = {}
    for article in gt_root.findall('.//article'):
        article_id = article.get('id')
        hyperpartisan = article.get('hyperpartisan')
        hyperpartisan_dict[article_id] = hyperpartisan == 'true'
    
    # Add hyperpartisan labels to articles DataFrame
    articles_df['hyperpartisan'] = articles_df['id'].map(hyperpartisan_dict)
    
    print(f"Added hyperpartisan labels to articles")
    print(f"Label distribution: {articles_df['hyperpartisan'].value_counts()}")
    
    # Save the combined dataset to CSV
    csv_path = os.path.join(data_dir, "articles_byarticle.csv")
    articles_df.to_csv(csv_path, index=False)
    print(f"Saved {len(articles_df)} articles to {csv_path}")
else:
    if not os.path.exists(article_xml_path):
        print(f"Articles XML file not found at {article_xml_path}")
    if not os.path.exists(ground_truth_xml_path):
        print(f"Ground truth XML file not found at {ground_truth_xml_path}")

Checking article XML file: hyperpartisan_data/articles-training-byarticle-20181122.xml
Checking ground truth XML file: hyperpartisan_data/ground-truth-training-byarticle-20181122.xml
Parsing articles XML file: hyperpartisan_data/articles-training-byarticle-20181122.xml
Parsed 645 articles
Parsing ground truth XML file: hyperpartisan_data/ground-truth-training-byarticle-20181122.xml
Added hyperpartisan labels to articles
Label distribution: hyperpartisan
False    407
True     238
Name: count, dtype: int64
Saved 645 articles to hyperpartisan_data/articles_byarticle.csv
