In [1]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

# Setup
data_dir = "hyperpartisan_data"
xml_file = os.path.join(data_dir, "articles-training-byarticle-20181122.xml")

def parse_xml_to_dataframe(xml_file):
    """Parse the XML file into a pandas DataFrame"""
    print(f"Parsing XML file: {xml_file}")
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    articles = []
    for article in root.findall('.//article'):
        # Extract article attributes
        article_id = article.get('id')
        title = article.get('title')
        published_at = article.get('published-at')
        hyperpartisan = article.get('hyperpartisan')
        
        # Extract article text
        text = ""
        for p in article.findall('.//p'):
            if p.text:
                text += p.text + "\n"
        
        articles.append({
            'id': article_id,
            'title': title,
            'published_at': published_at,
            'hyperpartisan': hyperpartisan == 'true',
            'text': text
        })
    
    print(f"Found {len(articles)} articles in the XML file")
    return pd.DataFrame(articles)

# Check if the XML file exists
if os.path.exists(xml_file):
    # Parse XML to DataFrame
    articles_df = parse_xml_to_dataframe(xml_file)
    
    # Save as CSV for easier handling
    csv_path = os.path.join(data_dir, "articles_byarticle.csv")
    articles_df.to_csv(csv_path, index=False)
    
    print(f"Saved {len(articles_df)} articles to {csv_path}")
    print(f"Label distribution: {articles_df['hyperpartisan'].value_counts()}")
    
    # Now you can proceed with the preprocessing step
    print("\nYou can now run the preprocessing code.")
else:
    print(f"XML file not found at {xml_file}")
    print("Please check the file path and try again.")

Parsing XML file: hyperpartisan_data/articles-training-byarticle-20181122.xml
Found 645 articles in the XML file
Saved 645 articles to hyperpartisan_data/articles_byarticle.csv
Label distribution: hyperpartisan
False    645
Name: count, dtype: int64

You can now run the preprocessing code.


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
