In [2]:
import os
import requests
import tarfile
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from io import BytesIO

# Create data directory
data_dir = "hyperpartisan_data"
os.makedirs(data_dir, exist_ok=True)

def download_semeval_dataset():
    """
    Download the SemEval-2019 Task 4 hyperpartisan news detection datasets
    """
    print("Downloading SemEval datasets...")
    
    # URLs for the datasets
    by_article_url = "https://zenodo.org/records/1489920/files/articles-training-byarticle-20181122.zip"
    
    # Download by-article dataset (smaller, manually labeled)
    response = requests.get(by_article_url)
    if response.status_code == 200:
        by_article_path = os.path.join(data_dir, "articles-training-byarticle.zip")
        with open(by_article_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded by-article dataset to {by_article_path}")
        
        # Extract the ZIP file
        import zipfile
        with zipfile.ZipFile(by_article_path, 'r') as zip_ref:
            zip_ref.extractall(data_dir)
        print("Extracted by-article dataset")
    else:
        print(f"Failed to download by-article dataset: Status code {response.status_code}")
        
    # Note: The by-publisher dataset is very large (754k articles)
    # For initial development, we'll use only the by-article dataset
    print("Skipping by-publisher dataset due to its large size (754k articles)")
    






Downloading SemEval datasets...
Downloaded by-article dataset to hyperpartisan_data/articles-training-byarticle.zip
Extracted by-article dataset
Skipping by-publisher dataset due to its large size (754k articles)
XML file not found at hyperpartisan_data/articles-training-byarticle/articles.xml


In [None]:
import os
import pandas as pd
import xml.etree.ElementTree as ET

# Define the data directory
data_dir = "hyperpartisan_data"

# Check for the actual XML file
possible_xml_paths = [
    os.path.join(data_dir, "articles-training-byarticle", "articles.xml"),
    os.path.join(data_dir, "articles-training-byarticle-20181122.xml"),
    os.path.join(data_dir, "articles.xml")
]

xml_file = None
for path in possible_xml_paths:
    if os.path.exists(path):
        xml_file = path
        print(f"Found XML file at: {path}")
        break

if xml_file:
    print(f"Parsing XML file: {xml_file}")
    
    # Parse the XML file
    def parse_xml_to_dataframe(xml_file):
        """Parse the XML file into a pandas DataFrame"""
        tree = ET.parse(xml_file)
        root = tree.getroot()
        
        articles = []
        for article in root.findall('.//article'):
            # Extract article attributes
            article_id = article.get('id')
            title = article.get('title')
            published_at = article.get('published-at')
            hyperpartisan = article.get('hyperpartisan')
            
            # Extract article text
            paragraphs = article.findall('.//p')
            text = '\n'.join([p.text if p.text else '' for p in paragraphs])
            
            articles.append({
                'id': article_id,
                'title': title,
                'published_at': published_at,
                'hyperpartisan': hyperpartisan == 'true',  # Convert to boolean
                'text': text
            })
        
        return pd.DataFrame(articles)
    
    # Parse the XML and save as CSV
    articles_df = parse_xml_to_dataframe(xml_file)
    csv_path = os.path.join(data_dir, "articles_byarticle.csv")
    articles_df.to_csv(csv_path, index=False)
    
    print(f"Parsed {len(articles_df)} articles and saved to {csv_path}")
    print(f"Label distribution: {articles_df['hyperpartisan'].value_counts()}")
    
    # Show a sample
    print("\nSample article:")
    print(articles_df.iloc[0][['title', 'hyperpartisan']])
    print(articles_df.iloc[0]['text'][:200] + "...")
else:
    print("No XML file found. Please check the extracted contents.")
    print("Current files in the data directory:")
    for file in os.listdir(data_dir):
        print(f"- {file}")