In [5]:
import os
import requests
import zipfile
import xml.etree.ElementTree as ET
import pandas as pd

# Create data directory
data_dir = "hyperpartisan_data"
os.makedirs(data_dir, exist_ok=True)

def download_file(url, save_path):
    """Download a file from a URL and save it to the specified path"""
    print(f"Downloading {url}...")
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded to {save_path}")
        return True
    else:
        print(f"Failed to download: {response.status_code}")
        return False

# Define files to download
files_to_download = [
    # Articles by article (training)
    ("https://zenodo.org/records/5776081/files/articles-training-byarticle-20181122.zip",
     os.path.join(data_dir, "articles-training-byarticle.zip")),
    
    # Ground truth for articles by article (training)
    ("https://zenodo.org/records/5776081/files/ground-truth-training-byarticle-20181122.zip",
     os.path.join(data_dir, "ground-truth-training-byarticle.zip"))
]

# Download the files
for url, save_path in files_to_download:
    download_file(url, save_path)

# Extract the ZIP files
for _, save_path in files_to_download:
    if os.path.exists(save_path):
        with zipfile.ZipFile(save_path, 'r') as zip_ref:
            zip_ref.extractall(data_dir)
        print(f"Extracted {save_path}")

# Parse the article XML file
article_xml_path = os.path.join(data_dir, "articles.xml")
if os.path.exists(article_xml_path):
    print(f"Parsing articles XML file: {article_xml_path}")
    
    tree = ET.parse(article_xml_path)
    root = tree.getroot()
    
    articles = []
    for article in root.findall('.//article'):
        # Extract article attributes
        article_id = article.get('id')
        title = article.get('title')
        published_at = article.get('published-at')
        
        # Text will be extracted later
        text = ""
        for p in article.findall('.//p'):
            if p.text:
                text += p.text + "\n"
        
        articles.append({
            'id': article_id,
            'title': title,
            'published_at': published_at,
            'text': text
        })
    
    # Create DataFrame from articles
    articles_df = pd.DataFrame(articles)
    print(f"Parsed {len(articles_df)} articles")
    
    # Parse the ground truth XML file
    ground_truth_xml_path = os.path.join(data_dir, "ground-truth.xml")
    if os.path.exists(ground_truth_xml_path):
        print(f"Parsing ground truth XML file: {ground_truth_xml_path}")
        
        gt_tree = ET.parse(ground_truth_xml_path)
        gt_root = gt_tree.getroot()
        
        hyperpartisan_dict = {}
        for article in gt_root.findall('.//article'):
            article_id = article.get('id')
            hyperpartisan = article.get('hyperpartisan')
            hyperpartisan_dict[article_id] = hyperpartisan == 'true'
        
        # Add hyperpartisan labels to articles DataFrame
        articles_df['hyperpartisan'] = articles_df['id'].map(hyperpartisan_dict)
        
        print(f"Added hyperpartisan labels to articles")
        print(f"Label distribution: {articles_df['hyperpartisan'].value_counts()}")
    else:
        print(f"Ground truth XML file not found at {ground_truth_xml_path}")
    
    # Save the combined dataset to CSV
    csv_path = os.path.join(data_dir, "articles_byarticle.csv")
    articles_df.to_csv(csv_path, index=False)
    print(f"Saved {len(articles_df)} articles to {csv_path}")
else:
    print(f"Articles XML file not found at {article_xml_path}")
    # Try to find XML files in the directory
    xml_files = [f for f in os.listdir(data_dir) if f.endswith('.xml')]
    if xml_files:
        print(f"Found XML files in {data_dir}: {xml_files}")
        print("Please adjust the path in the code to the correct XML file")
    else:
        print(f"No XML files found in {data_dir}")

Downloading https://zenodo.org/records/5776081/files/articles-training-byarticle-20181122.zip...
Downloaded to hyperpartisan_data/articles-training-byarticle.zip
Downloading https://zenodo.org/records/5776081/files/ground-truth-training-byarticle-20181122.zip...
Downloaded to hyperpartisan_data/ground-truth-training-byarticle.zip
Extracted hyperpartisan_data/articles-training-byarticle.zip
Extracted hyperpartisan_data/ground-truth-training-byarticle.zip
Articles XML file not found at hyperpartisan_data/articles.xml
Found XML files in hyperpartisan_data: ['articles-training-byarticle-20181122.xml', 'ground-truth-training-byarticle-20181122.xml']
Please adjust the path in the code to the correct XML file
