In [None]:
# semeval_download.ipynb
# Purpose: Download and extract the official SemEval 2019 Task 4 data files.

import os
import requests
import zipfile
import time # Added for potential delay between requests

# --- Configuration ---
data_dir = "hyperpartisan_data_official" # Use a new directory for clarity
os.makedirs(data_dir, exist_ok=True)

base_url = "https://zenodo.org/records/5776081/files/"

files_to_download = [
    # Training Data
    ("articles-training-byarticle-20181122.zip", "articles-training-byarticle.zip"),
    ("ground-truth-training-byarticle-20181122.zip", "ground-truth-training-byarticle.zip"),
    # Test Data
    ("articles-test-byarticle-20181207.zip", "articles-test-byarticle.zip"),
    ("ground-truth-test-byarticle-20181207.zip", "ground-truth-test-byarticle.zip")
]

# --- Functions ---
def download_file(url, save_path):
    """Download a file from a URL and save it to the specified path"""
    if os.path.exists(save_path):
        print(f"File already exists: {save_path}")
        return True
    print(f"Downloading {url}...")
    try:
        response = requests.get(url, stream=True, timeout=60) # Added stream and timeout
        response.raise_for_status() # Raise an exception for bad status codes
        total_size = int(response.headers.get('content-length', 0))
        block_size = 8192 # 8KB
        
        with open(save_path, 'wb') as f:
            for data in response.iter_content(block_size):
                f.write(data)
                
        # Verify file size if possible
        if total_size != 0 and os.path.getsize(save_path) != total_size:
             print(f"Warning: Downloaded size mismatch for {save_path}")

        print(f"Downloaded to {save_path}")
        time.sleep(1) # Add a small delay between downloads
        return True
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")
        if os.path.exists(save_path): # Clean up partial download
            os.remove(save_path)
        return False
    except Exception as e:
        print(f"An error occurred downloading {url}: {e}")
        if os.path.exists(save_path):
             os.remove(save_path)
        return False


def extract_zip(zip_path, extract_to_dir):
    """Extract a ZIP file."""
    if not os.path.exists(zip_path):
        print(f"ZIP file not found: {zip_path}")
        return False
    print(f"Extracting {zip_path} to {extract_to_dir}...")
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to_dir)
        print(f"Successfully extracted {zip_path}")
        return True
    except zipfile.BadZipFile:
        print(f"Error: Bad ZIP file: {zip_path}. Please re-download.")
        return False
    except Exception as e:
        print(f"An error occurred extracting {zip_path}: {e}")
        return False

# --- Execution ---
print(f"Data will be stored in: {data_dir}")

# Download files
download_success_count = 0
for url_part, filename in files_to_download:
    url = base_url + url_part
    save_path = os.path.join(data_dir, filename)
    if download_file(url, save_path):
        download_success_count += 1

if download_success_count != len(files_to_download):
    print("\nWarning: Not all files were downloaded successfully. Please check errors above.")
else:
    print("\nAll downloads completed or files already exist.")

# Extract files
print("\n--- Starting Extraction ---")
extraction_success_count = 0
for _, filename in files_to_download:
    save_path = os.path.join(data_dir, filename)
    # Extract to the main data_dir, zipfile should handle subdirectories if present in zip
    if extract_zip(save_path, data_dir): 
         extraction_success_count += 1

if extraction_success_count != len(files_to_download):
     print("\nWarning: Not all files were extracted successfully.")
else:
     print("\nAll extractions completed.")

print("\nDownload and extraction process finished.")
print(f"Check the '{data_dir}' directory for the extracted XML files and directories.")