In [3]:
with open('../dataset/8_sidor_links_mp3.txt', 'r', encoding='utf-8') as file:
    urls_text = file.read()
    urls = urls_text.splitlines()
    
urls

['https://8sidor.se/wp-content/uploads/2025/09/250924.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250923.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250922.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250919ny.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250918.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250917.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250916.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250915.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250912.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250911.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250910.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250909.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250908.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250905.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250904.mp3',
 'https://8sidor.se/wp-content/uploads/2025/09/250903.mp3',
 'https://8sidor.se/wp-content/uploads

In [4]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import time
from urllib.parse import urlparse

# Create output directory if it doesn't exist
output_dir = Path('../dataset/8_sidor_audios_full')
output_dir.mkdir(parents=True, exist_ok=True)

def download_mp3(url, output_dir, session=None):
    """Download a single MP3 file"""
    try:
        # Extract filename from URL
        parsed_url = urlparse(url)
        filename = os.path.basename(parsed_url.path)
        
        # Full path for the output file
        output_path = output_dir / filename
        
        # Skip if file already exists
        if output_path.exists():
            return f"Skipped (exists): {filename}"
        
        # Use session if provided, otherwise create new request
        if session:
            response = session.get(url, timeout=30)
        else:
            response = requests.get(url, timeout=30)
        
        response.raise_for_status()
        
        # Write file to disk
        with open(output_path, 'wb') as f:
            f.write(response.content)
        
        return f"Downloaded: {filename} ({len(response.content)} bytes)"
        
    except requests.exceptions.RequestException as e:
        return f"Error downloading {url}: {str(e)}"
    except Exception as e:
        return f"Unexpected error with {url}: {str(e)}"

def download_mp3s_chunked(urls, output_dir, max_workers=10, chunk_size=100):
    """Download MP3s in parallel chunks"""
    
    print(f"Starting download of {len(urls)} files...")
    print(f"Output directory: {output_dir}")
    print(f"Using {max_workers} workers, processing in chunks of {chunk_size}")
    
    downloaded = 0
    skipped = 0
    errors = 0
    
    # Process URLs in chunks to avoid overwhelming the server
    for i in range(0, len(urls), chunk_size):
        chunk_urls = urls[i:i + chunk_size]
        chunk_num = i // chunk_size + 1
        total_chunks = (len(urls) + chunk_size - 1) // chunk_size
        
        print(f"\nProcessing chunk {chunk_num}/{total_chunks} ({len(chunk_urls)} files)...")
        
        # Create a session for this chunk to reuse connections
        with requests.Session() as session:
            session.headers.update({
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
            })
            
            # Use ThreadPoolExecutor for parallel downloads
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                # Submit all downloads for this chunk
                future_to_url = {
                    executor.submit(download_mp3, url, output_dir, session): url 
                    for url in chunk_urls
                }
                
                # Process completed downloads
                for future in as_completed(future_to_url):
                    url = future_to_url[future]
                    try:
                        result = future.result()
                        print(f"  {result}")
                        
                        if result.startswith("Downloaded"):
                            downloaded += 1
                        elif result.startswith("Skipped"):
                            skipped += 1
                        else:
                            errors += 1
                            
                    except Exception as exc:
                        print(f"  Error with {url}: {exc}")
                        errors += 1
        
        # Small delay between chunks to be respectful to the server
        if i + chunk_size < len(urls):
            print(f"  Waiting 2 seconds before next chunk...")
            time.sleep(2)
    
    print(f"\n=== Download Summary ===")
    print(f"Total URLs: {len(urls)}")
    print(f"Downloaded: {downloaded}")
    print(f"Skipped (already exist): {skipped}")
    print(f"Errors: {errors}")
    print(f"Success rate: {((downloaded + skipped) / len(urls) * 100):.1f}%")

# Run the download
download_mp3s_chunked(urls, output_dir, max_workers=8, chunk_size=50)

Starting download of 2935 files...
Output directory: ../dataset/8_sidor_audios_full
Using 8 workers, processing in chunks of 50

Processing chunk 1/59 (50 files)...
  Downloaded: 250916.mp3 (7852116 bytes)
  Downloaded: 250916.mp3 (7852116 bytes)
  Downloaded: 250923.mp3 (9991858 bytes)
  Downloaded: 250919ny.mp3 (6603881 bytes)
  Downloaded: 250923.mp3 (9991858 bytes)
  Downloaded: 250919ny.mp3 (6603881 bytes)
  Downloaded: 250922.mp3 (10669579 bytes)
  Downloaded: 250922.mp3 (10669579 bytes)
  Downloaded: 250912.mp3 (8565572 bytes)
  Downloaded: 250912.mp3 (8565572 bytes)
  Downloaded: 250909.mp3 (6853403 bytes)
  Downloaded: 250909.mp3 (6853403 bytes)
  Downloaded: 250918.mp3 (8030167 bytes)
  Downloaded: 250918.mp3 (8030167 bytes)
  Downloaded: 250911.mp3 (8672152 bytes)
  Downloaded: 250905.mp3 (10633843 bytes)
  Downloaded: 250911.mp3 (8672152 bytes)
  Downloaded: 250905.mp3 (10633843 bytes)
  Downloaded: 250924.mp3 (9778072 bytes)
  Downloaded: 250924.mp3 (9778072 bytes)
  Downl