# Scraping iBestuur articles

#### Importing libraries

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import csv
import json
from tqdm import tqdm
from pathlib import Path

### Step 1: Fetch sitemap urls

In [2]:
# create directory
Path('../Data/iBestuur').mkdir(parents=True, exist_ok=True)

In [3]:
def fetch_sitemap_urls(sitemap_index_url):
    """Fetch a sitemap and return a list of URLs."""
    response = requests.get(sitemap_index_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'xml')
        sitemap_urls = [element.text for element in soup.find_all('loc')]
        return sitemap_urls
    else:
        print('Failed to retrieve the XML Sitemap Index file.')
        return []

### Step 2: Fetch all urls from the different sitemaps

In [4]:
def fetch_urls(sitemap_urls):
    """Fetch all URLs from a list of sitemaps."""
    all_urls = []

    for sitemap_url in sitemap_urls:
        response = requests.get(sitemap_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'xml')
            urls = [element.text for element in soup.find_all('loc')]
            all_urls.extend(urls)
        time.sleep(0.1)  # Be respectful by not hammering the server

    return all_urls

### Step 3: Fetch the content from the URLs

In [5]:
def fetch_content_and_write_to_csv(urls, csv_file_path):
    """Fetch the content from a list of URLs."""
    with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        # Check if the file is empty
        file.seek(0, 2) # Go to the end of the file
        if file.tell() == 0:  # If the file is empty, write the header
            writer.writerow(['title', 'subtitle', 'author', 'date', 'content', 'url'])

        for url in tqdm(urls, desc='Scraping Articles', unit=' article'):
            try:
                response = requests.get(url, allow_redirects=True)
            except requests.exceptions.TooManyRedirects:
                with open('../Data/iBestuur/problematic_urls.txt', 'a') as f:
                    f.write(f'{url}\n')
                continue # Skip to the next URL if too many redirects occur
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                script_tag = soup.find('script', id='smg_data')
                data = json.loads(script_tag.string) if script_tag else {}
                
                title = soup.find('h1').text.strip() if soup.find('h1') else 'NaN'
                
                author = data.get('author')
                if not author:
                    author = data.get('partner', 'NaN')
                
                date = data.get('first_published_at', 'NaN')
                
                content = ' '.join([p.text for p in soup.find_all('p')])
                
                if content:
                    writer.writerow([title, 'NaN', author, date, content, url])

### Step 4: Exclude unnecessary URLs

In [6]:
def exclude_urls(urls):
    """Exclude URLs that are not articles."""
    # Exclude pictures (URLs that end with .jpg)
    all_urls = [url for url in urls if not url.endswith('.jpg')]
    
    # Exclude already scraped urls
    try:
        df = pd.read_csv('../Data/iBestuur/ibestuur_articles.csv')
        scraped_urls = df['url'].tolist()
    except FileNotFoundError:
        scraped_urls = []
        
    urls_to_scrape = [url for url in all_urls if url not in scraped_urls]
    
    return urls_to_scrape

### Step 5: Setting up the scraper

In [7]:
def main():
    # Step 1: Fetch sitemap URLs
    sitemap_index_url = 'https://ibestuur.nl/sitemap_index.xml'
    sitemap_urls = fetch_sitemap_urls(sitemap_index_url)
    print(f"Total number of sitemaps: {len(sitemap_urls)}")
    
    # Exclude unnecessary sitemaps (NOW ONLY ARTICLES ARE INCLUDED !!!)
    sitemap_urls = [url for url in sitemap_urls if 'artikel' in url]
    print(f"Total number of sitemaps with articles: {len(sitemap_urls)}")

    # Step 2: Fetch all URLs from the different sitemaps
    all_urls = fetch_urls(sitemap_urls)
    
    # Step 4: Exclude unnecessary URLs
    all_urls = exclude_urls(all_urls)
    print(f"Total number of articles that will be scraped: {len(all_urls)}")

    # Step 3: Fetch the content from the URLs
    csv_file_path = '../Data/iBestuur/ibestuur_articles.csv'
    fetch_content_and_write_to_csv(all_urls, csv_file_path)

### Step 6: Scrape

In [8]:
if __name__ == '__main__':
    main()

Total number of sitemaps: 36
Total number of sitemaps with articles: 6
Total number of articles that will be scraped: 1281


Scraping Articles: 100%|██████████| 1281/1281 [13:37<00:00,  1.57 article/s]


In [9]:
df = pd.read_csv('../Data/iBestuur/ibestuur_articles.csv')

In [10]:
# Select rows with NaN values in the 'author' column
df[df['author'].isna()]

Unnamed: 0,title,subtitle,author,date,content,url
11,,,,,(�HB\��<�O�Û���<پ��l�3������\t*pZ�e~��6�(�...,https://ibestuur-uploads.storage.googleapis.co...


In [11]:
len(df)

644