# Scraping Binnenlands Bestuur articles

No sitemap is yet available for [Binnenlands Bestuur](https://www.binnenlandsbestuur.nl/). Therefore, the following code is a template for scraping articles from a website that does have a sitemap. The code from scraping_ibestuur.ipynb can be used for Binnenlands Bestuur once a sitemap is available.

#### Importing libraries

In [1]:
import csv

from bs4 import BeautifulSoup
import requests
from requests.exceptions import ChunkedEncodingError
import pandas as pd
import time
import os
from tqdm import tqdm
from pathlib import Path

#### Step 0: Global settings

In [2]:
# create directory
Path("../Data/BinnenlandsBestuur").mkdir(parents=True, exist_ok=True)

In [3]:
base_url = 'https://www.binnenlandsbestuur.nl'
csv_file_path = '../Data/BinnenlandsBestuur/binnenlandsbestuur_articles.csv'

#### Step 1: Initialize CSV file

In [4]:
def initialize_csv():
    """
    Initializes the CSV file with the headers
    """
    if not os.path.exists(csv_file_path):
        with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=['title', 'subtitle', 'author', 'date', 'content', 'url'])
            writer.writeheader()

#### Step 2: Is URL already scraped?

In [5]:
def url_already_scraped(url):
    """
    Checks if the URL has already been scraped
    """
    if not os.path.exists(csv_file_path):
        return False
    with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row['url'] == url:
                return True
    return False

#### Step 3: Fetch relevant urls

In [6]:
def fetch_urls(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all articles on the page
    article_links = soup.find_all('a', href=True)
    
    # Filter out the relevant articles
    full_links = []
    for link in article_links:
        if '/digitaal/' in link['href']:
            if link['href'].startswith('http') or link['href'].startswith('www'):
                full_links.append(link['href'])
            else:
                full_links.append(base_url + link['href'])
    return full_links

#### Step 4: Scrape URLs

In [7]:
def scrape_article(article_url):
    """
    Scrapes the article at the given URL
    """
    # Article should be a article in the digitaal category
    if '/digitaal/' not in article_url:
        print(f'Skipping non-digitaal URL: {article_url}')
        return None 
    
    try_count = 0
    max_tries = 2
    
    while try_count < max_tries:
        try: 
            response = requests.get(article_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Check if the url is an article
            if not soup.find('h1', class_='c-article-header__heading'):
                print(f'Skipping non-article URL: {article_url}')
                return None
            
            # Extract the relevant information
            article_data = extract_article_data(soup, article_url)
            
            if article_data:
                return article_data
                
        except Exception as e:
            print(f'Failed to fetch article: {article_url}, {e}')
            try_count += 1
            print(f'Try {try_count}/{max_tries}')
            time.sleep(1)

    # Write to problematic URLs file
    with open('../Data/BinnenlandsBestuur/problematic_urls.txt', 'a') as f:
        f.write(article_url + '\n')
    return None

In [8]:
def extract_article_data(soup, article_url):
    """
    Extracts the relevant data from the article page
    """
    title = soup.find('h1', class_='c-article-header__heading').text.strip()
    
    subtitle_div = soup.find('p', class_='c-article-header__lead')
    subtitle = subtitle_div.get_text(strip=True) if subtitle_div else 'No subtitle'
    
    author_div = soup.find('div', class_='c-meta__item c-meta__item--author')
    author = author_div.find('a').get_text(strip=True) if author_div else 'No author'
    
    date_div = soup.find('div', class_='c-meta__item c-meta__item--publicationDate')
    date = date_div.get_text(strip=True) if date_div else 'No date'
    
    content_div = soup.find('div', class_='b-article__container o-container')
    paragraphs = content_div.find_all('p') if content_div else []
    content = ' '.join([p.get_text(strip=True) for p in paragraphs])
    
    # Ensure we have the essential information before returning
    if not title or not content:
        print(f'Skipping article due to missing title or content: {article_url}')
        return None
    
    return {
        'title': title,
        'subtitle': subtitle,
        'author': author,
        'date': date,
        'content': content,
        'url': article_url
    }

#### Step 5: Setting up the scraper

In [9]:
def main():
    initialize_csv()
    new_articles = set([base_url])
    visited_articles = set() # Keep track of visited articles this run 
    iteration = 0
    
    while new_articles:
        print(f"Iteration {iteration+1}:")
        current_articles = set(new_articles)

        potential_new_articles = set()
        for link in tqdm(current_articles, desc='Collecting new URLs'):
            additional_links = fetch_urls(link)
            potential_new_articles.update(additional_links)
            visited_articles.add(link)
                
        # Filter out URLs that have already been scraped
        unscraped_articles = [link for link in potential_new_articles if not url_already_scraped(link)]
        
        # Now, scrape the articles that haven't been scraped yet
        for link in tqdm(unscraped_articles, desc='Scraping articles'):
            article_data = scrape_article(link)
            if article_data: # Directly save to CSV if successfully scraped
                with open(csv_file_path, 'a', newline='', encoding='utf-8') as csvfile:
                    writer = csv.DictWriter(csvfile, fieldnames=['title', 'subtitle', 'author', 'date', 'content', 'url'])
                    writer.writerow(article_data)

        # Update new_articles for the next iteration: consider all potential new articles that haven't been visited yet
        new_articles = potential_new_articles - visited_articles
        iteration += 1
                
    print('Data collection complete!')

### Step 5: Scrape

In [10]:
if __name__ == '__main__':
    main()

Iteration 1:


Collecting new URLs: 100%|██████████| 1/1 [00:00<00:00,  2.73it/s]
Scraping articles: 100%|██████████| 5/5 [00:03<00:00,  1.53it/s]


Iteration 2:


Collecting new URLs: 100%|██████████| 5/5 [00:01<00:00,  2.58it/s]
Scraping articles: 100%|██████████| 24/24 [00:28<00:00,  1.17s/it]


Iteration 3:


Collecting new URLs: 100%|██████████| 24/24 [00:10<00:00,  2.37it/s]
Scraping articles: 100%|██████████| 27/27 [00:38<00:00,  1.41s/it]


Iteration 4:


Collecting new URLs: 100%|██████████| 27/27 [00:10<00:00,  2.51it/s]
Scraping articles: 100%|██████████| 24/24 [00:35<00:00,  1.49s/it]


Iteration 5:


Collecting new URLs: 100%|██████████| 24/24 [00:09<00:00,  2.58it/s]
Scraping articles: 100%|██████████| 31/31 [00:46<00:00,  1.49s/it]


Iteration 6:


Collecting new URLs: 100%|██████████| 31/31 [00:12<00:00,  2.44it/s]
Scraping articles:   6%|▋         | 1/16 [00:00<00:14,  1.06it/s]

Skipping non-article URL: https://www.binnenlandsbestuur.nl/digitaal/advies-833-miljoen-euro-extra-nodig-voor-cyberweerbaarheid  


Scraping articles:  69%|██████▉   | 11/16 [00:14<00:05,  1.13s/it]

Skipping non-article URL: https://www.binnenlandsbestuur.nl/digitaal/europese-commissie-vraagt-om-input-voor-digital-resilience-act 


Scraping articles: 100%|██████████| 16/16 [00:20<00:00,  1.30s/it]


Iteration 7:


Collecting new URLs: 100%|██████████| 16/16 [00:06<00:00,  2.41it/s]
Scraping articles:   5%|▌         | 1/19 [00:00<00:15,  1.19it/s]

Skipping non-article URL: https://www.binnenlandsbestuur.nl/digitaal/kennispartners


Scraping articles:  26%|██▋       | 5/19 [00:05<00:15,  1.11s/it]

Skipping non-article URL: https://www.binnenlandsbestuur.nl/digitaal/dossiers


Scraping articles:  47%|████▋     | 9/19 [00:13<00:17,  1.77s/it]

Skipping non-article URL: https://www.binnenlandsbestuur.nl/digitaal/opinie


Scraping articles:  68%|██████▊   | 13/19 [00:20<00:11,  1.89s/it]

Skipping non-article URL: https://www.binnenlandsbestuur.nl/digitaal/achtergrond


Scraping articles: 100%|██████████| 19/19 [00:28<00:00,  1.51s/it]


Skipping non-article URL: https://www.binnenlandsbestuur.nl/digitaal/nieuws
Iteration 8:


Collecting new URLs: 100%|██████████| 19/19 [00:06<00:00,  2.73it/s]
Scraping articles:   3%|▎         | 1/34 [00:00<00:18,  1.80it/s]

Skipping non-article URL: https://www.binnenlandsbestuur.nl/digitaal/kennispartners


Scraping articles:  38%|███▊      | 13/34 [00:17<00:25,  1.24s/it]

Skipping non-article URL: https://www.binnenlandsbestuur.nl/digitaal/dossiers


Scraping articles:  62%|██████▏   | 21/34 [00:27<00:13,  1.05s/it]

Skipping non-article URL: https://www.binnenlandsbestuur.nl/digitaal/opinie


Scraping articles:  88%|████████▊ | 30/34 [00:40<00:05,  1.29s/it]

Skipping non-article URL: https://www.binnenlandsbestuur.nl/digitaal/achtergrond


Scraping articles:  91%|█████████ | 31/34 [00:41<00:03,  1.14s/it]

Skipping non-article URL: https://www.binnenlandsbestuur.nl/digitaal/nieuws


Scraping articles: 100%|██████████| 34/34 [00:45<00:00,  1.35s/it]


Iteration 9:


Collecting new URLs: 100%|██████████| 29/29 [00:11<00:00,  2.58it/s]
Scraping articles: 0it [00:00, ?it/s]

Data collection complete!





In [11]:
pd.read_csv(csv_file_path)

Unnamed: 0,title,subtitle,author,date,content,url
0,Hoe Capgemini steeds dominanter wordt bij IT-o...,Uit een inventarisatie van iBestuur van grote ...,Sjoerd Hartholt,02 april 2024,Uit een inventarisatie van iBestuur van grote ...,https://www.binnenlandsbestuur.nl/digitaal/cap...
1,AI Act nog onbekend bij veel gemeenten,Een kwart van de gemeenten heeft reeds actie o...,Marjolein van Trigt,01 april 2024,Een kwart van de gemeenten heeft reeds actie o...,https://www.binnenlandsbestuur.nl/digitaal/gro...
2,Waarom security van sluizen en gemalen een zor...,De IBD schakelt experts in om security van IAC...,Sjoerd Hartholt,29 maart 2024,De IBD schakelt experts in om security van IAC...,https://www.binnenlandsbestuur.nl/digitaal/hoe...
3,Brussel neemt laatste horde richting Europese ...,De verordening treedt naar verwachting eind ap...,Sjoerd Hartholt,29 maart 2024,De verordening treedt naar verwachting eind ap...,https://www.binnenlandsbestuur.nl/digitaal/laa...
4,Historische dag voor de AI Act,Vandaag stemt het Europarlement over de AI Act...,Marjolein van Trigt,13 maart 2024,Vandaag stemt het Europarlement over de AI Act...,https://www.binnenlandsbestuur.nl/digitaal/tek...
...,...,...,...,...,...,...
163,Overheid: bemoei je met AI,De overheid heeft ook een taak op het gebied v...,Marjolein van Trigt,18 augustus 2023,De overheid heeft ook een taak op het gebied v...,https://www.binnenlandsbestuur.nl/digitaal/bem...
164,Maak van de Leefomgeving uw zaak,De themadag is gratis en is gericht op beleids...,Centric,25 juni 2015,De themadag is gratis en is gericht op beleids...,https://www.binnenlandsbestuur.nl/digitaal/cen...
165,Een kleine stap van Napoleon naar gegevensbesc...,Hopelijk doorloopt gegevensbescherming dezelfd...,Marjolein Louwerse,22 januari 2024,Hopelijk doorloopt gegevensbescherming dezelfd...,https://www.binnenlandsbestuur.nl/digitaal/wat...
166,Digitale lessen uit de vorige eeuw,Politiek gezien viel aan automatisering niets ...,Margo ter Bekke,30 september 2022,Politiek gezien viel aan automatisering niets ...,https://www.binnenlandsbestuur.nl/digitaal/ess...
