In [1]:
from pathlib import Path
import scrapy
import csv
import os

class RDWSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        self.reset_data()
        urls = [
            "https://apk-handboek.rdw.nl/personenautos",
            "https://apk-handboek.rdw.nl/bedrijfsautos-licht",
            "https://apk-handboek.rdw.nl/bedrijfsautos-zwaar",
            "https://apk-handboek.rdw.nl/driewielige-motorrijtuigen",
            "https://apk-handboek.rdw.nl/aanhangwagens",
            "https://apk-handboek.rdw.nl/landbouwvoertuigen",
            "https://apk-handboek.rdw.nl/overige-regelgeving"
        ]

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)        
    
    def parse(self, response):
        if response.css(".law-section"):
            self.parse_law(response)

        if response.css(".faq"):
            self.parse_faq(response)
            
        if response.css(".forward-list"):
            links = response.css(".forward-list a")
            yield from response.follow_all(links, callback=self.parse)

    def parse_law(self, response):
        laws = response.css('.law-section *::text').getall()
        laws = self.squeeze(laws)
        
        segments = list(set(response.url.split("/")[-3:]))
        path = "-".join(segments)
        filename = f"data/laws/{path}.txt"
        title = " ".join(segments)

        with open(filename, 'w') as f:
            f.write(laws)

        with open("data/laws-mapping.csv", 'a') as f:
            writer = csv.writer(f)
            writer.writerow([title, filename])
            
        self.log(f"Saved file {filename}")
        
    def parse_faq(self, response):
        items = response.css(".faq ul li")

        with open("data/faq.csv", 'a') as f:
            writer = csv.writer(f)

            for item in items:
                question = item.css("h2 button::text").getall()
                question = self.squeeze(question)
                
                answer = item.css("div *::text").getall()
                answer = self.squeeze(answer)
                
                writer.writerow([question, answer])

    def reset_data(self):      
        with open("data/laws-mapping.csv", 'w') as f:
            f.truncate()
            writer = csv.writer(f)
            writer.writerow(["title", "filename"])

        with open("data/faq.csv", 'w') as f:
            f.truncate()
            writer = csv.writer(f)
            writer.writerow(["question", "answer"])

    def squeeze(self, items):
        return " ".join(item.strip() for item in items)

In [2]:
from scrapy.crawler import CrawlerProcess

process = CrawlerProcess()
process.crawl(RDWSpider)
process.start()

2025-01-09 13:43:33 [scrapy.utils.log] INFO: Scrapy 2.12.0 started (bot: scrapybot)
2025-01-09 13:43:33 [scrapy.utils.log] INFO: Versions: lxml 5.3.0.0, libxml2 2.12.9, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.11.0, Python 3.11.8 | packaged by conda-forge | (main, Feb 16 2024, 20:53:32) [GCC 12.3.0], pyOpenSSL 24.0.0 (OpenSSL 3.4.0 22 Oct 2024), cryptography 42.0.5, Platform Linux-6.8.0-51-generic-x86_64-with-glibc2.35
2025-01-09 13:43:33 [scrapy.addons] INFO: Enabled addons:
[]
2025-01-09 13:43:33 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2025-01-09 13:43:33 [scrapy.extensions.telnet] INFO: Telnet Password: b84aa71dcd16b07d
2025-01-09 13:43:33 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2025-01-09 13:43:33 [scrapy.crawler] INFO: Overridden settings:
{}
2025-01-09