In [1]:
from pathlib import Path
import scrapy
import csv
import os

class RDWSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        self.reset_data()
        urls = [
            "https://apk-handboek.rdw.nl/personenautos",
            "https://apk-handboek.rdw.nl/bedrijfsautos-licht",
            "https://apk-handboek.rdw.nl/bedrijfsautos-zwaar",
            "https://apk-handboek.rdw.nl/driewielige-motorrijtuigen",
            "https://apk-handboek.rdw.nl/aanhangwagens",
            "https://apk-handboek.rdw.nl/landbouwvoertuigen",
            "https://apk-handboek.rdw.nl/veelgestelde-vragen"
        ]

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
    
    def parse(self, response):
        if response.css(".forward-list"):
            links = response.css(".forward-list a")
            yield from response.follow_all(links, callback=self.parse)
        elif response.css(".law-section"):
            self.parse_law(response)
        elif response.css(".faq"):
            self.parse_faq(response)  

    def parse_law(self, response):
        # Write page content as context to text file.
        segments = list(set(response.url.split("/")[-3:]))
        filename = "-".join(segments) + ".txt"        
        context = response.css(".law-section *::text").getall()
        context = self.squeeze(context)    
        
        with open("data/laws/" + filename, 'w') as f:
            f.write(context)

        # Write each regulation and methods of inspection (moi) as separate entries to csv.
        article = response.css(".law-section__number::text").get()        
        title = response.css("#breadcrumbs li *::text").getall()[2:]
        title = self.squeeze(title)
 
        with open("data/laws.csv", 'a') as f:
            writer = csv.writer(f)            
            
            laws = response.css(".law-section > ol > li")
            for law in laws:
                text = law.css("*::text").getall()
                text = self.squeeze(text)

                if text:
                    writer.writerow([title, text, filename, article])
            
        self.log(f"Saved file {filename}")
        
    def parse_faq(self, response):
        items = response.css(".faq ul li")

        with open("data/faq.csv", 'a') as f:
            writer = csv.writer(f)

            for item in items:
                question = item.css("h2 button::text").getall()
                question = self.squeeze(question)
                
                answer = item.css("div *::text").getall()
                answer = self.squeeze(answer)
                
                writer.writerow([question, answer])

    def reset_data(self):      
        with open("data/laws.csv", 'w') as f:
            f.truncate()
            writer = csv.writer(f)
            writer.writerow(["title", "snippet", "filename", "article"])

        with open("data/faq.csv", 'w') as f:
            f.truncate()
            writer = csv.writer(f)
            writer.writerow(["question", "answer"])

    def squeeze(self, items):
        return " ".join(item.strip() for item in items)

In [2]:
from scrapy.crawler import CrawlerProcess

process = CrawlerProcess()
process.crawl(RDWSpider)
process.start()

2025-01-13 20:40:04 [scrapy.utils.log] INFO: Scrapy 2.12.0 started (bot: scrapybot)
2025-01-13 20:40:04 [scrapy.utils.log] INFO: Versions: lxml 5.3.0.0, libxml2 2.12.9, cssselect 1.2.0, parsel 1.9.1, w3lib 2.2.1, Twisted 24.11.0, Python 3.11.8 | packaged by conda-forge | (main, Feb 16 2024, 20:53:32) [GCC 12.3.0], pyOpenSSL 24.0.0 (OpenSSL 3.4.0 22 Oct 2024), cryptography 42.0.5, Platform Linux-6.8.0-51-generic-x86_64-with-glibc2.35
2025-01-13 20:40:04 [scrapy.addons] INFO: Enabled addons:
[]
2025-01-13 20:40:04 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2025-01-13 20:40:04 [scrapy.extensions.telnet] INFO: Telnet Password: 0f6f23cb5e280f10
2025-01-13 20:40:04 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2025-01-13 20:40:04 [scrapy.crawler] INFO: Overridden settings:
{}
2025-01-13

In [69]:
import pandas as pd
from datasets import Dataset
import random
import string

question_templates = [
    "Welke eisen gelden er voor {title}?",
    "Wat zegt de wet over {title}?",
    "Hoe wordt {title} gekeurd?",
    "Welke wijze van keuren gebruikt men voor {title}?",
    "Over {title} is de regelgeving als volgt: ",
    "{title} keurt men door: "
]

# SQuAD format:
# {
#   'id': str,
#   'title': str,
#   'context': str,
#   'question': str,
#   'answers': {
#     'text': list(str),
#     'answer_start': list(int)
#   }
# }

def dedup_faq():
    df = pd.read_csv("data/faq.csv")
    df = df.drop_duplicates()
    df = df.reindex()
    df.to_csv("data/faq.csv", columns=["question", "answer"], index=False)

def title_to_question(title):
    template = random.choice(question_templates)
    return template.format(title=title)

def read_context(filename):
    with open("data/laws/" + filename) as f:
        return f.read()

def row_to_squad(row):
    qid = row.name
    title = row["title"]
    context = read_context(row["filename"])
    question = title_to_question(row["title"])
    answer = row["snippet"]
    answer = answer.lstrip(string.whitespace + string.digits + '.')
    answer_start = context.find(answer)
    if (not answer_start > 0):
        context = 'NONE'
        answer_start = 0
    
    answers = {
        'text': [answer],
        'answer_start': [answer_start]
    }

    return pd.Series({
        'id': qid,
        'title': title,
        'context': context,
        'question': question,
        'answers': answers
    })

def append_laws(df):
    data = pd.read_csv("data/laws.csv")


    qa = data.apply(row_to_squad, axis=1)
    qa = qa[qa["context"] != "NONE"]

    print(f'Appending laws {len(qa)}/{len(data)}')
    return pd.concat([df, qa], join="inner")

def append_faq(df):
    # OUT OF DATE: Needs context, start answer idx etc.
    dedup_faq()
    
    data = pd.read_csv("data/faq.csv")
    data = data.drop_duplicates()
    print('Appending FAQ #', len(data))

    return pd.concat([df, data], join="inner")

def create_qa():
    df = pd.DataFrame([], columns=["id", "title", "context", "question", "answers"])
    df = append_laws(df)
    
    data = Dataset.from_pandas(df)
    data.save_to_disk("data/qa.hf")

    return df

df = create_qa()
df.head()

Appending laws 1490/1492


Saving the dataset (0/1 shards):   0%|          | 0/1490 [00:00<?, ? examples/s]

2025-01-13 14:19:31 [fsspec.local] DEBUG: open file: /home/jovyan/Udacity/RDW_Chatbot/data/qa.hf/data-00000-of-00001.arrow
2025-01-13 14:19:31 [fsspec.local] DEBUG: open file: /home/jovyan/Udacity/RDW_Chatbot/data/qa.hf/state.json
2025-01-13 14:19:31 [fsspec.local] DEBUG: open file: /home/jovyan/Udacity/RDW_Chatbot/data/qa.hf/dataset_info.json


Unnamed: 0,id,title,context,question,answers
0,0,Bedrijfsauto's zwaar Stuurinrichting...,Onderdelen stuurinrichting Artikel 5.3.29 A...,Welke wijze van keuren gebruikt men voor ...,{'text': ['De bestuurde wielen van bedrijfsaut...
1,1,Bedrijfsauto's zwaar Stuurinrichting...,Onderdelen stuurinrichting Artikel 5.3.29 A...,Wat zegt de wet over Bedrijfsauto's zwaar...,{'text': ['Bij draaiing van het stuurwiel tot ...
2,2,Bedrijfsauto's zwaar Stuurinrichting...,Onderdelen stuurinrichting Artikel 5.3.29 A...,Hoe wordt Bedrijfsauto's zwaar Stuur...,{'text': ['De voor de overbrenging van de stuu...
3,3,Bedrijfsauto's zwaar Stuurinrichting...,Onderdelen stuurinrichting Artikel 5.3.29 A...,Bedrijfsauto's zwaar Stuurinrichting...,{'text': ['Stofhoezen van het stuurhuis en de ...
4,4,Bedrijfsauto's zwaar Stuurinrichting...,Onderdelen stuurinrichting Artikel 5.3.29 A...,Over Bedrijfsauto's zwaar Stuurinric...,{'text': ['Koppelingen moeten een zichtbaar sp...
