# Scrape NvvP website voor vergoedingen 2024

In [1]:
import json
import unicodedata
from textwrap import dedent
from pathlib import Path
from copy import deepcopy

from selenium import webdriver
from bs4 import BeautifulSoup

In [12]:
YEAR = "2025"

page_url = "https://www.podotherapie.nl/vergoedingen/"

options = webdriver.ChromeOptions();
options.add_argument('headless');

driver = webdriver.Chrome(options=options)
driver.get(page_url)

In [3]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [4]:
articles = soup.find_all('article')[1].find_all('article')

In [5]:
def html_to_json(part):
    rows = part.find_all("tr")
    
    headers = {}
    thead = part.find("thead")
    if thead:
        thead = thead.find_all("th")
        for i in range(len(thead)):
            headers[i] = thead[i].text.strip().lower()
    data = []
    for row in rows:
        cells = row.find_all("td")
        if thead:
            items = {}
            if len(cells) > 0:
                for index in headers:
                    items[headers[index]] = cells[index].text
        else:
            items = []            
            for index in cells:
                items.append(index.text.strip())
        if items:
            data.append(items)
    
    return data

In [6]:
providers = {}

for article in articles:
    provider_name = article.find('button').text
    table = article.find('table')
    if table:
        products = html_to_json(table)
        providers[provider_name] = {
            'has_table': True,
            'products': products,
        }
    else:
        info = article.find('p').text
        providers[provider_name] = {
            'has_table': False,
            'info': info,
        }

In [7]:
len(providers)

43

In [8]:
path = Path(f'./{YEAR}/')
if not path.exists():
    path.mkdir()

In [9]:
with open(path / 'nvvp_tarieven_scrape.json', 'w') as f:
    json.dump(providers, f, indent=4)

## Three column format

In [13]:
table_press = [{
    'verzekeraar': 'Verzekeraar',
    'pakket': 'Pakket',
    'vergoeding': 'Vergoeding',
}]

for provider, data in providers.items():
    if provider == "ASR (De Amersfoortse)":
        products = deepcopy(data['products'])
        info = products.pop()
        additional_info = f"\n{info['pakket']}: {info['vergoeding']}"
        for product in products:
            table_press.append({
                'verzekeraar': provider,
                'pakket': product['pakket'],
                'vergoeding': product['vergoeding'] + f"\n{additional_info}",
            })

    elif data['has_table']:
        for product in data['products']:
            table_press.append({
                'verzekeraar': provider,
                'pakket': product['pakket'],
                'vergoeding': product['vergoeding'],
            })
    else:
        table_press.append({
            'verzekeraar': provider,
            'pakket': '-',
            'vergoeding': data['info'],
        })

with open(path / f'vergoedingen_{YEAR}_tablepress_3col.json', 'w') as f:
    json.dump(table_press, f, indent=4)

In [11]:
table_press

[{'verzekeraar': 'Verzekeraar',
  'pakket': 'Pakket',
  'vergoeding': 'Vergoeding'},
 {'verzekeraar': 'Aevitea',
  'pakket': 'Laef!1',
  'vergoeding': 'Geen vergoeding'},
 {'verzekeraar': 'Aevitea',
  'pakket': 'Laef!2',
  'vergoeding': '€\u202f70,- (inclusief steunzolen) per kalenderjaar'},
 {'verzekeraar': 'Aevitea',
  'pakket': 'Laef!3',
  'vergoeding': '€\u202f100,- (inclusief steunzolen) per kalenderjaar'},
 {'verzekeraar': 'Aevitea',
  'pakket': 'Laef!4',
  'vergoeding': '€\u202f150,- (inclusief steunzolen) per kalenderjaar'},
 {'verzekeraar': 'Anderzorg',
  'pakket': 'Flex',
  'vergoeding': 'Geen vergoeding'},
 {'verzekeraar': 'Anderzorg',
  'pakket': 'Extra',
  'vergoeding': 'Geen vergoeding'},
 {'verzekeraar': 'Aon Vitaal (Zilveren Kruis)',
  'pakket': 'Basis Vitaal',
  'vergoeding': 'Geen vergoeding'},
 {'verzekeraar': 'Aon Vitaal (Zilveren Kruis)',
  'pakket': 'Vitaal 1',
  'vergoeding': 'Geen vergoeding'},
 {'verzekeraar': 'Aon Vitaal (Zilveren Kruis)',
  'pakket': 'Vitaal 