# Scrape NvvP website voor vergoedingen

In [1]:
import json
import unicodedata
from textwrap import dedent

from selenium import webdriver
from bs4 import BeautifulSoup

In [2]:
page_url = "https://www.podotherapie.nl/vergoedingen/"

options = webdriver.ChromeOptions();
options.add_argument('headless');

driver = webdriver.Chrome(options=options)
driver.get(page_url)

In [3]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [4]:
h2_tags = soup.find_all('h2', attrs={'class': 'sluiter'})

In [5]:
def remove_unicode(input_string) -> str:
    return unicodedata.normalize('NFKD', input_string).encode('ascii', 'ignore').decode()


def html_to_json(part):
    rows = part.find_all("tr")
    
    headers = {}
    thead = part.find("thead")
    if thead:
        thead = thead.find_all("th")
        for i in range(len(thead)):
            headers[i] = thead[i].text.strip().lower()
    data = []
    for row in rows:
        cells = row.find_all("td")
        if thead:
            items = {}
            if len(cells) > 0:
                for index in headers:
                    items[headers[index]] = cells[index].text
                    return cells[index].text
        else:
            items = []            
            for index in cells:
                items.append(index.text.strip())
        if items:
            data.append(items)
    
    products = {
        name: information
        for name, information in data[1:]
    }
    
    return products

In [6]:
providers = {}

for h2_tag in h2_tags:
    provider_name = remove_unicode(h2_tag.text)
    info_div = h2_tag.find_next_sibling("div")
    if info_div:
        table = info_div.find('table')
        if table:
            products = html_to_json(table)
            providers[provider_name] = {
                'has_table': True,
                'products': products,
            }
        else:
            info = info_div.find('p')
            providers[provider_name] = {
                'has_table': False,
                'info': info.text.strip(),
            }

    
    

In [7]:
len(providers)

0

In [223]:
with open('nvvp_tarieven_scrape.json', 'w') as f:
    json.dump(providers, f, indent=4)

## Two column format

In [257]:
table_press = [{
    'pakket_naam': 'Verzekeringspakket',
    'vergoeding': 'Vergoeding',
}]

for provider, data in providers.items():
    if data['has_table']:
        for product, info in data['products'].items():
            table_press.append({
                'pakket_naam': f'{provider} - {product}',
                'vergoeding': info,
            })
    else:
        table_press.append({
            'pakket_naam': provider,
            'vergoeding': data['info'],
        })

with open('vergoedingen_2023_tablepress_2col.json', 'w') as f:
    json.dump(table_press, f, indent=4)

## Three column format

In [261]:
table_press = [{
    'verzekeraar': 'Verzekeraar',
    'pakket': 'Pakket',
    'vergoeding': 'Vergoeding',
}]

for provider, data in providers.items():
    if data['has_table']:
        for product, info in data['products'].items():
            table_press.append({
                'verzekeraar': provider,
                'pakket': product,
                'vergoeding': info,
            })
    else:
        table_press.append({
            'verzekeraar': provider,
            'pakket': '-',
            'vergoeding': data['info'],
        })

with open('vergoedingen_2023_tablepress_3col.json', 'w') as f:
    json.dump(table_press, f, indent=4)

In [239]:
def create_html_object(name, data):
    if data['has_table']:
        table_data = '<tr><th scope="col">Pakket</th><th scope="col">Vergoeding</th></tr>'
        for pakket, pakket_info in data['products'].items():
            table_data += f'<tr><th scope="row">{pakket}</th><td>{pakket_info}</td></tr>'
        info = f'<table class="verzekeraar-vergoeding-table">{table_data}</table>'
    else:
        info = f'<p class="verzekeraar-vergoeding-info">{data["info"]}</p>'
    
    html_code = dedent(f"""\
    <div class="verzekeraar">
        <h4 class="verzekeraar-titel">{name}</h4>
        <div class="verzekeraar-vergoeding">
            {info}
        </div>
    </div>
    """)
    
    return html_code.strip()

In [240]:
html_objects = ""
for provider, provider_data in providers.items():
    html_objects += create_html_object(provider, provider_data)

In [243]:
with open('vergoedingen_verzekeraars.html', 'w') as f:
    f.write(html_objects)

In [244]:
print(html_objects)

<div class="verzekeraar">
    <h4 class="verzekeraar-titel">Aevitea</h4>
    <div class="verzekeraar-vergoeding">
        <table class="verzekeraar-vergoeding-table"><tr><th scope="col">Pakket</th><th scope="col">Vergoeding</th></tr><tr><th scope="row">Prettig</th><td>Geen vergoeding</td></tr><tr><th scope="row">Prima</th><td>€ 70,- (inclusief steunzolen) per kalenderjaar</td></tr><tr><th scope="row">Populair</th><td>€ 100,- (inclusief steunzolen) per kalenderjaar</td></tr><tr><th scope="row">Ruim</th><td>€ 150,- (inclusief steunzolen) per kalenderjaar</td></tr></table>
    </div>
</div><div class="verzekeraar">
    <h4 class="verzekeraar-titel">Anderzorg</h4>
    <div class="verzekeraar-vergoeding">
        <table class="verzekeraar-vergoeding-table"><tr><th scope="col">Pakket</th><th scope="col">Vergoeding</th></tr><tr><th scope="row">Extra</th><td>Voetzorg € 150,- totaal per jaar</td></tr></table>
    </div>
</div><div class="verzekeraar">
        <h4 class="verzekeraar-titel">Aon I