In [3]:
import requests
from bs4 import BeautifulSoup

# URL of the EPPO Global Database page
url = 'https://gd.eppo.int/PPPUse/3CRGK'

# Send a GET request to the webpage
response = requests.get(url)
response.raise_for_status()  # Check for request errors

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Function to recursively extract hierarchy
def extract_hierarchy(ul_tag, parent_url=''):
    hierarchy = []
    for li in ul_tag.find_all('li', recursive=False):
        # Extract the text and URL
        a_tag = li.find('a', href=True)
        if a_tag:
            node_text = a_tag.get_text(strip=True)
            node_url = a_tag['href']
            # If the URL is relative, construct the full URL
            if node_url.startswith('/'):
                node_url = 'https://gd.eppo.int' + node_url
            # Append the current node
            hierarchy.append({'text': node_text, 'url': node_url})
            # Check for nested lists
            nested_ul = li.find('ul', recursive=False)
            if nested_ul:
                # Recursively extract the nested hierarchy
                nested_hierarchy = extract_hierarchy(nested_ul, node_url)
                # Append the nested hierarchy to the current node
                hierarchy[-1]['children'] = nested_hierarchy
    return hierarchy

# Find the main <ul> tag containing the hierarchy
main_ul = soup.find('ul', {'class': 'tree'})  # Adjust the class or id as needed
children = []
if main_ul:
    hierarchy = extract_hierarchy(main_ul)
    # Print the extracted hierarchy
    for item in hierarchy:
        print(item)
        children.append(item)
else:
    print('No hierarchy found on the page.')

root = {'text': 'Crop groups(3CRGK)', 'url': 'https://gd.eppo.int/PPPUse/3CRGK', 'children': children}

{'text': 'amenity grassland(3AMGC)', 'url': 'https://gd.eppo.int/taxon/3AMGC'}
{'text': 'arable crops(3ARAC)', 'url': 'https://gd.eppo.int/taxon/3ARAC', 'children': [{'text': 'Amaranthus cruentus(AMACR)', 'url': 'https://gd.eppo.int/taxon/AMACR'}, {'text': 'Baptisia tinctoria(BAPTI)', 'url': 'https://gd.eppo.int/taxon/BAPTI'}, {'text': 'beet crops(3BEEC)', 'url': 'https://gd.eppo.int/taxon/3BEEC', 'children': [{'text': 'Beta vulgaris subsp. vulgaris var. altissima(BEAVA)', 'url': 'https://gd.eppo.int/taxon/BEAVA'}, {'text': 'Beta vulgaris subsp. vulgaris var. crassa(BEAVC)', 'url': 'https://gd.eppo.int/taxon/BEAVC'}]}, {'text': 'brassica arable crops(3BRAC)', 'url': 'https://gd.eppo.int/taxon/3BRAC', 'children': [{'text': 'Brassica napus subsp. rapifera(BRSNA)', 'url': 'https://gd.eppo.int/taxon/BRSNA'}, {'text': 'Brassica oleracea var. medullosa(BRSOM)', 'url': 'https://gd.eppo.int/taxon/BRSOM'}, {'text': 'Camelina alyssum(CMAAL)', 'url': 'https://gd.eppo.int/taxon/CMAAL'}, {'text': '

In [5]:
# Function to display the tree structure
def display_tree(hierarchy, level=0):
    for node in hierarchy:
        print(" " * (level * 4) + f"- {node['text']} ({node['url']})")
        if 'children' in node:
            display_tree(node['children'], level + 1)

display_tree([root]) 

- Crop groups(3CRGK) (https://gd.eppo.int/PPPUse/3CRGK)
    - amenity grassland(3AMGC) (https://gd.eppo.int/taxon/3AMGC)
    - arable crops(3ARAC) (https://gd.eppo.int/taxon/3ARAC)
        - Amaranthus cruentus(AMACR) (https://gd.eppo.int/taxon/AMACR)
        - Baptisia tinctoria(BAPTI) (https://gd.eppo.int/taxon/BAPTI)
        - beet crops(3BEEC) (https://gd.eppo.int/taxon/3BEEC)
            - Beta vulgaris subsp. vulgaris var. altissima(BEAVA) (https://gd.eppo.int/taxon/BEAVA)
            - Beta vulgaris subsp. vulgaris var. crassa(BEAVC) (https://gd.eppo.int/taxon/BEAVC)
        - brassica arable crops(3BRAC) (https://gd.eppo.int/taxon/3BRAC)
            - Brassica napus subsp. rapifera(BRSNA) (https://gd.eppo.int/taxon/BRSNA)
            - Brassica oleracea var. medullosa(BRSOM) (https://gd.eppo.int/taxon/BRSOM)
            - Camelina alyssum(CMAAL) (https://gd.eppo.int/taxon/CMAAL)
            - Camelina sativa(CMASA) (https://gd.eppo.int/taxon/CMASA)
            - mustard crops(3