In [1]:
import requests
from bs4 import BeautifulSoup

# URL of the EPPO Global Database page
url = 'https://gd.eppo.int/PPPUse/3CRGK'

# Send a GET request to the webpage
response = requests.get(url)
response.raise_for_status()  # Check for request errors

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Function to recursively extract hierarchy
def extract_hierarchy(ul_tag, parent_url=''):
    hierarchy = []
    for li in ul_tag.find_all('li', recursive=False):
        # Extract the text and URL
        a_tag = li.find('a', href=True)
        if a_tag:
            node_text = a_tag.get_text(strip=True)
            last_opening_bracket = node_text.rfind('(')
            name = node_text[:last_opening_bracket]
            eppo = node_text[last_opening_bracket:][1:].split(')')[0]
            taxonomic = 'non-taxonomic' if eppo.startswith('3') else 'taxonomic'
            node_url = a_tag['href']
            # If the URL is relative, construct the full URL
            if node_url.startswith('/'):
                node_url = 'https://gd.eppo.int' + node_url
            # Append the current node
            hierarchy.append({'text': name, 'eppo': eppo, 'taxo': taxonomic, 'url': node_url})
            # Check for nested lists
            nested_ul = li.find('ul', recursive=False)
            if nested_ul:
                # Recursively extract the nested hierarchy
                nested_hierarchy = extract_hierarchy(nested_ul, node_url)
                # Append the nested hierarchy to the current node
                hierarchy[-1]['children'] = nested_hierarchy
                hierarchy[-1]['taxo'] = 'non-taxonomic'
    return hierarchy

# Find the main <ul> tag containing the hierarchy
main_ul = soup.find('ul', {'class': 'tree'})  # Adjust the class or id as needed
children = []
if main_ul:
    hierarchy = extract_hierarchy(main_ul)
    # Print the extracted hierarchy
    for item in hierarchy:
        print(item)
        children.append(item)
else:
    print('No hierarchy found on the page.')

root = {'text': 'crop groups', 'eppo': '3CRGK', 'taxo': 'non-taxonomic', 'url': 'https://gd.eppo.int/PPPUse/3CRGK', 'children': children}

{'text': 'amenity grassland', 'eppo': '3AMGC', 'taxo': 'non-taxonomic', 'url': 'https://gd.eppo.int/taxon/3AMGC'}
{'text': 'arable crops', 'eppo': '3ARAC', 'taxo': 'non-taxonomic', 'url': 'https://gd.eppo.int/taxon/3ARAC', 'children': [{'text': 'Amaranthus cruentus', 'eppo': 'AMACR', 'taxo': 'taxonomic', 'url': 'https://gd.eppo.int/taxon/AMACR'}, {'text': 'Baptisia tinctoria', 'eppo': 'BAPTI', 'taxo': 'taxonomic', 'url': 'https://gd.eppo.int/taxon/BAPTI'}, {'text': 'beet crops', 'eppo': '3BEEC', 'taxo': 'non-taxonomic', 'url': 'https://gd.eppo.int/taxon/3BEEC', 'children': [{'text': 'Beta vulgaris subsp. vulgaris var. altissima', 'eppo': 'BEAVA', 'taxo': 'taxonomic', 'url': 'https://gd.eppo.int/taxon/BEAVA'}, {'text': 'Beta vulgaris subsp. vulgaris var. crassa', 'eppo': 'BEAVC', 'taxo': 'taxonomic', 'url': 'https://gd.eppo.int/taxon/BEAVC'}]}, {'text': 'brassica arable crops', 'eppo': '3BRAC', 'taxo': 'non-taxonomic', 'url': 'https://gd.eppo.int/taxon/3BRAC', 'children': [{'text': 'Bra

In [286]:
# Function to display the tree structure
def display_tree(hierarchy, level=0):
    for node in hierarchy:
        print(" " * (level * 4) + f"- {node['text']} ({node['eppo']}) ({node['url']})")
        if 'children' in node:
            display_tree(node['children'], level + 1)

display_tree([root]) 

- crop groups (3CRGK) (https://gd.eppo.int/PPPUse/3CRGK)
    - amenity grassland (3AMGC) (https://gd.eppo.int/taxon/3AMGC)
    - arable crops (3ARAC) (https://gd.eppo.int/taxon/3ARAC)
        - Amaranthus cruentus (AMACR) (https://gd.eppo.int/taxon/AMACR)
        - Baptisia tinctoria (BAPTI) (https://gd.eppo.int/taxon/BAPTI)
        - beet crops (3BEEC) (https://gd.eppo.int/taxon/3BEEC)
            - Beta vulgaris subsp. vulgaris var. altissima (BEAVA) (https://gd.eppo.int/taxon/BEAVA)
            - Beta vulgaris subsp. vulgaris var. crassa (BEAVC) (https://gd.eppo.int/taxon/BEAVC)
        - brassica arable crops (3BRAC) (https://gd.eppo.int/taxon/3BRAC)
            - Brassica napus subsp. rapifera (BRSNA) (https://gd.eppo.int/taxon/BRSNA)
            - Brassica oleracea var. medullosa (BRSOM) (https://gd.eppo.int/taxon/BRSOM)
            - Camelina alyssum (CMAAL) (https://gd.eppo.int/taxon/CMAAL)
            - Camelina sativa (CMASA) (https://gd.eppo.int/taxon/CMASA)
            - mu

In [287]:
def count_leaves(hierarchy):
    count = 0
    for node in hierarchy:
        if 'children' in node:
            count += count_leaves(node['children'])
        else:
            if node['url'].split('/')[-1].startswith('3'): # these are subgroups without children
                pass
            else:
                count += 1
    return count

count_leaves(root['children'])

575

In [288]:
def remove_subgroups_at_leaf(hierarchy):
    for node in hierarchy:
        if 'children' in node:
            node['children'] = remove_subgroups_at_leaf(node['children'])
        else:
            if node['url'].split('/')[-1].startswith('3'): # these are subgroups without children
                hierarchy.remove(node)
    return hierarchy

remove_subgroups_at_leaf(root['children'])
display_tree([root]) 

- crop groups (3CRGK) (https://gd.eppo.int/PPPUse/3CRGK)
    - arable crops (3ARAC) (https://gd.eppo.int/taxon/3ARAC)
        - Amaranthus cruentus (AMACR) (https://gd.eppo.int/taxon/AMACR)
        - Baptisia tinctoria (BAPTI) (https://gd.eppo.int/taxon/BAPTI)
        - beet crops (3BEEC) (https://gd.eppo.int/taxon/3BEEC)
            - Beta vulgaris subsp. vulgaris var. altissima (BEAVA) (https://gd.eppo.int/taxon/BEAVA)
            - Beta vulgaris subsp. vulgaris var. crassa (BEAVC) (https://gd.eppo.int/taxon/BEAVC)
        - brassica arable crops (3BRAC) (https://gd.eppo.int/taxon/3BRAC)
            - Brassica napus subsp. rapifera (BRSNA) (https://gd.eppo.int/taxon/BRSNA)
            - Brassica oleracea var. medullosa (BRSOM) (https://gd.eppo.int/taxon/BRSOM)
            - Camelina alyssum (CMAAL) (https://gd.eppo.int/taxon/CMAAL)
            - Camelina sativa (CMASA) (https://gd.eppo.int/taxon/CMASA)
            - mustard crops (3MUSC) (https://gd.eppo.int/taxon/3MUSC)
            

In [289]:
# replace parents with single child with the child
def remove_single_child(hierarchy):
    for node in hierarchy:
        if 'children' in node:
            node['children'] = remove_single_child(node['children'])
            if len(node['children']) == 1:
                hierarchy[hierarchy.index(node)] = node['children'][0]
    return hierarchy

In [290]:
# remove repeated sibilings
def remove_repeated_siblings(hierarchy):
    for node in hierarchy:
        if 'children' in node:
            node['children'] = remove_repeated_siblings(node['children'])
            for child in node['children']:
                if node['children'].count(child) > 1:
                    node['children'].remove(child)
    return hierarchy

In [291]:
def compare(node1, node2):
    if node1['eppo'] == node2['eppo']:
        if 'children' in node1 and 'children' in node2:
            if len(node1['children']) == len(node2['children']):
                for i in range(len(node1['children'])):
                    if not compare(node1['children'][i], node2['children'][i]):
                        return False
                return True
            else:
                return False
        elif 'children' not in node1 and 'children' not in node2:
            return True
        else:
            return False        
    else:
        return False

In [292]:
from copy import deepcopy
oldroot =  {'text': 'crop groups', 'eppo': '3CRGK', 'taxo': 'non-taxonomic', 'url': 'https://gd.eppo.int/PPPUse/3CRGK'}
while not compare(oldroot, root):
    oldroot = deepcopy(root)
    remove_single_child(root['children'])
    remove_repeated_siblings(root['children'])
display_tree([root])

- crop groups (3CRGK) (https://gd.eppo.int/PPPUse/3CRGK)
    - arable crops (3ARAC) (https://gd.eppo.int/taxon/3ARAC)
        - Amaranthus cruentus (AMACR) (https://gd.eppo.int/taxon/AMACR)
        - Baptisia tinctoria (BAPTI) (https://gd.eppo.int/taxon/BAPTI)
        - beet crops (3BEEC) (https://gd.eppo.int/taxon/3BEEC)
            - Beta vulgaris subsp. vulgaris var. altissima (BEAVA) (https://gd.eppo.int/taxon/BEAVA)
            - Beta vulgaris subsp. vulgaris var. crassa (BEAVC) (https://gd.eppo.int/taxon/BEAVC)
        - brassica arable crops (3BRAC) (https://gd.eppo.int/taxon/3BRAC)
            - Brassica napus subsp. rapifera (BRSNA) (https://gd.eppo.int/taxon/BRSNA)
            - Brassica oleracea var. medullosa (BRSOM) (https://gd.eppo.int/taxon/BRSOM)
            - Camelina alyssum (CMAAL) (https://gd.eppo.int/taxon/CMAAL)
            - Camelina sativa (CMASA) (https://gd.eppo.int/taxon/CMASA)
            - mustard crops (3MUSC) (https://gd.eppo.int/taxon/3MUSC)
            

In [293]:
# write the tree to a json file 
import json

def write_tree_to_json(hierarchy, file=None):
    if file:
        json.dump(hierarchy, file, indent=4)
        
with open('/workdir/eppo_crops_tree.json', 'w') as file:
    write_tree_to_json(root, file=file)
    