# OpenFoodFacts - Analysis of Taxonomies

- packaging shapes taxonomy
- packaging materials taxonomy
- categories taxonomy (too big to display!)

In [1]:
!pip install python-slugify



In [2]:
from urllib.request import urlopen
from slugify import slugify

In [3]:
#url = "https://github.com/openfoodfacts/openfoodfacts-server/raw/main/taxonomies/packaging_shapes.txt"
url = "https://github.com/openfoodfacts/openfoodfacts-server/raw/main/taxonomies/packaging_materials.txt"
#url = "https://github.com/openfoodfacts/openfoodfacts-server/raw/main/taxonomies/categories.txt"

In [4]:
with urlopen(url) as response:
   txt = response.read()

## Parse taxonomy and create a list of edges between the categories (nodes)

In [5]:
parents = []
dupes = []
taxonomy = []
nodes = {}
edges = []


for i, line in enumerate(txt.decode('UTF-8').splitlines()):
    
    if line.startswith('<en:'):
        parents.append(line[1:])
    
    if line.strip() == '':
        # clear parents when block ends
        parents = []
    
    if line.startswith('en:'):
        first_name = line.split(',')[0].strip()
        node = nodes.get(first_name)

        # check for duplicates
        if node != None:
            dupes.append(f"'{first_name}' in line {i}")

        nodes[first_name] = parents

        slug = slugify(first_name[3:])
        if (len(parents) != 0):
            for p in parents:
                edges.append(f"  {slugify(p[3:])} --- {slug}")
        else:
            edges.append(f"  {slug}")

print("#edges:", len(edges))

#edges: 163


## Visualize Taxonomy Graph with Mermaid

In [6]:
# https://mermaid.js.org/config/Tutorials.html#jupyter-integration-with-mermaid-js
import base64
from IPython.display import Image, display
import matplotlib.pyplot as plt

def mm(graph):
    graphbytes = graph.encode("utf8")
    base64_bytes = base64.b64encode(graphbytes)
    base64_string = base64_bytes.decode("ascii")
    display(Image(url="https://mermaid.ink/img/" + base64_string))

# CAUTION: if the graph has too many edges, it will not render, because it is send as a very loooooong base64 encoded url param string that exceeds the limit.
graph = '\n'.join(['graph LR'] + edges)
mm(graph)