In [47]:
import json
import re

# DATA LOADING

In [48]:
file_path = '../data/mushroom_data_merged.json'


with open(file_path, 'r') as f:
    data = json.load(f)

# OVERVIEW

In [49]:
# Print number of entries
print(f"Number of entries in the dataset: {len(data)}")

# Print all unique keys that exist in the dataset
unique_keys = set()
for entry in data:
    unique_keys.update(entry.keys())
print(f"Unique keys in the dataset: {unique_keys}")

# See all unique keys that exist inside the mycomorphbox key
mycomorphbox_keys = set()
for entry in data:
    if 'mycomorphbox' in entry:
        mycomorphbox_keys.update(entry['mycomorphbox'].keys())
print(f"Unique keys in 'mycomorphbox': {mycomorphbox_keys}")

Number of entries in the dataset: 1161
Unique keys in the dataset: {'id', 'taxonomy', 'wiki_links', 'views_all_time', 'speciesbox', 'article', 'mycomorphbox', 'mushroom', 'text'}
Unique keys in 'mycomorphbox': {'name', 'hymeniumType', 'sporePrintColor', 'whichGills', 'stipeCharacter', 'howEdible', 'ecologicalType', 'capShape'}


In [50]:

morphology_attributes = ['sporePrintColor', 'stipeCharacter', 'name', 'capShape', 'hymeniumType', 'howEdible', 'ecologicalType', 'whichGills']
for i in morphology_attributes:
    attribute_options = set()
    for entry in data:
        if 'mycomorphbox' in entry and i in entry['mycomorphbox']:
            attribute_options.add(entry['mycomorphbox'][i])
    print(f"{len(attribute_options)} Unique options for '{i}': {attribute_options}")

60 Unique options for 'sporePrintColor': {'Spore print is brown', 'Spore print is purple', 'Spore print is brown\n   to reddish-brown', 'Spore print is cream\n   to white', 'Spore print is white\n   to cream', None, 'Spore print is yellow\n   to ochre', 'Spore print is purple\n   to brown', 'Spore print is green', 'Spore print is purple-brown', 'Spore print is ochre\n   to brown', 'Spore print is reddish-brown', 'Spore print is brown\n   to purple-brown', 'Spore print is tan', 'Spore print is cream\n   to buff', 'Spore print is salmon\n   to reddish-brown', 'Spore print is salmon', 'Spore print is yellow\n   to buff', 'Spore print is yellow', 'Spore print is olive\n   to brown', 'Spore print is pinkish-brown\n   to reddish-brown', 'Spore print is white', 'Spore print is blackish-brown\n   to black', 'Spore print is yellow\n   to cream', 'Spore print is purple-black', 'Spore print is yellow\n   to olive', 'Spore print is black\n   to brown', 'Spore print is brown\n   to yellow-brown', '

In [51]:
# Print number of entries that has a 'mycomorphbox' key
mycomorphbox_count = sum(1 for entry in data if 'mycomorphbox' in entry)
print(f"Number of entries with 'mycomorphbox': {mycomorphbox_count}")

# Print number of entries that has each morphology attribute
for i in morphology_attributes:
    count = sum(1 for entry in data if 'mycomorphbox' in entry and i in entry['mycomorphbox'])
    print(f"Number of entries with '{i}': {count}")

Number of entries with 'mycomorphbox': 1101
Number of entries with 'sporePrintColor': 1101
Number of entries with 'stipeCharacter': 1101
Number of entries with 'name': 1101
Number of entries with 'capShape': 1101
Number of entries with 'hymeniumType': 1101
Number of entries with 'howEdible': 1101
Number of entries with 'ecologicalType': 1101
Number of entries with 'whichGills': 1101


# ATTRIBUTE CLEANING 

In [52]:
# We need to clean all the morphology attributes, so we can use the for the network later. We start with sporePrintColor.

attribute = "sporePrintColor"
for i in range(len(data)):
    if 'mycomorphbox' not in data[i]:
        continue
    value = data[i]['mycomorphbox'][attribute]
    if value is None:
        data[i]['mycomorphbox'][attribute] = []
        continue
    # Remove "Spore print is" from the string
    cleaned_color = value.replace("Spore print is ", "").strip()
    # Remove newline characters
    cleaned_color = cleaned_color.replace("\n", "").strip()

    # Split by comma, -, " to ", " " and keep the list
    cleaned_color = [x.strip() for x in re.split(r',|-| to | ', cleaned_color) if x.strip()]
    data[i]['mycomorphbox'][attribute] = cleaned_color    

In [53]:
# We need to clean all the morphology attributes, so we can use the for the network later. Now we do capShape.
attribute = 'capShape'
for i in range(len(data)):
    if 'mycomorphbox' not in data[i]:
        continue
    value = data[i]['mycomorphbox'][attribute]
    if value is None:
        data[i]['mycomorphbox'][attribute] = []
        continue

    # Remove "Cap is" from the string
    value = value.replace("Cap is ", "").strip()
    # Remove newline characters
    value = value.replace("\n", "").strip()

    # Remove "or"
    value = value.replace(" or ", "")

    # Split by comma, -, " to ", " " and keep the list
    value = [x.strip() for x in re.split(r',|-| to | ', value) if x.strip()]

    data[i]['mycomorphbox'][attribute] = value

In [54]:
# We need to clean all the morphology attributes, so we can use the for the network later. Now we do stipeCharacter.

attribute = 'stipeCharacter'
for i in range(len(data)):
    if 'mycomorphbox' not in data[i]:
        continue
    value = data[i]['mycomorphbox'][attribute]
    if value is None:
        data[i]['mycomorphbox'][attribute] = []
        continue
    value = value.replace("Stipe is ", "").strip()
    value = value.replace("Stipe has a ", "").strip()
    value = value.replace("\n", "").strip()
    value = value.replace("or is", "").strip()
    value = value.replace("and", "").strip()
    value = value.replace("or has a", "").strip()
    value = value.replace("or", "").strip()

    # Remove word and text after "Lacks a"
    value = re.split(r'Lacks a', value)[0].strip()
    value = re.split(r'lacks a', value)[0].strip()

    # Split by comma, -, " to ", " " and keep the list
    value = [x.strip() for x in re.split(r',|-| to | ', value) if x.strip()]

    data[i]['mycomorphbox'][attribute] = value

In [55]:
# We need to clean all the morphology attributes, so we can use the for the network later. Now we do capShape.
attribute = 'hymeniumType'
for i in range(len(data)):
    if 'mycomorphbox' not in data[i]:
        continue
    value = data[i]['mycomorphbox'][attribute]
    if value is None:
        data[i]['mycomorphbox'][attribute] = []
        continue

    # Remove "Hymenium is" from the string
    value = value.replace("Hymenium is ", "").strip()
    # Remove "or"
    value = value.replace(" or ", "")

    # Remove "Hymenium attachment is irregularnot applicable"
    value = value.replace("Hymenium attachment is irregularnot applicable", "").strip()
    # Remove "Hymenium attachment is not applicable"
    value = value.replace("Hymenium attachment is not applicable", "").strip()

    # Split by comma, -, " to ", " " and keep the list
    value = [x.strip() for x in re.split(r',|-| to | ', value) if x.strip()]

    data[i]['mycomorphbox'][attribute] = value

In [56]:
# We need to clean all the morphology attributes, so we can use the for the network later. Now we do capShape.
attribute = 'howEdible'
for i in range(len(data)):
    if 'mycomorphbox' not in data[i]:
        continue
    value = data[i]['mycomorphbox'][attribute]
    if value is None:
        data[i]['mycomorphbox'][attribute] = []
        continue
    # Remove "Edibility is" from the string
    value = value.replace("Edibility is ", "").strip()
    # Remove "unknown"
    value = value.replace("unknown", "").strip()
    # Remove "or"
    # Replace edible with " edible " but not in words like inedible
    value = re.sub(r'(?<!in)(edible)', ' edible ', value)
    # Replace poisonous with " poisonous "
    value = value.replace("poisonous", " poisonous ")
    # Replace psychoactive with " psychoactive "
    value = value.replace("psychoactive", " psychoactive ")
    # Remove can cause
    value = re.sub(r'can cause [a-zA-Z ,]+', '', value)

    # Replace "not recommended" with "not_recommended"
    value = value.replace("not recommended", "not_recommended")

    # Replace "choice" with " choice "
    value = value.replace("choice", " choice ")

    # Replace too hard to eat with "too_hard_to_eat"
    value = value.replace("too hard to eat", "too_hard_to_eat")

    # Remove "but"
    value = value.replace("or", "")
    value = value.replace("but", "")
    
    # Split by comma, -, " to ", " " and keep the list
    value = [x.strip() for x in re.split(r',|-| to | ', value) if x.strip()]

    data[i]['mycomorphbox'][attribute] = value

In [57]:
# We need to clean all the morphology attributes, so we can use the for the network later. Now we do capShape.
attribute = 'whichGills'
for i in range(len(data)):
    if 'mycomorphbox' not in data[i]:
        continue
    value = data[i]['mycomorphbox'][attribute]
    if value is None:
        data[i]['mycomorphbox'][attribute] = []
        continue
    data[i]['mycomorphbox'][attribute] = [value]

In [58]:
attribute = "ecologicalType"

for i in range(len(data)):
    if 'mycomorphbox' not in data[i]:
        continue
    value = data[i]['mycomorphbox'][attribute]
    if value is None:
        data[i]['mycomorphbox'][attribute] = []
        continue
    # Remove "Ecology is" from the string
    value = value.replace("Ecology is ", "").strip()

    # Remove "or"
    value = value.replace(" or ", "")

    # Split by comma, -, " to ", " " and keep the list
    value = [x.strip() for x in re.split(r',|-| to | ', value) if x.strip()]
    data[i]['mycomorphbox'][attribute] = value

In [59]:
attribute = "conservation_status"

for i in range(len(data)):
    if 'speciesbox' not in data[i]:
        continue
    value = data[i]['speciesbox'][attribute]
    if value is None:
        data[i]['speciesbox'][attribute] = []
        continue
    
    # Split by comma, -, " to ", " " and keep the list
    value = [x.strip() for x in re.split(r',|-| to | ', value) if x.strip()]
    data[i]['speciesbox'][attribute] = value

# NETWORK CREATION

In [60]:
import networkx as nx
from itertools import combinations
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [61]:
# Make a network using networkx, where each node is mushroom from the dataset.
# The following attributes from the data should be added to each node:
# ["mycomorphbox"]["sporePrintColor"]
# ["mycomorphbox"]["stipeCharacter"]
# ["mycomorphbox"]["capShape"]
# ["mycomorphbox"]["hymeniumType"]
# ["mycomorphbox"]["ecologicalType"]
# ["mycomorphbox"]["whichGills"]
# ["speciesbox"]["conservation_status"]
# ["mushroom"]
# ["id"]
# ["views_all_time"]
# ["article"]
# ["speciesbox"]["image"]
# Also include taxonomic information for the purposes of future work.

G = nx.Graph()

for entry in data:
    if "mycomorphbox" not in entry or "speciesbox" not in entry:
        continue
    node_id = entry.get("id")

    # Extract attributes safely
    myco = entry.get("mycomorphbox", {})
    speciesbox = entry.get("speciesbox", {})
    taxonomy = entry.get("taxonomy", {})

    attrs = {
        "mushroom": entry.get("mushroom"),
        "id": node_id,
        "views_all_time": entry.get("views_all_time"),
        "article": entry.get("article"),
        "sporePrintColor": myco.get("sporePrintColor"),
        "howEdible": myco.get("howEdible"),
        "stipeCharacter": myco.get("stipeCharacter"),
        "capShape": myco.get("capShape"),
        "hymeniumType": myco.get("hymeniumType"),
        "ecologicalType": myco.get("ecologicalType"),
        "whichGills": myco.get("whichGills"),
        "conservation_status": speciesbox.get("conservation_status"),
        "image": speciesbox.get("image"),
        "Genus": taxonomy.get("Genus"),
        "Family": taxonomy.get("Family"),
        "Order": taxonomy.get("Order"),
        "Class": taxonomy.get("Class"),
        "Division": taxonomy.get("Division"),
        "Species": taxonomy.get("Species"),
        "text": entry.get("text"), 
        "article": entry.get("article"),
        "wikilinks": entry.get("wiki_links")
    }

    G.add_node(node_id, **attrs)

# Example: inspect attributes for one node
print(G.nodes[data[0]["id"]])


{'mushroom': 'Russula maculata', 'id': 1, 'views_all_time': 9897, 'article': 'https://en.wikipedia.org/wiki/Russula_maculata', 'sporePrintColor': ['yellow', 'ochre'], 'howEdible': [], 'stipeCharacter': ['bare'], 'capShape': ['convex', 'depressed'], 'hymeniumType': ['adnexed', 'free'], 'ecologicalType': ['mycorrhizal'], 'whichGills': ['Gills on hymenium'], 'conservation_status': [], 'image': 'https://upload.wikimedia.org/wikipedia/commons/thumb/8/82/1996-02-15_Russula_maculata_Qu%C3%A9l._%26_Roze_117.jpg/250px-1996-02-15_Russula_maculata_Qu%C3%A9l._%26_Roze_117.jpg', 'Genus': 'Russula', 'Family': 'Russulaceae', 'Order': 'Russulales', 'Class': 'Agaricomycetes', 'Division': 'Basidiomycota', 'Species': 'R.\xa0maculata', 'text': 'Russula maculata is a species of mushroom in the genus Russula . [ 1 ] Its cap ranges from 4–10 centimetres ( 1 + 1 ⁄ 2 –4\xa0in) wide, with hues varying from whitish to red. [ 2 ] It is difficult to distinguish reliably and its edibility is unknown. [ 2 ]\nThis Ru

In [62]:
import pickle
with open('mushroom_network_no_edges.pkl', 'wb') as f:
    pickle.dump(G, f)