In [109]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import linear_kernel
import re
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

In [110]:
# Set seed
np.random.seed(42)

In [111]:
file_path = "../mushroom_network_no_edges.pkl"

with open(file_path, 'rb') as f:
    data = pd.read_pickle(f)

In [112]:
# Print number of nodes
print(f"Number of nodes: {data.number_of_nodes()}")

# Print number of edges
print(f"Number of edges: {data.number_of_edges()}")

# Print node attributes
print("Node attributes:")
count = 0
for node, attrs in data.nodes(data=True):
    print(f"Node {node}: {attrs}")
    count += 1
    if count >= 5:
        break

Number of nodes: 1101
Number of edges: 0
Node attributes:
Node 1: {'mushroom': 'Russula maculata', 'id': 1, 'views_all_time': 9897, 'article': 'https://en.wikipedia.org/wiki/Russula_maculata', 'sporePrintColor': ['yellow', 'ochre'], 'howEdible': [], 'stipeCharacter': ['bare'], 'capShape': ['convex', 'depressed'], 'hymeniumType': ['adnexed', 'free'], 'ecologicalType': ['mycorrhizal'], 'whichGills': ['Gills on hymenium'], 'conservation_status': [], 'image': 'https://upload.wikimedia.org/wikipedia/commons/thumb/8/82/1996-02-15_Russula_maculata_Qu%C3%A9l._%26_Roze_117.jpg/250px-1996-02-15_Russula_maculata_Qu%C3%A9l._%26_Roze_117.jpg', 'Genus': 'Russula', 'Family': 'Russulaceae', 'Order': 'Russulales', 'Class': 'Agaricomycetes', 'Division': 'Basidiomycota', 'Species': 'R.\xa0maculata', 'text': 'Russula maculata is a species of mushroom in the genus Russula . [ 1 ] Its cap ranges from 4–10 centimetres ( 1 + 1 ⁄ 2 –4\xa0in) wide, with hues varying from whitish to red. [ 2 ] It is difficult to

In [113]:
# Get all nodes into a dataframe with their attributes
df = pd.DataFrame.from_dict(dict(data.nodes(data=True)), orient='index')

df = df.reset_index(drop=True)

In [114]:
tax_cols = ['Division', 'Class', 'Order', 'Family', 'Genus', 'Species']

morph_cols = ['sporePrintColor', 'howEdible', 'stipeCharacter', 'capShape', 
              'hymeniumType', 'ecologicalType', 'whichGills']

text_col = ['text']

wiki_cols = ['article', 'wikilinks']


# TAXONOMIC NETWORK 
Two mushrooms in the same **Genus** are very close (siblings).
Two mushrooms in the same **Family** are moderately close (cousins).
Two mushrooms in the same **Order** are distantly related.
Two mushrooms in the same **Class** (like *Agaricomycetes*) are basically strangers who just happen to live in the same city (too broad to be useful).

### The Strategy: "Weighted Hierarchical Overlap"

We will assign points based on how deep the match goes:
*   **Match on Genus:** +1.0 points (Strongest link)
*   **Match on Family:** +0.5 points
*   **Match on Order:** +0.2 points
*   **Match on Class/Division:** 0 points (Ignore these, or your graph will become a giant "hairball" where everything connects to everything).

In [116]:
# 1. Weights
W_GENUS = 1.0
W_FAMILY = 0.5
W_ORDER  = 0.2

print("\nEncoding matrices...")

# 2. Get Dummies (Sparse matrices)
# We access .values to get raw numpy arrays (safer)
genus_mat = pd.get_dummies(df['Genus']).astype(int).values
family_mat = pd.get_dummies(df['Family']).astype(int).values
order_mat = pd.get_dummies(df['Order']).astype(int).values

# 3. Dot Products (Numpy)
# Matrix (N x Categories) dot Transpose (Categories x N) = (N x N)
print("Calculating interactions...")
sim_genus = np.dot(genus_mat, genus_mat.T)
sim_family = np.dot(family_mat, family_mat.T)
sim_order = np.dot(order_mat, order_mat.T)

# 4. Weighted Sum
# Note: Since they are nested (Genus implies Family implies Order), 
# we don't just sum them, we want the highest match to dictate the score.
# Score = 1.0 (Same Genus) + 0.5 (Same Family) + 0.2 (Same Order)
taxonomic_scores = (sim_genus * W_GENUS) + (sim_family * W_FAMILY) + (sim_order * W_ORDER)

# Normalize
max_score = W_GENUS + W_FAMILY + W_ORDER
taxonomic_scores = taxonomic_scores / max_score

# Zero out diagonal
np.fill_diagonal(taxonomic_scores, 0)

# Convert back to DataFrame for easier lookup
tax_sim_df = pd.DataFrame(taxonomic_scores, index=df.index, columns=df.index)

print(f"Taxonomic Matrix Ready: {tax_sim_df.shape}")


Encoding matrices...
Calculating interactions...
Taxonomic Matrix Ready: (1101, 1101)


In [117]:
G_tax = nx.Graph()

print("Adding nodes...")
for idx, row in df.iterrows():
    G_tax.add_node(
        idx,
        label=row['mushroom'],
        views=row['views_all_time'],
        genus=row['Genus'],
        family=row['Family'],
        order=row['Order'],
        division=row['Division'],
        class_tax=row['Class'],
        species=row['Species']
    )


Adding nodes...


In [118]:
# We defined weights: Genus(1.0) + Family(0.5) + Order(0.2) / 1.7
# Score for Family match only = (0.5 + 0.2) / 1.7 = ~0.41

# A smart threshold is 0.3 or 0.4. 
# This cuts out the weak "Order" links but keeps "Family" connections.
THRESHOLD = 0.40

print(f"Adding edges (Threshold > {THRESHOLD})...")

# 1. Get the upper triangle of the matrix (excluding diagonal)
#    This prevents adding edge (A,B) and (B,A) twice.
matrix_values = tax_sim_df.values
rows, cols = np.where(np.triu(matrix_values, k=1) > THRESHOLD)

# 2. Build edge list efficiently
edges_to_add = []
for r, c in zip(rows, cols):
    weight = matrix_values[r, c]
    edges_to_add.append((r, c, weight))

# 3. Add to graph
G_tax.add_weighted_edges_from(edges_to_add)

# GRAPH SUMMARY
print("-" * 30)
print(f"Nodes: {G_tax.number_of_nodes()}")
print(f"Edges: {G_tax.number_of_edges()}")
print(f"Density: {nx.density(G_tax):.4f}")
print(f"Connected Components: {nx.number_connected_components(G_tax)}")

# Check degree distribution snippet
degrees = [d for n, d in G_tax.degree()]
print(f"Avg Degree: {np.mean(degrees):.2f}")

Adding edges (Threshold > 0.4)...
------------------------------
Nodes: 1101
Edges: 55248
Density: 0.0912
Connected Components: 87
Avg Degree: 100.36


In [119]:
print(len(df['Order'].unique()))
print(len(df['Family'].unique()))


20
87


In [120]:
# Check neighbors of Amanita muscaria (if it exists in the graph)
target_name = "Amanita muscaria"
target_idx = df[df['mushroom'] == target_name].index[0]

if target_idx in G_tax:
    neighbors = list(G_tax.neighbors(target_idx))
    print(f"\nNode {target_name} has {len(neighbors)} connections.")
    print("Top 3 connected neighbors:")
    
    # Sort neighbors by edge weight
    sorted_neighbors = sorted(neighbors, key=lambda n: G_tax[target_idx][n]['weight'], reverse=True)[:3]
    
    for n in sorted_neighbors:
        print(f"- {df.loc[n, 'mushroom']} (Weight: {G_tax[target_idx][n]['weight']:.2f})")



Node Amanita muscaria has 72 connections.
Top 3 connected neighbors:
- Amanita verna (Weight: 1.00)
- Amanita muscaria var. guessowii (Weight: 1.00)
- Amanita ocreata (Weight: 1.00)


# MORPHOLOGICAL NETWORK

In [121]:
# 1. Define Attributes and Weights

# We use weights from the gini coefficients calculated earlier.

morph_config = {
    'sporePrintColor': 0.6831,
    'howEdible': 0.4118,
    'stipeCharacter': 0.3991,
    'capShape': 0.5414,
    'hymeniumType': 0.6856,
    'ecologicalType': 0.6876,
    'conservation_status': 0.6627
}

total = 0
for i in morph_config:
    total += morph_config[i]
print(f"Total weight: {total}")

Total weight: 4.0713


In [122]:
print("Building Morphological Layer with Smart Weights...")

# Initialize accumulators
total_weighted_sim = np.zeros((len(df), len(df)))
total_weight_denom = 0

mlb = MultiLabelBinarizer()

# 2. Loop through each attribute
for col, weight in morph_config.items():
    print(f"Processing {col:<15} (Weight: {weight})")
    
    # A. Clean Data: Ensure every entry is a list (handle NaNs)
    # If it's NaN, make it empty list []
    clean_series = df[col].apply(lambda x: x if isinstance(x, list) else [])
    
    # B. Binarize (One-Hot Encoding for Lists)
    # Result: Matrix where columns are unique values (e.g., 'convex', 'flat')
    feature_mat = mlb.fit_transform(clean_series)
    
    # C. Calculate Similarity for this specific trait
    # Determine if they share the trait
    sim_mat = cosine_similarity(feature_mat)
    
    # D. Add to weighted total
    total_weighted_sim += (sim_mat * weight)
    total_weight_denom += weight

# 3. Normalize (0 to 1)
# We divide by the sum of weights so the final score is between 0 and 1
morph_sim_matrix = total_weighted_sim / total_weight_denom

# 4. Remove Self-Loops
np.fill_diagonal(morph_sim_matrix, 0)

# Convert to DataFrame for easier inspection
morph_sim_df = pd.DataFrame(morph_sim_matrix, index=df.index, columns=df.index)

print(f"\nMorphology Matrix Ready.")
print(f"Shape: {morph_sim_df.shape}")
print(f"Max Score: {morph_sim_df.max().max():.4f}")
print(f"Avg Score: {morph_sim_df.values[morph_sim_df.values > 0].mean():.4f}")

# --- 5. Sanity Check: Who connects to Amanita muscaria based on LOOKS? ---
target_name = "Amanita muscaria"
try:
    target_idx = df[df['mushroom'] == target_name].index[0]
    
    # Get top matches
    matches = morph_sim_df.iloc[target_idx].sort_values(ascending=False).head(10)
    
    print(f"\n--- Top Morphological Lookalikes for {target_name} ---")
    for idx, score in matches.items():
        name = df.iloc[idx]['mushroom']
        edibility = df.iloc[idx]['howEdible']
        print(f"Score {score:.2f} | {name} (Edible: {edibility})")
        
except IndexError:
    print(f"Could not find {target_name}")

Building Morphological Layer with Smart Weights...
Processing sporePrintColor (Weight: 0.6831)
Processing howEdible       (Weight: 0.4118)
Processing stipeCharacter  (Weight: 0.3991)
Processing capShape        (Weight: 0.5414)
Processing hymeniumType    (Weight: 0.6856)
Processing ecologicalType  (Weight: 0.6876)
Processing conservation_status (Weight: 0.6627)

Morphology Matrix Ready.
Shape: (1101, 1101)
Max Score: 0.8989
Avg Score: 0.2651

--- Top Morphological Lookalikes for Amanita muscaria ---
Score 0.86 | Amanita porphyria (Edible: ['inedible'])
Score 0.86 | Amanita flavoconia (Edible: [])
Score 0.84 | Amanita persicina (Edible: ['poisonous', 'psychoactive'])
Score 0.84 | Amanita muscaria var. guessowii (Edible: ['poisonous', 'psychoactive'])
Score 0.84 | Amanita regalis (Edible: ['poisonous', 'psychoactive'])
Score 0.81 | Amanita abrupta (Edible: ['poisonous'])
Score 0.81 | Amanita smithiana (Edible: ['poisonous'])
Score 0.81 | Amanita proxima (Edible: ['poisonous'])
Score 0.81 

In [123]:
# --- Build the Morphological Graph ---
G_morph = nx.Graph()

# Add nodes
for idx, row in df.iterrows():
    G_morph.add_node(
        idx,
        label=row['mushroom'],
        views=row['views_all_time'],
        genus=row['Genus']
    )

# Add edges with Threshold
# Since your max score is 1.0, a threshold of 0.6 means "Sharing >60% of traits"
THRESH_MORPH = 0.60 

# Use numpy for speed
matrix_values = morph_sim_df.values
# Upper triangle to avoid duplicates
rows, cols = np.where(np.triu(matrix_values, k=1) > THRESH_MORPH)

edges_to_add = []
for r, c in zip(rows, cols):
    weight = matrix_values[r, c]
    edges_to_add.append((r, c, weight))

G_morph.add_weighted_edges_from(edges_to_add)

print(f"Morphology Graph Built.")
print(f"Nodes: {G_morph.number_of_nodes()}")
print(f"Edges: {G_morph.number_of_edges()}")
print(f"Density: {nx.density(G_morph):.4f}")
print(f"Avg Degree: {np.mean([d for n, d in G_morph.degree()]):.2f}")

Morphology Graph Built.
Nodes: 1101
Edges: 7391
Density: 0.0122
Avg Degree: 13.43


# TEXT NETWORK


In [124]:
import re
from sklearn.feature_extraction import text

# 1. Text Cleaning Function (Kept same)
def clean_wiki_text(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r'\[\s*\d+\s*\]', '', text)
    text = re.sub(r'This .*? article is a stub\.', '', text)
    text = re.sub(r'You can help Wikipedia by expanding it\.', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 2. Add Custom Stop Words
my_stop_words = [
    # Wikipedia / Academic
    'citation needed', 'citation', 'needed', 'commonly', 'known', 'common', 'name', 
    'described', 'reported', 'according', 'published', 'stub', 'article', 'wikipedia',
    'usually', 'generally', 'typically', 'often', 'although', 'though', 'may', 'can',
    'include', 'including', 'includes', 'contain', 'contains', 'containing', 'like', 'help',
    
    # Generic Biology
    'mushroom', 'mushrooms', 'species', 'fungus', 'fungi', 'genus', 'variety', 
    'specimen', 'fruit', 'fruiting', 'body', 'bodies', 'group', 'member', 'type',
    'found', 'grow', 'growing', 'grows', 'occur', 'occurs', 'occurring', 
    'distributed', 'distribution', 'range', 'native',
    
    # Generic Anatomy
    'cap', 'stem', 'stipe', 'gill', 'gills', 'spore', 'spores', 'flesh', 'surface', 
    'appearance', 'shape', 'color', 'colour', 'look', 'looks', 'similar', 'shape', 'shaped', 'veil', 'base', 
    
    # Units / Measurements
    'cm', 'mm', 'μm', 'um', 'g', 'mg', 'μg', 'centimeter', 'millimeter', 'inch', 
    'diameter', 'wide', 'width', 'long', 'length', 'thick', 'thickness', 'size', 
    'small', 'large', 'high', 'low', 'amount', 'level',
    
    # Numbers and time
    'one', 'two', 'three', 'four', 'first', 'second', 'year', 'month', 'day', 'time',
    'spring', 'summer', 'autumn', 'fall', 'winter',
    
    # Abbreviations / Single letters
    'l', 'c', 'p', 'sp', 'spp'
]

# Add A-Z single letters just to be safe (removes middle initials)
my_stop_words.extend(list('abcdefghijklmnopqrstuvwxyz'))

geo_stop_words = [
    'north', 'south', 'east', 'west', 
    'northern', 'southern', 'eastern', 'western',
    'america', 'american', 'europe', 'european', 'asia', 'asian', 
    'africa', 'african', 'australia', 'australian',
    'china', 'chinese', 'japan', 'japanese', 'mexico', 'mexican',
    'united', 'states', 'kingdom', 'canada', 'canadian', 'zealand',
    'world', 'temperate', 'tropical', 'hemisphere', 'region', 'regions',
    'distribution', 'distributed', 'native', 'introduced'
]

# 3. Combine Static Lists
# Start with standard English and add yours
final_stop_words = list(text.ENGLISH_STOP_WORDS) + my_stop_words + geo_stop_words

# 4. Dynamic Stop Words (The Fix)

# A. Taxonomy (Included 'mushroom' column here)
tax_cols = ['Division', 'Class', 'Order', 'Family', 'Genus', 'Species', 'mushroom']

for col in tax_cols:
    # Get unique values, drop NaNs
    unique_vals = df[col].dropna().unique()
    
    for val in unique_vals:
        s = str(val).lower()
        # THE FIX: Replace non-alphanumeric chars (-, ., [, ]) with space
        # This turns "auricula-judae" into "auricula judae"
        s_clean = re.sub(r'[^a-z0-9]', ' ', s)
        # Split by whitespace to get individual tokens
        words = s_clean.split()
        final_stop_words.extend(words)

# B. Morphology
morph_cols = ['sporePrintColor', 'howEdible', 'stipeCharacter', 'capShape', 
              'hymeniumType', 'ecologicalType', 'whichGills']

for col in morph_cols:
    # Explode lists first, then get unique
    unique_vals = df[col].explode().dropna().unique()
    
    for val in unique_vals:
        if isinstance(val, str):
            s = val.lower()
            # Apply same regex fix just in case (e.g. "blue-green")
            s_clean = re.sub(r'[^a-z0-9]', ' ', s)
            words = s_clean.split()
            final_stop_words.extend(words)

# 5. Final Deduplication
# This reduces the list size significantly and speeds up TF-IDF
final_stop_words = list(set(final_stop_words))

print(f"Stop word list compiled. Total unique words to ignore: {len(final_stop_words)}")

# 6. Apply Text Cleaning
df['clean_text'] = df['text'].apply(clean_wiki_text)

Stop word list compiled. Total unique words to ignore: 1929


In [125]:
tfidf = TfidfVectorizer(
    stop_words=final_stop_words, 
    max_df=0.5,           # Ignore words in >50% of docs
    min_df=2,             # Ignore unique words (typos)
    strip_accents='unicode'
)

# Fit and Transform
tfidf_matrix = tfidf.fit_transform(df['clean_text'])
feature_names = np.array(tfidf.get_feature_names_out())

# --- 6. Verify Results ---
print("\n--- Top Keywords for the Most Popular Mushrooms ---")
top_mushrooms = df.sort_values('views_all_time', ascending=False).head(3)

for idx, row in top_mushrooms.iterrows():
    matrix_row_idx = df.index.get_loc(idx)
    row_vector = tfidf_matrix[matrix_row_idx]
    sorted_indices = np.argsort(row_vector.data)[::-1][:20] # Top 20
    top_words = feature_names[row_vector.indices[sorted_indices]]
    
    print(f"\n{row['mushroom']}:")
    print(", ".join(top_words))


--- Top Keywords for the Most Popular Mushrooms ---

Amanita muscaria:
muscimol, fly, ibotenic, use, acid, agaric, siberia, effects, urine, peoples, theory, used, food, wasson, poisoning, shamans, reindeer, muscarine, santa, alternative

Agaricus bisporus:
manure, bispora, psalliota, commercial, cultivation, worldwide, composted, paris, micrograms, farm, spawn, varieties, immature, cultivated, 1925, spored, marketed, renamed, history, fields

Psilocybe cubensis:
psilocybin, cattle, psilocin, bulk, illegal, grain, method, flush, weight, potency, users, growth, doses, cakes, dung, effects, cow, grams, magic, decriminalized


*   **Amanita muscaria:** "Shamans", "Siberia", "Fly", "Muscimol". This captures the **cultural and chemical** essence perfectly.
*   **Agaricus bisporus:** "Commercial", "Manure", "Cultivated". This captures the **industrial** essence.
*   **Psilocybe cubensis:** "Illegal", "Cattle", "Dung", "Grain". This captures the **usage and habitat**.

This confirms that your Text Layer will add a dimension that Taxonomy and Morphology completely missed: **Human Context.**

In [126]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
# 1. Clean the Similarity Matrix
# We need to act on the cosine_sim matrix you calculated
text_sim_matrix = cosine_sim.copy()

# A. Remove Self-Loops (Diagonal)
np.fill_diagonal(text_sim_matrix, 0)

# B. The "Stub" Filter
# If two articles are >95% identical, they are likely boilerplate copies. 
# We remove them to prevent false "strong" connections.
count_stubs = np.sum(text_sim_matrix > 0.95)
print(f"Pruning {count_stubs} edges that look like boilerplate stubs (>0.95 similarity)...")
text_sim_matrix[text_sim_matrix > 0.95] = 0

# 2. Build the Graph
G_text = nx.Graph()

# Add Nodes
for idx, row in df.iterrows():
    G_text.add_node(
        idx,
        label=row['mushroom'],
        views=row['views_all_time'],
        genus=row['Genus']
    )

Pruning 9158 edges that look like boilerplate stubs (>0.95 similarity)...


In [128]:
# 3. Add Edges with a Strict Threshold
# 0.30 is a standard cutoff for TF-IDF similarity to imply "Topical Relevance"
THRESH_TEXT = 0.4

rows, cols = np.where(np.triu(text_sim_matrix, k=1) > THRESH_TEXT)

edges_to_add = []
for r, c in zip(rows, cols):
    weight = text_sim_matrix[r, c]
    edges_to_add.append((r, c, weight))

G_text.add_weighted_edges_from(edges_to_add)

# 4. Health Check
print("-" * 30)
print(f"Text Graph Built (Threshold > {THRESH_TEXT})")
print(f"Nodes: {G_text.number_of_nodes()}")
print(f"Edges: {G_text.number_of_edges()}")
print(f"Density: {nx.density(G_text):.4f}")
print(f"Avg Degree: {np.mean([d for n, d in G_text.degree()]):.2f}")


------------------------------
Text Graph Built (Threshold > 0.4)
Nodes: 1101
Edges: 5689
Density: 0.0094
Avg Degree: 10.33


# WIKI LINKS NETWORK

In [129]:
import ast

In [130]:
import pandas as pd
import networkx as nx
import ast

def build_wiki_matrix_and_graph(df):
    """
    1. Builds the network (Graph)
    2. Converts it to a binary Adjacency Matrix (DataFrame)
    """
    
    # Build the Graph
    def parse_links(x):
        if isinstance(x, str):
            try:
                return ast.literal_eval(x)
            except (ValueError, SyntaxError):
                return []
        return x if isinstance(x, list) else []

    print("Parsing wikilinks...")
    df['wikilinks_parsed'] = df['wikilinks'].apply(parse_links)

    valid_urls = set(df['article'])
    url_to_name = pd.Series(df.mushroom.values, index=df.article).to_dict()

    G = nx.Graph()
    
    # Add nodes explicitly from the dataframe to ensure order
    G.add_nodes_from(df['mushroom'])

    print("Adding edges...")
    for index, row in df.iterrows():
        source_mushroom = row['mushroom']
        links = row['wikilinks_parsed']
        
        for link in links:
            target_url = link.get('url')
            if target_url in valid_urls:
                target_mushroom = url_to_name[target_url]
                if source_mushroom != target_mushroom:
                    G.add_edge(source_mushroom, target_mushroom)

    #Convert to Matrix
    print("Converting to Matrix...")
    adj_matrix = nx.to_numpy_array(G, nodelist=df['mushroom'])
    
    # Convert to DataFrame (looks just like your tax_sim_df)
    wiki_links_sim_df = pd.DataFrame(
        adj_matrix,
        index=df.index,
        columns=df.index
    )

    return G, wiki_links_sim_df

In [131]:
# --- USAGE ---
print("Processing Wiki Data...")
G_wiki_links, wiki_links_sim_df = build_wiki_matrix_and_graph(df)

print(f"Graph nodes: {G_wiki_links.number_of_nodes()}")
print(f"Graph edges: {G_wiki_links.number_of_edges()}")
print(f"Matrix shape: {wiki_links_sim_df.shape}")

Processing Wiki Data...
Parsing wikilinks...
Adding edges...
Converting to Matrix...
Graph nodes: 1101
Graph edges: 6039
Matrix shape: (1101, 1101)


# ALL NETWORK LAYERS OVERVIEW

In [132]:

# Print taxonomy graph health check
print("-" * 30)
print(f"Taxonomy Graph Built (Threshold > {THRESHOLD})")
print(f"Nodes: {G_tax.number_of_nodes()}")
print(f"Edges: {G_tax.number_of_edges()}")
print(f"Density: {nx.density(G_tax):.4f}")
print(f"Avg Degree: {np.mean([d for n, d in G_tax.degree()]):.2f}")

# Print morphology graph health check
print("-" * 30)
print(f"Morphology Graph Built (Threshold > {THRESH_MORPH})")
print(f"Nodes: {G_morph.number_of_nodes()}")
print(f"Edges: {G_morph.number_of_edges()}")
print(f"Density: {nx.density(G_morph):.4f}")
print(f"Avg Degree: {np.mean([d for n, d in G_morph.degree()]):.2f}")

# Print taxonomy graph health check
print("-" * 30)
print(f"Text Graph Built (Threshold > {THRESH_TEXT})")
print(f"Nodes: {G_text.number_of_nodes()}")
print(f"Edges: {G_text.number_of_edges()}")
print(f"Density: {nx.density(G_text):.4f}")
print(f"Avg Degree: {np.mean([d for n, d in G_text.degree()]):.2f}")

# Print wiki links graph health check
print("-" * 30)
print(f"Wikipedia Links Graph Built")
print(f"Nodes: {G_wiki_links.number_of_nodes()}")
print(f"Edges: {G_wiki_links.number_of_edges()}")
print(f"Density: {nx.density(G_wiki_links):.4f}")
print(f"Avg Degree: {np.mean([d for n, d in G_wiki_links.degree()]):.2f}")


------------------------------
Taxonomy Graph Built (Threshold > 0.4)
Nodes: 1101
Edges: 55248
Density: 0.0912
Avg Degree: 100.36
------------------------------
Morphology Graph Built (Threshold > 0.6)
Nodes: 1101
Edges: 7391
Density: 0.0122
Avg Degree: 13.43
------------------------------
Text Graph Built (Threshold > 0.4)
Nodes: 1101
Edges: 5689
Density: 0.0094
Avg Degree: 10.33
------------------------------
Wikipedia Links Graph Built
Nodes: 1101
Edges: 6039
Density: 0.0100
Avg Degree: 10.97


# COMBINE NETWORKS 

In [133]:
from networkx.algorithms import community

In [135]:
import numpy as np
import pandas as pd
import networkx as nx

print("Rebuilding Composite Matrix (No Thresholds)...")

# 1. Align Matrices (Ensure these are the raw, un-thresholded versions)
M_tax = tax_sim_df.values
M_morph = morph_sim_df.values
M_text = text_sim_matrix
M_wiki_links = wiki_links_sim_df.values

# 2. Weighted Sum (Adjust weights if desired)
W_TAX = 0.10
W_MORPH = 0.25
W_TEXT = 0.40
W_WIKI_LINKS = 0.25

M_final = (M_tax * W_TAX) + (M_morph * W_MORPH) + (M_text * W_TEXT) + (M_wiki_links * W_WIKI_LINKS)

# 3. Zero out ONLY the diagonal
np.fill_diagonal(M_final, 0)

print(f"Matrix ready. Max value: {M_final.max():.4f}")
print(f"Non-zero count: {np.count_nonzero(M_final)}")

Rebuilding Composite Matrix (No Thresholds)...
Matrix ready. Max value: 0.8186
Non-zero count: 1127944


In [136]:
print("Building k-NN Backbone (k=10)...")

G_backbone = nx.Graph()

# 1. Add Nodes
for idx, row in df.iterrows():
    G_backbone.add_node(
        idx,
        label=row['mushroom'],
        views=row['views_all_time'],
        genus=row['Genus'],
        family=row['Family']
    )

# 2. Add k-Nearest Neighbors
k = 10
rows_count, cols_count = M_final.shape

for r in range(rows_count):
    # Get all scores for mushroom r
    row_scores = M_final[r]
    
    # Find indices of top k scores
    # argsort sorts ascending, so we take the last k
    top_indices = np.argsort(row_scores)[::-1][:k]
    
    for c in top_indices:
        weight = row_scores[c]
        # Only add if there is ANY similarity (weight > 0)
        if weight > 0:
            G_backbone.add_edge(r, c, weight=weight)

print(f"k-NN Graph Edges: {G_backbone.number_of_edges()}")
print(f"k-NN Components: {nx.number_connected_components(G_backbone)}")

Building k-NN Backbone (k=10)...
k-NN Graph Edges: 8280
k-NN Components: 1


In [137]:
# Check for "Supernodes"
degrees = dict(G_final.degree())
sorted_degrees = sorted(degrees.items(), key=lambda x: x[1], reverse=True)

print("Top 5 Most Connected Mushrooms:")
for name, deg in sorted_degrees[:5]:
    print(f"{df.loc[name, 'mushroom']}: {deg} connections")

print(f"\nMax Degree: {sorted_degrees[0][1]}")
print(f"Avg Degree: {np.mean(list(degrees.values())):.2f}")

Top 5 Most Connected Mushrooms:
Gymnopilus austrosapineus: 104 connections
Gymnopilus nashii: 102 connections
Gymnopilus austropicreus: 102 connections
Gymnopilus aurantiobrunneus: 101 connections
Gymnopilus avellanus: 101 connections

Max Degree: 104
Avg Degree: 15.04


In [138]:
import community.community_louvain as community_louvain
import pandas as pd

# Set seed for reproducibility
np.random.seed(42)

# 1. Run Louvain (Same as before)
print("--- Searching for the 'Natural' Community Structure ---")
resolutions = [0.6, 0.8, 1.0, 1.2, 1.4]
results = []

for r in resolutions:
    partition_dict = community_louvain.best_partition(G_final, weight='weight', resolution=r)
    num_clusters = len(set(partition_dict.values()))
    modularity_score = community_louvain.modularity(partition_dict, G_final, weight='weight')
    results.append((r, num_clusters, modularity_score, partition_dict))

# Pick Winner
best_result = max(results, key=lambda x: x[2]) 
best_partition = best_result[3]
print(f"Winner: Resolution {best_result[0]} with {best_result[1]} Clusters.")

# 2. Map Partition to DataFrame
df['community_louvain'] = df.index.map(best_partition)

# 3. Aggregation: Find Dominant FAMILY instead of Genus
natural_stats = df.groupby('community_louvain').agg({
    'views_all_time': ['count', 'mean'],
    
    # LOGIC CHANGE: Look at 'Family' column, calculate Mode (most frequent)
    'Family': lambda x: x.mode()[0] if not x.mode().empty else "Mixed/Unknown",
    
    # We still keep the flagship mushroom for reference, but we won't name the group by it
    'mushroom': lambda x: df.loc[x.index[df.loc[x.index, 'views_all_time'].argmax()], 'mushroom']
})

# Flatten and Rename
natural_stats.columns = ['Size', 'Avg_Views', 'Dominant_Family', 'Flagship_Mushroom']
natural_stats = natural_stats.sort_values('Avg_Views', ascending=False)

print("\n--- Top Communities by Family ---")
print(natural_stats.head(10))

# 4. Create a Helper Dictionary for the Plot
# Format: {0: "Amanitaceae", 1: "Strophariaceae", ...}
cluster_name_map = natural_stats['Dominant_Family'].to_dict()

--- Searching for the 'Natural' Community Structure ---
Winner: Resolution 1.0 with 17 Clusters.

--- Top Communities by Family ---
                   Size      Avg_Views   Dominant_Family  \
community_louvain                                          
4                    71  243565.042254       Amanitaceae   
5                   116  209301.456897  Hymenogastraceae   
7                    52  174966.442308       Agaricaceae   
9                   182  143250.368132        Mycenaceae   
11                   25  129411.480000   Cantharellaceae   
13                   30  126072.166667        Phallaceae   
15                   26  111814.730769     Morchellaceae   
12                   78   82807.012821        Boletaceae   
10                   47   80081.000000       Geastraceae   
14                   33   53548.090909        Suillaceae   

                       Flagship_Mushroom  
community_louvain                         
4                       Amanita muscaria  
5                 

In [139]:
# Add community IDs to graph nodes
nx.set_node_attributes(G_final, best_partition, 'community_louvain')
nx.set_node_attributes(G_final, cluster_name_map, 'community_name')
print("Node attributes updated with community info.")

Node attributes updated with community info.


In [140]:
# Save graph as pickle
with open("mushroom_network_final.pkl", 'wb') as f:
    pd.to_pickle(G_final, f)

In [141]:
# Save community dataframe
df.to_csv("mushroom_communities_louvain_final.csv", index=False)

### METRICS

In [142]:
import networkx as nx
import numpy as np
import plotly.graph_objects as go
import plotly.colors as pc
from fa2 import ForceAtlas2

# --- 0. COMPATIBILITY PATCH ---
if not hasattr(nx, 'to_scipy_sparse_matrix'):
    nx.to_scipy_sparse_matrix = nx.to_scipy_sparse_array

# --- 1. PREPARATION ---
partition = best_partition
node_ids = list(G_final.nodes())

# Calculate Log Views for sizing
all_views = [df.loc[n, 'views_all_time'] for n in node_ids]
log_views = np.log1p(all_views)
min_log, max_log = min(log_views), max(log_views)

# --- 2. CALCULATE DOMINANT FAMILIES ---
# We do this before plotting to have the data ready for the legend
cluster_families = {}
unique_comms = sorted(list(set(partition.values())))

for comm in unique_comms:
    # Get all nodes belonging to this community
    comm_nodes = [n for n in node_ids if partition[n] == comm]
    
    # Extract the 'Family' column, drop N/A, find the most common (mode)
    families = df.loc[comm_nodes, 'Family'].dropna()
    
    if not families.empty:
        # Get the most frequent family
        top_family = families.mode()[0]
    else:
        top_family = "Unknown Family"
        
    cluster_families[comm] = top_family

# --- 3. LAYOUT (ANTI-HAIRBALL) ---
print("Calculating Layout with Strong Repulsion...")

forceatlas2 = ForceAtlas2(
    outboundAttractionDistribution=True,
    linLogMode=False,        
    adjustSizes=False,       
    edgeWeightInfluence=0.1, # Weak edge influence = better separation
    jitterTolerance=1.0,
    barnesHutOptimize=True,
    barnesHutTheta=1.2,
    scalingRatio=100.0,      # High repulsion
    gravity=0.05,            # Low gravity
    verbose=True
)

pos = forceatlas2.forceatlas2_networkx_layout(G_final, iterations=2500)

# --- 4. DATA VECTORIZATION ---
x_nodes = [pos[n][0] for n in node_ids]
y_nodes = [pos[n][1] for n in node_ids]

# Sizing
node_sizes = np.interp(
    [np.log1p(df.loc[n, 'views_all_time']) for n in node_ids], 
    (min_log, max_log), 
    (5, 55)
)

# Coloring
palette = pc.qualitative.Dark24 + pc.qualitative.Light24
color_map = {comm: palette[i % len(palette)] for i, comm in enumerate(unique_comms)}
node_colors = [color_map[partition[n]] for n in node_ids]

# Hover Text
hover_texts = []
def format_attr(val):
    if val is None: return "N/A"
    if isinstance(val, list): return ", ".join([str(x).strip().title() for x in val]) if val else "Unknown"
    s_val = str(val)
    return "N/A" if s_val.lower() == 'nan' else s_val.title()

for n in node_ids:
    row = df.loc[n]
    comm_id = partition[n]
    fam = cluster_families.get(comm_id, "Unknown")
    
    txt = (
        f"<span style='font-size:16px'><b>{row['mushroom'].upper()}</b></span><br>"
        f"<span style='color: #444; font-style: italic'>{row.get('Genus', 'Unknown')} {row.get('Species', '')}</span><br>"
        f"<br>Views: <b>{row['views_all_time']:,}</b><br>"
        f"Cluster Family: <b>{fam}</b> (C{comm_id})<br>"
        f"Edibility: {format_attr(row.get('howEdible'))}"
    )
    hover_texts.append(txt)

# --- 5. PLOTTING ---
fig = go.Figure()

# A. Edges
edge_x, edge_y = [], []
for u, v in G_final.edges():
    x0, y0 = pos[u]
    x1, y1 = pos[v]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

fig.add_trace(go.Scattergl(
    x=edge_x, y=edge_y,
    mode='lines',
    line=dict(width=0.3, color="#bbbbbb"), # Slightly darker gray for better visibility
    opacity=0.3,
    hoverinfo='skip',
    showlegend=False
))

# B. Nodes
fig.add_trace(go.Scattergl(
    x=x_nodes, y=y_nodes,
    mode='markers',
    marker=dict(
        size=node_sizes, 
        color=node_colors, 
        opacity=0.9, 
        # VISUAL IMPROVEMENT: Dark thin border makes nodes "pop"
        line=dict(width=0.8, color="#FFFFFF") 
    ),
    text=hover_texts,
    hovertemplate="%{text}<extra></extra>",
    showlegend=False
))

# D. Legend (Using Families + Avg Views)
cluster_stats = {}
for n in node_ids:
    c = partition[n]
    views = df.loc[n, 'views_all_time']
    if c not in cluster_stats: cluster_stats[c] = {'total_views': 0, 'count': 0}
    cluster_stats[c]['total_views'] += views
    cluster_stats[c]['count'] += 1

sorted_clusters = sorted(
    cluster_stats.keys(), 
    key=lambda x: cluster_stats[x]['total_views'] / cluster_stats[x]['count'], 
    reverse=True
)

def human_format(num):
    num = float(num)
    if num >= 1_000_000: return f"{num/1_000_000:.1f}M"
    if num >= 1_000: return f"{num/1_000:.0f}K"
    return str(int(num))

for comm in sorted_clusters:
    count = cluster_stats[comm]['count']
    if count >= 5: # Filter small noise
        avg_v = cluster_stats[comm]['total_views'] / count
        avg_str = human_format(avg_v)
        
        # USE THE FAMILY NAME HERE
        fam_name = cluster_families.get(comm, "Mixed")
        
        legend_name = f"<b>C{comm}: {fam_name}</b> (Avg: {avg_str})" 
        
        fig.add_trace(go.Scatter(
            x=[None], y=[None], 
            mode='markers',
            marker=dict(size=10, color=color_map[comm], line=dict(width=1, color='white')),
            legendgroup=str(comm),
            showlegend=True,
            name=legend_name
        ))

# E. Final Layout
fig.update_layout(
    title=dict(
        text="<b>Mushroom Network Analysis</b><br><sub>Coloured by community</sub>",
        y=0.96, x=0.5, xanchor='center', yanchor='top',
        font=dict(size=24)
    ),
    width=1200, 
    height=1000, 
    plot_bgcolor='white',
    xaxis=dict(visible=False), 
    yaxis=dict(visible=False),
    margin=dict(t=80, b=150, l=20, r=20),
    legend=dict(
        orientation="h",
        yanchor="top",
        y=-0.02,
        xanchor="center",
        x=0.5,
        font=dict(size=11),
        bgcolor="rgba(255,255,255,0.9)",
        bordercolor="#e2e2e2",
        borderwidth=1,
        itemsizing='constant'
    )
)
fig.write_html("mushroom_network.html")

fig.show()


Calculating Layout with Strong Repulsion...


100%|██████████| 2500/2500 [00:12<00:00, 200.64it/s]


BarnesHut Approximation  took  5.38  seconds
Repulsion forces  took  5.73  seconds
Gravitational forces  took  0.09  seconds
Attraction forces  took  0.45  seconds
AdjustSpeedAndApplyForces step  took  0.47  seconds


In [None]:
import pandas as pd

# CONFIGURATION
TOP_N_TO_INSPECT = 5  # How many top clusters do you want to look at?
SHOW_TOP_SPECIES = 20 # How many species to list per cluster?

# 1. RANK CLUSTERS
# Group by community and calculate statistics
comm_stats = df.groupby('community_louvain')['views_all_time'].agg(['mean', 'count'])
comm_stats = comm_stats.sort_values('mean', ascending=False)

# Get the IDs of the top N clusters
top_ids = comm_stats.head(TOP_N_TO_INSPECT).index.tolist()

print(f"--- INSPECTING TOP {TOP_N_TO_INSPECT} POPULARITY CLUSTERS ---\n")

# 2. GENERATE DOSSIERS
for rank, cluster_id in enumerate(top_ids, 1):
    # Get data for this cluster
    subset = df[df['community_louvain'] == cluster_id]
    
    # Calculate Metadata
    avg_views = subset['views_all_time'].mean()
    total_views = subset['views_all_time'].sum()
    size = len(subset)
    
    # Identify Flagship (Most viewed)
    flagship_idx = subset['views_all_time'].idxmax()
    flagship_name = subset.loc[flagship_idx, 'mushroom']
    
    # Identify Dominant Genus (The most common Genus in this group)
    dominant_genus = subset['Genus'].mode()[0] if not subset['Genus'].mode().empty else "Mixed"
    
    # Get list of top species in this cluster
    top_species = subset.sort_values('views_all_time', ascending=False).head(SHOW_TOP_SPECIES)
    
    # --- PRINT REPORT ---
    print("="*60)
    print(f"RANK #{rank} | CLUSTER ID: {cluster_id}")
    print(f"Theme Hint: The '{dominant_genus}' Cluster")
    print("-" * 60)
    print(f"• Flagship Species: {flagship_name}")
    print(f"• Average Views:    {int(avg_views):,}")
    print(f"• Cluster Size:     {size} species")
    print(f"• Dominant Genus:   {dominant_genus}")
    print("-" * 60)
    print(f"Top {SHOW_TOP_SPECIES} Species in this Cluster:")
    
    for idx, row in top_species.iterrows():
        print(f"  - {row['mushroom']} ({row['views_all_time']:,} views)")
        
    if size > SHOW_TOP_SPECIES:
        print(f"  ... and {size - SHOW_TOP_SPECIES} more.")
    print("\n")

--- INSPECTING TOP 5 POPULARITY CLUSTERS ---

RANK #1 | CLUSTER ID: 4
Theme Hint: The 'Amanita' Cluster
------------------------------------------------------------
• Flagship Species: Amanita muscaria
• Average Views:    243,565
• Cluster Size:     71 species
• Dominant Genus:   Amanita
------------------------------------------------------------
Top 20 Species in this Cluster:
  - Amanita muscaria (8,460,394 views)
  - Amanita phalloides (3,492,246 views)
  - Amanita caesarea (619,866 views)
  - Amanita virosa (468,221 views)
  - Amanita pantherina (451,932 views)
  - Amanita bisporigera (422,913 views)
  - Amanita ocreata (403,790 views)
  - Amanita verna (249,951 views)
  - Amanita muscaria var. guessowii (222,652 views)
  - Amanita jacksonii (199,338 views)
  - Amanita vaginata (145,170 views)
  - Amanita flavoconia (128,171 views)
  - Amanita gemmata (125,305 views)
  - Amanita citrina (118,962 views)
  - Amanita regalis (99,940 views)
  - Amanita fulva (96,373 views)
  - Amanita