# Exploratory Data Analysis

This notebook contains the exploratory data analysis for the experiment `exp_20251125`.

In this version, the interaction matrix is generated with improved control to preserve the proportions of null and negative interactions. The workflow also removes the species selected for extinction before simulating extinctions, and the calculation of keystoness has been updated so that `(1-p_i)` reflects the relative abundance of species $i$ prior to the perturbation.

In [None]:
# | eval: false

import pandas as pd
import os

# Section: Generate-paths
EXP_DIR = '/mnt/data/sur/users/mrivera/Controls/exp_20251125'
TGT_DIR = os.path.join(EXP_DIR, "GNN-targets")
RAW_DIR = os.path.join(EXP_DIR, "raw-ODEs")
POST_DIR = os.path.join(EXP_DIR, "Post-exts")


import pyarrow.feather as ft
import numpy as np

from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Pool

def read_data(row):
    id = row["id"]
    sk = row["keys"]
    sk_idx = sk - 1             # Convert to zero-indexed
    # Load target metrics
    x = ft.read_table(os.path.join(TGT_DIR, f'tgt_{id}.feather'), columns=['new_ext', 'BC_diss', 'K_s'] )
    # Convert to dataframe
    df = x.to_pandas() 
    df['BC_diss'] = df['BC_diss'].round(5)
    df['K_s'] = df['K_s'].round(5)
    # Section: Load relative frequency BEFORE perturbation
    y = ft.read_table(os.path.join(RAW_DIR, f'O_{id}.feather'))
    freq = y.column(-1).to_numpy()
    freq_sum = freq.sum()
    df['rel_freq'] = freq / freq_sum if freq_sum != 0 else freq * 0  
    # Section: Add labels
    labels = ["other"] * 30
    labels[sk_idx] = "key"
    df['label'] = labels
    return df

#  Load-data
data = pd.read_csv(f'{EXP_DIR}/Simulation-parameters.tsv', sep="\t")
rows = [row for _, row in data.iterrows()]

with ProcessPoolExecutor() as ex:           # uses all cores automatically
   results  = list(ex.map(read_data, rows))

combined = pd.concat(results, ignore_index=True)

# Generate Violin plot

For keystoness and Bray-Curtiss.

In [None]:
# | eval: false

import seaborn as sns
import matplotlib.pyplot as plt

plt.clf()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))

sns.violinplot(data=combined, x="K_s", y= "label" , hue="label", palette="Set1", legend=False, ax=ax1)
ax1.set_ylabel("Species type")
ax1.set_xlabel("Keystoness (K_s)")


sns.violinplot(data=combined, x="BC_diss", y= "label" , hue="label", palette="Set2", legend=False, ax=ax2)
ax2.set_ylabel("Species type")
ax2.set_xlabel("Bray-Curtis")

fig.suptitle('Controls EDA', fontsize=14)
plt.tight_layout()
plt.savefig('/mnt/data/sur/users/mrivera/Plots/Violin_20251125.png', format='png', dpi=300, bbox_inches='tight')

In [None]:

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

img = mpimg.imread('/mnt/data/sur/users/mrivera/Plots/Violin_20251125.png')
plt.figure(figsize=(10, 6))
plt.imshow(img)
plt.axis('off')
plt.show()

# Generate extinctions barplots

In [None]:

# | eval: false

plt.clf()
plt.figure(figsize=(6,4))
sns.violinplot(data=combined, x="new_ext", y= "label" , hue="label", palette="Set1", legend=False)
plt.ylabel("Species type")
plt.xlabel("Extinctions")
plt.grid(True, axis="x", linestyle="--", linewidth=0.6, alpha=0.6)
plt.title("Extinction distribution")
plt.tight_layout()
plt.savefig('/mnt/data/sur/users/mrivera/Plots/extinctions_20251125.png', format='png', dpi=300, bbox_inches='tight')

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

img = mpimg.imread('/mnt/data/sur/users/mrivera/Plots/extinctions_20251125.png')
plt.figure(figsize=(10, 6))
plt.imshow(img)
plt.axis('off')
plt.show()