## Exploratory data analysis

This code performs an exploratory data analysis of the metrics calculated from the extinction events. The analyzed metrics include:

* Number of new extinctions (new_ext)

* Bray–Curtis dissimilarity (BC_diss)

* Keystoneness (K_s)

* Time to stability after extinctions (ext_ts)

In [None]:
# Load data
import pandas as pd
import os

# Section: Generate-paths
exp_dir = "/mnt/data/sur/users/mrivera/Train-sims/4379fd40-9f0a"
tgt_dir = os.path.join(exp_dir, "GNN-targets")
data_path = os.path.join(exp_dir, "parameters-sims.tsv")

#  Load-data
data = pd.read_csv(data_path, sep="\t")
d20 = data.loc[data['n_species'] == 20]['id']
d100 = data.loc[data['n_species'] == 100]['id']

# 20-species
We will be plotting extinctions distribution separated by number of species.


In [None]:

import pyarrow.feather as ft
import numpy as np
from multiprocessing import Pool
from sys import getsizeof

def read_data(id):
    x = ft.read_table(os.path.join(tgt_dir, f'tgt_{id}.feather'))
    ext = x['new_ext'].to_pandas()
    Bc = round(x['BC_diss'].to_pandas(), 5)
    Ks = round(x['K_s'].to_pandas(), 5)
    return ext, Bc, Ks

# Generate function for extracting data
def par_dat(ids):
    ext, Bc, Ks = [], [], []
    if __name__ == '__main__':
        with Pool(processes=8) as pool:
            results = pool.map(read_data, ids)
        # Unpack and convert to numpy arrays
        ext, Bc, Ks = map(np.array, zip(*results))
        # Flatten/concatenate the vectors
        ext = np.concatenate(ext)
        Bc = np.concatenate(Bc)
        Ks = np.concatenate(Ks)
    return ext, Bc, Ks

# 20 (specs)* Number of simulations with 20 species
ext20, Bc20, Ks20 = par_dat(ids=d20)
ext100, Bc100, Ks100 = par_dat(ids=d100)
# getsizeof(ext100) / (1024 ** 2) 
# uniques = dict.fromkeys(ext100)
print(f" >> The number of extinctions for 20 species is {len(ext20)} it should be 20*{len(d20)}={20 * len(d20)}")


## Extinctions distribution
We generate a function to compare the distribution of the number of extinctions.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Generate counting 
labels20, counts20 = np.unique(ext20, return_counts=True)
labels100, counts100 = np.unique(ext100, return_counts=True)

def pie_dat(labels, counts):
    # Filter by >5% of data
    rel = counts/sum(counts)
    labels_final, counts_final, fail_counts = labels[rel > 0.05],  counts[rel > 0.05], counts[rel <= 0.05].sum()
    # Pie chart final labels and counts
    pie_labels= np.append(labels_final, 'other')
    pie_counts= np.append(counts_final, fail_counts)
    return pie_counts, pie_labels

# Generate pie chart data
pie_20, pie_labels20 = pie_dat(labels = labels20, counts = counts20)
pie_100, pie_labels100 = pie_dat(labels = labels100, counts = counts100)

# Generate the pie chart
plt.clf()  # Clear the entire figure
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
colors = sns.color_palette('pastel')

# First pie chart
ax1.pie(pie_20, labels = pie_labels20, autopct='%1.1f%%', colors = colors,
        wedgeprops={'edgecolor': 'black', 'linewidth': 0.5},
        startangle=90,
        )
ax1.set_title('Number of extinctions in 20 species data')

# Second pie chart
ax2.pie(pie_100, labels=pie_labels100, autopct='%1.1f%%', colors = colors,
        wedgeprops={'edgecolor': 'black', 'linewidth': 0.5},
        startangle=90,
        )
ax2.set_title('Number of extinctions in 100 species data')

plt.tight_layout()
plt.savefig('/mnt/data/sur/users/mrivera/Plots/ext-pie.png', dpi = 300)  # Saves as PNG

In [None]:
# Distribution of extinctions histogram

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import gaussian_kde

# Get log-counts with 20 species
b1 = np.arange(0, 21, 1)
ct1, _ = np.histogram(ext20, bins=21)
ct1_log = np.log1p(ct1)  # log(count + 1) to avoid log(0)
mask = ct1_log > 0

# Get log-counts with 20 species
b2 = np.arange(0, 101, 1)
ct2, _ = np.histogram(ext20, bins=101)
ct2_log = np.log1p(ct2)  # log(count + 1) to avoid log(0)

# Create the plot
plt.clf()  # Clear the entire figure
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Plot on first subplot
ax1.plot(b1, ct1_log, marker='o', linestyle='-', color='blue')
ax1.set_xlabel('Number of extinctions')
ax1.set_ylabel('log(Frequency + 1)')
ax1.set_title('Distribution with 20 species')
ax1.set_xlim(0, 20)

# Plot on first subplot
ax2.plot(b2, ct2_log, marker='o', linestyle='-', color='red')
ax2.set_xlabel('Number of extinctions')
ax2.set_ylabel('log(Frequency + 1)')
ax2.set_title('Distribution with 100 species')
ax2.set_xlim(0, 100)

plt.tight_layout()
plt.savefig('/mnt/data/sur/users/mrivera/Plots/exts-distr.png', dpi=300)
plt.close()

# Correlation between variables



In [None]:

import numpy as np 
from scipy.stats import pearsonr

def correlate(var1, var2, name1, name2, specs):
    corr, pvalue = pearsonr(var1, var2)
    if pvalue <= 0.05:
        print(f'>> {name1} is correlated with {name2} with a r={corr} for {specs} species. Statistical significance of: pval={pvalue}')
    else:
        print(f'>> {name1} is NOT correlated with {name2} with a r={corr}. Statistical significance of: pval={pvalue}')

# Run with 20 species
correlate(ext20, Bc20, "Extinctions", "Bray-Curtis", 20)
correlate(ext20, Ks20, "Extinctions", "Keystoneness", 20)
correlate(Ks20, Bc20, "Keystoneness", "Bray-Curtis", 20)

# Run with 100 species
correlate(ext100, Bc100, "Extinctions", "Bray-Curtis", 100)
correlate(ext100, Ks100, "Extinctions", "Keystoneness", 100)
correlate(Ks100, Bc100, "Keystoneness", "Bray-Curtis", 100)


Run with 20 species
- Extinctions is correlated with Bray-Curtis with a r=0.06158719265577826 for 20 species. Statistical significance of: pval=3.869123886395461e-106
- Extinctions is correlated with Keystoneness with a r=0.06170291131837201 for 20 species. Statistical significance of: pval=1.5666279402111278e-106
- Keystoneness is correlated with Bray-Curtis with a r=0.9907854367974938 for 20 species. Statistical significance of: pval=0.0

Run with 100 species
- Extinctions is correlated with Bray-Curtis with a r=0.5444984960220364 for 100 species. Statistical significance of: pval=0.0
- Extinctions is correlated with Keystoneness with a r=0.54385381297123 for 100 species. Statistical significance of: pval=0.0
- Keystoneness is correlated with Bray-Curtis with a r=0.998427694691392 for 100 species. Statistical significance of: pval=0.0

# Distribution of keystoness and Bray-Curtis

To determine the optimal number of bins, we can use the Freedman–Diaconis rule. However, since the interquartile range (IQR) is equal to zero in this case, the rule cannot be applied. Therefore, we use Scott’s rule as an alternative.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Calculate optimal number of bins
def bin_calc(x):
    std = np.std(x)
    n = len(x)
    h_scott = 3.5 * std / (n ** (1/3))
    bins_scott = int(np.ceil((x.max() - x.min()) / h_scott))
    return bins_scott

# Compute the number of bins
x1 = np.round(Ks20, 5)
bins1 = bin_calc(x1)
x2 = np.round(Ks100, 5)
bins2 = bin_calc(x2)

# Generate the histogram for keystoness
plt.clf()  # Clear the entire figure
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.hist(x1, bins=bins1, density=True, histtype='step',  color='blue', label='S20')  # 'step' draws only the outline
ax1.hist(x2, bins=bins2, density=True, histtype='step', color='red', label='S100')  # 'step' draws only the outline
ax1.set_xlabel('Keystoness')
ax1.set_ylabel('Frequency')
ax1.set_title('Keystoness distribution')

# Compute the number of bins
x1 = np.round(Bc20, 5)
x2 = np.round(Bc100, 5)
bins1 = bin_calc(x1)
bins2 = bin_calc(x2)
ax2.hist(x1, bins=bins1, density=True, histtype='step',  color='blue', label='S20')  # 'step' draws only the outline
ax2.hist(x2, bins=bins2, density=True, histtype='step', color='red', label='S100')  # 'step' draws only the outline
ax2.set_xlabel('Bray-Curtis')
ax2.set_ylabel('Frequency')
ax2.set_title('Bray-Curtis distribution')
plt.legend()
plt.savefig('/mnt/data/sur/users/mrivera/Plots/both-distr.png', dpi = 300)  # Saves as PNG
plt.close()

In [None]:
# Find the number of bins using the  Freedman-Diaconis rule
q75, q25 = np.percentile(Ks20, [75 ,25])            # quartile
iqr = q75 - q25                                     # interquartile
n = len(Ks20)
h = 2 * iqr / (n ** (1/3))

# Number of bins
bins = int((Ks20.max() - Ks20.min()) / h)

# Kernel density estimation

We will use Kernel Density Estimation (KDE) to estimate the probability density function of our data, based on the available observations.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
from scipy.stats import norm
from scipy.stats import binned_statistic

def pdf_kde(x):
    # Divide data into bins
    statistic, bin_edges, bin_number = binned_statistic(x=range(len(x)), values = x, statistic='mean', bins = 1200)
    # Compute KDE
    x_range = np.linspace(0, 1, 1200)
    kde = gaussian_kde(statistic)
    # Evaluate density at those x points
    density = kde(x_range)
    # Generate plot
    fig = plt.figure(figsize=(10, 6))
    plt.plot(statistic, density, 'r-', linewidth=2, label='KDE (smoothed density)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    return fig

fig = pdf_kde(x = Ks20)
fig.savefig('/mnt/data/sur/users/mrivera/Plots/plot.png', dpi = 300)  # Saves as PNG
plt.close()

In [None]:
# Normal histogram



# Compute histogram
counts, bin_edges = np.histogram(x, bins=bins_scott, density=True)
percentages = counts / counts.sum() * 100
# Plot as a line
plt.plot(bin_edges[:-1], percentages, drawstyle='steps-mid', color='blue', linewidth=2, label='Histogram line')
plt.xlabel('Keystoness-20-species')
plt.ylabel('Percentages')
plt.title('Probability density function')
plt.legend()
plt.savefig('/mnt/data/sur/users/mrivera/Plots/plot.png', dpi = 300)  # Saves as PNG
plt.close()
plt.clf()  # Clear the entire figure