## Exploratory data analysis

This code performs an exploratory data analysis of the metrics calculated from the extinction events. The analyzed metrics include:

* Number of new extinctions (new_ext)

* Bray–Curtis dissimilarity (BC_diss)

* Keystoneness (K_s)

* Time to stability after extinctions (ext_ts)

In [None]:
# Load data
import pandas as pd
import os

# Section: Generate-paths
exp_dir = "/mnt/data/sur/users/mrivera/Train-sims/4379fd40-9f0a"
tgt_dir = os.path.join(exp_dir, "GNN-targets")
data_path = os.path.join(exp_dir, "parameters-sims.tsv")

#  Load-data
data = pd.read_csv(data_path, sep="\t")
d20 = data.loc[data['n_species'] == 20]['id']
d100 = data.loc[data['n_species'] == 100]['id']

# 20-species
We will be plotting extinctions distribution separated by number of species.

In [None]:

import pyarrow.feather as ft
import numpy as np
from multiprocessing import Pool
from sys import getsizeof

def read_data(id):
    x = ft.read_table(os.path.join(tgt_dir, f'tgt_{id}.feather'))
    ext = x['new_ext'].to_pandas()
    Bc = round(x['BC_diss'].to_pandas(), 5)
    Ks = round(x['K_s'].to_pandas(), 5)
    return ext, Bc, Ks

# Generate function for extracting data
def par_dat(ids):
    ext, Bc, Ks = [], [], []
    if __name__ == '__main__':
        with Pool(processes=8) as pool:
            results = pool.map(read_data, ids)
        # Unpack and convert to numpy arrays
        ext, Bc, Ks = map(np.array, zip(*results))
        # Flatten/concatenate the vectors
        ext = np.concatenate(ext)
        Bc = np.concatenate(Bc)
        Ks = np.concatenate(Ks)
    return ext, Bc, Ks

# 20 (specs)* Number of simulations with 20 species
ext20, Bc20, Ks20 = par_dat(ids=d20)
ext100, Bc100, Ks100 = par_dat(ids=d100)
# getsizeof(ext100) / (1024 ** 2) 
uniques = dict.fromkeys(ext100)
print(f" >> The number of extinctions for 20 species is {len(ext20)} it should be 20*{len(d20)}={20 * len(d20)}")


## Extinctions distribution
We generate a function to compare the distribution of the number of extinctions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Generate counting 
labels20, counts20 = np.unique(ext20, return_counts=True)
labels100, counts100 = np.unique(ext100, return_counts=True)

def pie_dat(labels, counts):
    # Filter by >5% of data
    rel = counts/sum(counts)
    labels_final, counts_final, fail_counts = labels[rel > 0.05],  counts[rel > 0.05], counts[rel <= 0.05].sum()
    # Pie chart final labels and counts
    pie_labels= np.append(labels_final, 'other')
    pie_counts= np.append(counts_final, fail_counts)
    return pie_counts, pie_labels

# Generate pie chart data
pie_20, pie_labels20 = pie_dat(labels = labels20, counts = counts20)
pie_100, pie_labels100 = pie_dat(labels = labels100, counts = counts100)

# Generate the pie chart
plt.clf()  # Clear the entire figure
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
colors = sns.color_palette('pastel')

# First pie chart
ax1.pie(pie_20, labels = pie_labels20, autopct='%1.1f%%', colors = colors,
        wedgeprops={'edgecolor': 'black', 'linewidth': 0.5},
        startangle=90,
        )
ax1.set_title('Number of extinctions in 20 species data')

# Second pie chart
ax2.pie(pie_100, labels=pie_labels100, autopct='%1.1f%%', colors = colors,
        wedgeprops={'edgecolor': 'black', 'linewidth': 0.5},
        startangle=90,
        )
ax2.set_title('Number of extinctions in 100 species data')

plt.tight_layout()
plt.savefig('/mnt/data/sur/users/mrivera/Plots/plot.png', dpi = 300)  # Saves as PNG

# Correlation between variables

In [None]:
# ext20, Bc20, Ks20 = par_dat(ids=d20)
# ext100, Bc100, Ks100 = par_dat(ids=d100)
import numpy as np 
from scipy.stats import pearsonr

def correlate(var1, var2, name1, name2):
    corr, pvalue = pearsonr(var1, var2)
    if pvalue <= 0.05:
        direction = "positively" if corr > 0 else "negatively"
        print(f'>> {name1} is {direction} correlated with {name2}')
        print(f'   r = {corr:.3f}, p = {pvalue}')
    else:
        print(f'>> {name1} is NOT significantly correlated with {name2}')
        print(f'   r = {corr:.3f}, p = {pvalue} (not significant)')

# Run analyses
correlate(ext20, Bc20, "Extinctions", "Bray-Curtis")
correlate(ext20, Ks20, "Extinctions", "Keystoneness")
correlate(Ks20, Bc20, "Keystoneness", "Bray-Curtis")


# Distribution of keystoness

In [None]:
q25, q75 = np.percentile(x, [25, 75])
bin_width = 2 * (q75 - q25) * len(x) ** (-1/3)
bins = round((x.max() - x.min()) / bin_width)