## Exploratory data analysis

This code performs an exploratory data analysis of the metrics calculated from the extinction events. The analyzed metrics include:

* Number of new extinctions (new_ext)

* Bray–Curtis dissimilarity (BC_diss)

* Keystoneness (K_s)

* Time to stability after extinctions (ext_ts)

In [None]:
# Load data
import pandas as pd
import numpy as np
import os
from datetime import datetime

# Section: Generate-paths
exp_dir = "/mnt/data/sur/users/mrivera/Train-sims/4379fd40-9f0a"
tgt_dir = os.path.join(exp_dir, "GNN-targets")
data_path = os.path.join(exp_dir, "parameters-sims.tsv")

#  Load-data
data = pd.read_csv(data_path, sep="\t")
d20 = data.loc[data['n_species'] == 20]['id']
d100 = data.loc[data['n_species'] == 100]['id']

In [None]:
# We will be plotting extinctions distribution separated by number of species.
import pyarrow.feather as ft
import numpy as np
from multiprocessing import Pool

def read_data(id):
    x = ft.read_table(os.path.join(tgt_dir, f'tgt_{id}.feather'))
    ext = x['new_ext'].to_pandas()
    Bc = round(np.log10(x['BC_diss'].to_pandas()), 2 )
    Ks = round(np.log10(x['K_s'].to_pandas()), 2)
    return ext, Bc, Ks

y1, y2, y3 = read_data(id)
ext20, Bc20, Ks20 = [], [], []

del ext, Bc, Ks
if __name__ == '__main__':
    with Pool(processes=8) as pool:
        results = pool.map(read_data, d20)
    # Unpack and convert to numpy arrays
    ext, Bc, Ks = map(np.array, zip(*results))
    # Flatten/concatenate the vectors
    ext = np.concatenate(ext)
    Bc = np.concatenate(Bc)
    Ks = np.concatenate(Ks)


In [None]:
# Histogram plot function
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

np.random.seed(42)
x = np.random.normal(size=1000)



x = ext
plt.hist(x, bins=20, color='steelblue', edgecolor='black', alpha=0.7)
plt.title("Number of extinctions post perturbation")
plt.ylabel("Log-freq")
plt.xlabel("Number of extinctions")
plt.xticks(range(1,11))
plt.grid(True, alpha=0.3, linestyle='--')
plt.tight_layout()
plt.savefig('/mnt/data/sur/users/mrivera/Plots/plot.png', dpi = 300)  # Saves as PNG



In [None]:
q25, q75 = np.percentile(x, [25, 75])
bin_width = 2 * (q75 - q25) * len(x) ** (-1/3)
bins = round((x.max() - x.min()) / bin_width)