# Exploratory Data Anlysis control group



## Generate data

In [None]:
# | eval: false

import pandas as pd
import os

# Section: Generate-paths
EXP_DIR = '/mnt/data/sur/users/mrivera/Controls/exp_20251125'
TGT_DIR = os.path.join(EXP_DIR, "GNN-targets")
RAW_DIR = os.path.join(EXP_DIR, "raw-ODEs")
POST_DIR = os.path.join(EXP_DIR, "Post-exts")


In [None]:
# | eval: false

import pyarrow.feather as ft
import numpy as np

from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Pool

def read_data(row):
    id = row["id"]
    sk = row["keys"]
    sk_idx = sk - 1             # Convert to zero-indexed
    # Load target metrics
    x = ft.read_table(os.path.join(TGT_DIR, f'tgt_{id}.feather'), columns=['new_ext', 'BC_diss', 'K_s'] )
    # Convert to dataframe
    df = x.to_pandas() 
    df['BC_diss'] = df['BC_diss'].round(5)
    df['K_s'] = df['K_s'].round(5)
    # Section: Load relative frequency BEFORE perturbation
    y = ft.read_table(os.path.join(RAW_DIR, f'O_{id}.feather'))
    freq = y.column(-1).to_numpy()
    freq_sum = freq.sum()
    df['rel_freq'] = freq / freq_sum if freq_sum != 0 else freq * 0  
    # Section: Add labels
    labels = ["other"] * 30
    labels[sk_idx] = "key"
    df['label'] = labels
    return df

#  Load-data
data = pd.read_csv(f'{EXP_DIR}/Simulation-parameters.tsv', sep="\t")
rows = [row for _, row in data.iterrows()]

with ProcessPoolExecutor() as ex:           # uses all cores automatically
   results  = list(ex.map(read_data, rows))

combined = pd.concat(results, ignore_index=True)


In [None]:
# | eval: false

import pyarrow.feather as ft
import numpy as np

from concurrent.futures import ProcessPoolExecutor
from multiprocessing import Pool

id = ids_vector[0]
sk = data.loc[0, 'keys']
#row = rows[0]

def read_data(row):
    id = row['id']
    sk = row['keys'] 
    sk_idx = sk - 1             # Convert to zero-indexed
    # Load target metrics
    x = ft.read_table(os.path.join(TGT_DIR, f'tgt_{id}.feather'), columns=['new_ext', 'BC_diss', 'K_s'] )
    # Convert to numpy 
    ext = x['new_ext'].to_numpy()
    Bc = x['BC_diss'].to_numpy().round(5) 
    Ks = np.round(x['K_s'].to_numpy(), 5)
    # Section: Load relative frequency BEFORE perturbation
    y = ft.read_table(os.path.join(RAW_DIR, f'O_{id}.feather'))
    freq = y.column(-1).to_numpy()
    freq_sum = freq.sum()
    key_Brel = freq[sk_idx] / freq_sum if freq_sum != 0 else freq * 0  # Faster than zeros_like 
    # Section: Load relative frequency AFTER perturbation
    y = ft.read_table(os.path.join(POST_DIR, f'E_{id}-S{sk}.feather'))
    freq = y.column(-1).to_numpy()
    freq_sum = freq.sum()
    key_Arel = freq[sk_idx] / freq_sum if freq_sum != 0 else freq * 0  # Faster than zeros_like 
    # Section: Extract keystone values
    key_ext = ext[sk_idx]
    key_bc = Bc[sk_idx]
    key_ks = Ks[sk_idx]
    # Remove keystone index from arrays
    ext = np.delete(ext, sk_idx)
    Bc = np.delete(Bc, sk_idx)
    Ks = np.delete(Ks, sk_idx)
    return ext, Bc, Ks, key_Brel, key_Arel, key_ext, key_ks, key_bc

rows = [row for _, row in data.iterrows()]

with ProcessPoolExecutor() as ex:           # uses all cores automatically
   results  = list(ex.map(read_data, rows))

ext, Bc, Ks, key_Brel, key_Arel, key_ext, key_ks, key_bc = map(list, zip(*results))
other_ext = np.concatenate(ext)
other_bc  = np.concatenate(Bc)
other_ks  = np.concatenate(Ks)

# Not-keystone
other_data = {
    'extinctions': other_ext,
    'BrayCurtis': other_bc,
    'keystone': other_ks
}

# Keystone-species
key_data = {
    'extinctions': key_ext,
    'BrayCurtis': key_bc,
    'keystone': key_ks,
    'Before-rel': key_Brel,
    'After-rel': key_Arel,
}


## Testing order of parallelization

I think the issue may come from how the data is returned, so I’ll make a dummy function to check if the order changes.

*conclusion:* `ProcessPoolExecutor()` returns data in order; it doesn’t change it.

In [None]:
from concurrent.futures import ProcessPoolExecutor

def dummy(x,y, z):
    return x*x, y*y, z

x = [1,2,3,4]
y = [10,20,30,40]
z = ['a','b','c','d']

with ProcessPoolExecutor() as ex:           # uses all cores automatically
   results  = list(ex.map(dummy, x, y, z))

print(results)

## Histogram relative frequency

Histogram describing the keystone specie relative frequency before community perturbation.

In [None]:
# | eval: false

import matplotlib.pyplot as plt

plt.clf()
fig, ax = plt.subplots(figsize=(12, 6))
ax.hist(key_data['relative'], bins=20, color='#3498DB', alpha=0.6, edgecolor='black', linewidth=0.5)

# Styling
ax.set_xlabel('Relative frequency', fontsize=14, fontweight='bold')
ax.set_ylabel('Frequency', fontsize=14, fontweight='bold')
ax.set_title('Relative frequency of keystone specie', fontsize=16, fontweight='bold', pad=20)

# Grid
ax.grid(True, alpha=0.3, linestyle='--', linewidth=0.7)

# Add background color
ax.set_facecolor('#F8F9FA')

plt.tight_layout()
plt.savefig('/mnt/data/sur/users/mrivera/Plots/KeystoneRel-controls.png', format='png', dpi=300, bbox_inches='tight')

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

img = mpimg.imread('/mnt/data/sur/users/mrivera/Plots/KeystoneRel-controls.png')
plt.figure(figsize=(10, 6))
plt.imshow(img)
plt.axis('off')
plt.show()

## Scatter Plot relative frequency before vs after perturbation

I want to examine the growth of my keystone species in the community before perturbation and after perturbation.

In [None]:

# | eval: false

import numpy as np
import matplotlib.pyplot as plt

# Assuming you have your data
before = key_data['Before-rel'] 
after = key_data['After-rel'] 

# Create x-coordinates: 1 for first 29, 2 for next 29, etc.
x_coords = [n for n in range(1, 1001)]

plt.clf()
plt.figure(figsize=(12, 6))

# Draw points
plt.scatter(x_coords, before, alpha=0.6, s=50, color='blue', label='Before')
plt.scatter(x_coords, after, alpha=0.6, s=50, color='red', label='After')

# Labels
plt.xlabel('Community', fontsize=12, fontweight='bold')
plt.ylabel('Relative-freq', fontsize=12, fontweight='bold')
plt.title('Keystone specie relative frequency', fontsize=16, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.legend(loc='upper right', frameon=True, shadow=True, fontsize=12, title='Species Type', title_fontsize=12)
plt.savefig('/mnt/data/sur/users/mrivera/Plots/keystone_relative.png', dpi=300)

In [None]:
before = np.array(key_data['Before-rel'])
after = np.array(key_data['After-rel'])

result = before - after
print(result)   # [-3  1  3]

## Barplot extinctions

In [None]:
# | eval: false

import matplotlib.pyplot as plt
import numpy as np

plt.clf()  # Clear the current figure
fig, (ax1, ax2) = plt.subplots(1, 2)

# Create two boxplots
def gen_counts(data):
    counts, bins = np.histogram(data, bins=np.arange(0, 30, 1))  # Explicit bin edges
    counts = np.log1p(counts)
    return counts, bins

# Keystone data
ct, bin = gen_counts(key_data['extinctions'])
ax1.bar(bin[:-1],  height=ct, width=0.5, align='center', label='Others', color='blue')
ax2.legend(loc='upper right')  # Add legend to first subplot

# Optional: Add labels
ax1.set_ylabel('log-freq')
ax2.set_ylabel('log-freq')

ax1.set_xlabel('# of extinctions')
ax2.set_xlabel('# of extinctions')
fig.suptitle('Control data EDA', fontsize=14)

plt.savefig('/mnt/data/sur/users/mrivera/Plots/boxplots-controls.png', format='png')height=ct, width=0.5, align='center', label='Keystone', color='red')
ax1.legend(loc='upper right')  # Add legend to first subplot

# Other species counts
ct, bin = gen_counts(other_data['extinctions'])
ax2.bar(bin[:-1],  height=ct, width=0.5, align='center', label='Others', color='blue')
ax2.legend(loc='upper right')  # Add legend to first subplot

# Optional: Add labels
ax1.set_ylabel('log-freq')
ax2.set_ylabel('log-freq')

ax1.set_xlabel('# of extinctions')
ax2.set_xlabel('# of extinctions')
fig.suptitle('Control data EDA', fontsize=14)

plt.savefig('/mnt/data/sur/users/mrivera/Plots/boxplots-controls.png', format='png')

## Keystoness and Bray-Curtis

In [None]:
# | eval: false

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

plt.clf()  # Clear the current figure

# Basic violin plot
# Basic violin plot
fig, ax = plt.subplots(figsize=(10, 6))
data1 = np.log1p(other_data['keystone'])
data2 = np.log1p(key_data['keystone'])
parts = ax.violinplot([data1, data2], 
                       positions=[1, 2],  # Changed from [1, 2, 3] to [1, 2]
                       showmeans=True,
                       showmedians=True)

ax.set_xticks([1, 2])
ax.set_xticklabels(['Others', 'Keystone species'])  # Fixed 'specie' to 'species'
ax.set_ylabel('Extinctions')  # Changed from 'Values' to be more specific
plt.savefig('/mnt/data/sur/users/mrivera/Plots/Violin-controls.png', format='png')

In [None]:
# | eval: false

import matplotlib.pyplot as plt

plt.clf()
fig, ax = plt.subplots(figsize=(12, 6))

# Overlapping histograms
ax.hist(other_data['keystone'], bins=50, color='#3498DB', alpha=0.6, edgecolor='black', linewidth=0.5, label='Others')
ax.hist(key_data['keystone'], bins=50, color='#E74C3C', alpha=0.6, edgecolor='black', linewidth=0.5, label='Keystone')

# Styling
ax.set_xlabel('Keystoness', fontsize=14, fontweight='bold')
ax.set_ylabel('Frequency', fontsize=14, fontweight='bold')
ax.set_title('Keystoness Comparison: Keystone vs Other Species', fontsize=16, fontweight='bold', pad=20)

# Legend
ax.legend(loc='upper right', frameon=True, shadow=True, fontsize=12, title='Species Type', title_fontsize=12)

# Grid
ax.grid(True, alpha=0.3, linestyle='--', linewidth=0.7)

# Add background color
ax.set_facecolor('#F8F9FA')

plt.tight_layout()
plt.savefig('/mnt/data/sur/users/mrivera/Plots/histogram-controls.png', format='png', dpi=300, bbox_inches='tight')

In [None]:
# | eval: false

import numpy as np
import matplotlib.pyplot as plt

# Assuming you have your data
data = other_data['keystone']  # or other_data['keystone']
data2 = key_data['keystone'] 

# Create x-coordinates: 1 for first 29, 2 for next 29, etc.
x_coords = np.repeat(range(1, len(data)//29 + 2), 29)[:len(data)]
x2_coords = [n for n in range(1, 1001)]

plt.clf()
plt.figure(figsize=(12, 6))
plt.scatter(x_coords, data, alpha=0.6, s=50, color='blue', label='others')
plt.scatter(x2_coords, data2, alpha=0.6, s=50, color='red', label='key')
plt.xlabel('Community', fontsize=12, fontweight='bold')
plt.ylabel('Keystoness', fontsize=12, fontweight='bold')
plt.title('Controls keystoness distribution', fontsize=16, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.legend(loc='upper right', frameon=True, shadow=True, fontsize=12, title='Species Type', title_fontsize=12)
plt.savefig('/mnt/data/sur/users/mrivera/Plots/keystone_scatter.png', dpi=300)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

img = mpimg.imread('/mnt/data/sur/users/mrivera/Plots/grouped_scatter.png')
plt.figure(figsize=(10, 6))
plt.imshow(img)
plt.axis('off')
plt.show()

## Bray-Curtis dissimilitude

In [None]:
# | eval: false

import numpy as np
import matplotlib.pyplot as plt

# Assuming you have your data
data = other_data['BrayCurtis']  # or other_data['keystone']
data2 = key_data['BrayCurtis'] 

# Create x-coordinates: 1 for first 29, 2 for next 29, etc.
x_coords = np.repeat(range(1, len(data)//29 + 2), 29)[:len(data)]
x2_coords = [n for n in range(1, 1001)]

plt.clf()
plt.figure(figsize=(12, 6))
plt.scatter(x_coords, data, alpha=0.6, s=50, color='blue', label='others')
plt.scatter(x2_coords, data2, alpha=0.6, s=50, color='red', label='key')
plt.xlabel('Community', fontsize=12, fontweight='bold')
plt.ylabel('Bray-Curtis', fontsize=12, fontweight='bold')
plt.title('Controls distribution', fontsize=16, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.legend(loc='upper right', frameon=True, shadow=True, fontsize=12, title='Species Type', title_fontsize=12)
plt.savefig('/mnt/data/sur/users/mrivera/Plots/braycurtis_scatter.png', dpi=300)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

img = mpimg.imread('/mnt/data/sur/users/mrivera/Plots/braycurtis_scatter.png')
plt.figure(figsize=(10, 6))
plt.imshow(img)
plt.axis('off')
plt.show()