In [1]:
import sys
import os
# Add the 'project' directory to the path
sys.path.append(os.path.abspath('..'))

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display

In [3]:
pars_to_compare = ['z', '1-kap', 'v', 'p_M', 'E_Hb', 'E_Hp']

# Load results

In [4]:
#df = pd.read_csv('../data/raw/estimation_from_AmP_pars.csv', index_col=0)
#df = pd.read_csv('../data/estimation_runs/full_estimation_from_AmP_pars_subset_test_set.csv', index_col=0)
df = pd.read_csv('../data/estimation_runs/run_train_val_sets_until_minimum.csv', index_col=0)


df.index.name = 'species'
#df.drop(columns=['data_split'])
# Drop species that were not run yet -> execution_time == 0
df = df[df.execution_time > 0]
# Create columns for '1-kap'
df['1-kap_i'] = 1 - df['kap_i']
df['1-kap_f'] = 1 - df['kap_f']
# Convert boolean variables
df['convergence'] = df['convergence'].astype(bool)
df['error'] = df['error'].astype(bool)
# Fix error being false when an error message is logged
max_execution_time_error_message = "Maximum execution time exceeded"
df.loc[(~df['error_message'].isna()) & (df['error_message'] != max_execution_time_error_message), 'error'] = True
df


Unnamed: 0_level_0,init_loss,z_i,kap_i,v_i,p_M_i,E_Hb_i,E_Hp_i,final_loss,z_f,kap_f,...,E_Hb_f,E_Hp_f,convergence,n_runs,n_iter,execution_time,error,error_message,1-kap_i,1-kap_f
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sepiella_inermis,0.466196,0.444090,0.73830,0.025902,882.4408,56.74000,16330.0000,0.422621,0.459775,0.746540,...,51.553179,17601.332536,True,7,3488,312.367851,False,,0.26170,0.253460
Peucaea_botterii,0.131787,1.701900,0.97544,0.048308,584.5174,177.20000,1937.0000,0.109257,0.909771,0.965543,...,63.534347,511.215271,False,39,19500,1006.346506,False,,0.02456,0.034457
Molothrus_bonariensis,0.142471,1.986500,0.82804,0.053568,687.9868,2176.00000,30820.0000,0.142409,1.982540,0.828091,...,2179.890185,30682.237689,False,1,500,54.134099,False,,0.17196,0.171909
Xiphophorus_maculatus,0.348674,0.964880,0.57892,0.022653,287.8024,12.04000,1646.0000,0.348674,0.964848,0.578890,...,12.040168,1646.300480,False,1,500,23.895588,False,,0.42108,0.421110
Cnidoglanis_macrocephalus,0.031688,9.231700,0.53392,0.025475,30.9836,148.30000,579500.0000,0.028419,8.944826,0.500467,...,162.263736,599751.616484,True,7,3472,175.172382,False,,0.46608,0.499533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Moina_weismanni,0.129960,0.036359,0.46386,0.025982,2795.7862,0.01609,0.1052,0.129959,0.036362,0.463765,...,0.016111,0.105354,False,1,500,29.738363,False,,0.53614,0.536235
Nipponia_nippon,0.107336,3.848800,0.92208,0.026915,1337.3902,2755.00000,94010.0000,0.107239,3.837297,0.924466,...,2630.105986,89634.062162,False,1,500,40.660928,False,,0.07792,0.075534
Lutjanus_kasmira,0.000000,0.000000,0.00000,0.000000,0.0000,0.00000,0.0000,0.000000,0.000000,0.000000,...,0.000000,0.000000,False,0,0,0.334265,True,The logical indices contain a true value outsi...,1.00000,1.000000
Micrathene_whitneyi,0.094604,2.353000,0.87902,0.032293,199.0872,2252.00000,32310.0000,0.045576,3.382121,0.874165,...,4928.151490,22668.047077,False,8,4000,163.745420,False,,0.12098,0.125835


In [5]:
# Convert final_loss column to complex type if it is of type object
if df['final_loss'].dtype == np.dtype('O'):
    df['final_loss'] = df['final_loss'].apply(lambda x: complex(x.replace('i', 'j')) if isinstance(x, str) else x)

# Set error if loss is imaginary
imaginary_loss_mask = df['final_loss'].apply(lambda x: x.imag != 0 if isinstance(x, complex) else False)
df.loc[imaginary_loss_mask, 'error'] = True
df.loc[imaginary_loss_mask, 'error_message'] = 'Final loss has imaginary part'

# Convert final_loss column to complex type if it is of type object
if df['init_loss'].dtype == np.dtype('O'):
    df['init_loss'] = df['init_loss'].apply(lambda x: complex(x.replace('i', 'j')) if isinstance(x, str) else x)

# Set error if loss is imaginary
imaginary_loss_mask = df['init_loss'].apply(lambda x: x.imag != 0 if isinstance(x, complex) else False)
df.loc[imaginary_loss_mask, 'error'] = True
df.loc[imaginary_loss_mask, 'error_message'] = 'Initial loss has imaginary part'

In [6]:
print(f"{df['convergence'].sum()} out of {len(df)} species converged ({df['convergence'].sum()/len(df)*100:.2f}%)")
print(f"{df['error'].sum()} out of {len(df)} species threw an error ({df['error'].sum()/len(df)*100:.2f}%)")

126 out of 1926 species converged (6.54%)
15 out of 1926 species threw an error (0.78%)


In [7]:
df['error_message'].value_counts()

error_message
Maximum execution time exceeded                                                                       44
The logical indices contain a true value outside of the array bounds.                                  7
Folder for species "Deania_calcea" does not exist.                                                     1
Unable to perform assignment because the left and right sides have a different number of elements.     1
Out of memory.                                                                                         1
Arrays have incompatible sizes for this operation.                                                     1
Folder for species "Anadontostoma_chacunda" does not exist.                                            1
Dimensions of arrays being concatenated are not consistent.                                            1
Index in position 2 exceeds array bounds.                                                              1
predict_filter                           

In [8]:
df[~df['error_message'].isna()][['error_message','n_iter', 'execution_time']].sort_values(by='error_message')

Unnamed: 0_level_0,error_message,n_iter,execution_time
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Macropus_rufus,Arrays have incompatible sizes for this operat...,0,1.327052
Saxicola_rubicola,Dimensions of arrays being concatenated are no...,0,315.972959
Anadontostoma_chacunda,"Folder for species ""Anadontostoma_chacunda"" do...",0,0.112043
Deania_calcea,"Folder for species ""Deania_calcea"" does not ex...",0,0.111158
Lepisma_saccharina,Index in position 2 exceeds array bounds.,0,0.772026
Ameiurus_nebulosus,Maximum execution time exceeded,0,54000.0
Thryssa_aestuaria,Maximum execution time exceeded,0,54000.0
Brevoortia_patronus,Maximum execution time exceeded,0,54000.0
Cynoglossus_canariensis,Maximum execution time exceeded,0,54000.0
Cyprinodon_bovinus,Maximum execution time exceeded,0,54000.0


In [13]:
print(*df.loc[df['error_message'] == 'Maximum execution time exceeded', 'execution_time'].sort_index().index.values, sep='\n')

Ameiurus_melas
Ameiurus_nebulosus
Aphanopus_intermedius
Blicca_bjoerkna
Brevoortia_patronus
Cetengraulis_edentulus
Channa_argus
Chasmistes_liorus
Coilia_brachygnathus
Cyclonaias_asperata
Cynoglossus_canariensis
Cynoglossus_senegalensis
Cyprinodon_bovinus
Decapterus_macrosoma
Dionda_diaboli
Diplodus_annularis
Diplodus_sargus
Engraulis_anchoita
Etheostoma_zonistium
Faustina_faustina
Fusconaia_cuneolus
Gymnocharacinus_bergii
Lepturacanthus_savala
Lutjanus_synagris
Macruronus_novaezelandiae
Margaritifera_margaritifera
Mene_maculata
Menidia_extensa
Mercenaria_mercenaria
Morone_saxatilis
Mytilus_edulis
Oligosarcus_hepsetus
Oncorhynchus_nerka
Parvaspina_collina
Piaractus_brachypomus
Pomoxis_nigromaculatus
Pseudoplatystoma_fasciatum
Ptychocheilus_oregonensis
Rastrelliger_kanagurta
Sardinella_aurita
Stichopus_vastus
Synodontis_membranaceus
Thryssa_aestuaria
Truncilla_truncata


In [None]:
species_list = ['Diplectrum_formosum ', 'Turdus_merula', 'Rhombosolea_plebeia', 'Paranotothenia_magellanica', 'Macquaria_ambigua', 'Gallotia_galloti']
df.loc[df.index.intersection(species_list)]

# Analysis 

In [None]:
# Remove species with errors
par_gap_cols = [p+'_gap' for p in pars_to_compare]
gap_df = pd.DataFrame(index=df[df['error_message'].isna()].index, columns=['loss_diff', 'loss_gap'] + par_gap_cols)
gap_df['loss_diff'] = df['init_loss'] - df['final_loss']
gap_df['loss_gap'] = (df['init_loss'] - df['final_loss']) / df['init_loss'] * 100
for p in pars_to_compare:
    gap_df[p+'_gap'] = (df[f"{p}_i"] - df[f"{p}_f"]).abs() / df[f"{p}_i"]
gap_df['par_dist_metric'] = gap_df[par_gap_cols].sum(axis=1)
gap_df.sort_index()

In [None]:
gap_df.loc[gap_df.index.intersection(species_list)]

## Loss improvement

In [None]:
fig, ax = plt.subplots()
sns.histplot(gap_df['loss_diff'], bins=10, kde=False, ax=ax, log_scale=True)
ax.set_xlabel('Relative improvement in loss (%)')

In [None]:
loss_gap_cutoff = 10 # in percentage
n_species_below_cutoff = (gap_df['loss_gap'] < loss_gap_cutoff).sum()
print(f"There are {n_species_below_cutoff} ({n_species_below_cutoff/len(gap_df)*100:.2f} %) species with a loss improvement < {loss_gap_cutoff} %")

In [None]:
(gap_df['loss_gap'] > 10).sum()

## Worst species

In [None]:
print("nm = { ...")
for s, lg in gap_df.loc[gap_df['loss_gap'] > 80, 'loss_gap'].items():
        print(f"'{s}'")
print("};")

In [None]:
for s, lg in gap_df['loss_gap'].sort_values(ascending=False).items():
    print(f"{s}: {lg:.2f}%")

## Parameter differences

In [None]:
fig, ax = plt.subplots()
sns.histplot(gap_df['par_dist_metric'], bins=10, kde=True, ax=ax, log_scale=True)
ax.set_xlabel('Parameter distance metric')

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
for i, par in enumerate(pars_to_compare):
    ax = axes[i // 3, i % 3]
    sns.histplot(gap_df[par+'_gap'], bins=10, kde=True, ax=ax, log_scale=True)
    ax.set_xlabel(f'Relative difference in {par}')
    ax.set_ylabel('')

In [None]:
gap_df.describe()

In [None]:
ax = sns.scatterplot(data=gap_df, x='loss_gap', y='par_dist_metric')
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Relative improvement in loss (%)')
ax.set_ylabel('Parameter distance metric')

## Specific cases to investigate