In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sys
import os

# Add the 'project' directory to the path
sys.path.append(os.path.abspath('..'))

from project_code.data.preprocess_data import encode_taxonomy, encode_eco_codes, drop_species_with_missing_values, save_types_of_col, split_and_save_dataset
from project_code.data.imputation import *

In [3]:
deb_models = ['std', 'stf', 'stx', 'abj']
parameter_cols = ['p_Am', 'kap', 'v', 'p_M', 'E_G', 'h_a', 'E_Hb', 'E_Hj', 'E_Hx', 'E_Hp']
taxonomy_cols = ['family', 'order', 'class', 'phylum']
ecocode_cols = ['climate', 'ecozone', 'habitat', 'embryo', 'migrate', 'food', 'gender', 'reprod']
age_data_cols = ['ab', 'ah', 'aj', 'ax', 'ap', 'am']
time_since_birth_data_cols = ['tg', 'tb', 'tj', 'tx', 'tp']
weight_data_cols = ['Wwb', 'Wwj', 'Wwx', 'Wwp', 'Wwi']
length_data_cols = ['Lb', 'Lj', 'Lx', 'Lp', 'Li']
other_cols = ['d_V', 'Ri', 'T_typical', 't_0', 'model']

In [4]:
def print_missing_values_per_column(df, percentage=True):
    for c in df.columns:
        n_missing = pd.isna(df[c]).sum()
        if n_missing:
            if percentage:
                print(f"{n_missing/len(df)*100:.1f} % missing values in column {c}")
            else:
                print(f"{n_missing} missing values in column {c}")

# Loading raw data

In [5]:
processed_dataset_save_folder = '../data/processed/'

In [6]:
raw_data = pd.read_csv('../data/raw/dataset_matlab_20250324.csv', index_col=0)
raw_data.dropna(how='all', inplace=True)
raw_data.index.name = 'species'
for col in raw_data.columns:
    if 'estim_' in col:
        raw_data[col] = raw_data[col].astype('bool')
raw_data

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,estim_k_J,estim_E_Hb,estim_E_Hj,estim_E_Hp,d_V,T_typical,f,t_0,model,completeness
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.777120,0.027220,13.844900,2393.823700,5.047000e-06,0.487600,6.941000,,1403.000000,...,False,True,True,True,0.09,278.15,1.0,,abj,2.5
Abbottina_rivularis,53.617532,0.971489,0.022091,21.517909,5226.313884,2.648805e-07,0.052719,0.192017,,128.863865,...,False,True,True,True,0.20,291.15,1.0,153.832271,abj,2.5
Ablennes_hians,231.831215,0.996902,0.022324,21.267776,5227.569153,1.476023e-09,0.002605,0.022575,,955.472281,...,False,True,True,True,0.20,301.05,1.0,,abj,2.8
Abra_segmentum,8.266674,0.930000,0.020926,25.204100,2349.963100,7.505000e-07,0.000601,0.011460,,10.940000,...,False,True,True,True,0.09,288.15,1.0,,abj,2.1
Abralia_trigonura,109.484284,0.980810,0.009515,1009.810800,5492.358000,1.526000e-10,0.001511,2.628000,,14.090000,...,False,True,True,True,0.21,291.15,1.0,,abj,2.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.856433,0.977551,0.020240,17.132204,5231.466494,1.513136e-07,0.003956,,,1576.172765,...,False,True,False,True,0.20,290.25,1.0,,std,2.5
Zosterops_lateralis,2594.395445,0.964558,0.021701,2905.710028,7322.263049,2.751121e-12,41.924067,,226.319058,482.019783,...,True,True,False,True,0.28,314.75,1.0,,std,2.5
Zosterops_virens,1036.789660,0.991031,0.028112,850.429303,7321.292242,3.050644e-12,17.252003,,81.773428,232.181719,...,True,True,False,True,0.28,314.75,1.0,,std,2.5
Zygochlamys_patagonica,64.787878,0.943100,0.020238,41.254100,2342.387800,4.982000e-08,0.000251,0.000722,,423.675000,...,False,True,True,True,0.09,280.15,1.0,,abj,2.5


In [7]:
raw_data['model'].value_counts()

model
abj    2518
std    2261
stx     733
stf      51
abp      16
ssj      12
hex      11
hep       9
hax       6
asj       4
sbp       4
Name: count, dtype: int64

## Load DEB model predictions 

In [8]:
mat_level_deb_predictions_df = pd.read_csv('../data/deb_model_predictions/metamorphosis_predictions.csv', index_col=0)
mat_level_deb_predictions_df.dropna(how='all', inplace=True)
mat_level_deb_predictions_df.index.name = 'species'
mat_level_deb_predictions_df

Unnamed: 0_level_0,L_b,L_j,L_p,L_i,a_b,a_j,a_p,success,execution_time,error,error_message
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Abatus_cordatus,0.088737,0.213679,0.970146,5.665792,12.717214,39.513908,155.726259,1,0.318058,,
Abbottina_rivularis,0.069836,0.107126,0.830806,3.713308,12.510862,25.950153,262.793966,1,0.317563,,
Ablennes_hians,0.054240,0.111324,3.539144,22.303548,8.068393,15.995400,384.730232,1,0.317325,,
Abra_segmentum,0.014976,0.039564,0.321871,0.805831,3.363052,19.586638,168.217503,1,0.317728,,
Abralia_trigonura,0.021490,0.240470,0.398092,1.189914,7.724631,38.213879,47.266564,1,0.436186,,
...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.121635,,
Zosterops_lateralis,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.122307,,
Zosterops_virens,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.123072,,
Zygochlamys_patagonica,0.012070,0.017142,1.094880,2.103472,2.060789,3.184606,286.577016,1,0.123843,,


In [9]:
# Drop species for which an error occurred
mat_level_deb_predictions_df = mat_level_deb_predictions_df[mat_level_deb_predictions_df['error_message'].isna()]
# Drop species for which mat levels were not computed
mat_level_deb_predictions_df['success'] = mat_level_deb_predictions_df['success'].astype('bool')
mat_level_deb_predictions_df = mat_level_deb_predictions_df[mat_level_deb_predictions_df['success']]
mat_level_deb_predictions_df

Unnamed: 0_level_0,L_b,L_j,L_p,L_i,a_b,a_j,a_p,success,execution_time,error,error_message
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Abatus_cordatus,0.088737,0.213679,0.970146,5.665792,12.717214,39.513908,155.726259,True,0.318058,,
Abbottina_rivularis,0.069836,0.107126,0.830806,3.713308,12.510862,25.950153,262.793966,True,0.317563,,
Ablennes_hians,0.054240,0.111324,3.539144,22.303548,8.068393,15.995400,384.730232,True,0.317325,,
Abra_segmentum,0.014976,0.039564,0.321871,0.805831,3.363052,19.586638,168.217503,True,0.317728,,
Abralia_trigonura,0.021490,0.240470,0.398092,1.189914,7.724631,38.213879,47.266564,True,0.436186,,
...,...,...,...,...,...,...,...,...,...,...,...
Zingel_asper,0.072276,1.339119,1.339186,4.327728,15.007628,275.414168,275.418625,True,0.141881,,
Zoarces_americanus,0.200904,0.200904,2.305288,9.833431,24.765926,24.765926,381.590306,True,0.143275,,
Zoarces_elongatus,0.100263,0.846983,1.072204,8.840436,19.698527,227.983173,254.310758,True,0.143960,,
Zoarces_viviparus,0.391950,0.409293,1.218718,4.070480,41.297071,43.593709,162.143198,True,0.109038,,


In [10]:
print_missing_values_per_column(raw_data)

52.2 % missing values in column E_Hj
69.0 % missing values in column E_Hx
0.2 % missing values in column E_Hp
64.2 % missing values in column migrate
44.0 % missing values in column ab
98.4 % missing values in column ah
99.2 % missing values in column aj
100.0 % missing values in column ax
95.8 % missing values in column ap
0.9 % missing values in column am
87.0 % missing values in column tg
99.8 % missing values in column tb
96.0 % missing values in column tj
69.2 % missing values in column tx
52.6 % missing values in column tp
13.8 % missing values in column Wwb
97.9 % missing values in column Wwj
94.0 % missing values in column Wwx
62.7 % missing values in column Wwp
7.9 % missing values in column Wwi
77.0 % missing values in column Lb
94.1 % missing values in column Lj
99.7 % missing values in column Lx
37.0 % missing values in column Lp
29.9 % missing values in column Li
18.6 % missing values in column Ri
97.4 % missing values in column Ni
92.3 % missing values in column GSI
99.6 

# Preprocessing

In [11]:
df = raw_data.copy()

## Remove species where 'z' (or p_Am) is not estimated

In [12]:
df = df.loc[df['estim_z']]
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,estim_k_J,estim_E_Hb,estim_E_Hj,estim_E_Hp,d_V,T_typical,f,t_0,model,completeness
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.777120,0.027220,13.844900,2393.823700,5.047000e-06,0.487600,6.941000,,1403.000000,...,False,True,True,True,0.09,278.15,1.0,,abj,2.5
Abbottina_rivularis,53.617532,0.971489,0.022091,21.517909,5226.313884,2.648805e-07,0.052719,0.192017,,128.863865,...,False,True,True,True,0.20,291.15,1.0,153.832271,abj,2.5
Ablennes_hians,231.831215,0.996902,0.022324,21.267776,5227.569153,1.476023e-09,0.002605,0.022575,,955.472281,...,False,True,True,True,0.20,301.05,1.0,,abj,2.8
Abra_segmentum,8.266674,0.930000,0.020926,25.204100,2349.963100,7.505000e-07,0.000601,0.011460,,10.940000,...,False,True,True,True,0.09,288.15,1.0,,abj,2.1
Abralia_trigonura,109.484284,0.980810,0.009515,1009.810800,5492.358000,1.526000e-10,0.001511,2.628000,,14.090000,...,False,True,True,True,0.21,291.15,1.0,,abj,2.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.856433,0.977551,0.020240,17.132204,5231.466494,1.513136e-07,0.003956,,,1576.172765,...,False,True,False,True,0.20,290.25,1.0,,std,2.5
Zosterops_lateralis,2594.395445,0.964558,0.021701,2905.710028,7322.263049,2.751121e-12,41.924067,,226.319058,482.019783,...,True,True,False,True,0.28,314.75,1.0,,std,2.5
Zosterops_virens,1036.789660,0.991031,0.028112,850.429303,7321.292242,3.050644e-12,17.252003,,81.773428,232.181719,...,True,True,False,True,0.28,314.75,1.0,,std,2.5
Zygochlamys_patagonica,64.787878,0.943100,0.020238,41.254100,2342.387800,4.982000e-08,0.000251,0.000722,,423.675000,...,False,True,True,True,0.09,280.15,1.0,,abj,2.5


## Drop species that were not estimated until the loss function minimum

In [13]:
species_not_in_minimum = ['Alasmidonta_heterodon', 'Ameiurus_melas', 'Ameiurus_nebulosus', 'Betta_splendens', 'Cyprinodon_variegatus', 'Dionda_diaboli', 'Stichopus_vastus', 'Teredo_navalis']
df = df.drop(species_not_in_minimum)

## Fill missing data with DEB model predictions

In [14]:
df['s_M'] = mat_level_deb_predictions_df['L_j'] / mat_level_deb_predictions_df['L_b']
df['estim_s_M'] = df[['estim_p_M', 'estim_v', 'estim_kap', 'estim_k_J', 'estim_E_Hb', 'estim_E_Hj']].any(axis=1)
df.loc[df['model'].isin(['std', 'stx', 'stf']), 's_M'] = 1
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,estim_E_Hj,estim_E_Hp,d_V,T_typical,f,t_0,model,completeness,s_M,estim_s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.777120,0.027220,13.844900,2393.823700,5.047000e-06,0.487600,6.941000,,1403.000000,...,True,True,0.09,278.15,1.0,,abj,2.5,2.408004,True
Abbottina_rivularis,53.617532,0.971489,0.022091,21.517909,5226.313884,2.648805e-07,0.052719,0.192017,,128.863865,...,True,True,0.20,291.15,1.0,153.832271,abj,2.5,1.533968,True
Ablennes_hians,231.831215,0.996902,0.022324,21.267776,5227.569153,1.476023e-09,0.002605,0.022575,,955.472281,...,True,True,0.20,301.05,1.0,,abj,2.8,2.052446,True
Abra_segmentum,8.266674,0.930000,0.020926,25.204100,2349.963100,7.505000e-07,0.000601,0.011460,,10.940000,...,True,True,0.09,288.15,1.0,,abj,2.1,2.641807,True
Abralia_trigonura,109.484284,0.980810,0.009515,1009.810800,5492.358000,1.526000e-10,0.001511,2.628000,,14.090000,...,True,True,0.21,291.15,1.0,,abj,2.3,11.189711,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.856433,0.977551,0.020240,17.132204,5231.466494,1.513136e-07,0.003956,,,1576.172765,...,False,True,0.20,290.25,1.0,,std,2.5,1.000000,True
Zosterops_lateralis,2594.395445,0.964558,0.021701,2905.710028,7322.263049,2.751121e-12,41.924067,,226.319058,482.019783,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zosterops_virens,1036.789660,0.991031,0.028112,850.429303,7321.292242,3.050644e-12,17.252003,,81.773428,232.181719,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zygochlamys_patagonica,64.787878,0.943100,0.020238,41.254100,2342.387800,4.982000e-08,0.000251,0.000722,,423.675000,...,True,True,0.09,280.15,1.0,,abj,2.5,1.420209,True


In [15]:
df[(df['model'] == 'abj') & (df['s_M'].isna())]

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,estim_E_Hj,estim_E_Hp,d_V,T_typical,f,t_0,model,completeness,s_M,estim_s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


## Remove species with invalid data or parameter sets

In [16]:
# Remove species with incorrect maturity values
df = df[df.apply(check_column_values_increase, axis=1, args=(['E_Hb', 'E_Hj', 'E_Hx', 'E_Hp'],))]
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,estim_E_Hj,estim_E_Hp,d_V,T_typical,f,t_0,model,completeness,s_M,estim_s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.777120,0.027220,13.844900,2393.823700,5.047000e-06,0.487600,6.941000,,1403.000000,...,True,True,0.09,278.15,1.0,,abj,2.5,2.408004,True
Abbottina_rivularis,53.617532,0.971489,0.022091,21.517909,5226.313884,2.648805e-07,0.052719,0.192017,,128.863865,...,True,True,0.20,291.15,1.0,153.832271,abj,2.5,1.533968,True
Ablennes_hians,231.831215,0.996902,0.022324,21.267776,5227.569153,1.476023e-09,0.002605,0.022575,,955.472281,...,True,True,0.20,301.05,1.0,,abj,2.8,2.052446,True
Abra_segmentum,8.266674,0.930000,0.020926,25.204100,2349.963100,7.505000e-07,0.000601,0.011460,,10.940000,...,True,True,0.09,288.15,1.0,,abj,2.1,2.641807,True
Abralia_trigonura,109.484284,0.980810,0.009515,1009.810800,5492.358000,1.526000e-10,0.001511,2.628000,,14.090000,...,True,True,0.21,291.15,1.0,,abj,2.3,11.189711,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.856433,0.977551,0.020240,17.132204,5231.466494,1.513136e-07,0.003956,,,1576.172765,...,False,True,0.20,290.25,1.0,,std,2.5,1.000000,True
Zosterops_lateralis,2594.395445,0.964558,0.021701,2905.710028,7322.263049,2.751121e-12,41.924067,,226.319058,482.019783,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zosterops_virens,1036.789660,0.991031,0.028112,850.429303,7321.292242,3.050644e-12,17.252003,,81.773428,232.181719,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zygochlamys_patagonica,64.787878,0.943100,0.020238,41.254100,2342.387800,4.982000e-08,0.000251,0.000722,,423.675000,...,True,True,0.09,280.15,1.0,,abj,2.5,1.420209,True


## Fill missing weight data from length data

In [17]:
df = df.apply(impute_weight_and_length_data, axis=1)
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,estim_E_Hj,estim_E_Hp,d_V,T_typical,f,t_0,model,completeness,s_M,estim_s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.777120,0.027220,13.844900,2393.823700,5.047000e-06,0.487600,6.941000,,1403.000000,...,True,True,0.09,278.15,1.0,,abj,2.5,2.408004,True
Abbottina_rivularis,53.617532,0.971489,0.022091,21.517909,5226.313884,2.648805e-07,0.052719,0.192017,,128.863865,...,True,True,0.20,291.15,1.0,153.832271,abj,2.5,1.533968,True
Ablennes_hians,231.831215,0.996902,0.022324,21.267776,5227.569153,1.476023e-09,0.002605,0.022575,,955.472281,...,True,True,0.20,301.05,1.0,,abj,2.8,2.052446,True
Abra_segmentum,8.266674,0.930000,0.020926,25.204100,2349.963100,7.505000e-07,0.000601,0.011460,,10.940000,...,True,True,0.09,288.15,1.0,,abj,2.1,2.641807,True
Abralia_trigonura,109.484284,0.980810,0.009515,1009.810800,5492.358000,1.526000e-10,0.001511,2.628000,,14.090000,...,True,True,0.21,291.15,1.0,,abj,2.3,11.189711,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.856433,0.977551,0.020240,17.132204,5231.466494,1.513136e-07,0.003956,,,1576.172765,...,False,True,0.20,290.25,1.0,,std,2.5,1.000000,True
Zosterops_lateralis,2594.395445,0.964558,0.021701,2905.710028,7322.263049,2.751121e-12,41.924067,,226.319058,482.019783,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zosterops_virens,1036.789660,0.991031,0.028112,850.429303,7321.292242,3.050644e-12,17.252003,,81.773428,232.181719,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zygochlamys_patagonica,64.787878,0.943100,0.020238,41.254100,2342.387800,4.982000e-08,0.000251,0.000722,,423.675000,...,True,True,0.09,280.15,1.0,,abj,2.5,1.420209,True


In [18]:
# Check that all weights are increasing
df = df[df.apply(check_column_values_increase, args=(WEIGHT_COLS,), axis=1,)]
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,estim_E_Hj,estim_E_Hp,d_V,T_typical,f,t_0,model,completeness,s_M,estim_s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.777120,0.027220,13.844900,2393.823700,5.047000e-06,0.487600,6.941000,,1403.000000,...,True,True,0.09,278.15,1.0,,abj,2.5,2.408004,True
Abbottina_rivularis,53.617532,0.971489,0.022091,21.517909,5226.313884,2.648805e-07,0.052719,0.192017,,128.863865,...,True,True,0.20,291.15,1.0,153.832271,abj,2.5,1.533968,True
Ablennes_hians,231.831215,0.996902,0.022324,21.267776,5227.569153,1.476023e-09,0.002605,0.022575,,955.472281,...,True,True,0.20,301.05,1.0,,abj,2.8,2.052446,True
Abra_segmentum,8.266674,0.930000,0.020926,25.204100,2349.963100,7.505000e-07,0.000601,0.011460,,10.940000,...,True,True,0.09,288.15,1.0,,abj,2.1,2.641807,True
Abralia_trigonura,109.484284,0.980810,0.009515,1009.810800,5492.358000,1.526000e-10,0.001511,2.628000,,14.090000,...,True,True,0.21,291.15,1.0,,abj,2.3,11.189711,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.856433,0.977551,0.020240,17.132204,5231.466494,1.513136e-07,0.003956,,,1576.172765,...,False,True,0.20,290.25,1.0,,std,2.5,1.000000,True
Zosterops_lateralis,2594.395445,0.964558,0.021701,2905.710028,7322.263049,2.751121e-12,41.924067,,226.319058,482.019783,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zosterops_virens,1036.789660,0.991031,0.028112,850.429303,7321.292242,3.050644e-12,17.252003,,81.773428,232.181719,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zygochlamys_patagonica,64.787878,0.943100,0.020238,41.254100,2342.387800,4.982000e-08,0.000251,0.000722,,423.675000,...,True,True,0.09,280.15,1.0,,abj,2.5,1.420209,True


## Fill missing weight puberty for Aves species

In [19]:
df = df.apply(impute_weight_at_puberty_for_aves, axis=1)
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,estim_E_Hj,estim_E_Hp,d_V,T_typical,f,t_0,model,completeness,s_M,estim_s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.777120,0.027220,13.844900,2393.823700,5.047000e-06,0.487600,6.941000,,1403.000000,...,True,True,0.09,278.15,1.0,,abj,2.5,2.408004,True
Abbottina_rivularis,53.617532,0.971489,0.022091,21.517909,5226.313884,2.648805e-07,0.052719,0.192017,,128.863865,...,True,True,0.20,291.15,1.0,153.832271,abj,2.5,1.533968,True
Ablennes_hians,231.831215,0.996902,0.022324,21.267776,5227.569153,1.476023e-09,0.002605,0.022575,,955.472281,...,True,True,0.20,301.05,1.0,,abj,2.8,2.052446,True
Abra_segmentum,8.266674,0.930000,0.020926,25.204100,2349.963100,7.505000e-07,0.000601,0.011460,,10.940000,...,True,True,0.09,288.15,1.0,,abj,2.1,2.641807,True
Abralia_trigonura,109.484284,0.980810,0.009515,1009.810800,5492.358000,1.526000e-10,0.001511,2.628000,,14.090000,...,True,True,0.21,291.15,1.0,,abj,2.3,11.189711,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.856433,0.977551,0.020240,17.132204,5231.466494,1.513136e-07,0.003956,,,1576.172765,...,False,True,0.20,290.25,1.0,,std,2.5,1.000000,True
Zosterops_lateralis,2594.395445,0.964558,0.021701,2905.710028,7322.263049,2.751121e-12,41.924067,,226.319058,482.019783,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zosterops_virens,1036.789660,0.991031,0.028112,850.429303,7321.292242,3.050644e-12,17.252003,,81.773428,232.181719,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zygochlamys_patagonica,64.787878,0.943100,0.020238,41.254100,2342.387800,4.982000e-08,0.000251,0.000722,,423.675000,...,True,True,0.09,280.15,1.0,,abj,2.5,1.420209,True


## Fill missing age data with time since birth data

In [20]:
df = df.apply(impute_age_data, axis=1)
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,estim_E_Hj,estim_E_Hp,d_V,T_typical,f,t_0,model,completeness,s_M,estim_s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.777120,0.027220,13.844900,2393.823700,5.047000e-06,0.487600,6.941000,,1403.000000,...,True,True,0.09,278.15,1.0,,abj,2.5,2.408004,True
Abbottina_rivularis,53.617532,0.971489,0.022091,21.517909,5226.313884,2.648805e-07,0.052719,0.192017,,128.863865,...,True,True,0.20,291.15,1.0,153.832271,abj,2.5,1.533968,True
Ablennes_hians,231.831215,0.996902,0.022324,21.267776,5227.569153,1.476023e-09,0.002605,0.022575,,955.472281,...,True,True,0.20,301.05,1.0,,abj,2.8,2.052446,True
Abra_segmentum,8.266674,0.930000,0.020926,25.204100,2349.963100,7.505000e-07,0.000601,0.011460,,10.940000,...,True,True,0.09,288.15,1.0,,abj,2.1,2.641807,True
Abralia_trigonura,109.484284,0.980810,0.009515,1009.810800,5492.358000,1.526000e-10,0.001511,2.628000,,14.090000,...,True,True,0.21,291.15,1.0,,abj,2.3,11.189711,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.856433,0.977551,0.020240,17.132204,5231.466494,1.513136e-07,0.003956,,,1576.172765,...,False,True,0.20,290.25,1.0,,std,2.5,1.000000,True
Zosterops_lateralis,2594.395445,0.964558,0.021701,2905.710028,7322.263049,2.751121e-12,41.924067,,226.319058,482.019783,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zosterops_virens,1036.789660,0.991031,0.028112,850.429303,7321.292242,3.050644e-12,17.252003,,81.773428,232.181719,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zygochlamys_patagonica,64.787878,0.943100,0.020238,41.254100,2342.387800,4.982000e-08,0.000251,0.000722,,423.675000,...,True,True,0.09,280.15,1.0,,abj,2.5,1.420209,True


In [21]:
# Check that all ages are increasing
df = df[df.apply(check_column_values_increase, args=(AGE_COLS,), axis=1,)]
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,estim_E_Hj,estim_E_Hp,d_V,T_typical,f,t_0,model,completeness,s_M,estim_s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.777120,0.027220,13.844900,2393.823700,5.047000e-06,0.487600,6.941000,,1403.000000,...,True,True,0.09,278.15,1.0,,abj,2.5,2.408004,True
Abbottina_rivularis,53.617532,0.971489,0.022091,21.517909,5226.313884,2.648805e-07,0.052719,0.192017,,128.863865,...,True,True,0.20,291.15,1.0,153.832271,abj,2.5,1.533968,True
Ablennes_hians,231.831215,0.996902,0.022324,21.267776,5227.569153,1.476023e-09,0.002605,0.022575,,955.472281,...,True,True,0.20,301.05,1.0,,abj,2.8,2.052446,True
Abra_segmentum,8.266674,0.930000,0.020926,25.204100,2349.963100,7.505000e-07,0.000601,0.011460,,10.940000,...,True,True,0.09,288.15,1.0,,abj,2.1,2.641807,True
Abralia_trigonura,109.484284,0.980810,0.009515,1009.810800,5492.358000,1.526000e-10,0.001511,2.628000,,14.090000,...,True,True,0.21,291.15,1.0,,abj,2.3,11.189711,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.856433,0.977551,0.020240,17.132204,5231.466494,1.513136e-07,0.003956,,,1576.172765,...,False,True,0.20,290.25,1.0,,std,2.5,1.000000,True
Zosterops_lateralis,2594.395445,0.964558,0.021701,2905.710028,7322.263049,2.751121e-12,41.924067,,226.319058,482.019783,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zosterops_virens,1036.789660,0.991031,0.028112,850.429303,7321.292242,3.050644e-12,17.252003,,81.773428,232.181719,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zygochlamys_patagonica,64.787878,0.943100,0.020238,41.254100,2342.387800,4.982000e-08,0.000251,0.000722,,423.675000,...,True,True,0.09,280.15,1.0,,abj,2.5,1.420209,True


In [22]:
discarded_species = raw_data.loc[raw_data.index.difference(df.index), ['model', 'class', 'E_Hb', 'E_Hj', 'E_Hx', 'E_Hp'] + LENGTH_COLS + WEIGHT_COLS + AGE_COLS]
discarded_species['lengths_check'] = discarded_species.apply(check_column_values_increase, args=(LENGTH_COLS,), axis=1)
discarded_species['mat_levels_check'] = discarded_species.apply(check_column_values_increase, args=(['E_Hb', 'E_Hj', 'E_Hx', 'E_Hp'],), axis=1)
discarded_species['weights_check'] = discarded_species.apply(check_column_values_increase, args=(WEIGHT_COLS,), axis=1)
discarded_species['ages_check'] = discarded_species.apply(check_column_values_increase, args=(AGE_COLS,), axis=1)
discarded_species[discarded_species['model'].isin(deb_models)]

Unnamed: 0_level_0,model,class,E_Hb,E_Hj,E_Hx,E_Hp,Lb,Lj,Lx,Lp,...,Wwi,ab,aj,ax,ap,am,lengths_check,mat_levels_check,weights_check,ages_check
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alasmidonta_heterodon,abj,Bivalvia,9.0932e-08,1.148115e-07,,0.1310444,0.02,0.03,,2.0,...,3.5,14.586587,,,,5807.852507,True,True,True,True
Ameiurus_melas,abj,Actinopterygii,0.4315,0.4457,,95700.0,,,,45.0,...,3970.0,2.030681,,,,2181.186007,True,True,True,True
Ameiurus_nebulosus,abj,Actinopterygii,1.521145,2115.45,,41077.29,,,,25.0,...,2080.0,5.766593,,,,2730.392029,True,True,True,True
Amphioctopus_aegina,abj,Cephalopoda,0.004817,6.444,,622.6,,,,,...,156.0,7.635703,,,,66.760246,True,True,True,True
Amphioctopus_fangsiao,abj,Cephalopoda,36.16,321.6,,18810.0,,,,,...,100.0,,,,,113.898186,True,True,True,True
Betta_splendens,abj,Actinopterygii,0.0006470643,0.1929314,,21.70404,,,,4.0,...,4.7,1.210874,,,,1767.875565,True,True,True,True
Caretta_caretta_MED,std,Reptilia,22320.0,,,58580000.0,4.04,,,65.75,...,87000.0,28.735889,,,,61.632854,True,True,True,True
Ceriodaphnia_dubia,std,Branchiopoda,0.00528555,,,0.02385632,0.036,,,0.063,...,3.9e-05,1.265535,,,,37.333294,True,True,True,True
Ceriodaphnia_lacustris,std,Branchiopoda,0.004472559,,,0.02730684,,,,,...,2.8e-05,2.0,,,,93.0,True,True,True,True
Chromis_chromis,abj,Actinopterygii,0.04375,0.7261,,6676.0,0.29,0.7,,5.28,...,22.92,4.405199,14.961052,,606.753784,3285.0,True,True,True,True


## Fill missing reproduction rate data with total reproduction

In [23]:
df = df.apply(impute_reproduction_rate_data, axis=1)
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,estim_E_Hj,estim_E_Hp,d_V,T_typical,f,t_0,model,completeness,s_M,estim_s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.777120,0.027220,13.844900,2393.823700,5.047000e-06,0.487600,6.941000,,1403.000000,...,True,True,0.09,278.15,1.0,,abj,2.5,2.408004,True
Abbottina_rivularis,53.617532,0.971489,0.022091,21.517909,5226.313884,2.648805e-07,0.052719,0.192017,,128.863865,...,True,True,0.20,291.15,1.0,153.832271,abj,2.5,1.533968,True
Ablennes_hians,231.831215,0.996902,0.022324,21.267776,5227.569153,1.476023e-09,0.002605,0.022575,,955.472281,...,True,True,0.20,301.05,1.0,,abj,2.8,2.052446,True
Abra_segmentum,8.266674,0.930000,0.020926,25.204100,2349.963100,7.505000e-07,0.000601,0.011460,,10.940000,...,True,True,0.09,288.15,1.0,,abj,2.1,2.641807,True
Abralia_trigonura,109.484284,0.980810,0.009515,1009.810800,5492.358000,1.526000e-10,0.001511,2.628000,,14.090000,...,True,True,0.21,291.15,1.0,,abj,2.3,11.189711,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.856433,0.977551,0.020240,17.132204,5231.466494,1.513136e-07,0.003956,,,1576.172765,...,False,True,0.20,290.25,1.0,,std,2.5,1.000000,True
Zosterops_lateralis,2594.395445,0.964558,0.021701,2905.710028,7322.263049,2.751121e-12,41.924067,,226.319058,482.019783,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zosterops_virens,1036.789660,0.991031,0.028112,850.429303,7321.292242,3.050644e-12,17.252003,,81.773428,232.181719,...,False,True,0.28,314.75,1.0,,std,2.5,1.000000,True
Zygochlamys_patagonica,64.787878,0.943100,0.020238,41.254100,2342.387800,4.982000e-08,0.000251,0.000722,,423.675000,...,True,True,0.09,280.15,1.0,,abj,2.5,1.420209,True


## Statistics on imputed values

## Handling acceleration and weaning

In [24]:
df['metamorphosis'] = df['model'] == 'abj'
df['weaning'] = df['model'] == 'stx'
df['foetus'] = (df['model'] == 'stf') | (df['model'] == 'stx')
#df.drop(columns=['model'], inplace=True)

## Handle taxonomic info

In [25]:
# Create genus taxon from species name
df['genus'] = df.index.map(lambda s: s.split('_')[0])

## Save dataset after imputation

In [26]:
df.to_csv('../data/interim/filled_data.csv', index=True)
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,f,t_0,model,completeness,s_M,estim_s_M,metamorphosis,weaning,foetus,genus
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.777120,0.027220,13.844900,2393.823700,5.047000e-06,0.487600,6.941000,,1403.000000,...,1.0,,abj,2.5,2.408004,True,True,False,False,Abatus
Abbottina_rivularis,53.617532,0.971489,0.022091,21.517909,5226.313884,2.648805e-07,0.052719,0.192017,,128.863865,...,1.0,153.832271,abj,2.5,1.533968,True,True,False,False,Abbottina
Ablennes_hians,231.831215,0.996902,0.022324,21.267776,5227.569153,1.476023e-09,0.002605,0.022575,,955.472281,...,1.0,,abj,2.8,2.052446,True,True,False,False,Ablennes
Abra_segmentum,8.266674,0.930000,0.020926,25.204100,2349.963100,7.505000e-07,0.000601,0.011460,,10.940000,...,1.0,,abj,2.1,2.641807,True,True,False,False,Abra
Abralia_trigonura,109.484284,0.980810,0.009515,1009.810800,5492.358000,1.526000e-10,0.001511,2.628000,,14.090000,...,1.0,,abj,2.3,11.189711,True,True,False,False,Abralia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.856433,0.977551,0.020240,17.132204,5231.466494,1.513136e-07,0.003956,,,1576.172765,...,1.0,,std,2.5,1.000000,True,False,False,False,Zosterisessor
Zosterops_lateralis,2594.395445,0.964558,0.021701,2905.710028,7322.263049,2.751121e-12,41.924067,,226.319058,482.019783,...,1.0,,std,2.5,1.000000,True,False,False,False,Zosterops
Zosterops_virens,1036.789660,0.991031,0.028112,850.429303,7321.292242,3.050644e-12,17.252003,,81.773428,232.181719,...,1.0,,std,2.5,1.000000,True,False,False,False,Zosterops
Zygochlamys_patagonica,64.787878,0.943100,0.020238,41.254100,2342.387800,4.982000e-08,0.000251,0.000722,,423.675000,...,1.0,,abj,2.5,1.420209,True,True,False,False,Zygochlamys


# Processed dataset

## Define options

In [27]:
taxonomy_class_options = {
    'class': ['Aves', 'Actinopterygii', 'Reptilia', 'Chondrichthyes', 'Amphibia',
       'Mammalia', 'Bivalvia', 'Branchiopoda', 'Malacostraca'],
}
taxonomy_include_other = {
    'class': True
}



In [28]:
eco_code_options = {
    'climate': ['A', 'B', 'C', 'D', 'E'],
    'habitat': ['T', 'F', 'S', 'M'],
    'migrate': ['T'],
    'food': ['P', 'O', 'H', 'C'],
}

eco_code_include_other = {
    'climate': False,
    'habitat': False,
    'migrate': False,
    'food': True,
}


## Bijection datasets

In [29]:
bijection_df_cols = [
    'ab', 'ap', 'am', 'd_V', 'Wwb', 'Wwj', 'Wwp', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis',
    'p_Am', 'kap', 'v', 'p_M', 'E_Hb', 'E_Hj', 'E_Hp', 'E_Hp', 'k_J',
    'estim_k_J',
    ]

bijection_df = pd.DataFrame(index=df.index, columns=bijection_df_cols)

# Copy columns
for col in bijection_df.columns:
    if col in df.columns:
        bijection_df[col] = df.loc[bijection_df.index, col].copy()
    
# Fill missing values in E_Hj with E_Hb
bijection_df['E_Hj'] = bijection_df['E_Hj'].fillna(bijection_df['E_Hb'])
# Drop rows with missing values
bijection_df.dropna(how='any', inplace=True)
bijection_df


Unnamed: 0_level_0,ab,ap,am,d_V,Wwb,Wwj,Wwp,Wwi,Ri,T_typical,...,p_Am,kap,v,p_M,E_Hb,E_Hj,E_Hp,E_Hp,k_J,estim_k_J
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Acipenser_sturio,19.298951,5110.000000,70327.868765,0.20,1.242000e-02,0.335340,12420.000000,460000.000000,934.195805,293.15,...,203.520289,0.582210,0.089593,1.762110,46.948675,417.937892,1.552973e+07,1.552973e+07,0.002000,False
Albula_vulpes,1.581919,461.920420,4619.204196,0.20,1.555388e-03,0.019759,112.513602,8100.000000,12989.250411,298.15,...,177.961277,0.735510,0.171210,6.839500,5.592000,5.592000,6.464000e+05,6.464000e+05,0.000391,True
Ambystoma_maculatum,100.932371,2971.606295,15487.606686,0.28,4.200000e-03,0.040086,1.392868,12.840000,0.206617,290.15,...,43.208061,0.962031,0.021461,19.418802,0.925835,7.935076,2.474301e+02,2.474301e+02,0.002000,False
Amphiuma_means,126.753359,1340.260927,8191.176087,0.28,5.500000e-01,0.714050,148.600000,1042.000000,0.692208,295.15,...,218.839797,0.773919,0.011105,25.617124,284.263071,357.289702,1.195124e+05,1.195124e+05,0.002000,False
Amphiura_filiformis,3.537313,3964.986695,33041.555790,0.09,5.236000e-07,0.023550,4.775600,5.846500,502.254805,284.15,...,14.389156,0.991349,0.030418,41.505380,0.000009,0.001285,1.877911e+02,1.877911e+02,0.002000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Spisula_solidissima,2.000000,713.241551,17123.842953,0.09,1.021111e-06,0.000091,372.202965,67131.317643,360000.000000,287.15,...,20.520282,0.376331,0.018315,7.802868,0.002180,0.282065,2.973635e+03,2.973635e+03,0.001749,True
Stagnicola_palustris,12.000000,64.000000,360.000000,0.15,5.970000e-05,0.014040,0.130000,0.246700,23.041513,293.15,...,83.459491,0.893968,0.018692,989.734175,0.017575,6.622195,1.204815e+02,1.204815e+02,0.002000,False
Trachycephalus_resinifictrix,4.847953,384.027118,5561.294417,0.28,1.800000e-03,0.393000,14.479971,71.000000,9.889784,297.15,...,116.207608,0.947670,0.041766,30.165060,0.519353,93.543268,6.650420e+03,6.650420e+03,0.002000,False
Urechis_caupo,2.209990,728.191553,4839.877089,0.04,8.200000e-06,0.000200,56.000000,160.000000,82646.726892,290.15,...,86.674826,0.500000,0.029563,12.260133,0.001724,0.044838,2.540135e+04,2.540135e+04,0.002000,False


In [30]:
dataset_name = 'bijection_input'
types_of_col = {
 'ab': ['input', 'log', 'scale', 'quantile'],
 'am': ['input', 'log', 'scale', 'quantile'],
 'd_V': ['input'],
 'Wwb': ['input', 'log', 'scale', 'quantile'],
 'Wwp': ['input', 'log', 'scale', 'quantile'],
 'Wwi': ['input', 'log', 'scale', 'quantile'],
 'Ri': ['input', 'log', 'scale', 'quantile'],
 'T_typical': ['input', 'scale', 'quantile'],
 'metamorphosis': ['input', 'boolean'],
 'p_Am': ['output', 'log', 'scale', 'quantile'],
 'kap': ['output', 'log', 'scale', 'bounded01', 'quantile'],
 'v': ['output', 'log', 'scale', 'quantile'],
 'p_M': ['output', 'log', 'scale', 'quantile'],
 'E_Hb': ['output', 'log', 'scale', 'quantile'],
 'E_Hbj': ['output', 'log', 'scale', 'quantile'],
 'E_Hp': ['output', 'log', 'scale', 'quantile'],
 'k_J': ['output', 'log', 'scale', 'quantile'],
 's_M': ['output', 'log', 'scale', 'quantile'],
}

if include_taxonomy:
    dataset_name += '_taxonomy'
    for col in taxonomy_dummy_cols:
        types_of_col[col] = ['input', 'boolean']
#else:
#    nap_df.drop(columns=taxonomy_dummy_cols, inplace=True)

if include_eco_codes:
    dataset_name += '_ecocodes'
    for col in eco_code_dummy_cols:
        types_of_col[col] = ['input', 'boolean']
#else:
#    nap_df.drop(columns=eco_code_dummy_cols, inplace=True)

    
#split_and_save_dataset(df=nap_df, dataset_name=dataset_name, types_of_col=types_of_col, stratify=df['metamorphosis'], save_folder=processed_dataset_save_folder)


NameError: name 'include_taxonomy' is not defined

## No age at puberty

In [32]:
nap_df_cols = [
    'ab', 'am', 'd_V', 'Wwb', 'Wwp', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis',
    'p_Am', 'kap', 'v', 'p_M', 'E_Hb', 'E_Hp', 'k_J', 's_M',
    'E_Hj', 'E_Hbj',
    ]

# Include metamorphosis datasets
include_metamorphosis_data = False
if include_metamorphosis_data:
    nap_df_cols.extend(['aj', 'Wwj'])
    
# Include weaning datasets
include_weaning_data = False
if include_weaning_data:
    nap_df_cols.extend(['ax', 'Wwx'])
    
# Include taxonomy columns
include_taxonomy = True
if include_taxonomy:
    nap_df_cols.extend(taxonomy_cols)
# Include eco-code columns
include_eco_codes = True
if include_eco_codes:
    nap_df_cols.extend(ecocode_cols)

# Copy columns
nap_df = pd.DataFrame(index=df.index, columns=nap_df_cols)
nap_df.index.name = 'species'
for col in nap_df_cols:
    if col in df.columns:
        nap_df[col] = df[col].copy()

# Drop species with 'stf' model (too few samples to train model)
nap_df = nap_df[df['model'].isin(['std', 'stx', 'abj'])]
#nap_df.drop(columns=['foetus'], inplace=True)

# Encode taxonomy
if include_taxonomy:
    nap_df, taxonomy_dummy_cols = encode_taxonomy(nap_df, categories=taxonomy_class_options, include_other_col=taxonomy_include_other)
    nap_df.drop(columns=taxonomy_cols, inplace=True)

# Encode eco-codes
if include_eco_codes:
    nap_df, eco_code_dummy_cols = encode_eco_codes(nap_df, categories=eco_code_options, include_other_col=eco_code_include_other)
    nap_df.drop(columns=ecocode_cols, inplace=True)

# Create a single intermediate maturity level (metamorphosis or birth if missing)
nap_df[f"E_Hbj"] = nap_df[f"E_Hbj"].fillna(nap_df.loc[nap_df['metamorphosis'], f'E_Hj'])
nap_df[f"E_Hbj"] = nap_df[f"E_Hbj"].fillna(nap_df.loc[~nap_df['metamorphosis'], f'E_Hb'])
#nap_df['E_Hj'] = nap_df['E_Hj'].fillna(0)
nap_df.drop(columns=[f"E_Hj"], inplace=True)

# Drop rows with missing data
nap_df.dropna(how='any', axis=0, inplace=True)
#nap_df = drop_species_with_missing_values(nap_df)

nap_df

  nap_df[f"E_Hbj"] = nap_df[f"E_Hbj"].fillna(nap_df.loc[nap_df['metamorphosis'], f'E_Hj'])


Unnamed: 0_level_0,ab,am,d_V,Wwb,Wwp,Wwi,Ri,T_typical,metamorphosis,p_Am,...,habitat_T,habitat_F,habitat_S,habitat_M,migrate_T,food_other,food_P,food_O,food_H,food_C
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbottina_rivularis,18.092767,1761.029277,0.20,0.000520,0.870,79.0,7.772727,291.15,True,53.617532,...,False,True,False,False,False,False,False,False,False,True
Ablennes_hians,1.954569,2675.316655,0.20,0.000520,144.000,4200.0,179.418014,301.05,True,231.831215,...,False,False,False,True,False,False,False,False,False,True
Abramis_brama,15.680398,6603.859788,0.20,0.002800,319.000,6050.0,2271.398921,291.15,True,469.510089,...,False,True,False,False,False,False,False,False,True,True
Abroscopus_superciliaris,2.245595,443.899620,0.28,0.780000,6.175,6.5,0.072088,314.75,False,852.227338,...,True,False,False,False,False,False,False,False,False,True
Acanthis_flammea,2.041450,541.557536,0.28,1.300000,13.490,14.2,0.225276,314.75,False,3277.710396,...,True,False,False,False,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zonotrichia_querula,2.653885,519.362555,0.28,3.100000,32.015,33.7,0.090110,314.75,False,4208.751230,...,True,False,False,False,False,False,False,False,True,True
Zootoca_vivipara,38.066843,4600.151458,0.30,0.190000,2.200,5.0,0.018635,286.85,False,520.970257,...,True,False,False,False,False,False,False,False,False,True
Zosterisessor_ophiocephalus,8.588355,2910.838943,0.20,0.000065,20.760,276.5,100.486494,290.25,False,90.856433,...,False,False,False,True,False,False,False,False,False,True
Zosterops_lateralis,2.449740,532.679544,0.28,1.100000,11.875,12.5,0.067583,314.75,False,2594.395445,...,True,False,False,False,False,False,False,False,True,True


In [33]:
dataset_name = 'no_pub_age'
types_of_col = {
 'ab': ['input', 'log', 'scale', 'quantile'],
 'am': ['input', 'log', 'scale', 'quantile'],
 'd_V': ['input'],
 'Wwb': ['input', 'log', 'scale', 'quantile'],
 'Wwp': ['input', 'log', 'scale', 'quantile'],
 'Wwi': ['input', 'log', 'scale', 'quantile'],
 'Ri': ['input', 'log', 'scale', 'quantile'],
 'T_typical': ['input', 'scale', 'quantile'],
 'metamorphosis': ['input', 'boolean'],
 'p_Am': ['output', 'log', 'scale', 'quantile'],
 'kap': ['output', 'log', 'scale', 'bounded01', 'quantile'],
 'v': ['output', 'log', 'scale', 'quantile'],
 'p_M': ['output', 'log', 'scale', 'quantile'],
 'E_Hb': ['output', 'log', 'scale', 'quantile'],
 'E_Hbj': ['output', 'log', 'scale', 'quantile'],
 'E_Hp': ['output', 'log', 'scale', 'quantile'],
 'k_J': ['output', 'log', 'scale', 'quantile'],
 's_M': ['output', 'log', 'scale', 'quantile'],
}

if include_taxonomy:
    dataset_name += '_taxonomy'
    for col in taxonomy_dummy_cols:
        types_of_col[col] = ['input', 'boolean']
#else:
#    nap_df.drop(columns=taxonomy_dummy_cols, inplace=True)

if include_eco_codes:
    dataset_name += '_ecocodes'
    for col in eco_code_dummy_cols:
        types_of_col[col] = ['input', 'boolean']
#else:
#    nap_df.drop(columns=eco_code_dummy_cols, inplace=True)

    
#split_and_save_dataset(df=nap_df, dataset_name=dataset_name, types_of_col=types_of_col, stratify=df['metamorphosis'], save_folder=processed_dataset_save_folder)


In [34]:
df.loc[nap_df.index, :].to_csv(f'{processed_dataset_save_folder}/no_pub_age_taxonomy_ecocodes/no_dummies.csv', index=True, float_format='%.6e')

## Final dataset (no pub age subset)

Predicts  $1-\kappa$, $s_H = E_H^b/E_H^p $ and $s_p = \dot{k}_J E_H^p [\dot{p}_M]^2 / \kappa^2 (1-\kappa) \{ \dot{p}_{Am} \}^3$, as well as $\{ \dot{p}_{Am} \}$, $\dot{v}$, $E_H^p$ and $\dot{k}_J$. $[E_G]$ is predicted from a theoretical equation in the bijection.



In [48]:
final_df_cols = [
    'ab', 'am', 'd_V', 'Wwb', 'Wwp', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis',
    's_p_M', '1-kap', 'v', 'p_M', 's_Hb_p', 's_Hb_j', 's_Hj_p', 'E_Hp', 'k_J', '1/s_M',
    'kap', 'E_Hb', 'E_Hj', 'p_Am', 's_M',
    'estim_p_M', 'estim_v', 'estim_kap', 'estim_k_J', 'estim_E_Hb', 'estim_E_Hj', 'estim_E_Hp', 'estim_s_M', 
    ]

# Include taxonomy columns
include_taxonomy = True
if include_taxonomy:
    final_df_cols.extend(taxonomy_dummy_cols)

# Include eco-code columns
include_eco_codes = True
if include_eco_codes:
    final_df_cols.extend(eco_code_dummy_cols)

final_df = pd.DataFrame(index=nap_df.index, columns=final_df_cols)

# Copy columns
for col in final_df.columns:
    if col in df.columns:
        final_df[col] = df.loc[final_df.index, col].copy()
    elif col in nap_df.columns:
        final_df[col] = nap_df.loc[final_df.index, col].copy()
    
# Compute ratio columns
final_df['1-kap'] = 1 - final_df['kap']
final_df['estim_1-kap'] = final_df['estim_kap']

final_df['1/s_M'] = 1 / final_df['s_M']
final_df['estim_1/s_M'] = final_df['estim_s_M']

final_df['s_p_M'] = final_df['k_J'] * final_df['E_Hp'] * final_df['p_M'] ** 2 / final_df['p_Am'] ** 3 / final_df['kap'] ** 2 / (1 - final_df['kap']) / final_df['s_M'] ** 3
final_df['estim_s_p_M'] = True
final_df['estim_p_Am'] = True

# Compute maturity ratios
final_df.loc[final_df['metamorphosis'], 's_Hb_j'] = final_df.loc[final_df['metamorphosis'], 'E_Hb'] / final_df.loc[final_df['metamorphosis'],'E_Hj'] 
final_df['estim_s_Hb_j'] = final_df['estim_E_Hb'] & final_df['metamorphosis']
final_df.loc[final_df['metamorphosis'], 's_Hj_p'] = final_df.loc[final_df['metamorphosis'], 'E_Hj'] / final_df.loc[final_df['metamorphosis'], 'E_Hp']
final_df['estim_s_Hj_p'] = final_df['estim_E_Hj'] & final_df['metamorphosis']
final_df.loc[~final_df['metamorphosis'], 's_Hb_p'] = final_df.loc[~final_df['metamorphosis'], 'E_Hb'] / final_df.loc[~final_df['metamorphosis'],'E_Hp'] 
final_df['estim_s_Hb_p'] = final_df['estim_E_Hb'] & ~final_df['metamorphosis']
# Fill missing values in maturity ratios
final_df.loc[~final_df['metamorphosis'], 's_Hb_j'] = 1 
final_df.loc[~final_df['metamorphosis'], 's_Hj_p'] = final_df.loc[~final_df['metamorphosis'], 'E_Hb'] / final_df.loc[~final_df['metamorphosis'], 'E_Hp']
final_df.loc[final_df['metamorphosis'], 's_Hb_p'] = final_df.loc[final_df['metamorphosis'], 'E_Hb'] / final_df.loc[final_df['metamorphosis'],'E_Hp'] 
final_df.drop(columns=['kap', 'E_Hb', 'E_Hj', 'p_Am', 's_M'], inplace=True)
final_df.columns

Index(['ab', 'am', 'd_V', 'Wwb', 'Wwp', 'Wwi', 'Ri', 'T_typical',
       'metamorphosis', 's_p_M', '1-kap', 'v', 'p_M', 's_Hb_p', 's_Hb_j',
       's_Hj_p', 'E_Hp', 'k_J', '1/s_M', 'estim_p_M', 'estim_v', 'estim_kap',
       'estim_k_J', 'estim_E_Hb', 'estim_E_Hj', 'estim_E_Hp', 'estim_s_M',
       'class_other', 'class_Aves', 'class_Actinopterygii', 'class_Reptilia',
       'class_Chondrichthyes', 'class_Amphibia', 'class_Mammalia',
       'class_Bivalvia', 'class_Branchiopoda', 'class_Malacostraca',
       'climate_A', 'climate_B', 'climate_C', 'climate_D', 'climate_E',
       'habitat_T', 'habitat_F', 'habitat_S', 'habitat_M', 'migrate_T',
       'food_other', 'food_P', 'food_O', 'food_H', 'food_C', 'estim_1-kap',
       'estim_1/s_M', 'estim_s_p_M', 'estim_p_Am', 'estim_s_Hb_j',
       'estim_s_Hj_p', 'estim_s_Hb_p'],
      dtype='object')

In [49]:
dataset_name = 'final'
types_of_col = {
    'ab': {'classes': ['input'], 'qualifiers': ['log', 'scale', 'quantile']},
    'am': {'classes': ['input'], 'qualifiers': ['log', 'scale', 'quantile']},
    'd_V': {'classes': ['input'], 'qualifiers': []},
    'Wwb': {'classes': ['input'], 'qualifiers': ['log', 'scale', 'quantile']},
    'Wwp': {'classes': ['input'], 'qualifiers': ['log', 'scale', 'quantile']},
    'Wwi': {'classes': ['input'], 'qualifiers': ['log', 'scale', 'quantile']},
    'Ri': {'classes': ['input'], 'qualifiers': ['log', 'scale', 'quantile']},
    'T_typical': {'classes': ['input'], 'qualifiers': ['scale', 'quantile']},

    'p_M': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'quantile']},
    '1-kap': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'bounded01', 'quantile']},
    'v': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'quantile']},
    's_p_M': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'bounded01', 'quantile']},
    's_Hb_p': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'bounded01', 'quantile']},
    's_Hb_j': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'bounded01', 'quantile']},
    's_Hj_p': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'bounded01', 'quantile']},
    'E_Hp': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'quantile']},
    'k_J': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'quantile']},
    '1/s_M': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'bounded01', 'quantile']},

    'metamorphosis': {'classes': ['input', 'mask'], 'qualifiers': ['boolean']},
}

for col in final_df.columns:
    if 'estim_' in col:
        types_of_col[col] = {'classes': ['mask'], 'qualifiers': ['boolean']}

if include_taxonomy:
    dataset_name += '_taxonomy'
    for col in taxonomy_dummy_cols:
        types_of_col[col] = {'classes': ['input'], 'qualifiers': ['boolean']}

if include_eco_codes:
    dataset_name += '_ecocodes'
    for col in eco_code_dummy_cols:
        types_of_col[col] = {'classes': ['input'], 'qualifiers': ['boolean']}


split_and_save_dataset(df=final_df, dataset_name=dataset_name, types_of_col=types_of_col, stratify=df['metamorphosis'], save_folder=processed_dataset_save_folder)


Train set size: 1647
Validation set size: 353
Test set size: 353


In [50]:
list_of_estim_cols = [col for col in final_df.columns if 'estim_' in col]
final_df[list_of_estim_cols].sum()

estim_p_M       2293
estim_v         2115
estim_kap       2275
estim_k_J       1156
estim_E_Hb      2352
estim_E_Hj       848
estim_E_Hp      2348
estim_s_M       2353
estim_1-kap     2275
estim_1/s_M     2353
estim_s_p_M     2353
estim_p_Am      2353
estim_s_Hb_j     777
estim_s_Hj_p     777
estim_s_Hb_p    1575
dtype: int64

In [43]:
final_df['metamorphosis'].value_counts() / len(final_df)

metamorphosis
False    0.669783
True     0.330217
Name: count, dtype: float64

## Final dataset with $[\dot{p}_M]$ instead of $\dot{k}_J$ (no pub age subset)

Predicts  $1-\kappa$, $s_H = E_H^b/E_H^p $ and $s_p = \dot{k}_J E_H^p [\dot{p}_M]^2 / \kappa^2 (1-\kappa) \{ \dot{p}_{Am} \}^3$, as well as $\{ \dot{p}_{Am} \}$, $\dot{v}$, $[\dot{p}_M]$ and $E_H^p$. $[E_G]$ is predicted from a theoretical equation in the bijection.



In [None]:
final_no_k_J_df_cols = [
    'ab', 'am', 'd_V', 'Wwb', 'Wwp', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis',
    'p_Am', '1-kap', 'v', 'p_M', 's_Hb_bj', 's_Hbj_p', 'E_Hp', 's_p_M', '1/s_M',
    'kap', 'E_Hb', 'E_Hbj', 'k_J', 's_M',
    'estim_k_J',
    ]

# Include taxonomy columns
include_taxonomy = True
if include_taxonomy:
    final_no_k_J_df_cols.extend(taxonomy_dummy_cols)

# Include eco-code columns
include_eco_codes = True
if include_eco_codes:
    final_no_k_J_df_cols.extend(eco_code_dummy_cols)

final_no_k_J_df = pd.DataFrame(index=nap_df.index, columns=final_no_k_J_df_cols)

# Copy columns
for col in final_no_k_J_df.columns:
    if col in df.columns:
        final_no_k_J_df[col] = df.loc[final_no_k_J_df.index, col].copy()
    elif col in nap_df.columns:
        final_no_k_J_df[col] = nap_df.loc[final_no_k_J_df.index, col].copy()
    
# Compute ratio columns
final_no_k_J_df['1-kap'] = 1 - final_no_k_J_df['kap']
final_no_k_J_df['1/s_M'] = 1 / final_no_k_J_df['s_M']
final_no_k_J_df['s_Hb_bj'] = final_no_k_J_df['E_Hb'] / final_no_k_J_df['E_Hbj'] 
final_no_k_J_df['s_Hbj_p'] = final_no_k_J_df['E_Hbj'] / final_no_k_J_df['E_Hp']
final_no_k_J_df['s_p_M'] = final_no_k_J_df['k_J'] * final_no_k_J_df['E_Hp'] * final_no_k_J_df['p_M'] ** 2 / final_no_k_J_df['p_Am'] ** 3 / final_no_k_J_df['kap'] ** 2 / (1 - final_no_k_J_df['kap']) / final_no_k_J_df['s_M'] ** 3
final_no_k_J_df.drop(columns=['kap', 'E_Hb', 'E_Hbj', 'k_J', 's_M'], inplace=True)
final_df.columns

Index(['ab', 'am', 'd_V', 'Wwb', 'Wwp', 'Wwi', 'Ri', 'T_typical',
       'metamorphosis', 'p_Am', '1-kap', 'v', 's_p_M', 's_Hb_bj', 's_Hbj_p',
       'E_Hp', 'k_J', '1/s_M', 'estim_k_J', 'class_other', 'class_Aves',
       'class_Actinopterygii', 'class_Reptilia', 'class_Chondrichthyes',
       'class_Amphibia', 'class_Mammalia', 'class_Bivalvia',
       'class_Branchiopoda', 'class_Malacostraca', 'climate_A', 'climate_B',
       'climate_C', 'climate_D', 'climate_E', 'habitat_T', 'habitat_F',
       'habitat_S', 'habitat_M', 'migrate_T', 'food_other', 'food_P', 'food_O',
       'food_H', 'food_C', 'data_split'],
      dtype='object')

In [None]:
dataset_name = 'final_no_k_J'

types_of_col = {
    'ab': {'classes': ['input'], 'qualifiers': ['log', 'scale', 'quantile']},
    'am': {'classes': ['input'], 'qualifiers': ['log', 'scale', 'quantile']},
    'd_V': {'classes': ['input'], 'qualifiers': []},
    'Wwb': {'classes': ['input'], 'qualifiers': ['log', 'scale', 'quantile']},
    'Wwp': {'classes': ['input'], 'qualifiers': ['log', 'scale', 'quantile']},
    'Wwi': {'classes': ['input'], 'qualifiers': ['log', 'scale', 'quantile']},
    'Ri': {'classes': ['input'], 'qualifiers': ['log', 'scale', 'quantile']},
    'T_typical': {'classes': ['input'], 'qualifiers': ['scale', 'quantile']},

    'p_Am': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'quantile']},
    '1-kap': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'bounded01', 'quantile']},
    'v': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'quantile']},
    'p_M': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'quantile']},
    's_Hb_bj': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'bounded01', 'quantile']},
    's_Hbj_p': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'bounded01', 'quantile']},
    'E_Hp': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'quantile']},
    's_p_M': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'bounded01', 'quantile']},
    '1/s_M': {'classes': ['output'], 'qualifiers': ['log', 'scale', 'bounded01', 'quantile']},

    'metamorphosis': {'classes': ['input', 'mask'], 'qualifiers': ['boolean']},
    'estim_k_J': {'classes': ['mask'], 'qualifiers': ['boolean']},
}

if include_taxonomy:
    dataset_name += '_taxonomy'
    for col in taxonomy_dummy_cols:
        types_of_col[col] = {'classes': ['input'], 'qualifiers': ['boolean']}

if include_eco_codes:
    dataset_name += '_ecocodes'
    for col in eco_code_dummy_cols:
        types_of_col[col] = {'classes': ['input'], 'qualifiers': ['boolean']}


split_and_save_dataset(df=final_no_k_J_df, dataset_name=dataset_name, types_of_col=types_of_col, stratify=df['metamorphosis'], save_folder=processed_dataset_save_folder)


Train set size: 1654
Validation set size: 354
Test set size: 355


## Biologist approach (Taxonomy & Maximum weight)

## Biologist approach (Taxonomy & Maximum weight) (Only species in no_pub_age)

In [None]:
bio_nap_df_cols = [
    'Wwi', 'd_V',
    'genus', 'family', 'order', 'class', 'phylum',
    'p_Am', 'kap', 'v', 's_p_M', 'E_Hb', 'E_Hj', 'E_Hp', 'k_J', 's_M',
    'p_M',
    'metamorphosis', 'estim_k_J',
    #'E_G',
    #'E_Hx',
]

bio_nap_df = pd.DataFrame(index=nap_df.index, columns=bio_nap_df_cols)
bio_nap_df.index.name = 'species'
for col in bio_nap_df_cols:
    if col in df.columns:
        bio_nap_df[col] = df[col].copy()
# Fill in E_Hx and E_Hj values for species that do not define them
#bio_nap_df['E_Hx'] = bio_nap_df['E_Hx'].fillna(bio_nap_df['E_Hb'])
bio_nap_df['E_Hj'] = bio_nap_df['E_Hj'].fillna(bio_nap_df['E_Hb'])
# Add small increment to make sure maturities always increase
bio_nap_df.loc[bio_nap_df['E_Hj'] == bio_nap_df['E_Hb'], 'E_Hj'] *= (1 + 1e-10)

# Transform p_M into s_p_M to ensure puberty is reached even when k_J is not estimated
bio_nap_df['s_p_M'] = bio_nap_df['k_J'] * bio_nap_df['E_Hp'] * bio_nap_df['p_M'] ** 2 / bio_nap_df['p_Am'] ** 3 / bio_nap_df['kap'] ** 2 / (1 - bio_nap_df['kap']) / bio_nap_df['s_M'] ** 3
bio_nap_df.drop(columns=['p_M'], inplace=True)

bio_nap_df.dropna(how='any', inplace=True)
bio_nap_df

Unnamed: 0_level_0,Wwi,d_V,genus,family,order,class,phylum,p_Am,kap,v,s_p_M,E_Hb,E_Hj,E_Hp,k_J,s_M,metamorphosis,estim_k_J
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Abbottina_rivularis,79.0,0.20,Abbottina,Leuciscidae,Cypriniformes,Actinopterygii,Chordata,53.617532,0.971489,0.022091,0.007971,0.052719,0.192017,128.863865,0.002000,1.533968,True,False
Ablennes_hians,4200.0,0.20,Ablennes,Belonidae,Beloniformes,Actinopterygii,Chordata,231.831215,0.996902,0.022324,0.002606,0.002605,0.022575,955.472281,0.002000,2.052446,True,False
Abramis_brama,6050.0,0.20,Abramis,Leuciscidae,Cypriniformes,Actinopterygii,Chordata,469.510089,0.621643,0.014534,0.019651,0.549493,0.549493,242260.145127,0.001563,1.000000,True,True
Abroscopus_superciliaris,6.5,0.28,Abroscopus,Cisticolidae,Passeriformes,Aves,Chordata,852.227338,0.947562,0.031666,0.861730,103.106259,103.106259,1507.584831,0.030212,1.000000,False,True
Acanthis_flammea,14.2,0.28,Acanthis,Fringillidae,Passeriformes,Aves,Chordata,3277.710396,0.878973,0.026571,0.868034,173.945982,173.945982,2067.451493,0.132496,1.000000,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zonotrichia_querula,33.7,0.28,Zonotrichia,Passerellidae,Passeriformes,Aves,Chordata,4208.751230,0.971028,0.024713,0.716507,71.498558,71.498559,645.380991,0.155661,1.000000,False,True
Zootoca_vivipara,5.0,0.30,Zootoca,Lacertidae,Squamata,Reptilia,Chordata,520.970257,0.726984,0.023570,0.076189,221.078931,221.078931,6327.201533,0.001997,1.000000,False,False
Zosterisessor_ophiocephalus,276.5,0.20,Zosterisessor,Gobiidae,Gobiiformes,Actinopterygii,Chordata,90.856433,0.977551,0.020240,0.057507,0.003956,0.003956,1576.172765,0.002000,1.000000,False,False
Zosterops_lateralis,12.5,0.28,Zosterops,Zosteropidae,Passeriformes,Aves,Chordata,2594.395445,0.964558,0.021701,0.842116,41.924067,41.924067,482.019783,0.119146,1.000000,False,True


In [None]:
bio_nap_df.loc[bio_nap_df['E_Hj'] == bio_nap_df['E_Hb'], ['E_Hb', 'E_Hj']]

Unnamed: 0_level_0,E_Hb,E_Hj
species,Unnamed: 1_level_1,Unnamed: 2_level_1


In [None]:
bio_nap_df.loc['Homo_sapiens']

Wwi                      68000.0
d_V                          0.3
genus                       Homo
family                 Hominidae
order                   Primates
class                   Mammalia
phylum                  Chordata
p_Am                  118.424731
kap                     0.793895
v                       0.031964
s_p_M                    0.68008
E_Hb              4840627.319891
E_Hj              4840627.320375
E_Hp             86709745.228651
k_J                     0.000254
s_M                          1.0
metamorphosis              False
estim_k_J                   True
Name: Homo_sapiens, dtype: object

In [None]:
dataset_name = 'biologist_no_pub_age'
types_of_col = {
    'Wwi': {'classes': ['input'], 'qualifiers': ['log']},
    'd_V': {'classes': ['input'], 'qualifiers': []},
    'genus': {'classes': ['input'], 'qualifiers': ['category']},
    'family': {'classes': ['input'], 'qualifiers': ['category']},
    'order': {'classes': ['input'], 'qualifiers': ['category']},
    'class': {'classes': ['input'], 'qualifiers': ['category']},
    'phylum': {'classes': ['input'], 'qualifiers': ['category']},
    'p_Am': {'classes': ['output'], 'qualifiers': []},
    'kap': {'classes': ['output'], 'qualifiers': []},
    'v': {'classes': ['output'], 'qualifiers': []},
    's_p_M': {'classes': ['output'], 'qualifiers': []},
    #'E_G': {'classes': ['output'], 'qualifiers': []},
    'E_Hb': {'classes': ['output'], 'qualifiers': []},
    #'E_Hx': {'classes': ['output'], 'qualifiers': []},
    'E_Hj': {'classes': ['output'], 'qualifiers': []},
    'E_Hp': {'classes': ['output'], 'qualifiers': []},
    'k_J': {'classes': ['input', 'output'], 'qualifiers': ['log']},
    's_M': {'classes': ['output'], 'qualifiers': []},
    'metamorphosis': {'classes': ['input', 'mask'], 'qualifiers': ['boolean']},
    'estim_k_J': {'classes': ['input', 'mask'], 'qualifiers': ['boolean']},
}

split_and_save_dataset(df=bio_nap_df, dataset_name=dataset_name, types_of_col=types_of_col, stratify=df['metamorphosis'], save_folder=processed_dataset_save_folder)


Train set size: 1654
Validation set size: 354
Test set size: 355
