In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import sys
import os

# Add the 'project' directory to the path
sys.path.append(os.path.abspath('..'))

from project_code.data.preprocess_data import encode_taxonomy, encode_eco_codes, drop_species_with_missing_values, save_types_of_col, split_and_save_dataset
from project_code.data.imputation import *

In [3]:
deb_models = ['std', 'stf', 'stx', 'abj']
parameter_cols = ['p_Am', 'kap', 'v', 'p_M', 'E_G', 'h_a', 'E_Hb', 'E_Hj', 'E_Hx', 'E_Hp']
taxonomy_cols = ['family', 'order', 'class', 'phylum']
ecocode_cols = ['climate', 'ecozone', 'habitat', 'embryo', 'migrate', 'food', 'gender', 'reprod']
age_data_cols = ['ab', 'ah', 'aj', 'ax', 'ap', 'am']
time_since_birth_data_cols = ['tg', 'tb', 'tj', 'tx', 'tp']
weight_data_cols = ['Wwb', 'Wwj', 'Wwx', 'Wwp', 'Wwi']
length_data_cols = ['Lb', 'Lj', 'Lx', 'Lp', 'Li']
other_cols = ['d_V', 'Ri', 'T_typical', 't_0', 'model']

In [4]:
def print_missing_values_per_column(df, percentage=True):
    for c in df.columns:
        n_missing = pd.isna(df[c]).sum()
        if n_missing:
            if percentage:
                print(f"{n_missing/len(df)*100:.1f} % missing values in column {c}")
            else:
                print(f"{n_missing} missing values in column {c}")

# Loading raw data

In [5]:
processed_dataset_save_folder = '../data/processed/'

In [6]:
raw_data = pd.read_csv('../data/raw/dataset_matlab.csv', index_col=0)
raw_data.dropna(how='all', inplace=True)
raw_data.index.name = 'species'
raw_data

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Ri,Ni,GSI,NR,d_V,T_typical,f,t_0,model,completeness
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,,,0.07,,0.09,278.15,1.0,,abj,2.5
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,7.772727,,,,0.20,291.15,1.0,149.8737,abj,2.5
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,179.418014,,,,0.20,301.05,1.0,,abj,2.8
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,48.959720,,,,0.09,288.15,1.0,,abj,2.1
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,,11000.0,,,0.21,291.15,1.0,,abj,2.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,100.486494,,,,0.20,290.25,1.0,,std,2.5
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,0.067583,,,,0.28,314.75,1.0,,std,2.5
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,0.045055,,,,0.28,314.75,1.0,,std,2.5
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,,,,0.09,280.15,1.0,,abj,2.5


In [7]:
raw_data['model'].value_counts()

model
abj    2259
std    2213
stx     732
stf      51
abp      15
ssj      12
hex      11
hep       7
hax       6
asj       4
sbp       4
Name: count, dtype: int64

## Load DEB model predictions 

In [8]:
mat_level_deb_predictions_df = pd.read_csv('../data/deb_model_predictions/metamorphosis_predictions.csv', index_col=0)
mat_level_deb_predictions_df.dropna(how='all', inplace=True)
mat_level_deb_predictions_df.index.name = 'species'
mat_level_deb_predictions_df

Unnamed: 0_level_0,L_b,L_j,L_p,L_i,a_b,a_j,a_p,success,execution_time,error,error_message
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Abatus_cordatus,0.088737,0.213678,1.202151,5.665785,12.717214,39.514929,196.620665,1,0.581737,,
Abbottina_rivularis,0.065971,0.075683,0.783173,3.517721,12.143220,15.500865,274.468552,1,0.581278,,
Ablennes_hians,0.057128,0.115561,3.695223,22.581461,8.437935,16.928854,424.096102,1,0.580210,,
Abra_segmentum,0.014976,0.039564,0.350306,0.805831,3.363052,19.586637,187.816776,1,0.582677,,
Abralia_trigonura,0.021490,0.240470,0.398083,1.189914,7.724631,38.213879,47.267600,1,0.570245,,
...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.126701,,
Zosterops_lateralis,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.128840,,
Zosterops_virens,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0.130270,,
Zygochlamys_patagonica,0.012070,0.017142,1.102825,2.103472,2.060789,3.184608,290.105836,1,0.131553,,


In [9]:
# Drop species for which an error occurred
mat_level_deb_predictions_df = mat_level_deb_predictions_df[mat_level_deb_predictions_df['error_message'].isna()]
# Drop species for which mat levels were not computed
mat_level_deb_predictions_df['success'] = mat_level_deb_predictions_df['success'].astype('bool')
mat_level_deb_predictions_df = mat_level_deb_predictions_df[mat_level_deb_predictions_df['success']]
mat_level_deb_predictions_df

Unnamed: 0_level_0,L_b,L_j,L_p,L_i,a_b,a_j,a_p,success,execution_time,error,error_message
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Abatus_cordatus,0.088737,0.213678,1.202151,5.665785,12.717214,39.514929,196.620665,True,0.581737,,
Abbottina_rivularis,0.065971,0.075683,0.783173,3.517721,12.143220,15.500865,274.468552,True,0.581278,,
Ablennes_hians,0.057128,0.115561,3.695223,22.581461,8.437935,16.928854,424.096102,True,0.580210,,
Abra_segmentum,0.014976,0.039564,0.350306,0.805831,3.363052,19.586637,187.816776,True,0.582677,,
Abralia_trigonura,0.021490,0.240470,0.398083,1.189914,7.724631,38.213879,47.267600,True,0.570245,,
...,...,...,...,...,...,...,...,...,...,...,...
Zingel_asper,0.072276,1.339119,1.339186,4.327728,15.007628,275.414169,275.418626,True,0.132351,,
Zoarces_americanus,0.196003,0.196431,2.265815,9.388288,24.659708,24.724283,377.738521,True,0.137951,,
Zoarces_elongatus,0.100263,0.846983,2.775295,8.840436,19.698527,227.983176,482.359656,True,0.140167,,
Zoarces_viviparus,0.391950,0.409291,1.467455,4.070460,41.297071,43.594209,206.540478,True,0.142653,,


In [10]:
print_missing_values_per_column(raw_data)

54.2 % missing values in column E_Hj
67.2 % missing values in column E_Hx
0.2 % missing values in column E_Hp
0.0 % missing values in column climate
0.0 % missing values in column ecozone
0.0 % missing values in column habitat
0.0 % missing values in column embryo
64.8 % missing values in column migrate
0.0 % missing values in column food
0.0 % missing values in column gender
0.0 % missing values in column reprod
44.1 % missing values in column ab
98.3 % missing values in column ah
99.1 % missing values in column aj
100.0 % missing values in column ax
95.6 % missing values in column ap
0.9 % missing values in column am
86.3 % missing values in column tg
99.8 % missing values in column tb
95.8 % missing values in column tj
67.4 % missing values in column tx
49.9 % missing values in column tp
14.5 % missing values in column Wwb
97.8 % missing values in column Wwj
93.7 % missing values in column Wwx
66.1 % missing values in column Wwp
8.3 % missing values in column Wwi
75.8 % missing valu

# Preprocessing

In [11]:
df = raw_data.copy()

## Fill missing data with DEB model predictions

In [12]:
df['s_M'] = mat_level_deb_predictions_df['L_j'] / mat_level_deb_predictions_df['L_b']
df.loc[df['model'].isin(['std', 'stx', 'stf']), 's_M'] = 1
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Ni,GSI,NR,d_V,T_typical,f,t_0,model,completeness,s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,,0.07,,0.09,278.15,1.0,,abj,2.5,2.408001
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,,,,0.20,291.15,1.0,149.8737,abj,2.5,1.147220
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,,,,0.20,301.05,1.0,,abj,2.8,2.022866
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,,,,0.09,288.15,1.0,,abj,2.1,2.641807
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,11000.0,,,0.21,291.15,1.0,,abj,2.3,11.189711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,,,,0.20,290.25,1.0,,std,2.5,1.000000
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,,,0.09,280.15,1.0,,abj,2.5,1.420209


## Remove species with invalid data or parameter sets

In [13]:
# Remove species with incorrect maturity values
df = df[df.apply(check_column_values_increase, axis=1, args=(['E_Hb', 'E_Hj', 'E_Hx', 'E_Hp'],))]
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Ni,GSI,NR,d_V,T_typical,f,t_0,model,completeness,s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,,0.07,,0.09,278.15,1.0,,abj,2.5,2.408001
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,,,,0.20,291.15,1.0,149.8737,abj,2.5,1.147220
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,,,,0.20,301.05,1.0,,abj,2.8,2.022866
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,,,,0.09,288.15,1.0,,abj,2.1,2.641807
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,11000.0,,,0.21,291.15,1.0,,abj,2.3,11.189711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,,,,0.20,290.25,1.0,,std,2.5,1.000000
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,,,0.09,280.15,1.0,,abj,2.5,1.420209


## Fill missing weight data from length data

In [14]:
df = df.apply(impute_weight_and_length_data, axis=1)
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Ni,GSI,NR,d_V,T_typical,f,t_0,model,completeness,s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,,0.07,,0.09,278.15,1.0,,abj,2.5,2.408001
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,,,,0.20,291.15,1.0,149.8737,abj,2.5,1.147220
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,,,,0.20,301.05,1.0,,abj,2.8,2.022866
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,,,,0.09,288.15,1.0,,abj,2.1,2.641807
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,11000.0,,,0.21,291.15,1.0,,abj,2.3,11.189711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,,,,0.20,290.25,1.0,,std,2.5,1.000000
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,,,0.09,280.15,1.0,,abj,2.5,1.420209


In [15]:
# Check that all weights are increasing
df = df[df.apply(check_column_values_increase, args=(WEIGHT_COLS,), axis=1,)]
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Ni,GSI,NR,d_V,T_typical,f,t_0,model,completeness,s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,,0.07,,0.09,278.15,1.0,,abj,2.5,2.408001
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,,,,0.20,291.15,1.0,149.8737,abj,2.5,1.147220
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,,,,0.20,301.05,1.0,,abj,2.8,2.022866
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,,,,0.09,288.15,1.0,,abj,2.1,2.641807
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,11000.0,,,0.21,291.15,1.0,,abj,2.3,11.189711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,,,,0.20,290.25,1.0,,std,2.5,1.000000
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,,,0.09,280.15,1.0,,abj,2.5,1.420209


## Fill missing weight puberty for Aves species

In [16]:
df = df.apply(impute_weight_at_puberty_for_aves, axis=1)
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Ni,GSI,NR,d_V,T_typical,f,t_0,model,completeness,s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,,0.07,,0.09,278.15,1.0,,abj,2.5,2.408001
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,,,,0.20,291.15,1.0,149.8737,abj,2.5,1.147220
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,,,,0.20,301.05,1.0,,abj,2.8,2.022866
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,,,,0.09,288.15,1.0,,abj,2.1,2.641807
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,11000.0,,,0.21,291.15,1.0,,abj,2.3,11.189711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,,,,0.20,290.25,1.0,,std,2.5,1.000000
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,,,0.09,280.15,1.0,,abj,2.5,1.420209


## Fill missing age data with time since birth data

In [17]:
df = df.apply(impute_age_data, axis=1)
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Ni,GSI,NR,d_V,T_typical,f,t_0,model,completeness,s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,,0.07,,0.09,278.15,1.0,,abj,2.5,2.408001
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,,,,0.20,291.15,1.0,149.8737,abj,2.5,1.147220
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,,,,0.20,301.05,1.0,,abj,2.8,2.022866
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,,,,0.09,288.15,1.0,,abj,2.1,2.641807
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,11000.0,,,0.21,291.15,1.0,,abj,2.3,11.189711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,,,,0.20,290.25,1.0,,std,2.5,1.000000
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,,,0.09,280.15,1.0,,abj,2.5,1.420209


In [18]:
# Check that all ages are increasing
df = df[df.apply(check_column_values_increase, args=(AGE_COLS,), axis=1,)]
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Ni,GSI,NR,d_V,T_typical,f,t_0,model,completeness,s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,,0.07,,0.09,278.15,1.0,,abj,2.5,2.408001
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,,,,0.20,291.15,1.0,149.8737,abj,2.5,1.147220
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,,,,0.20,301.05,1.0,,abj,2.8,2.022866
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,,,,0.09,288.15,1.0,,abj,2.1,2.641807
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,11000.0,,,0.21,291.15,1.0,,abj,2.3,11.189711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,,,,0.20,290.25,1.0,,std,2.5,1.000000
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,,,0.09,280.15,1.0,,abj,2.5,1.420209


In [19]:
discarded_species = raw_data.loc[raw_data.index.difference(df.index), ['model', 'class', 'E_Hb', 'E_Hj', 'E_Hx', 'E_Hp'] + LENGTH_COLS + WEIGHT_COLS + AGE_COLS]
discarded_species['lengths_check'] = discarded_species.apply(check_column_values_increase, args=(LENGTH_COLS,), axis=1)
discarded_species['mat_levels_check'] = discarded_species.apply(check_column_values_increase, args=(['E_Hb', 'E_Hj', 'E_Hx', 'E_Hp'],), axis=1)
discarded_species['weights_check'] = discarded_species.apply(check_column_values_increase, args=(WEIGHT_COLS,), axis=1)
discarded_species['ages_check'] = discarded_species.apply(check_column_values_increase, args=(AGE_COLS,), axis=1)
discarded_species[discarded_species['model'].isin(deb_models)]

Unnamed: 0_level_0,model,class,E_Hb,E_Hj,E_Hx,E_Hp,Lb,Lj,Lx,Lp,...,Wwi,ab,aj,ax,ap,am,lengths_check,mat_levels_check,weights_check,ages_check
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Eurycea_multiplicata,std,Amphibia,1.881,233.9,,233.9,0.95,3.3,,3.4,...,2.0,48.169773,,,,5333.196715,True,False,True,True
Leipoa_ocellata,std,Aves,83520.0,,105.6,1469000.0,,,,,...,1830.0,15.298767,,,,371.426172,True,False,True,True
Lepidochelys_olivacea,std,Reptilia,4785.0,,,25540000.0,,,,,...,40000.0,22.356697,,,6205.0,18250.0,True,True,False,True
Leucoraja_erinacea,std,Chondrichthyes,5871.0,,,1222000.0,9.7,,,40.7,...,910.0,360.719563,,,6278.993284,6278.993284,True,True,True,False
Melopsittacus_undulatus,std,Aves,112.3,,1898.0,1898.0,,,,,...,50.0,3.878755,,,,984.514042,True,False,True,True
Microtus_arvalis,stx,Mammalia,98.21,,230.0,1171.0,,,,,...,50.0,,,,,,True,True,False,True
Percopsis_omiscomaycus,std,Actinopterygii,5.175,,,2747.0,0.8,,,5.4,...,89.6,10.530534,,,1172.131146,1172.131146,True,True,True,False
Pontoporia_blainvillei,stx,Mammalia,325100.0,,2114000.0,7910000.0,70.0,,105.0,134.0,...,32000.0,,,,,1689.15648,False,True,False,True
Pseudis_paradoxa,std,Amphibia,0.5089,28390.0,,28390.0,,4.79,,4.8,...,29.9,,,,,2322.6084,True,False,True,True
Squalus_suckleyi,std,Chondrichthyes,5930.0,,,282400.0,25.0,,,90.0,...,950.0,1773.20075,,,,46899.874914,True,True,False,True


## Fill missing reproduction rate data with total reproduction

In [20]:
df = df.apply(impute_reproduction_rate_data, axis=1)
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Ni,GSI,NR,d_V,T_typical,f,t_0,model,completeness,s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,,0.07,,0.09,278.15,1.0,,abj,2.5,2.408001
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,,,,0.20,291.15,1.0,149.8737,abj,2.5,1.147220
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,,,,0.20,301.05,1.0,,abj,2.8,2.022866
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,,,,0.09,288.15,1.0,,abj,2.1,2.641807
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,11000.0,,,0.21,291.15,1.0,,abj,2.3,11.189711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,,,,0.20,290.25,1.0,,std,2.5,1.000000
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,,,,0.28,314.75,1.0,,std,2.5,1.000000
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,,,0.09,280.15,1.0,,abj,2.5,1.420209


## Statistics on imputed values

## Handling acceleration and weaning

In [21]:
df['metamorphosis'] = df['model'] == 'abj'
df['weaning'] = df['model'] == 'stx'
df['foetus'] = (df['model'] == 'stf') | (df['model'] == 'stx')
#df.drop(columns=['model'], inplace=True)

## Handle taxonomic info

In [22]:
# Create genus taxon from species name
df['genus'] = df.index.map(lambda s: s.split('_')[0])

## Save dataset after imputation

In [23]:
df.to_csv('../data/interim/filled_data.csv', index=True)
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,T_typical,f,t_0,model,completeness,s_M,metamorphosis,weaning,foetus,genus
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,278.15,1.0,,abj,2.5,2.408001,True,False,False,Abatus
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,291.15,1.0,149.8737,abj,2.5,1.147220,True,False,False,Abbottina
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,301.05,1.0,,abj,2.8,2.022866,True,False,False,Ablennes
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,288.15,1.0,,abj,2.1,2.641807,True,False,False,Abra
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,291.15,1.0,,abj,2.3,11.189711,True,False,False,Abralia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,290.25,1.0,,std,2.5,1.000000,False,False,False,Zosterisessor
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,314.75,1.0,,std,2.5,1.000000,False,False,False,Zosterops
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,314.75,1.0,,std,2.5,1.000000,False,False,False,Zosterops
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,280.15,1.0,,abj,2.5,1.420209,True,False,False,Zygochlamys


# Processed dataset

## Define options

In [24]:
taxonomy_class_options = {
    'class': ['Aves', 'Actinopterygii', 'Reptilia', 'Chondrichthyes', 'Amphibia',
       'Mammalia', 'Bivalvia', 'Branchiopoda', 'Malacostraca'],
}
taxonomy_include_other = {
    'class': True
}



In [25]:
eco_code_options = {
    'climate': ['A', 'B', 'C', 'D', 'E'],
    'habitat': ['T', 'F', 'S', 'M'],
    'migrate': ['T'],
    'food': ['P', 'O', 'H', 'C'],
}

eco_code_include_other = {
    'climate': False,
    'habitat': False,
    'migrate': False,
    'food': True,
}


## No age at puberty

In [26]:
nap_df_cols = [
    'ab', 'am', 'd_V', 'Wwb', 'Wwp', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis',
    'p_Am', 'kap', 'v', 'p_M', 'E_Hb', 'E_Hp', 'k_J', 's_M',
    'E_Hj', 'E_Hbj',
    ]

# Include metamorphosis datasets
include_metamorphosis_data = False
if include_metamorphosis_data:
    nap_df_cols.extend(['aj', 'Wwj'])
    
# Include weaning datasets
include_weaning_data = False
if include_weaning_data:
    nap_df_cols.extend(['ax', 'Wwx'])
    
# Include taxonomy columns
include_taxonomy = True
if include_taxonomy:
    nap_df_cols.extend(taxonomy_cols)
# Include eco-code columns
include_eco_codes = True
if include_eco_codes:
    nap_df_cols.extend(ecocode_cols)

# Copy columns
nap_df = pd.DataFrame(index=df.index, columns=nap_df_cols)
nap_df.index.name = 'species'
for col in nap_df_cols:
    if col in df.columns:
        nap_df[col] = df[col].copy()

# Drop species with 'stf' model (too few samples to train model)
nap_df = nap_df[df['model'].isin(['std', 'stx', 'abj'])]
#nap_df.drop(columns=['foetus'], inplace=True)

# Encode taxonomy
if include_taxonomy:
    nap_df, taxonomy_dummy_cols = encode_taxonomy(nap_df, categories=taxonomy_class_options, include_other_col=taxonomy_include_other)
    nap_df.drop(columns=taxonomy_cols, inplace=True)

# Encode eco-codes
if include_eco_codes:
    nap_df, eco_code_dummy_cols = encode_eco_codes(nap_df, categories=eco_code_options, include_other_col=eco_code_include_other)
    nap_df.drop(columns=ecocode_cols, inplace=True)

# Create a single intermediate maturity level (metamorphosis or birth if missing)
nap_df[f"E_Hbj"] = nap_df[f"E_Hbj"].fillna(nap_df.loc[nap_df['metamorphosis'], f'E_Hj'])
nap_df[f"E_Hbj"] = nap_df[f"E_Hbj"].fillna(nap_df.loc[~nap_df['metamorphosis'], f'E_Hb'])
nap_df.drop(columns=[f"E_Hj"], inplace=True)

# Drop rows with missing data
nap_df.dropna(how='any', axis=0, inplace=True)
#nap_df = drop_species_with_missing_values(nap_df)

nap_df

  nap_df[f"E_Hbj"] = nap_df[f"E_Hbj"].fillna(nap_df.loc[nap_df['metamorphosis'], f'E_Hj'])


Unnamed: 0_level_0,ab,am,d_V,Wwb,Wwp,Wwi,Ri,T_typical,metamorphosis,p_Am,...,habitat_T,habitat_F,habitat_S,habitat_M,migrate_T,food_other,food_P,food_O,food_H,food_C
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbottina_rivularis,18.092767,1761.029277,0.20,0.000520,0.870,79.0,7.772727,291.15,True,75.309251,...,False,True,False,False,False,False,False,False,False,True
Ablennes_hians,1.954569,2675.316655,0.20,0.000520,144.000,4200.0,179.418014,301.05,True,201.398502,...,False,False,False,True,False,False,False,False,False,True
Abramis_brama,15.680398,6603.859788,0.20,0.002800,319.000,6050.0,2271.398921,291.15,True,401.092069,...,False,True,False,False,False,False,False,False,True,True
Abroscopus_superciliaris,2.245595,443.899620,0.28,0.780000,6.175,6.5,0.072088,314.75,False,663.006069,...,True,False,False,False,False,False,False,False,False,True
Acanthis_flammea,2.041450,541.557536,0.28,1.300000,13.490,14.2,0.225276,314.75,False,1294.709603,...,True,False,False,False,False,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zonotrichia_querula,2.653885,519.362555,0.28,3.100000,32.015,33.7,0.090110,314.75,False,1476.816557,...,True,False,False,False,False,False,False,False,True,True
Zootoca_vivipara,38.066843,4600.151458,0.30,0.190000,2.200,5.0,0.018635,286.85,False,516.242968,...,True,False,False,False,False,False,False,False,False,True
Zosterisessor_ophiocephalus,8.588355,2910.838943,0.20,0.000065,20.760,276.5,100.486494,290.25,False,90.838193,...,False,False,False,True,False,False,False,False,False,True
Zosterops_lateralis,2.449740,532.679544,0.28,1.100000,11.875,12.5,0.067583,314.75,False,872.827510,...,True,False,False,False,False,False,False,False,True,True


In [27]:
dataset_name = 'no_pub_age'
types_of_col = {
 'ab': ['input', 'log', 'scale', 'quantile'],
 'am': ['input', 'log', 'scale', 'quantile'],
 'd_V': ['input'],
 'Wwb': ['input', 'log', 'scale', 'quantile'],
 'Wwp': ['input', 'log', 'scale', 'quantile'],
 'Wwi': ['input', 'log', 'scale', 'quantile'],
 'Ri': ['input', 'log', 'scale', 'quantile'],
 'T_typical': ['input', 'scale', 'quantile'],
 'metamorphosis': ['input', 'boolean'],
 'p_Am': ['output', 'log', 'scale', 'quantile'],
 'kap': ['output', 'log', 'scale', 'bounded01', 'quantile'],
 'v': ['output', 'log', 'scale', 'quantile'],
 'p_M': ['output', 'log', 'scale', 'quantile'],
 'E_Hb': ['output', 'log', 'scale', 'quantile'],
 'E_Hbj': ['output', 'log', 'scale', 'quantile'],
 'E_Hp': ['output', 'log', 'scale', 'quantile'],
 'k_J': ['output', 'log', 'scale', 'quantile'],
 's_M': ['output', 'log', 'scale', 'quantile'],
}

if include_taxonomy:
    dataset_name += '_taxonomy'
    for col in taxonomy_dummy_cols:
        types_of_col[col] = ['input', 'boolean']
#else:
#    nap_df.drop(columns=taxonomy_dummy_cols, inplace=True)

if include_eco_codes:
    dataset_name += '_ecocodes'
    for col in eco_code_dummy_cols:
        types_of_col[col] = ['input', 'boolean']
#else:
#    nap_df.drop(columns=eco_code_dummy_cols, inplace=True)

    
split_and_save_dataset(df=nap_df, dataset_name=dataset_name, types_of_col=types_of_col, stratify=df['metamorphosis'], save_folder=processed_dataset_save_folder)


Train set size: 1587
Validation set size: 340
Test set size: 341


In [28]:
df.loc[nap_df.index, :].to_csv(f'{processed_dataset_save_folder}/no_pub_age_taxonomy_ecocodes/no_dummies.csv', index=True, float_format='%.6e')

## Final dataset (no pub age subset)

Predicts  $1-\kappa$, $s_H = E_H^b/E_H^p $ and $s_p = \dot{k}_J E_H^p [\dot{p}_M]^2 / \kappa^2 (1-\kappa) \{ \dot{p}_{Am} \}^3$, as well as $\{ \dot{p}_{Am} \}$, $\dot{v}$, $E_H^p$, $\dot{k}_J$ and $\ddot{h}_a$. $[E_G]$ is predicted from a theoretical equation in the bijection.



In [29]:
final_df_cols = [
    'ab', 'am', 'd_V', 'Wwb', 'Wwp', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis',
    'p_Am', '1-kap', 'v', 's_p_M', 's_Hb_bj', 's_Hbj_p', 'E_Hp', 'k_J', '1/s_M',
    'kap', 'E_Hb', 'E_Hbj', 'p_M', 's_M'
    ]

# Include taxonomy columns
include_taxonomy = True
if include_taxonomy:
    final_df_cols.extend(taxonomy_dummy_cols)

# Include eco-code columns
include_eco_codes = True
if include_eco_codes:
    final_df_cols.extend(eco_code_dummy_cols)

final_df = pd.DataFrame(index=nap_df.index, columns=final_df_cols)

# Copy columns
for col in final_df.columns:
    if col in df.columns:
        final_df[col] = df.loc[final_df.index, col].copy()
    elif col in nap_df.columns:
        final_df[col] = nap_df.loc[final_df.index, col].copy()
    
# Compute ratio columns
final_df['1-kap'] = 1 - final_df['kap']
final_df['1/s_M'] = 1 / final_df['s_M']
final_df['s_Hb_bj'] = final_df['E_Hb'] / final_df['E_Hbj'] 
final_df['s_Hbj_p'] = final_df['E_Hbj'] / final_df['E_Hp']
final_df['s_p_M'] = final_df['k_J'] * final_df['E_Hp'] * final_df['p_M'] ** 2 / final_df['p_Am'] ** 3 / final_df['kap'] ** 2 / (1 - final_df['kap']) / final_df['s_M'] ** 3
final_df.drop(columns=['kap', 'E_Hb', 'E_Hbj', 'p_M', 's_M'], inplace=True)
final_df.columns

Index(['ab', 'am', 'd_V', 'Wwb', 'Wwp', 'Wwi', 'Ri', 'T_typical',
       'metamorphosis', 'p_Am', '1-kap', 'v', 's_p_M', 's_Hb_bj', 's_Hbj_p',
       'E_Hp', 'k_J', '1/s_M', 'class_other', 'class_Aves',
       'class_Actinopterygii', 'class_Reptilia', 'class_Chondrichthyes',
       'class_Amphibia', 'class_Mammalia', 'class_Bivalvia',
       'class_Branchiopoda', 'class_Malacostraca', 'climate_A', 'climate_B',
       'climate_C', 'climate_D', 'climate_E', 'habitat_T', 'habitat_F',
       'habitat_S', 'habitat_M', 'migrate_T', 'food_other', 'food_P', 'food_O',
       'food_H', 'food_C'],
      dtype='object')

In [30]:
dataset_name = 'final'
types_of_col = {
 'ab': ['input', 'log', 'scale', 'quantile'],
 'am': ['input', 'log', 'scale', 'quantile'],
 'd_V': ['input'],
 'Wwb': ['input', 'log', 'scale', 'quantile'],
 'Wwp': ['input', 'log', 'scale', 'quantile'],
 'Wwi': ['input', 'log', 'scale', 'quantile'],
 'Ri': ['input', 'log', 'scale', 'quantile'],
 'T_typical': ['input', 'scale', 'quantile'],
 'metamorphosis': ['input', 'boolean'],
 'p_Am': ['output', 'log', 'scale', 'quantile'],
 '1-kap': ['output', 'log', 'scale', 'bounded01', 'quantile'],
 'v': ['output', 'log', 'scale', 'quantile'],
 's_p_M': ['output', 'log', 'scale', 'bounded01', 'quantile'],
 's_Hb_bj': ['output', 'log', 'scale', 'bounded01', 'quantile'],
 's_Hbj_p': ['output', 'log', 'scale', 'bounded01', 'quantile'],
 'E_Hp': ['output', 'log', 'scale', 'quantile'],
 'k_J': ['output', 'log', 'scale', 'quantile'],
 '1/s_M': ['output', 'log', 'scale', 'bounded01', 'quantile'],
}

if include_taxonomy:
    dataset_name += '_taxonomy'
    for col in taxonomy_dummy_cols:
        types_of_col[col] = ['input', 'boolean']

if include_eco_codes:
    dataset_name += '_ecocodes'
    for col in eco_code_dummy_cols:
        types_of_col[col] = ['input', 'boolean']


split_and_save_dataset(df=final_df, dataset_name=dataset_name, types_of_col=types_of_col, stratify=df['metamorphosis'], save_folder=processed_dataset_save_folder)


Train set size: 1587
Validation set size: 340
Test set size: 341


## Biologist approach (Taxonomy & Maximum weight)

## Biologist approach (Taxonomy & Maximum weight) (Only species in no_pub_age)

In [31]:
bio_nap_df_cols = [
    'Wwi',
    'genus', 'family', 'order', 'class', 'phylum',
    'metamorphosis',
    'p_Am', 'kap', 'v', 'p_M', 'E_G', 'E_Hb', 'E_Hx', 'E_Hj', 'E_Hp', 'k_J', 's_M',
]

bio_nap_df = pd.DataFrame(index=nap_df.index, columns=bio_nap_df_cols)
bio_nap_df.index.name = 'species'
for col in bio_nap_df_cols:
    if col in df.columns:
        bio_nap_df[col] = df[col].copy()
# Fill in E_Hx and E_Hj values for species that do not define them
bio_nap_df['E_Hx'] = bio_nap_df['E_Hx'].fillna(bio_nap_df['E_Hb'])
bio_nap_df['E_Hj'] = bio_nap_df['E_Hj'].fillna(bio_nap_df['E_Hb'])

bio_nap_df.dropna(how='any', inplace=True)
bio_nap_df

Unnamed: 0_level_0,Wwi,genus,family,order,class,phylum,metamorphosis,p_Am,kap,v,p_M,E_G,E_Hb,E_Hx,E_Hj,E_Hp,k_J,s_M
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Abbottina_rivularis,79.0,Abbottina,Leuciscidae,Cypriniformes,Actinopterygii,Chordata,True,75.309251,0.97055,0.020221,23.8370,5215.8135,0.045900,0.045900,0.069550,89.34,0.002000,1.147220
Ablennes_hians,4200.0,Ablennes,Belonidae,Beloniformes,Actinopterygii,Chordata,True,201.398502,0.99640,0.022797,17.9765,5237.1062,0.003540,0.003540,0.029360,1083.00,0.002000,2.022866
Abramis_brama,6050.0,Abramis,Leuciscidae,Cypriniformes,Actinopterygii,Chordata,True,401.092069,0.66367,0.016416,23.3484,5228.9640,0.548400,0.548400,0.548500,236600.00,0.001245,1.000060
Abroscopus_superciliaris,6.5,Abroscopus,Cisticolidae,Passeriformes,Aves,Chordata,False,663.006069,0.95822,0.034790,533.0640,7316.5423,96.030000,631.400000,96.030000,1472.00,0.021838,1.000000
Acanthis_flammea,14.2,Acanthis,Fringillidae,Passeriformes,Aves,Chordata,False,1294.709603,0.92908,0.037653,907.6351,7320.7246,211.900000,1348.000000,211.900000,3305.00,0.037090,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zonotrichia_querula,33.7,Zonotrichia,Passerellidae,Passeriformes,Aves,Chordata,False,1476.816557,0.97417,0.041453,834.8830,7333.1130,188.800000,817.400000,188.800000,2151.00,0.033836,1.000000
Zootoca_vivipara,5.0,Zootoca,Lacertidae,Squamata,Reptilia,Chordata,False,516.242968,0.72515,0.023765,344.4866,7837.0592,226.400000,226.400000,226.400000,6375.00,0.001997,1.000000
Zosterisessor_ophiocephalus,276.5,Zosterisessor,Gobiidae,Gobiiformes,Actinopterygii,Chordata,False,90.838193,0.97755,0.020245,17.1271,5231.3551,0.003958,0.003958,0.003958,1577.00,0.002000,1.000000
Zosterops_lateralis,12.5,Zosterops,Zosteropidae,Passeriformes,Aves,Chordata,False,872.827510,0.97558,0.035919,617.3516,7324.4923,76.790000,456.800000,76.790000,1199.00,0.024996,1.000000


In [32]:
dataset_name = 'biologist_no_pub_age'
types_of_col = {
 'Wwi': ['input', 'log', ],
 'genus': ['input', 'category'],
 'family': ['input', 'category'],
 'order': ['input', 'category'],
 'class': ['input', 'category'],
 'phylum': ['input', 'category'],
 'metamorphosis': ['input', 'boolean'],
 'p_Am': ['output', ],
 'kap': ['output', ],
 'v': ['output', ],
 'p_M': ['output', ],
 'E_G': ['output', ],
 'E_Hb': ['output', ],
 'E_Hx': ['output', ],
 'E_Hj': ['output', ],
 'E_Hp': ['output', ],
 'k_J': ['output', ],
 's_M': ['output', ],
}

split_and_save_dataset(df=bio_nap_df, dataset_name=dataset_name, types_of_col=types_of_col, stratify=df['metamorphosis'], save_folder=processed_dataset_save_folder)


Train set size: 1587
Validation set size: 340
Test set size: 341


In [33]:
all([sp in bio_nap_df.index for sp in final_df.index])

True