In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json

In [2]:
deb_models = ['std', 'stf', 'stx', 'abj']
parameter_cols = ['p_Am', 'kap', 'v', 'p_M', 'E_G', 'h_a', 'E_Hb', 'E_Hj', 'E_Hx', 'E_Hp']
info_cols = ['family', 'order', 'class', 'phylum']
age_data_cols = ['ab', 'ah', 'aj', 'ax', 'ap', 'am']
time_since_birth_data_cols = ['tg', 'tb', 'tj', 'tx', 'tp']
weight_data_cols = ['Wwb', 'Wwj', 'Wwx', 'Wwp', 'Wwi']
length_data_cols = ['Lb', 'Lj', 'Lx', 'Lp', 'Li']
other_cols = ['d_V', 'Ri', 'T_typical', 't_0', 'model']

In [3]:
def print_missing_values_per_column(df):
    for c in df.columns:
        n_missing = pd.isna(df[c]).sum()
        if n_missing:
            print(f"{n_missing} missing values in column {c}")

# Loading dataset

In [4]:
raw_data = pd.read_csv('../data/raw/dataset_matlab.csv', index_col=0)
raw_data.dropna(how='all', inplace=True)
raw_data.index.name = 'species'
raw_data

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Lb,Lj,Lx,Lp,Li,d_V,Ri,T_typical,t_0,model
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,,0.276,,1.90,8.0,0.09,,278.15,,abj
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,,,,4.50,18.9,0.20,7.772727,291.15,149.8737,abj
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,,,,47.40,140.0,0.20,73.373736,301.05,,abj
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,,,,0.90,1.9,0.09,48.959720,288.15,,abj
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,0.08,,,1.40,4.0,0.21,,291.15,,abj
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,,,,12.65,29.9,0.20,100.486494,290.25,,std
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,,,,,,0.28,0.067583,314.75,,std
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,,,,,,0.28,0.045055,314.75,,std
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,0.060,,,7.0,0.09,,280.15,,abj


In [5]:
print_missing_values_per_column(raw_data)

2726 missing values in column E_Hj
3511 missing values in column E_Hx
2050 missing values in column ab
4564 missing values in column ah
4601 missing values in column aj
4640 missing values in column ax
4413 missing values in column ap
51 missing values in column am
3913 missing values in column tg
4631 missing values in column tb
4450 missing values in column tj
3514 missing values in column tx
2018 missing values in column tp
739 missing values in column Wwb
4547 missing values in column Wwj
4304 missing values in column Wwx
3379 missing values in column Wwp
403 missing values in column Wwi
3402 missing values in column Lb
4334 missing values in column Lj
4623 missing values in column Lx
2052 missing values in column Lp
1634 missing values in column Li
741 missing values in column Ri
3007 missing values in column t_0


# Preprocessing

In [6]:
df = raw_data.copy()

## Remove species with invalid data or parameter sets

In [7]:
# Drop rows with values less than 0
df = df[((df.select_dtypes(include='number') >= 0) | (df.select_dtypes(include='number').isna())).all(axis=1)]
# Remove species with incorrect maturity values
df = df[( (df['E_Hb'] < df['E_Hx']) | df[['E_Hb', 'E_Hx']].isna().any(axis=1) )]
df = df[( (df['E_Hb'] < df['E_Hj']) | df[['E_Hb', 'E_Hj']].isna().any(axis=1) )]
df = df[( (df['E_Hb'] < df['E_Hp']) )]
df = df[( (df['E_Hx'] < df['E_Hp']) | df[['E_Hp', 'E_Hx']].isna().any(axis=1) )]
df = df[( (df['E_Hj'] < df['E_Hp']) | df[['E_Hp', 'E_Hj']].isna().any(axis=1) )]
#nwp_df = nwp_df[~(nwp_df['E_Hbjx'] < nwp_df['E_Hb']) & ~(nwp_df['E_Hp'] < nwp_df['E_Hbjx'])]
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Lb,Lj,Lx,Lp,Li,d_V,Ri,T_typical,t_0,model
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,,0.276,,1.90,8.0,0.09,,278.15,,abj
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,,,,4.50,18.9,0.20,7.772727,291.15,149.8737,abj
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,,,,47.40,140.0,0.20,73.373736,301.05,,abj
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,,,,0.90,1.9,0.09,48.959720,288.15,,abj
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,0.08,,,1.40,4.0,0.21,,291.15,,abj
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,,,,12.65,29.9,0.20,100.486494,290.25,,std
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,,,,,,0.28,0.067583,314.75,,std
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,,,,,,0.28,0.045055,314.75,,std
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,0.060,,,7.0,0.09,,280.15,,abj


## Fill missing weight data from length data

In [8]:
for species, row in df.iterrows():
    length_vars_exist = [~np.isnan(row[ldv]) for ldv in length_data_cols]
    weight_vars_exist = [~np.isnan(row[wdv]) for wdv in weight_data_cols]
    both_vars_exist = [l and w for l, w in zip(length_vars_exist, weight_vars_exist)]
    # Check if we have a length-weight pair to make correspondences
    if not any(both_vars_exist) or sum(length_vars_exist) <= 1:
        continue
    for i in range(len(length_data_cols) - 1, -1, -1):
        # Continue if the weight at the life stage already exists
        if weight_vars_exist[i]:
            continue
        elif length_vars_exist[i]:
            for j in range(0, len(length_data_cols), 1):
                # Find the closest life stage with both weight and length data
                if both_vars_exist[(i + j) % len(length_data_cols)]:
                    df.loc[species,weight_data_cols[i]] = (row[weight_data_cols[j]] * np.power(row[length_data_cols[i]] / row[length_data_cols[j]], 3))


## Fill missing age data with time since birth data

In [9]:
for species in df.index.values:
    data = df.loc[species]
    # If no data on age at birth 'ab'
    if data.isna()['ab']:
        # Infer it from gestation time 'tg'
        if ~data.isna()['tg']:
            # If diapause exists take into account for calculating age at birth
            if data.isna()['t_0']:
                df.at[species, 'ab'] = df.at[species, 'tg']
            else:
                df.at[species, 'ab'] = df.at[species, 'tg'] + df.at[species, 't_0']
        # Infer from age at hatch 'ah' and time since hatch at birth 'tb'
        if ~data.isna()['ah'] and ~data.isna()['tb']:
            df.at[species, 'ab'] = df.at[species, 'ah'] + df.at[species, 'tb']  
    # Compute age at maturity levels from time since birth data
    for mat in ('j', 'x', 'p'):
        if data.isna()[f'a{mat}'] and not data.isna()[f't{mat}']:
            df.at[species, f'a{mat}'] = df.at[species, f't{mat}'] + df.at[species, 'ab']       
        

In [10]:
# Remove species with data on age at maturity levels not in increasing order
df = df[( (df['ab'] < df['ax']) | df[['ab', 'ax']].isna().any(axis=1) )]
df = df[( (df['ab'] < df['aj']) | df[['ab', 'aj']].isna().any(axis=1) )]
df = df[( (df['ab'] < df['ap']) | df[['ab', 'ap']].isna().any(axis=1) )]
df = df[( (df['ab'] < df['am']) | df[['ab', 'am']].isna().any(axis=1) )]
df = df[( (df['ax'] < df['ap']) | df[['ax', 'ap']].isna().any(axis=1) )]
df = df[( (df['ax'] < df['am']) | df[['ax', 'am']].isna().any(axis=1) )]
df = df[( (df['aj'] < df['ap']) | df[['aj', 'ap']].isna().any(axis=1) )]
df = df[( (df['aj'] < df['am']) | df[['aj', 'am']].isna().any(axis=1) )]
df = df[( (df['ap'] < df['am']) | df[['ap', 'am']].isna().any(axis=1) )]

## Handling acceleration and weaning

In [11]:
df['metamorphosis'] = df['model'] == 'abj'
df['weaning'] = df['model'] == 'stx'
df['foetus'] = (df['model'] == 'stf') | (df['model'] == 'stx')
#df.drop(columns=['model'], inplace=True)

# Handle taxonomic info

In [12]:
# NOT IMPLEMENTED

In [13]:
print_missing_values_per_column(df)

2698 missing values in column E_Hj
3477 missing values in column E_Hx
1306 missing values in column ab
4518 missing values in column ah
4415 missing values in column aj
3467 missing values in column ax
2070 missing values in column ap
50 missing values in column am
3864 missing values in column tg
4581 missing values in column tb
4403 missing values in column tj
3468 missing values in column tx
1997 missing values in column tp
198 missing values in column Wwb
4430 missing values in column Wwj
4254 missing values in column Wwx
3340 missing values in column Wwp
397 missing values in column Wwi
3375 missing values in column Lb
4292 missing values in column Lj
4573 missing values in column Lx
2031 missing values in column Lp
1616 missing values in column Li
729 missing values in column Ri
2996 missing values in column t_0


In [14]:
df.to_csv('../data/interim/filled_data.csv', index=True)

In [15]:
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Lp,Li,d_V,Ri,T_typical,t_0,model,metamorphosis,weaning,foetus
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,1.90,8.0,0.09,,278.15,,abj,True,False,False
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,4.50,18.9,0.20,7.772727,291.15,149.8737,abj,True,False,False
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,47.40,140.0,0.20,73.373736,301.05,,abj,True,False,False
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,0.90,1.9,0.09,48.959720,288.15,,abj,True,False,False
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,1.40,4.0,0.21,,291.15,,abj,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,12.65,29.9,0.20,100.486494,290.25,,std,False,False,False
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,,,0.28,0.067583,314.75,,std,False,False,False
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,,,0.28,0.045055,314.75,,std,False,False,False
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,7.0,0.09,,280.15,,abj,True,False,False


# Processed dataset

In [16]:
def split_and_save_dataset(df, dataset_name, train_percentage=0.70, val_percentage=0.15, test_percentage=0.15, seed=42, stratify=None):
    
    # First split to separate train and temp (test + val)
    if stratify is not None:
        stratify_array = stratify.loc[df.index]
    else: 
        stratify_array = None
    train_df, temp_df = train_test_split(df, test_size=(1 - train_percentage), random_state=seed, stratify=stratify_array)
    
    # Calculate the percentage of temp that should be test and val
    val_test_ratio = val_percentage / (val_percentage + test_percentage)
    
    # Split temp into validation and test sets
    if stratify is not None:
        stratify_array = stratify.loc[temp_df.index]
    else: 
        stratify_array = None
    val_df, test_df = train_test_split(temp_df, test_size=(1 - val_test_ratio), random_state=seed, stratify=stratify_array)
    
    
    print(f"Train set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")
    print(f"Test set size: {len(test_df)}")
    
    # Save datasets
    df.to_csv(f'../data/processed/{dataset_name}.csv', index=True, float_format='%.6e')
    train_df.to_csv(f'../data/processed/{dataset_name}_train.csv', index=True, float_format='%.6e')
    val_df.to_csv(f'../data/processed/{dataset_name}_val.csv', index=True, float_format='%.6e')
    test_df.to_csv(f'../data/processed/{dataset_name}_test.csv', index=True, float_format='%.6e')


In [37]:
def save_types_of_col(types_of_col, dataset_name):
    with open(f'../data/processed/{dataset_name}_types_of_col.json', "w") as json_file:
        json.dump(types_of_col, json_file, indent=4)
            

## Bijection input datasets

In [18]:
bijection_input_df_cols = [
    'ab', 'ap', 'am', 'd_V', 'Wwb', 'Wwj', 'Wwx', 'Wwp', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis', 'weaning', 'foetus', 
    'p_Am', 'kap', 'v', 'p_M', 'E_G', 'h_a', 'E_Hb', 'E_Hj', 'E_Hx', 'E_Hp',
    ]
bijection_input_df = df[bijection_input_df_cols].copy()
bijection_input_df['Wwj'] = bijection_input_df['Wwj'].fillna(bijection_input_df['Wwx']).fillna(bijection_input_df['Wwb'])
bijection_input_df['E_Hj'] = bijection_input_df['E_Hj'].fillna(bijection_input_df['E_Hx']).fillna(bijection_input_df['E_Hb'])
bijection_input_df.rename(columns={'Wwj':'Wwbjx', 'E_Hj':'E_Hbjx'}, inplace=True)
bijection_input_df.drop(columns=['Wwx', 'E_Hx'], inplace=True)
bijection_input_df.index.name = 'species'
# Drop rows with missing data
bijection_input_df.dropna(how='any', axis=0, inplace=True)
# Drop species with 'stf' model (too few samples to train model)
bijection_input_df = bijection_input_df[df['model'] != 'stf']
bijection_input_df.drop(columns=['foetus'], inplace=True)

bijection_input_df

  bijection_input_df = bijection_input_df[df['model'] != 'stf']


Unnamed: 0_level_0,ab,ap,am,d_V,Wwb,Wwbjx,Wwp,Wwi,Ri,T_typical,...,weaning,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hbjx,E_Hp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abramis_brama,15.680398,1336.452355,6603.859788,0.20,2.800000e-03,2.800000e-03,319.00,6050.0,2271.398921,291.15,...,False,401.092069,0.66367,0.016416,23.3484,5228.9640,4.116000e-08,0.548400,0.548500,236600.00
Achoerodus_viridis,2.195824,1095.000000,12775.000000,0.20,2.604167e-04,2.604167e-04,70.00,3600.0,9589.041096,293.15,...,False,148.179353,0.52154,0.099329,5.6975,5238.8833,2.801000e-10,0.306500,0.306600,310500.00
Acipenser_persicus,13.670977,3640.522705,14235.000000,0.20,9.765937e-02,9.765937e-02,28000.00,70000.0,301.300000,293.15,...,False,562.047321,0.56571,0.043759,12.3264,5242.1846,8.726000e-10,52.330000,52.330000,36130000.00
Acipenser_ruthenus,12.845273,4094.430716,22548.134817,0.20,1.400000e-03,1.400000e-03,292.00,6400.0,184.279721,288.15,...,False,221.446235,0.78040,0.051789,11.8758,5279.7477,1.871000e-08,0.986800,0.986800,274000.00
Actinonaias_ligamentina,22.479227,2201.146591,15679.269617,0.09,5.300000e-07,5.300000e-07,25.00,1017.0,2663.703640,288.15,...,False,35.950640,0.98335,0.003881,14.8638,2354.2993,9.080000e-10,0.000004,0.000009,57.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Varanus_komodoensis,22.408378,1514.304606,18940.255192,0.30,9.000000e+01,9.000000e+01,18000.00,87000.0,0.151998,298.15,...,False,634.343602,0.97120,0.127960,17.0958,7870.0241,6.071000e-11,12730.000000,12730.000000,2481000.00
Wallabia_bicolor,30.551071,154.814712,1094.805523,0.30,6.100000e-01,3.850000e+02,7000.00,14625.0,0.021129,309.65,...,True,543.141214,0.70265,0.028538,23.8679,7841.1164,1.235000e-11,560.400000,392500.000000,8185000.00
Xantusia_vigilis,56.949093,1334.449093,3978.500000,0.30,2.300000e-01,2.300000e-01,0.95,1.3,0.005479,293.15,...,False,64.250393,0.41186,0.017897,26.8706,7828.9786,4.052000e-09,1812.000000,1812.000000,15170.00
Xiphias_gladius,1.898303,1642.500000,4015.000000,0.20,1.661124e-03,1.661124e-03,74000.00,650000.0,41095.890411,293.15,...,False,1651.206087,0.89926,0.038368,50.1257,5215.6372,5.065000e-08,0.087410,0.201000,12260000.00


In [19]:
split_and_save_dataset(df=bijection_input_df, dataset_name='bijection_input', stratify=df['model'])

Train set size: 200
Validation set size: 43
Test set size: 44


In [20]:
dataset_name = 'bijection_input'
train_df = pd.read_csv(f'../data/processed/{dataset_name}_train.csv', index_col=0)
val_df = pd.read_csv(f'../data/processed/{dataset_name}_val.csv', index_col=0)
test_df = pd.read_csv(f'../data/processed/{dataset_name}_test.csv', index_col=0)

print('\n train')
train_df = train_df.join(df['model'], how='left')
print(train_df['model'].value_counts() / len(train_df))

print('\n val')
val_df = val_df.join(df['model'], how='left')
print(val_df['model'].value_counts() / len(val_df))

print('\n test')
test_df = test_df.join(df['model'], how='left')
print(test_df['model'].value_counts() / len(test_df))


 train
std    0.445
abj    0.375
stx    0.180
Name: model, dtype: float64

 val
std    0.441860
abj    0.372093
stx    0.186047
Name: model, dtype: float64

 test
std    0.454545
abj    0.363636
stx    0.181818
Name: model, dtype: float64


## Replaced weight at puberty for age at weaning/metamorphosis

In [21]:
nwp_df_cols = [
    'ab', 'aj', 'ax', 'ap', 'am', 'd_V', 'Wwb', 'Wwj', 'Wwx', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis', 'weaning', 'foetus', 
    'p_Am', 'kap', 'v', 'p_M', 'E_G', 'h_a', 'E_Hb', 'E_Hj', 'E_Hx', 'E_Hp',
    ]
nwp_df = df[nwp_df_cols].copy()

nwp_df['aj'] = nwp_df['aj'].fillna(nwp_df['ax']).fillna(nwp_df['ab'])
nwp_df['Wwj'] = nwp_df['Wwj'].fillna(nwp_df['Wwx']).fillna(nwp_df['Wwb'])
nwp_df['E_Hj'] = nwp_df['E_Hj'].fillna(nwp_df['E_Hx']).fillna(nwp_df['E_Hb'])

nwp_df.rename(columns={'Wwj':'Wwbjx', 'E_Hj':'E_Hbjx', 'aj': 'abjx'}, inplace=True)
nwp_df.drop(columns=['Wwx', 'E_Hx', 'ax'], inplace=True)
nwp_df.index.name = 'species'

# Drop rows with missing data
nwp_df.dropna(how='any', axis=0, inplace=True)
# Drop species with 'stf' model (too few samples to train model)
nwp_df = nwp_df[df['model'] != 'stf']
nwp_df.drop(columns=['foetus'], inplace=True)

nwp_df

  nwp_df = nwp_df[df['model'] != 'stf']


Unnamed: 0_level_0,ab,abjx,ap,am,d_V,Wwb,Wwbjx,Wwi,Ri,T_typical,...,weaning,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hbjx,E_Hp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abramis_brama,15.680398,15.680398,1336.452355,6603.859788,0.20,0.0028,0.0028,6050.0,2271.398921,291.15,...,False,401.092069,0.66367,0.016416,23.3484,5228.9640,4.116000e-08,0.5484,0.5485,236600.0
Abroscopus_superciliaris,2.245595,3.948224,7.353481,443.899620,0.28,0.7800,0.7800,6.5,0.072088,314.75,...,False,663.006069,0.95822,0.034790,533.0640,7316.5423,4.147000e-12,96.0300,631.4000,1472.0
Acanthis_flammea,2.041450,3.379230,6.054789,541.557536,0.28,1.3000,1.3000,14.2,0.225276,314.75,...,False,1294.709603,0.92908,0.037653,907.6351,7320.7246,2.215000e-12,211.9000,1348.0000,3305.0
Acanthis_hornemanni,2.245595,3.826607,6.988632,301.851741,0.28,1.3000,1.3000,12.7,0.101374,314.75,...,False,709.593021,0.95793,0.047468,432.1025,7335.3504,8.946000e-13,197.8000,1236.0000,3203.0
Acanthisitta_chloris,8.014527,8.014527,11.097155,468.816211,0.28,1.0000,1.0000,7.0,0.127982,314.15,...,False,3037.650231,0.98132,0.026069,4238.5777,7324.1141,1.013000e-14,69.7600,69.7600,2759.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zonotrichia_querula,2.653885,3.784917,6.046981,519.362555,0.28,3.1000,3.1000,33.7,0.090110,314.75,...,False,1476.816557,0.97417,0.041453,834.8830,7333.1130,3.237000e-12,188.8000,817.4000,2151.0
Zootoca_vivipara,38.066843,38.066843,957.078491,4600.151458,0.30,0.1900,0.1900,5.0,0.018635,286.85,...,False,516.242968,0.72515,0.023765,344.4866,7837.0592,1.533000e-07,226.4000,226.4000,6375.0
Zosterops_lateralis,2.449740,3.909136,6.827928,532.679544,0.28,1.1000,1.1000,12.5,0.067583,314.75,...,False,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.7900,456.8000,1199.0
Zosterops_virens,2.449740,3.544287,5.733381,532.679544,0.28,0.9000,0.9000,11.8,0.045055,314.75,...,False,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.0600,101.4000,294.8


In [38]:
dataset_name = 'no_pub_weight'
split_and_save_dataset(df=nwp_df, dataset_name=dataset_name, stratify=df['model'])
types_of_col = {
 'ab': ['input', 'log', 'scale'],
 'abjx': ['input', 'log', 'scale'],
 'ap': ['input', 'log', 'scale'],
 'am': ['input', 'log', 'scale'],
 'd_V': ['input'],
 'Wwb': ['input', 'log', 'scale'],
 'Wwbjx': ['input', 'log', 'scale'],
 'Wwi': ['input', 'log', 'scale'],
 'Ri': ['input', 'log', 'scale'],
 'T_typical': ['input', 'scale'],
 'metamorphosis': ['input'],
 'weaning': ['input'],
 'p_Am': ['output', 'log', 'scale'],
 'kap': ['output', 'bounded01'],
 'v': ['output', 'log', 'scale'],
 'p_M': ['output', 'log', 'scale'],
 'E_G': ['output', 'scale'],
 'h_a': ['output', 'log', 'scale'],
 'E_Hb': ['output', 'log', 'scale'],
 'E_Hbjx': ['output', 'log', 'scale'],
 'E_Hp': ['output', 'log', 'scale'],
}
save_types_of_col(types_of_col, dataset_name=dataset_name)

Train set size: 1555
Validation set size: 333
Test set size: 334


In [23]:
train_df = pd.read_csv(f'../data/processed/{dataset_name}_train.csv', index_col=0)
val_df = pd.read_csv(f'../data/processed/{dataset_name}_val.csv', index_col=0)
test_df = pd.read_csv(f'../data/processed/{dataset_name}_test.csv', index_col=0)

print('\n train')
train_df = train_df.join(df['model'], how='left')
print(train_df['model'].value_counts() / len(train_df))

print('\n val')
val_df = val_df.join(df['model'], how='left')
print(val_df['model'].value_counts() / len(val_df))

print('\n test')
test_df = test_df.join(df['model'], how='left')
print(test_df['model'].value_counts() / len(test_df))


 train
std    0.603215
stx    0.313183
abj    0.083601
Name: model, dtype: float64

 val
std    0.603604
stx    0.312312
abj    0.084084
Name: model, dtype: float64

 test
std    0.604790
stx    0.314371
abj    0.080838
Name: model, dtype: float64


In [24]:
nwp_df.describe()

Unnamed: 0,ab,abjx,ap,am,d_V,Wwb,Wwbjx,Wwi,Ri,T_typical,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hbjx,E_Hp
count,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0
mean,50.158786,74.506933,548.57125,3466.170741,0.273348,6334.644,15308.92,265048.9,1701.589,307.204991,4362.270242,0.895016,0.054585,2050.683139,7143.83588,3.858895e-06,1179318.0,7627159.0,37349430.0
std,199.062386,284.480535,1758.032369,8542.026467,0.043391,87559.44,368139.3,4403030.0,33557.55,9.133495,8497.377954,0.140749,0.056503,5368.758548,1138.462736,8.266893e-05,12754700.0,100258000.0,428885200.0
min,0.003398,0.003398,1.164408,3.796606,0.02,2.3e-13,2.3e-13,8.05e-08,0.0008753557,272.15,0.120759,0.17929,0.000401,1.2973,261.4134,8.939999999999999e-44,9.004e-09,1.193e-08,3.177e-06
25%,3.907404,5.008687,10.86695,551.298354,0.28,1.3,1.5,27.525,0.02773981,302.15,562.60437,0.86005,0.024727,49.218925,7313.211675,5.3195e-14,93.5425,225.85,5058.75
50%,11.595167,16.804174,44.90768,1178.320974,0.28,5.7,9.0,180.5,0.07101325,311.15,1454.483986,0.95636,0.036735,602.67595,7326.35985,2.862e-12,374.2,1540.5,32635.0
75%,42.549474,63.27677,323.843085,2559.673231,0.3,44.0,60.5125,2324.0,0.1802209,313.85,4845.988223,0.988565,0.062159,2084.603425,7837.265975,3.685e-10,3131.0,20372.5,600050.0
max,6854.356708,7149.490895,27594.791037,126351.863286,0.3,2750000.0,17000000.0,160000000.0,1300000.0,315.15,216086.54405,0.99998,0.60285,160859.8216,8291.128,0.00327,437300000.0,3034274000.0,12820000000.0


## Ratios

In [25]:
bi_ratio_df_cols = [
    'ab/m', 'ap/m', 'am', 'd_V', 'Wwb/i', 'Wwbjx/i', 'Wwp/i', 'Wwi', 'Ri',
    'T_typical', 'acceleration', 'weaning', 'foetus',
    'p_Am', 'kap', 'v', 'p_M', 'E_G', 'h_a', 'E_Hb/bjx', 'E_Hbjx/p', 'E_Hp',
    ]
bi_ratio_df = pd.DataFrame(index=bijection_input_df.index, columns=bi_ratio_df_cols)

In [26]:
for col in bi_ratio_df.columns:
    if col in bijection_input_df.columns:
        bi_ratio_df[col] = bijection_input_df[col].copy()
bi_ratio_df['ab/m'] = bijection_input_df['ab'] / bijection_input_df['am'] 
bi_ratio_df['ap/m'] = bijection_input_df['ap'] / bijection_input_df['am'] 
bi_ratio_df['Wwb/i'] = bijection_input_df['Wwb'] / bijection_input_df['Wwi'] 
bi_ratio_df['Wwbjx/i'] = bijection_input_df['Wwbjx'] / bijection_input_df['Wwi'] 
bi_ratio_df['Wwp/i'] = bijection_input_df['Wwp'] / bijection_input_df['Wwi'] 
bi_ratio_df['E_Hb/bjx'] = bijection_input_df['E_Hb'] / bijection_input_df['E_Hbjx'] 
bi_ratio_df['E_Hbjx/p'] = bijection_input_df['E_Hbjx'] / bijection_input_df['E_Hp'] 
bi_ratio_df

Unnamed: 0_level_0,ab/m,ap/m,am,d_V,Wwb/i,Wwbjx/i,Wwp/i,Wwi,Ri,T_typical,...,foetus,p_Am,kap,v,p_M,E_G,h_a,E_Hb/bjx,E_Hbjx/p,E_Hp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abramis_brama,0.002374,0.202374,6603.859788,0.20,4.628099e-07,4.628099e-07,0.052727,6050.0,2271.398921,291.15,...,,401.092069,0.66367,0.016416,23.3484,5228.9640,4.116000e-08,0.999818,2.318259e-06,236600.00
Achoerodus_viridis,0.000172,0.085714,12775.000000,0.20,7.233796e-08,7.233796e-08,0.019444,3600.0,9589.041096,293.15,...,,148.179353,0.52154,0.099329,5.6975,5238.8833,2.801000e-10,0.999674,9.874396e-07,310500.00
Acipenser_persicus,0.000960,0.255744,14235.000000,0.20,1.395134e-06,1.395134e-06,0.400000,70000.0,301.300000,293.15,...,,562.047321,0.56571,0.043759,12.3264,5242.1846,8.726000e-10,1.000000,1.448381e-06,36130000.00
Acipenser_ruthenus,0.000570,0.181586,22548.134817,0.20,2.187500e-07,2.187500e-07,0.045625,6400.0,184.279721,288.15,...,,221.446235,0.78040,0.051789,11.8758,5279.7477,1.871000e-08,1.000000,3.601460e-06,274000.00
Actinonaias_ligamentina,0.001434,0.140386,15679.269617,0.09,5.211406e-10,5.211406e-10,0.024582,1017.0,2663.703640,288.15,...,,35.950640,0.98335,0.003881,14.8638,2354.2993,9.080000e-10,0.418423,1.580427e-07,57.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Varanus_komodoensis,0.001183,0.079952,18940.255192,0.30,1.034483e-03,1.034483e-03,0.206897,87000.0,0.151998,298.15,...,,634.343602,0.97120,0.127960,17.0958,7870.0241,6.071000e-11,1.000000,5.130996e-03,2481000.00
Wallabia_bicolor,0.027905,0.141408,1094.805523,0.30,4.170940e-05,2.632479e-02,0.478632,14625.0,0.021129,309.65,...,,543.141214,0.70265,0.028538,23.8679,7841.1164,1.235000e-11,0.001428,4.795357e-02,8185000.00
Xantusia_vigilis,0.014314,0.335415,3978.500000,0.30,1.769231e-01,1.769231e-01,0.730769,1.3,0.005479,293.15,...,,64.250393,0.41186,0.017897,26.8706,7828.9786,4.052000e-09,1.000000,1.194463e-01,15170.00
Xiphias_gladius,0.000473,0.409091,4015.000000,0.20,2.555576e-09,2.555576e-09,0.113846,650000.0,41095.890411,293.15,...,,1651.206087,0.89926,0.038368,50.1257,5215.6372,5.065000e-08,0.434876,1.639478e-08,12260000.00


In [29]:
split_and_save_dataset(df=bi_ratio_df, dataset_name='ratio_bijection_input')

Train set size: 200
Validation set size: 43
Test set size: 44


## Ratios (No puberty weight)

In [41]:
nwp_ratio_df_cols = [
    'ab/m', 'abjx/m', 'ap/m', 'am', 'd_V', 'Wwb/i', 'Wwbjx/i', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis', 'weaning',
    'p_Am', 'kap', 'v', 'p_M', 'E_G', 'h_a', 'E_Hb/bjx', 'E_Hbjx/p', 'E_Hp',
    ]
nwp_ratio_df = pd.DataFrame(index=nwp_df.index, columns=nwp_ratio_df_cols)

In [42]:
for col in nwp_ratio_df.columns:
    if col in nwp_df.columns:
        nwp_ratio_df[col] = nwp_df[col].copy()
nwp_ratio_df['ab/m'] = nwp_df['ab'] / nwp_df['am'] 
nwp_ratio_df['abjx/m'] = nwp_df['abjx'] / nwp_df['am'] 
nwp_ratio_df['ap/m'] = nwp_df['ap'] / nwp_df['am'] 
nwp_ratio_df['Wwb/i'] = nwp_df['Wwb'] / nwp_df['Wwi'] 
nwp_ratio_df['Wwbjx/i'] = nwp_df['Wwbjx'] / nwp_df['Wwi'] 
# Guarantees maturity increases are respected
nwp_ratio_df['E_Hb/bjx'] = nwp_df['E_Hb'] / nwp_df['E_Hbjx'] 
nwp_ratio_df['E_Hbjx/p'] = nwp_df['E_Hbjx'] / nwp_df['E_Hp'] 
nwp_ratio_df

Unnamed: 0_level_0,ab/m,abjx/m,ap/m,am,d_V,Wwb/i,Wwbjx/i,Wwi,Ri,T_typical,...,weaning,p_Am,kap,v,p_M,E_G,h_a,E_Hb/bjx,E_Hbjx/p,E_Hp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abramis_brama,0.002374,0.002374,0.202374,6603.859788,0.20,4.628099e-07,4.628099e-07,6050.0,2271.398921,291.15,...,False,401.092069,0.66367,0.016416,23.3484,5228.9640,4.116000e-08,0.999818,0.000002,236600.0
Abroscopus_superciliaris,0.005059,0.008894,0.016566,443.899620,0.28,1.200000e-01,1.200000e-01,6.5,0.072088,314.75,...,False,663.006069,0.95822,0.034790,533.0640,7316.5423,4.147000e-12,0.152091,0.428940,1472.0
Acanthis_flammea,0.003770,0.006240,0.011180,541.557536,0.28,9.154930e-02,9.154930e-02,14.2,0.225276,314.75,...,False,1294.709603,0.92908,0.037653,907.6351,7320.7246,2.215000e-12,0.157196,0.407867,3305.0
Acanthis_hornemanni,0.007439,0.012677,0.023153,301.851741,0.28,1.023622e-01,1.023622e-01,12.7,0.101374,314.75,...,False,709.593021,0.95793,0.047468,432.1025,7335.3504,8.946000e-13,0.160032,0.385888,3203.0
Acanthisitta_chloris,0.017095,0.017095,0.023671,468.816211,0.28,1.428571e-01,1.428571e-01,7.0,0.127982,314.15,...,False,3037.650231,0.98132,0.026069,4238.5777,7324.1141,1.013000e-14,1.000000,0.025285,2759.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zonotrichia_querula,0.005110,0.007288,0.011643,519.362555,0.28,9.198813e-02,9.198813e-02,33.7,0.090110,314.75,...,False,1476.816557,0.97417,0.041453,834.8830,7333.1130,3.237000e-12,0.230976,0.380009,2151.0
Zootoca_vivipara,0.008275,0.008275,0.208054,4600.151458,0.30,3.800000e-02,3.800000e-02,5.0,0.018635,286.85,...,False,516.242968,0.72515,0.023765,344.4866,7837.0592,1.533000e-07,1.000000,0.035514,6375.0
Zosterops_lateralis,0.004599,0.007339,0.012818,532.679544,0.28,8.800000e-02,8.800000e-02,12.5,0.067583,314.75,...,False,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,0.168104,0.380984,1199.0
Zosterops_virens,0.004599,0.006654,0.010763,532.679544,0.28,7.627119e-02,7.627119e-02,11.8,0.045055,314.75,...,False,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,0.207692,0.343962,294.8


In [43]:
list(nwp_ratio_df.columns)

['ab/m',
 'abjx/m',
 'ap/m',
 'am',
 'd_V',
 'Wwb/i',
 'Wwbjx/i',
 'Wwi',
 'Ri',
 'T_typical',
 'metamorphosis',
 'weaning',
 'p_Am',
 'kap',
 'v',
 'p_M',
 'E_G',
 'h_a',
 'E_Hb/bjx',
 'E_Hbjx/p',
 'E_Hp']

In [44]:
dataset_name = 'ratio_no_pub_weight'
split_and_save_dataset(df=nwp_ratio_df, dataset_name=dataset_name, stratify=df['model'])
types_of_col = {
 'ab/m': ['input', 'bounded01'],
 'abjx/m': ['input', 'bounded01'],
 'ap/m': ['input', 'bounded01'],
 'am': ['input', 'log', 'scale'],
 'd_V': ['input'],
 'Wwb/i': ['input', 'bounded01'],
 'Wwbjx/i': ['input', 'bounded01'],
 'Wwi': ['input', 'log', 'scale'],
 'Ri': ['input', 'log', 'scale'],
 'T_typical': ['input', 'scale'],
 'metamorphosis': ['input'],
 'weaning': ['input'],
 'p_Am': ['output', 'log', 'scale'],
 'kap': ['output', 'bounded01'],
 'v': ['output', 'log', 'scale'],
 'p_M': ['output', 'log', 'scale'],
 'E_G': ['output', 'log', 'scale'],
 'h_a': ['output', 'log', 'scale'],
 'E_Hb/bjx': ['output', 'bounded01'],
 'E_Hbjx/p': ['output', 'bounded01'],
 'E_Hp': ['output', 'log', 'scale'],
}
save_types_of_col(types_of_col, dataset_name=dataset_name)


Train set size: 1555
Validation set size: 333
Test set size: 334


In [45]:
nwp_ratio_df.describe()

Unnamed: 0,ab/m,abjx/m,ap/m,am,d_V,Wwb/i,Wwbjx/i,Wwi,Ri,T_typical,p_Am,kap,v,p_M,E_G,h_a,E_Hb/bjx,E_Hbjx/p,E_Hp
count,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0
mean,0.02581661,0.03423379,0.110268,3466.170741,0.273348,0.05793432,0.1015287,265048.9,1701.589,307.204991,4362.270242,0.895016,0.054585,2050.683139,7143.83588,3.858895e-06,0.4897781,0.1686322,37349430.0
std,0.04062718,0.05135564,0.129145,8542.026467,0.043391,0.05766202,0.1366605,4403030.0,33557.55,9.133495,8497.377954,0.140749,0.056503,5368.758548,1138.462736,8.266893e-05,0.4382295,0.1997941,428885200.0
min,7.800613e-07,7.800613e-07,0.000567,3.796606,0.02,8.846154e-11,8.846154e-11,8.05e-08,0.0008753557,272.15,0.120759,0.17929,0.000401,1.2973,261.4134,8.939999999999999e-44,7.637854e-07,8.479263e-09,3.177e-06
25%,0.004548708,0.005609052,0.014855,551.298354,0.28,0.01185033,0.01923871,27.525,0.02773981,302.15,562.60437,0.86005,0.024727,49.218925,7313.211675,5.3195e-14,0.07263278,0.01090602,5058.75
50%,0.009033719,0.01195357,0.069343,1178.320974,0.28,0.04744041,0.0625,180.5,0.07101325,311.15,1454.483986,0.95636,0.036735,602.67595,7326.35985,2.862e-12,0.2442104,0.07341721,32635.0
75%,0.03113163,0.04714396,0.159115,2559.673231,0.3,0.08655669,0.1165049,2324.0,0.1802209,313.85,4845.988223,0.988565,0.062159,2084.603425,7837.265975,3.685e-10,1.0,0.3204559,600050.0
max,0.6978431,0.8,0.88,126351.863286,0.3,0.54,0.9809524,160000000.0,1300000.0,315.15,216086.54405,0.99998,0.60285,160859.8216,8291.128,0.00327,1.0,0.9998959,12820000000.0
