In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json
import os

In [3]:
deb_models = ['std', 'stf', 'stx', 'abj']
parameter_cols = ['p_Am', 'kap', 'v', 'p_M', 'E_G', 'h_a', 'E_Hb', 'E_Hj', 'E_Hx', 'E_Hp']
taxonomy_cols = ['family', 'order', 'class', 'phylum']
ecocode_cols = ['climate', 'ecozone', 'habitat', 'embryo', 'migrate', 'food', 'gender', 'reprod']
age_data_cols = ['ab', 'ah', 'aj', 'ax', 'ap', 'am']
time_since_birth_data_cols = ['tg', 'tb', 'tj', 'tx', 'tp']
weight_data_cols = ['Wwb', 'Wwj', 'Wwx', 'Wwp', 'Wwi']
length_data_cols = ['Lb', 'Lj', 'Lx', 'Lp', 'Li']
other_cols = ['d_V', 'Ri', 'T_typical', 't_0', 'model']

In [4]:
def print_missing_values_per_column(df, percentage=True):
    for c in df.columns:
        n_missing = pd.isna(df[c]).sum()
        if n_missing:
            if percentage:
                print(f"{n_missing/len(df)*100:.1f} % missing values in column {c}")
            else:
                print(f"{n_missing} missing values in column {c}")

# Loading dataset

In [29]:
raw_data = pd.read_csv('../data/raw/dataset_matlab.csv', index_col=0)
raw_data.dropna(how='all', inplace=True)
raw_data.index.name = 'species'
raw_data

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Lb,Lj,Lx,Lp,Li,d_V,Ri,T_typical,t_0,model
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,,0.276,,1.90,8.0,0.09,,278.15,,abj
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,,,,4.50,18.9,0.20,7.772727,291.15,149.8737,abj
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,,,,47.40,140.0,0.20,73.373736,301.05,,abj
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,,,,0.90,1.9,0.09,48.959720,288.15,,abj
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,0.08,,,1.40,4.0,0.21,,291.15,,abj
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,,,,12.65,29.9,0.20,100.486494,290.25,,std
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,,,,,,0.28,0.067583,314.75,,std
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,,,,,,0.28,0.045055,314.75,,std
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,0.060,,,7.0,0.09,,280.15,,abj


In [30]:
raw_data['model'].value_counts()

model
std    2103
abj    1756
stx     732
stf      51
Name: count, dtype: int64

In [31]:
print_missing_values_per_column(raw_data)

58.7 % missing values in column E_Hj
75.6 % missing values in column E_Hx
0.0 % missing values in column climate
0.0 % missing values in column ecozone
0.0 % missing values in column habitat
0.0 % missing values in column embryo
66.3 % missing values in column migrate
0.0 % missing values in column food
0.0 % missing values in column gender
0.0 % missing values in column reprod
44.2 % missing values in column ab
98.3 % missing values in column ah
99.1 % missing values in column aj
100.0 % missing values in column ax
95.1 % missing values in column ap
1.1 % missing values in column am
84.3 % missing values in column tg
99.8 % missing values in column tb
95.9 % missing values in column tj
75.7 % missing values in column tx
43.5 % missing values in column tp
15.9 % missing values in column Wwb
98.0 % missing values in column Wwj
92.7 % missing values in column Wwx
72.8 % missing values in column Wwp
8.7 % missing values in column Wwi
73.3 % missing values in column Lb
93.4 % missing value

# Preprocessing

In [144]:
df = raw_data.copy()

## Remove species with invalid data or parameter sets

In [145]:
# Drop rows with values less than 0
df = df[((df.select_dtypes(include='number') >= 0) | (df.select_dtypes(include='number').isna())).all(axis=1)]
# Remove species with incorrect maturity values
df = df[( (df['E_Hb'] < df['E_Hx']) | df[['E_Hb', 'E_Hx']].isna().any(axis=1) )]
df = df[( (df['E_Hb'] < df['E_Hj']) | df[['E_Hb', 'E_Hj']].isna().any(axis=1) )]
df = df[( (df['E_Hb'] < df['E_Hp']) )]
df = df[( (df['E_Hx'] < df['E_Hp']) | df[['E_Hp', 'E_Hx']].isna().any(axis=1) )]
df = df[( (df['E_Hj'] < df['E_Hp']) | df[['E_Hp', 'E_Hj']].isna().any(axis=1) )]
#nwp_df = nwp_df[~(nwp_df['E_Hbjx'] < nwp_df['E_Hb']) & ~(nwp_df['E_Hp'] < nwp_df['E_Hbjx'])]
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Lb,Lj,Lx,Lp,Li,d_V,Ri,T_typical,t_0,model
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,,0.276,,1.90,8.0,0.09,,278.15,,abj
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,,,,4.50,18.9,0.20,7.772727,291.15,149.8737,abj
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,,,,47.40,140.0,0.20,73.373736,301.05,,abj
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,,,,0.90,1.9,0.09,48.959720,288.15,,abj
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,0.08,,,1.40,4.0,0.21,,291.15,,abj
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,,,,12.65,29.9,0.20,100.486494,290.25,,std
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,,,,,,0.28,0.067583,314.75,,std
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,,,,,,0.28,0.045055,314.75,,std
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,0.060,,,7.0,0.09,,280.15,,abj


## Fill missing weight data from length data

In [146]:
n_imputed_values = {c: 0 for c in weight_data_cols+age_data_cols}
#species = 'Ziphius_cavirostris'
#for species, row in [[species, df.loc[species]]]:
for species, row in df.iterrows():
    length_vars_exist = [~np.isnan(row[ldv]) for ldv in length_data_cols]
    weight_vars_exist = [~np.isnan(row[wdv]) for wdv in weight_data_cols]
    both_vars_exist = [l and w for l, w in zip(length_vars_exist, weight_vars_exist)]
    # Check if we have a length-weight pair to make correspondences
    if not any(both_vars_exist) or sum(length_vars_exist) <= 1:
        continue
    for i in range(len(length_data_cols) - 1, -1, -1):
        # Continue if the weight at the life stage already exists
        if weight_vars_exist[i]:
            continue
        elif length_vars_exist[i]:
            for j in range(1, len(length_data_cols), 1):
                # Find a life stage with both weight and length data
                b = (i - j) % len(length_data_cols)
                if both_vars_exist[b]:
                    w_col = weight_data_cols[i]
                    df.at[species, w_col] = (row[weight_data_cols[b]] * np.power(row[length_data_cols[i]] / row[length_data_cols[b]], 3))
                    n_imputed_values[w_col] += 1
                    break
                    

{'Wwb': 532, 'Wwj': 208, 'Wwx': 7, 'Wwp': 1184, 'Wwi': 69, 'ab': 0, 'ah': 0, 'aj': 0, 'ax': 0, 'ap': 0, 'am': 0}
{'Wwb': 532, 'Wwj': 208, 'Wwx': 7, 'Wwp': 1184, 'Wwi': 69, 'ab': 0, 'ah': 0, 'aj': 0, 'ax': 0, 'ap': 0, 'am': 0}


## Fill missing age data with time since birth data

In [147]:
for species in df.index.values:
    data = df.loc[species]
    # If no data on age at birth 'ab'
    if data.isna()['ab']:
        # Infer it from gestation time 'tg'
        if not data.isna()['tg']:
            # If diapause exists take into account for calculating age at birth
            if data.isna()['t_0']:
                df.at[species, 'ab'] = df.at[species, 'tg']
                n_imputed_values['ab'] += 1
            else:
                df.at[species, 'ab'] = df.at[species, 'tg'] + df.at[species, 't_0']
                n_imputed_values['ab'] += 1

        # Infer from age at hatch 'ah' and time since hatch at birth 'tb'
        elif not data.isna()['ah'] and not data.isna()['tb']:
            df.at[species, 'ab'] = df.at[species, 'ah'] + df.at[species, 'tb']
            n_imputed_values['ab'] += 1

    # Compute age at maturity levels from time since birth data
    if not data.isna()['ab']:
        for mat in ('j', 'x', 'p'):
            if data.isna()[f'a{mat}'] and not data.isna()[f't{mat}']:
                df.at[species, f'a{mat}'] = df.at[species, f't{mat}'] + df.at[species, 'ab']  
                n_imputed_values[f'a{mat}'] += 1

print(n_imputed_values)

{'Wwb': 532, 'Wwj': 208, 'Wwx': 7, 'Wwp': 1184, 'Wwi': 69, 'ab': 738, 'ah': 0, 'aj': 135, 'ax': 400, 'ap': 1585, 'am': 0}


In [148]:
# Remove species with data on age at maturity levels not in increasing order
df = df[( (df['ab'] < df['ax']) | df[['ab', 'ax']].isna().any(axis=1) )]
df = df[( (df['ab'] < df['aj']) | df[['ab', 'aj']].isna().any(axis=1) )]
df = df[( (df['ab'] < df['ap']) | df[['ab', 'ap']].isna().any(axis=1) )]
df = df[( (df['ab'] < df['am']) | df[['ab', 'am']].isna().any(axis=1) )]
df = df[( (df['ax'] < df['ap']) | df[['ax', 'ap']].isna().any(axis=1) )]
df = df[( (df['ax'] < df['am']) | df[['ax', 'am']].isna().any(axis=1) )]
df = df[( (df['aj'] < df['ap']) | df[['aj', 'ap']].isna().any(axis=1) )]
df = df[( (df['aj'] < df['am']) | df[['aj', 'am']].isna().any(axis=1) )]
df = df[( (df['ap'] < df['am']) | df[['ap', 'am']].isna().any(axis=1) )]

## Handling acceleration and weaning

In [149]:
df['metamorphosis'] = df['model'] == 'abj'
df['weaning'] = df['model'] == 'stx'
df['foetus'] = (df['model'] == 'stf') | (df['model'] == 'stx')
#df.drop(columns=['model'], inplace=True)

# Handle taxonomic info

In [150]:
# NOT IMPLEMENTED

In [151]:
print_missing_values_per_column(df)

58.8 % missing values in column E_Hj
75.7 % missing values in column E_Hx
0.0 % missing values in column climate
0.0 % missing values in column ecozone
0.0 % missing values in column habitat
0.0 % missing values in column embryo
66.3 % missing values in column migrate
0.0 % missing values in column food
0.0 % missing values in column gender
0.0 % missing values in column reprod
28.4 % missing values in column ab
98.4 % missing values in column ah
96.2 % missing values in column aj
91.3 % missing values in column ax
60.9 % missing values in column ap
1.1 % missing values in column am
84.1 % missing values in column tg
99.8 % missing values in column tb
95.9 % missing values in column tj
75.5 % missing values in column tx
43.5 % missing values in column tp
4.3 % missing values in column Wwb
93.5 % missing values in column Wwj
92.5 % missing values in column Wwx
47.1 % missing values in column Wwp
7.2 % missing values in column Wwi
73.5 % missing values in column Lb
93.5 % missing values 

In [152]:
df.to_csv('../data/interim/filled_data.csv', index=True)

In [153]:
df

Unnamed: 0_level_0,p_Am,kap,v,p_M,E_G,h_a,E_Hb,E_Hj,E_Hx,E_Hp,...,Lp,Li,d_V,Ri,T_typical,t_0,model,metamorphosis,weaning,foetus
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abatus_cordatus,41.918449,0.77712,0.027220,13.8449,2393.8237,5.047000e-06,0.487600,6.941000,,1403.000,...,1.90,8.0,0.09,,278.15,,abj,True,False,False
Abbottina_rivularis,75.309251,0.97055,0.020221,23.8370,5215.8135,3.547000e-07,0.045900,0.069550,,89.340,...,4.50,18.9,0.20,7.772727,291.15,149.8737,abj,True,False,False
Ablennes_hians,201.398502,0.99640,0.022797,17.9765,5237.1062,1.614000e-09,0.003540,0.029360,,1083.000,...,47.40,140.0,0.20,73.373736,301.05,,abj,True,False,False
Abra_segmentum,8.266674,0.93000,0.020926,25.2041,2349.9631,7.505000e-07,0.000601,0.011460,,10.940,...,0.90,1.9,0.09,48.959720,288.15,,abj,True,False,False
Abralia_trigonura,109.484284,0.98081,0.009515,1009.8108,5492.3580,1.526000e-10,0.001511,2.628000,,14.090,...,1.40,4.0,0.21,,291.15,,abj,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zosterisessor_ophiocephalus,90.838193,0.97755,0.020245,17.1271,5231.3551,1.513000e-07,0.003958,,,1577.000,...,12.65,29.9,0.20,100.486494,290.25,,std,False,False,False
Zosterops_lateralis,872.827510,0.97558,0.035919,617.3516,7324.4923,2.674000e-12,76.790000,,456.8,1199.000,...,,,0.28,0.067583,314.75,,std,False,False,False
Zosterops_virens,940.612495,0.99060,0.031028,734.2559,7319.8077,3.005000e-12,21.060000,,101.4,294.800,...,,,0.28,0.045055,314.75,,std,False,False,False
Zygochlamys_patagonica,64.787878,0.94310,0.020238,41.2541,2342.3878,4.982000e-08,0.000251,0.000722,,423.675,...,,7.0,0.09,,280.15,,abj,True,False,False


# Processed dataset

In [154]:
processed_dataset_save_folder = '../data/processed'

In [155]:
abj_columns = ['E_Hj', 'Wwj', 'aj']
stx_columns = ['E_Hx', 'Wwx', 'ax']

def drop_species_with_missing_values(df):

    # abj models
    subset = [col for col in df.columns if col not in stx_columns]
    #print(len(subset), subset)
    abj_df = df[df['metamorphosis']].dropna(subset=subset)

    # stx models
    subset = [col for col in df.columns if col not in abj_columns]
    #print(len(subset), subset)
    stx_df = df[df['weaning']].dropna(subset=subset)
    
    # std models
    subset = [col for col in df.columns if col not in abj_columns and col not in stx_columns]
    #print(len(subset), subset)
    std_df = df[~df['weaning'] & ~df['metamorphosis']].dropna(subset=subset)

    return pd.concat([abj_df, std_df, stx_df], axis=0)

In [156]:
def save_types_of_col(types_of_col, dataset_folder):
    with open(f'{dataset_folder}/types_of_col.json', "w") as json_file:
        json.dump(types_of_col, json_file, indent=4)

def split_and_save_dataset(df, dataset_name, types_of_col, train_percentage=0.70, val_percentage=0.15, test_percentage=0.15, seed=42, stratify=None):
    
    # First split to separate train and temp (test + val)
    if stratify is not None:
        stratify_array = stratify.loc[df.index]
    else: 
        stratify_array = None
    train_df, temp_df = train_test_split(df, test_size=(1 - train_percentage), random_state=seed, stratify=stratify_array)
    
    # Calculate the percentage of temp that should be test and val
    val_test_ratio = val_percentage / (val_percentage + test_percentage)
    
    # Split temp into validation and test sets
    if stratify is not None:
        stratify_array = stratify.loc[temp_df.index]
    else: 
        stratify_array = None
    val_df, test_df = train_test_split(temp_df, test_size=(1 - val_test_ratio), random_state=seed, stratify=stratify_array)
    
    
    print(f"Train set size: {len(train_df)}")
    print(f"Validation set size: {len(val_df)}")
    print(f"Test set size: {len(test_df)}")
    
    # Save datasets
    
    dataset_folder = os.path.join(processed_dataset_save_folder, dataset_name)
    if not os.path.exists(dataset_folder):
        os.makedirs(dataset_folder)
    df.to_csv(os.path.join(dataset_folder, f'{dataset_name}.csv'), index=True, float_format='%.6e')
    train_df.to_csv(os.path.join(dataset_folder, f'train.csv'), index=True, float_format='%.6e')
    val_df.to_csv(os.path.join(dataset_folder, f'val.csv'), index=True, float_format='%.6e')
    test_df.to_csv(os.path.join(dataset_folder, f'test.csv'), index=True, float_format='%.6e')
    # Save dataset info
    save_types_of_col(types_of_col, dataset_folder)


In [157]:
taxonomy_class_options = {
    'class': ['Actinopterygii', 'Aves', 'Mammalia', 'Reptilia', 'Amphibia'],
}
ecocode_include_other = {
    'class': True
}

def encode_taxonomy(df, include_other=True):
    taxonomy_dummy_cols = []
    for col, options in taxonomy_class_options.items():
        if ecocode_include_other[col]:
            df[f"{col}_other"] = True
            taxonomy_dummy_cols.append(f"{col}_other")
        for taxo in options:
            dummy_col = f'{col}_{taxo}'
            taxonomy_dummy_cols.append(dummy_col)
            df[dummy_col] = False
            for species, species_taxo in df[col].items():
                if pd.isna(taxo):
                    continue
                if taxo == species_taxo:
                    df.loc[species, dummy_col] = True
                    df.loc[species, f"{col}_other"] = False
    return df, taxonomy_dummy_cols

In [158]:
eco_code_options = {
    'climate': ['A', 'B', 'C', 'D', 'E'],
    'habitat': ['T', 'F', 'S', 'M'],
    'migrate': ['T'],
    'food': ['P', 'O', 'H', 'C'],
}
eco_code_include_other = {
    'climate': False,
    'habitat': False,
    'migrate': False,
    'food': True,
}

def encode_eco_codes(df):
    eco_code_dummy_cols = []
    for col, options in eco_code_options.items():
        if eco_code_include_other[col]:
            df[f"{col}_other"] = True
            eco_code_dummy_cols.append(f"{col}_other")
        for code in options:
            dummy_col = f'{col}_{code}'
            eco_code_dummy_cols.append(dummy_col)
            df[dummy_col] = False
            for species, eco_codes in df[col].items():
                if pd.isna(eco_codes):
                    continue
                if code in eco_codes:
                    df.loc[species, dummy_col] = True
                    if eco_code_include_other[col]:
                        df.loc[species, f"{col}_other"] = False

    return df, eco_code_dummy_cols

## Bijection input datasets

In [180]:
bijection_input_df_cols = [
    'ab', 'ap', 'am', 'd_V', 'Wwb', 'Wwp', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis', 'weaning', 'foetus', 
    'p_Am', 'kap', 'v', 'p_M', 'h_a', 'E_Hb', 'E_Hp',
    ]

# Include metamorphosis datasets
include_metamorphosis = True
if include_metamorphosis:
    bijection_input_df_cols.extend(['Wwj', 'E_Hj'])
    
# Include weaning datasets
include_weaning = True
if include_weaning:
    bijection_input_df_cols.extend(['Wwx', 'E_Hx'])
    
# Include taxonomy columns
include_taxonomy = True
if include_taxonomy:
    bijection_input_df_cols.extend(taxonomy_cols)

# Include eco-code columns
include_eco_codes = True
if include_eco_codes:
    bijection_input_df_cols.extend(ecocode_cols)

# Copy columns
bijection_input_df = df[bijection_input_df_cols].copy()
bijection_input_df.index.name = 'species'

# Drop species with 'stf' model (too few samples to train model)
bijection_input_df = bijection_input_df[df['model'] != 'stf']
bijection_input_df.drop(columns=['foetus'], inplace=True)

# Encode taxonomy
if include_taxonomy:
    bijection_input_df, taxonomy_dummy_cols = encode_taxonomy(bijection_input_df, include_other=True)
    bijection_input_df.drop(columns=taxonomy_cols, inplace=True)

# Encode eco-codes
if include_eco_codes:
    bijection_input_df, eco_code_dummy_cols = encode_eco_codes(bijection_input_df)
    bijection_input_df.drop(columns=ecocode_cols, inplace=True)

# Drop rows with missing data
bijection_input_df = drop_species_with_missing_values(bijection_input_df)

bijection_input_df

Unnamed: 0_level_0,ab,ap,am,d_V,Wwb,Wwp,Wwi,Ri,T_typical,metamorphosis,...,habitat_T,habitat_F,habitat_S,habitat_M,migrate_T,food_other,food_P,food_O,food_H,food_C
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Amphiura_filiformis,3.537313,3964.986695,33041.555790,0.09,5.236000e-07,4.775600,5.846500,502.254805,284.15,True,...,False,False,False,True,False,False,True,False,False,False
Argopecten_purpuratus,4.839344,92.893264,2678.306724,0.09,1.482222e-06,1.777778,213.777778,608488.755352,289.15,True,...,False,False,False,True,False,False,True,False,False,False
Biomphalaria_glabrata,13.000000,50.200000,360.000000,0.15,3.879400e-04,0.440000,2.350000,137.000000,293.15,True,...,False,True,False,False,False,False,False,False,True,False
Brachidontes_pharaonis,1.100000,183.100000,1460.000000,0.09,2.053401e-08,0.084000,1.545600,1208.408219,293.15,True,...,False,False,False,True,False,False,True,False,False,False
Branchiostoma_floridae,0.909731,15.283481,730.513987,0.06,7.221500e-05,0.026300,0.421800,687.016278,303.65,True,...,False,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Umbra_limi,11.548971,613.745335,3010.981816,0.20,1.800000e-03,0.519669,14.650000,2.490882,286.15,False,...,False,True,False,False,False,False,False,False,False,True
Varanus_bengalensis,44.293739,621.694263,6235.925664,0.30,7.800000e+01,300.000000,7200.000000,0.086595,298.15,False,...,True,False,False,False,False,False,False,False,False,True
Varanus_komodoensis,22.408378,1514.304606,18940.255192,0.30,9.000000e+01,18000.000000,87000.000000,0.151998,298.15,False,...,True,False,False,False,False,False,False,False,False,True
Xantusia_vigilis,56.949093,1334.449093,3978.500000,0.30,2.300000e-01,0.950000,1.300000,0.005479,293.15,False,...,True,False,False,False,False,False,False,False,False,True


In [182]:
bijection_input_df[['weaning', 'metamorphosis']].value_counts()/len(bijection_input_df)

weaning  metamorphosis
False    False            0.849398
         True             0.150602
Name: count, dtype: float64

In [161]:
dataset_name = 'bijection_input'
types_of_col = {
 'ab': ['input', 'log', 'scale'],
 'ap': ['input', 'log', 'scale'],
 'ap': ['input', 'log', 'scale'],
 'am': ['input', 'log', 'scale'],
 'd_V': ['input'],
 'Wwb': ['input', 'log', 'scale'],
# 'Wwbjx': ['input', 'log', 'scale'],
 'Wwp': ['input', 'log', 'scale'],
 'Wwi': ['input', 'log', 'scale'],
 'Ri': ['input', 'log', 'scale'],
 'T_typical': ['input', 'scale'],
 'metamorphosis': ['input'],
 'weaning': ['input'],
 'p_Am': ['output', 'log', 'scale'],
 'kap': ['output', 'bounded01'],
 'v': ['output', 'log', 'scale'],
 'p_M': ['output', 'log', 'scale'],
# 'E_G': ['output', 'scale'],
 'h_a': ['output', 'log', 'scale'],
 'E_Hb': ['output', 'log', 'scale'],
# 'E_Hbjx': ['output', 'log', 'scale'],
 'E_Hp': ['output', 'log', 'scale'],
}

if include_taxonomy:
    dataset_name += '_taxonomy'
    for col in taxonomy_dummy_cols:
        types_of_col[col] = ['input', 'boolean']
else:
    bijection_input_df.drop(columns=taxonomy_dummy_cols, inplace=True)

if include_eco_codes:
    dataset_name += '_ecocodes'
    for col in eco_code_dummy_cols:
        types_of_col[col] = ['input', 'boolean']
else:
    bijection_input_df.drop(columns=eco_code_dummy_cols, inplace=True)

split_and_save_dataset(df=bijection_input_df, dataset_name=dataset_name, types_of_col=types_of_col, stratify=df['model'])


Train set size: 238
Validation set size: 51
Test set size: 52


## No weight at puberty 

In [195]:
nwp_df_cols = [
    'ab', 'ap', 'am', 'd_V', 'Wwb', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis', 'weaning', 'foetus', 
    'p_Am', 'kap', 'v', 'p_M', 'h_a', 'E_Hb', 'E_Hp',
    ]

# Include metamorphosis datasets
include_metamorphosis = False
if include_metamorphosis:
    nwp_df_cols.extend(['aj', 'Wwj', 'E_Hj'])
    
# Include weaning datasets
include_weaning = False
if include_weaning:
    nwp_df_cols.extend(['ax', 'Wwx', 'E_Hx'])
    
# Include taxonomy columns
include_taxonomy = True
if include_taxonomy:
    nwp_df_cols.extend(taxonomy_cols)
# Include eco-code columns
include_eco_codes = True
if include_eco_codes:
    nwp_df_cols.extend(ecocode_cols)

# Copy columns
nwp_df = df[nwp_df_cols].copy()
nwp_df.index.name = 'species'

# Drop species with 'stf' model (too few samples to train model)
nwp_df = nwp_df[df['model'] != 'stf']
nwp_df.drop(columns=['foetus'], inplace=True)

# Encode taxonomy
if include_taxonomy:
    nwp_df, taxonomy_dummy_cols = encode_taxonomy(nwp_df, include_other=True)
    nwp_df.drop(columns=taxonomy_cols, inplace=True)

# Encode eco-codes
if include_eco_codes:
    nwp_df, eco_code_dummy_cols = encode_eco_codes(nwp_df)
    nwp_df.drop(columns=ecocode_cols, inplace=True)

# Create a single intermediate maturity level (weaning or metamorphosis or birth if missing)
# nwp_df['ax'] = nwp_df['ax'].fillna(nwp_df['aj']).fillna(nwp_df['ab'])
# nwp_df['Wwx'] = nwp_df['Wwx'].fillna(nwp_df['Wwj']).fillna(nwp_df['Wwb'])
# nwp_df['E_Hx'] = nwp_df['E_Hx'].fillna(nwp_df['E_Hj']).fillna(nwp_df['E_Hb'])
# nwp_df.rename(columns={'Wwx':'Wwbjx', 'E_Hx':'E_Hbjx', 'ax': 'abjx'}, inplace=True)
# nwp_df.drop(columns=['Wwj', 'E_Hj', 'aj'], inplace=True)

# Drop rows with missing data
#nwp_df.dropna(how='any', axis=0, inplace=True)
nwp_df = drop_species_with_missing_values(nwp_df)

nwp_df

Unnamed: 0_level_0,ab,ap,am,d_V,Wwb,Wwi,Ri,T_typical,metamorphosis,weaning,...,habitat_T,habitat_F,habitat_S,habitat_M,migrate_T,food_other,food_P,food_O,food_H,food_C
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abramis_brama,15.680398,1336.452355,6603.859788,0.20,2.800000e-03,6050.000000,2271.398921,291.15,True,False,...,False,True,False,False,False,False,False,False,True,True
Achoerodus_viridis,2.195824,1095.000000,12775.000000,0.20,2.604167e-04,3600.000000,9589.041096,293.15,True,False,...,False,False,False,True,False,False,True,False,False,True
Actinonaias_ligamentina,22.479227,2201.146591,15679.269617,0.09,5.300000e-07,1017.000000,2663.703640,288.15,True,False,...,False,True,False,False,False,False,True,False,False,False
Aegopinella_epipedostoma,30.154611,343.762564,880.514638,0.15,5.128976e-04,0.166667,0.045428,294.65,True,False,...,True,False,False,False,False,False,False,False,False,True
Aequipecten_opercularis,3.211318,960.102626,9568.913082,0.09,1.640000e-07,107.000000,6270.304630,283.15,True,False,...,False,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zonotrichia_leucophrys,2.755957,7.134145,590.386494,0.28,2.700000e+00,25.800000,0.135166,314.75,False,False,...,True,False,False,False,False,False,False,False,True,True
Zonotrichia_querula,2.653885,6.046981,519.362555,0.28,3.100000e+00,33.700000,0.090110,314.75,False,False,...,True,False,False,False,False,False,False,False,True,True
Zootoca_vivipara,38.066843,957.078491,4600.151458,0.30,1.900000e-01,5.000000,0.018635,286.85,False,False,...,True,False,False,False,False,False,False,False,False,True
Zosterops_lateralis,2.449740,6.827928,532.679544,0.28,1.100000e+00,12.500000,0.067583,314.75,False,False,...,True,False,False,False,False,False,False,False,True,True


In [196]:
# nwp_df[['weaning', 'metamorphosis', 'foetus']].value_counts()
nwp_df[['weaning', 'metamorphosis']].value_counts()

weaning  metamorphosis
False    False            1355
         True              194
Name: count, dtype: int64

In [197]:
dataset_name = 'no_pub_weight'
types_of_col = {
 'ab': ['input', 'log', 'scale'],
# 'ax': ['input', 'log', 'scale'],
# 'aj': ['input', 'log', 'scale'],
 'ap': ['input', 'log', 'scale'],
 'am': ['input', 'log', 'scale'],
 'd_V': ['input'],
 'Wwb': ['input', 'log', 'scale'],
# 'Wwx': ['input', 'log', 'scale'],
# 'Wwj': ['input', 'log', 'scale'],
 'Wwi': ['input', 'log', 'scale'],
 'Ri': ['input', 'log', 'scale'],
 'T_typical': ['input', 'scale'],
 'metamorphosis': ['input', 'boolean'],
 'weaning': ['input', 'boolean'],
 'p_Am': ['output', 'log', 'scale'],
 'kap': ['output', 'bounded01'],
 'v': ['output', 'log', 'scale'],
 'p_M': ['output', 'log', 'scale'],
# 'E_G': ['output', 'scale'],
 'h_a': ['output', 'log', 'scale'],
 'E_Hb': ['output', 'log', 'scale'],
# 'E_Hx': ['output', 'log', 'scale'],
# 'E_Hj': ['output', 'log', 'scale'],
 'E_Hp': ['output', 'log', 'scale'],
}

if include_taxonomy:
    dataset_name += '_taxonomy'
    for col in taxonomy_dummy_cols:
        types_of_col[col] = ['input', 'boolean']
#else:
#    nwp_df.drop(columns=taxonomy_dummy_cols, inplace=True)

if include_eco_codes:
    dataset_name += '_ecocodes'
    for col in eco_code_dummy_cols:
        types_of_col[col] = ['input', 'boolean']
#else:
#    nwp_df.drop(columns=eco_code_dummy_cols, inplace=True)

features_targets = {}
for par in parameter_cols:
    if par in types_of_col:
        feature_list = [f for f in types_of_col if f not in parameter_cols]
        if par == 'E_Hj':
            feature_list.remove('ax')
            feature_list.remove('Wwx')
        elif par == 'E_Hx':
            feature_list.remove('aj')
            feature_list.remove('Wwj')
        features_targets[par] = feature_list

    
split_and_save_dataset(df=nwp_df, dataset_name=dataset_name, types_of_col=types_of_col, stratify=df['model'])


Train set size: 1084
Validation set size: 232
Test set size: 233


In [198]:
df.loc[nwp_df.index, :].to_csv(f'{processed_dataset_save_folder}/no_pub_weight/no_dummies.csv', index=True, float_format='%.6e')

## Ratios

In [98]:
bi_ratio_df_cols = [
    'ab_m', 'ap_m', 'am', 'd_V', 'Wwb_i', 'Wwbjx_i', 'Wwp_i', 'Wwi', 'Ri',
    'T_typical', 'acceleration', 'weaning', 'foetus',
    'p_Am', 'kap', 'v', 'p_M', 'h_a', 'E_Hb_bjx', 'E_Hbjx_p', 'E_Hp',
    ]
bi_ratio_df = pd.DataFrame(index=bijection_input_df.index, columns=bi_ratio_df_cols)

In [99]:
for col in bi_ratio_df.columns:
    if col in bijection_input_df.columns:
        bi_ratio_df[col] = bijection_input_df[col].copy()
bi_ratio_df['ab_m'] = bijection_input_df['ab'] / bijection_input_df['am'] 
bi_ratio_df['ap_m'] = bijection_input_df['ap'] / bijection_input_df['am'] 
bi_ratio_df['Wwb_i'] = bijection_input_df['Wwb'] / bijection_input_df['Wwi'] 
bi_ratio_df['Wwbjx_i'] = bijection_input_df['Wwbjx'] / bijection_input_df['Wwi'] 
bi_ratio_df['Wwp_i'] = bijection_input_df['Wwp'] / bijection_input_df['Wwi'] 
bi_ratio_df['E_Hb_bjx'] = bijection_input_df['E_Hb'] / bijection_input_df['E_Hbjx'] 
bi_ratio_df['E_Hbjx_p'] = bijection_input_df['E_Hbjx'] / bijection_input_df['E_Hp'] 
bi_ratio_df

Unnamed: 0_level_0,ab_m,ap_m,am,d_V,Wwb_i,Wwbjx_i,Wwp_i,Wwi,Ri,T_typical,...,weaning,foetus,p_Am,kap,v,p_M,h_a,E_Hb_bjx,E_Hbjx_p,E_Hp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abramis_brama,0.002374,0.202374,6603.859788,0.20,4.628099e-07,4.628099e-07,0.052727,6050.0,2271.398921,291.15,...,False,,401.092069,0.66367,0.016416,23.3484,4.116000e-08,0.999818,2.318259e-06,236600.00
Achoerodus_viridis,0.000172,0.085714,12775.000000,0.20,7.233796e-08,7.233796e-08,0.019444,3600.0,9589.041096,293.15,...,False,,148.179353,0.52154,0.099329,5.6975,2.801000e-10,0.999674,9.874396e-07,310500.00
Acipenser_persicus,0.000960,0.255744,14235.000000,0.20,1.395134e-06,1.395134e-06,0.400000,70000.0,301.300000,293.15,...,False,,562.047321,0.56571,0.043759,12.3264,8.726000e-10,1.000000,1.448381e-06,36130000.00
Acipenser_ruthenus,0.000570,0.181586,22548.134817,0.20,2.187500e-07,2.187500e-07,0.045625,6400.0,184.279721,288.15,...,False,,221.446235,0.78040,0.051789,11.8758,1.871000e-08,1.000000,3.601460e-06,274000.00
Actinonaias_ligamentina,0.001434,0.140386,15679.269617,0.09,5.211406e-10,5.211406e-10,0.024582,1017.0,2663.703640,288.15,...,False,,35.950640,0.98335,0.003881,14.8638,9.080000e-10,0.418423,1.580427e-07,57.63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Varanus_komodoensis,0.001183,0.079952,18940.255192,0.30,1.034483e-03,1.034483e-03,0.206897,87000.0,0.151998,298.15,...,False,,634.343602,0.97120,0.127960,17.0958,6.071000e-11,1.000000,5.130996e-03,2481000.00
Wallabia_bicolor,0.027905,0.141408,1094.805523,0.30,4.170940e-05,2.632479e-02,0.478632,14625.0,0.021129,309.65,...,True,,543.141214,0.70265,0.028538,23.8679,1.235000e-11,0.001428,4.795357e-02,8185000.00
Xantusia_vigilis,0.014314,0.335415,3978.500000,0.30,1.769231e-01,1.769231e-01,0.730769,1.3,0.005479,293.15,...,False,,64.250393,0.41186,0.017897,26.8706,4.052000e-09,1.000000,1.194463e-01,15170.00
Xiphias_gladius,0.000473,0.409091,4015.000000,0.20,2.555576e-09,2.555576e-09,0.113846,650000.0,41095.890411,293.15,...,False,,1651.206087,0.89926,0.038368,50.1257,5.065000e-08,0.434876,1.639478e-08,12260000.00


In [100]:
split_and_save_dataset(df=bi_ratio_df, dataset_name='ratio_bijection_input')

Train set size: 200
Validation set size: 43
Test set size: 44


## Ratios (No weight at puberty)

In [101]:
nwp_ratio_df_cols = [
    'ab_m', 'abjx_m', 'ap_m', 'am', 'd_V', 'Wwb_i', 'Wwbjx_i', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis', 'weaning',
    'p_Am', 'kap', 'v', 'p_M', 'h_a', 'E_Hb_bjx', 'E_Hbjx_p', 'E_Hp',
    ]
nwp_ratio_df = pd.DataFrame(index=nwp_df.index, columns=nwp_ratio_df_cols)

In [102]:
for col in nwp_ratio_df.columns:
    if col in nwp_df.columns:
        nwp_ratio_df[col] = nwp_df[col].copy()
nwp_ratio_df['ab_m'] = nwp_df['ab'] / nwp_df['am'] 
nwp_ratio_df['abjx_m'] = nwp_df['abjx'] / nwp_df['am'] 
nwp_ratio_df['ap_m'] = nwp_df['ap'] / nwp_df['am'] 
nwp_ratio_df['Wwb_i'] = nwp_df['Wwb'] / nwp_df['Wwi'] 
nwp_ratio_df['Wwbjx_i'] = nwp_df['Wwbjx'] / nwp_df['Wwi'] 
# Guarantees maturity increases are respected
nwp_ratio_df['E_Hb_bjx'] = nwp_df['E_Hb'] / nwp_df['E_Hbjx'] 
nwp_ratio_df['E_Hbjx_p'] = nwp_df['E_Hbjx'] / nwp_df['E_Hp'] 
nwp_ratio_df

Unnamed: 0_level_0,ab_m,abjx_m,ap_m,am,d_V,Wwb_i,Wwbjx_i,Wwi,Ri,T_typical,metamorphosis,weaning,p_Am,kap,v,p_M,h_a,E_Hb_bjx,E_Hbjx_p,E_Hp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Abramis_brama,0.002374,0.002374,0.202374,6603.859788,0.20,4.628099e-07,4.628099e-07,6050.0,2271.398921,291.15,True,False,401.092069,0.66367,0.016416,23.3484,4.116000e-08,0.999818,0.000002,236600.0
Abroscopus_superciliaris,0.005059,0.008894,0.016566,443.899620,0.28,1.200000e-01,1.200000e-01,6.5,0.072088,314.75,False,False,663.006069,0.95822,0.034790,533.0640,4.147000e-12,0.152091,0.428940,1472.0
Acanthis_flammea,0.003770,0.006240,0.011180,541.557536,0.28,9.154930e-02,9.154930e-02,14.2,0.225276,314.75,False,False,1294.709603,0.92908,0.037653,907.6351,2.215000e-12,0.157196,0.407867,3305.0
Acanthis_hornemanni,0.007439,0.012677,0.023153,301.851741,0.28,1.023622e-01,1.023622e-01,12.7,0.101374,314.75,False,False,709.593021,0.95793,0.047468,432.1025,8.946000e-13,0.160032,0.385888,3203.0
Acanthisitta_chloris,0.017095,0.017095,0.023671,468.816211,0.28,1.428571e-01,1.428571e-01,7.0,0.127982,314.15,False,False,3037.650231,0.98132,0.026069,4238.5777,1.013000e-14,1.000000,0.025285,2759.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zonotrichia_querula,0.005110,0.007288,0.011643,519.362555,0.28,9.198813e-02,9.198813e-02,33.7,0.090110,314.75,False,False,1476.816557,0.97417,0.041453,834.8830,3.237000e-12,0.230976,0.380009,2151.0
Zootoca_vivipara,0.008275,0.008275,0.208054,4600.151458,0.30,3.800000e-02,3.800000e-02,5.0,0.018635,286.85,False,False,516.242968,0.72515,0.023765,344.4866,1.533000e-07,1.000000,0.035514,6375.0
Zosterops_lateralis,0.004599,0.007339,0.012818,532.679544,0.28,8.800000e-02,8.800000e-02,12.5,0.067583,314.75,False,False,872.827510,0.97558,0.035919,617.3516,2.674000e-12,0.168104,0.380984,1199.0
Zosterops_virens,0.004599,0.006654,0.010763,532.679544,0.28,7.627119e-02,7.627119e-02,11.8,0.045055,314.75,False,False,940.612495,0.99060,0.031028,734.2559,3.005000e-12,0.207692,0.343962,294.8


In [103]:
dataset_name = 'ratio_no_pub_weight'
split_and_save_dataset(df=nwp_ratio_df, dataset_name=dataset_name, stratify=df['model'])
types_of_col = {
 'ab_m': ['input', 'log', 'scale', 'bounded01'],
 'abjx_m': ['input', 'log', 'scale', 'bounded01'],
 'ap_m': ['input', 'log', 'scale', 'bounded01'],
 'am': ['input', 'log', 'scale'],
 'd_V': ['input'],
 'Wwb_i': ['input', 'log', 'scale', 'bounded01'],
 'Wwbjx_i': ['input', 'log', 'scale', 'bounded01'],
 'Wwi': ['input', 'log', 'scale'],
 'Ri': ['input', 'log', 'scale'],
 'T_typical': ['input', 'scale'],
 'metamorphosis': ['input'],
 'weaning': ['input'],
 'p_Am': ['output', 'log', 'scale'],
 'kap': ['output', 'bounded01'],
 'v': ['output', 'log', 'scale'],
 'p_M': ['output', 'log', 'scale'],
 #'E_G': ['output', 'log', 'scale'],
 'h_a': ['output', 'log', 'scale'],
 'E_Hb_bjx': ['output', 'bounded01'],
 'E_Hbjx_p': ['output', 'bounded01'],
 'E_Hp': ['output', 'log', 'scale'],
}
save_types_of_col(types_of_col, dataset_name=dataset_name)


Train set size: 1555
Validation set size: 333
Test set size: 334


In [104]:
nwp_ratio_df.describe()

Unnamed: 0,ab_m,abjx_m,ap_m,am,d_V,Wwb_i,Wwbjx_i,Wwi,Ri,T_typical,p_Am,kap,v,p_M,h_a,E_Hb_bjx,E_Hbjx_p,E_Hp
count,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0,2222.0
mean,0.02581661,0.03423379,0.110268,3466.170741,0.273334,0.05793432,0.1015287,265048.9,1701.589,307.204991,4362.270242,0.895016,0.054585,2050.683139,3.858895e-06,0.4897781,0.1686322,37349430.0
std,0.04062718,0.05135564,0.129145,8542.026467,0.043508,0.05766202,0.1366605,4403030.0,33557.55,9.133495,8497.377954,0.140749,0.056503,5368.758548,8.266893e-05,0.4382295,0.1997941,428885200.0
min,7.800613e-07,7.800613e-07,0.000567,3.796606,0.01,8.846154e-11,8.846154e-11,8.05e-08,0.0008753557,272.15,0.120759,0.17929,0.000401,1.2973,8.939999999999999e-44,7.637854e-07,8.479263e-09,3.177e-06
25%,0.004548708,0.005609052,0.014855,551.298354,0.28,0.01185033,0.01923871,27.525,0.02773981,302.15,562.60437,0.86005,0.024727,49.218925,5.3195e-14,0.07263278,0.01090602,5058.75
50%,0.009033719,0.01195357,0.069343,1178.320974,0.28,0.04744041,0.0625,180.5,0.07101325,311.15,1454.483986,0.95636,0.036735,602.67595,2.862e-12,0.2442104,0.07341721,32635.0
75%,0.03113163,0.04714396,0.159115,2559.673231,0.3,0.08655669,0.1165049,2324.0,0.1802209,313.85,4845.988223,0.988565,0.062159,2084.603425,3.685e-10,1.0,0.3204559,600050.0
max,0.6978431,0.8,0.88,126351.863286,0.3,0.54,0.9809524,160000000.0,1300000.0,315.15,216086.54405,0.99998,0.60285,160859.8216,0.00327,1.0,0.9998959,12820000000.0


## Ratios (Only output) (No weight at puberty)

In [200]:
nwp_ratio_output_df_cols = [
    'ab', 'ap', 'am', 'd_V', 'Wwb', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis', 'weaning',
    'p_Am', 'kap', 'v', 'p_M', 'h_a', 'E_Hb_p', 'E_Hp',
    ]

# Include taxonomy columns
include_taxonomy = True
if include_taxonomy:
    nwp_ratio_output_df_cols.extend(taxonomy_dummy_cols)

# Include eco-code columns
include_eco_codes = True
if include_eco_codes:
    nwp_ratio_output_df_cols.extend(eco_code_dummy_cols)

nwp_ratio_output_df = pd.DataFrame(index=nwp_df.index, columns=nwp_ratio_output_df_cols)

# Copy columns
for col in nwp_ratio_output_df.columns:
    if col in nwp_df.columns:
        nwp_ratio_output_df[col] = nwp_df[col].copy()

# Compute ratio columns
nwp_ratio_output_df['E_Hb_p'] = nwp_df['E_Hb'] / nwp_df['E_Hp'] 
nwp_ratio_output_df

Unnamed: 0_level_0,ab,ap,am,d_V,Wwb,Wwi,Ri,T_typical,metamorphosis,weaning,...,habitat_T,habitat_F,habitat_S,habitat_M,migrate_T,food_other,food_P,food_O,food_H,food_C
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abramis_brama,15.680398,1336.452355,6603.859788,0.20,2.800000e-03,6050.000000,2271.398921,291.15,True,False,...,False,True,False,False,False,False,False,False,True,True
Achoerodus_viridis,2.195824,1095.000000,12775.000000,0.20,2.604167e-04,3600.000000,9589.041096,293.15,True,False,...,False,False,False,True,False,False,True,False,False,True
Actinonaias_ligamentina,22.479227,2201.146591,15679.269617,0.09,5.300000e-07,1017.000000,2663.703640,288.15,True,False,...,False,True,False,False,False,False,True,False,False,False
Aegopinella_epipedostoma,30.154611,343.762564,880.514638,0.15,5.128976e-04,0.166667,0.045428,294.65,True,False,...,True,False,False,False,False,False,False,False,False,True
Aequipecten_opercularis,3.211318,960.102626,9568.913082,0.09,1.640000e-07,107.000000,6270.304630,283.15,True,False,...,False,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zonotrichia_leucophrys,2.755957,7.134145,590.386494,0.28,2.700000e+00,25.800000,0.135166,314.75,False,False,...,True,False,False,False,False,False,False,False,True,True
Zonotrichia_querula,2.653885,6.046981,519.362555,0.28,3.100000e+00,33.700000,0.090110,314.75,False,False,...,True,False,False,False,False,False,False,False,True,True
Zootoca_vivipara,38.066843,957.078491,4600.151458,0.30,1.900000e-01,5.000000,0.018635,286.85,False,False,...,True,False,False,False,False,False,False,False,False,True
Zosterops_lateralis,2.449740,6.827928,532.679544,0.28,1.100000e+00,12.500000,0.067583,314.75,False,False,...,True,False,False,False,False,False,False,False,True,True


In [201]:
nwp_ratio_output_df.columns

Index(['ab', 'ap', 'am', 'd_V', 'Wwb', 'Wwi', 'Ri', 'T_typical',
       'metamorphosis', 'weaning', 'p_Am', 'kap', 'v', 'p_M', 'h_a', 'E_Hb_p',
       'E_Hp', 'class_other', 'class_Actinopterygii', 'class_Aves',
       'class_Mammalia', 'class_Reptilia', 'class_Amphibia', 'climate_A',
       'climate_B', 'climate_C', 'climate_D', 'climate_E', 'habitat_T',
       'habitat_F', 'habitat_S', 'habitat_M', 'migrate_T', 'food_other',
       'food_P', 'food_O', 'food_H', 'food_C'],
      dtype='object')

In [202]:
dataset_name = 'ratio_output_no_pub_weight'
types_of_col = {
 'ab': ['input', 'log', 'scale'],
 'ap': ['input', 'log', 'scale'],
 'am': ['input', 'log', 'scale'],
 'd_V': ['input'],
 'Wwb': ['input', 'log', 'scale'],
 'Wwi': ['input', 'log', 'scale'],
 'Ri': ['input', 'log', 'scale'],
 'T_typical': ['input', 'scale'],
 'metamorphosis': ['input'],
 'weaning': ['input'],
 'p_Am': ['output', 'log', 'scale'],
 'kap': ['output', 'bounded01'],
 'v': ['output', 'log', 'scale'],
 'p_M': ['output', 'log', 'scale'],
 'h_a': ['output', 'log', 'scale'],
 'E_Hb_p': ['output', 'log', 'bounded01'],
 'E_Hp': ['output', 'log', 'scale'],
}

if include_taxonomy:
    dataset_name += '_taxonomy'
    for col in taxonomy_dummy_cols:
        types_of_col[col] = ['input', 'boolean']

if include_eco_codes:
    dataset_name += '_ecocodes'
    for col in eco_code_dummy_cols:
        types_of_col[col] = ['input', 'boolean']


split_and_save_dataset(df=nwp_ratio_output_df, dataset_name=dataset_name, types_of_col=types_of_col, stratify=df['model'])


Train set size: 1084
Validation set size: 232
Test set size: 233


## No age at puberty

In [214]:
nap_df_cols = [
    'ab', 'am', 'd_V', 'Wwb', 'Wwp', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis', 'weaning', 'foetus', 
    'p_Am', 'kap', 'v', 'p_M', 'h_a', 'E_Hb', 'E_Hp',
    ]

# Include metamorphosis datasets
include_metamorphosis = False
if include_metamorphosis:
    nap_df_cols.extend(['aj', 'Wwj', 'E_Hj'])
    
# Include weaning datasets
include_weaning = False
if include_weaning:
    nap_df_cols.extend(['ax', 'Wwx', 'E_Hx'])
    
# Include taxonomy columns
include_taxonomy = True
if include_taxonomy:
    nap_df_cols.extend(taxonomy_cols)
# Include eco-code columns
include_eco_codes = True
if include_eco_codes:
    nap_df_cols.extend(ecocode_cols)

# Copy columns
nap_df = df[nap_df_cols].copy()
nap_df.index.name = 'species'

# Drop species with 'stf' model (too few samples to train model)
nap_df = nap_df[df['model'] != 'stf']
nap_df.drop(columns=['foetus'], inplace=True)

# Encode taxonomy
if include_taxonomy:
    nap_df, taxonomy_dummy_cols = encode_taxonomy(nap_df, include_other=True)
    nap_df.drop(columns=taxonomy_cols, inplace=True)

# Encode eco-codes
if include_eco_codes:
    nap_df, eco_code_dummy_cols = encode_eco_codes(nap_df)
    nap_df.drop(columns=ecocode_cols, inplace=True)

# Create a single intermediate maturity level (weaning or metamorphosis or birth if missing)
# nap_df['ax'] = nap_df['ax'].fillna(nap_df['aj']).fillna(nap_df['ab'])
# nap_df['Wwx'] = nap_df['Wwx'].fillna(nap_df['Wwj']).fillna(nap_df['Wwb'])
# nap_df['E_Hx'] = nap_df['E_Hx'].fillna(nap_df['E_Hj']).fillna(nap_df['E_Hb'])
# nap_df.rename(columns={'Wwx':'Wwbjx', 'E_Hx':'E_Hbjx', 'ax': 'abjx'}, inplace=True)
# nap_df.drop(columns=['Wwj', 'E_Hj', 'aj'], inplace=True)

# Drop rows with missing data
#nap_df.dropna(how='any', axis=0, inplace=True)
nap_df = drop_species_with_missing_values(nap_df)

nap_df

Unnamed: 0_level_0,ab,am,d_V,Wwb,Wwp,Wwi,Ri,T_typical,metamorphosis,weaning,...,habitat_T,habitat_F,habitat_S,habitat_M,migrate_T,food_other,food_P,food_O,food_H,food_C
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbottina_rivularis,18.092767,1761.029277,0.2,0.000520,8.700000e-01,79.0,7.772727,291.15,True,False,...,False,True,False,False,False,False,False,False,False,True
Ablennes_hians,1.954569,2675.316655,0.2,0.000520,1.440000e+02,4200.0,73.373736,301.05,True,False,...,False,False,False,True,False,False,False,False,False,True
Abramis_brama,15.680398,6603.859788,0.2,0.002800,3.190000e+02,6050.0,2271.398921,291.15,True,False,...,False,True,False,False,False,False,False,False,True,True
Acanthoclinus_littoreus,20.552903,4233.589854,0.2,0.000520,1.010000e+01,49.0,66.761244,290.45,True,False,...,False,False,False,True,False,False,True,False,False,True
Acanthocybium_solandri,2.176640,2269.924535,0.2,0.000600,6.000000e+03,112600.0,26432.596797,297.85,True,False,...,False,False,False,True,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sotalia_fluviatilis,71.629172,2815.260801,0.3,3978.587963,4.309023e+04,55000.0,0.006216,310.35,False,True,...,False,False,False,True,False,False,False,False,False,True
Theropithecus_gelada,34.667783,2710.002225,0.3,464.000000,1.343863e+04,11000.0,0.006642,311.15,False,True,...,True,False,False,False,False,False,False,False,True,False
Ursus_maritimus,217.400207,3575.744543,0.3,665.000000,1.540000e+05,185000.0,0.011705,309.95,False,True,...,False,False,False,True,True,False,False,False,False,True
Wallabia_bicolor,30.551071,1094.805523,0.3,0.610000,7.000000e+03,14625.0,0.021129,309.65,False,True,...,True,False,False,False,False,False,False,False,True,False


In [216]:
nap_df.isna().sum()

ab                      0
am                      0
d_V                     0
Wwb                     0
Wwp                     0
Wwi                     0
Ri                      0
T_typical               0
metamorphosis           0
weaning                 0
p_Am                    0
kap                     0
v                       0
p_M                     0
h_a                     0
E_Hb                    0
E_Hp                    0
class_other             0
class_Actinopterygii    0
class_Aves              0
class_Mammalia          0
class_Reptilia          0
class_Amphibia          0
climate_A               0
climate_B               0
climate_C               0
climate_D               0
climate_E               0
habitat_T               0
habitat_F               0
habitat_S               0
habitat_M               0
migrate_T               0
food_other              0
food_P                  0
food_O                  0
food_H                  0
food_C                  0
dtype: int64

In [215]:
dataset_name = 'no_pub_age'
types_of_col = {
 'ab': ['input', 'log', 'scale'],
# 'ax': ['input', 'log', 'scale'],
# 'aj': ['input', 'log', 'scale'],
# 'ap': ['input', 'log', 'scale'],
 'am': ['input', 'log', 'scale'],
 'd_V': ['input'],
 'Wwb': ['input', 'log', 'scale'],
# 'Wwx': ['input', 'log', 'scale'],
# 'Wwj': ['input', 'log', 'scale'],
 'Wwp': ['input', 'log', 'scale'],
 'Wwi': ['input', 'log', 'scale'],
 'Ri': ['input', 'log', 'scale'],
 'T_typical': ['input', 'scale'],
 'metamorphosis': ['input', 'boolean'],
 'weaning': ['input', 'boolean'],
 'p_Am': ['output', 'log', 'scale'],
 'kap': ['output', 'bounded01'],
 'v': ['output', 'log', 'scale'],
 'p_M': ['output', 'log', 'scale'],
# 'E_G': ['output', 'scale'],
 'h_a': ['output', 'log', 'scale'],
 'E_Hb': ['output', 'log', 'scale'],
# 'E_Hx': ['output', 'log', 'scale'],
# 'E_Hj': ['output', 'log', 'scale'],
 'E_Hp': ['output', 'log', 'scale'],
}

if include_taxonomy:
    dataset_name += '_taxonomy'
    for col in taxonomy_dummy_cols:
        types_of_col[col] = ['input', 'boolean']
#else:
#    nap_df.drop(columns=taxonomy_dummy_cols, inplace=True)

if include_eco_codes:
    dataset_name += '_ecocodes'
    for col in eco_code_dummy_cols:
        types_of_col[col] = ['input', 'boolean']
#else:
#    nap_df.drop(columns=eco_code_dummy_cols, inplace=True)

features_targets = {}
for par in parameter_cols:
    if par in types_of_col:
        feature_list = [f for f in types_of_col if f not in parameter_cols]
        if par == 'E_Hj':
            feature_list.remove('ax')
            feature_list.remove('Wwx')
        elif par == 'E_Hx':
            feature_list.remove('aj')
            feature_list.remove('Wwj')
        features_targets[par] = feature_list

    
split_and_save_dataset(df=nap_df, dataset_name=dataset_name, types_of_col=types_of_col, stratify=df['model'])


Train set size: 745
Validation set size: 160
Test set size: 160


In [208]:
df.loc[nap_df.index, :].to_csv(f'{processed_dataset_save_folder}/no_pub_age/no_dummies.csv', index=True, float_format='%.6e')

## Ratios (Only output) (No age at puberty)

In [218]:
nap_ratio_output_df_cols = [
    'ab', 'am', 'd_V', 'Wwb', 'Wwp', 'Wwi', 'Ri',
    'T_typical', 'metamorphosis', 'weaning',
    'p_Am', 'kap', 'v', 'p_M', 'h_a', 'E_Hb_p', 'E_Hp',
    ]

# Include taxonomy columns
include_taxonomy = True
if include_taxonomy:
    nap_ratio_output_df_cols.extend(taxonomy_dummy_cols)

# Include eco-code columns
include_eco_codes = True
if include_eco_codes:
    nap_ratio_output_df_cols.extend(eco_code_dummy_cols)

nap_ratio_output_df = pd.DataFrame(index=nap_df.index, columns=nap_ratio_output_df_cols)

# Copy columns
for col in nap_ratio_output_df.columns:
    if col in nap_df.columns:
        nap_ratio_output_df[col] = nap_df[col].copy()

# Compute ratio columns
nap_ratio_output_df['E_Hb_p'] = nap_df['E_Hb'] / nap_df['E_Hp'] 
nap_ratio_output_df

Unnamed: 0_level_0,ab,am,d_V,Wwb,Wwp,Wwi,Ri,T_typical,metamorphosis,weaning,...,habitat_T,habitat_F,habitat_S,habitat_M,migrate_T,food_other,food_P,food_O,food_H,food_C
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbottina_rivularis,18.092767,1761.029277,0.2,0.000520,8.700000e-01,79.0,7.772727,291.15,True,False,...,False,True,False,False,False,False,False,False,False,True
Ablennes_hians,1.954569,2675.316655,0.2,0.000520,1.440000e+02,4200.0,73.373736,301.05,True,False,...,False,False,False,True,False,False,False,False,False,True
Abramis_brama,15.680398,6603.859788,0.2,0.002800,3.190000e+02,6050.0,2271.398921,291.15,True,False,...,False,True,False,False,False,False,False,False,True,True
Acanthoclinus_littoreus,20.552903,4233.589854,0.2,0.000520,1.010000e+01,49.0,66.761244,290.45,True,False,...,False,False,False,True,False,False,True,False,False,True
Acanthocybium_solandri,2.176640,2269.924535,0.2,0.000600,6.000000e+03,112600.0,26432.596797,297.85,True,False,...,False,False,False,True,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sotalia_fluviatilis,71.629172,2815.260801,0.3,3978.587963,4.309023e+04,55000.0,0.006216,310.35,False,True,...,False,False,False,True,False,False,False,False,False,True
Theropithecus_gelada,34.667783,2710.002225,0.3,464.000000,1.343863e+04,11000.0,0.006642,311.15,False,True,...,True,False,False,False,False,False,False,False,True,False
Ursus_maritimus,217.400207,3575.744543,0.3,665.000000,1.540000e+05,185000.0,0.011705,309.95,False,True,...,False,False,False,True,True,False,False,False,False,True
Wallabia_bicolor,30.551071,1094.805523,0.3,0.610000,7.000000e+03,14625.0,0.021129,309.65,False,True,...,True,False,False,False,False,False,False,False,True,False


In [219]:
nap_ratio_output_df.columns

Index(['ab', 'am', 'd_V', 'Wwb', 'Wwp', 'Wwi', 'Ri', 'T_typical',
       'metamorphosis', 'weaning', 'p_Am', 'kap', 'v', 'p_M', 'h_a', 'E_Hb_p',
       'E_Hp', 'class_other', 'class_Actinopterygii', 'class_Aves',
       'class_Mammalia', 'class_Reptilia', 'class_Amphibia', 'climate_A',
       'climate_B', 'climate_C', 'climate_D', 'climate_E', 'habitat_T',
       'habitat_F', 'habitat_S', 'habitat_M', 'migrate_T', 'food_other',
       'food_P', 'food_O', 'food_H', 'food_C'],
      dtype='object')

In [220]:
dataset_name = 'ratio_output_no_pub_age'
types_of_col = {
 'ab': ['input', 'log', 'scale'],
 'am': ['input', 'log', 'scale'],
 'd_V': ['input'],
 'Wwb': ['input', 'log', 'scale'],
 'Wwp': ['input', 'log', 'scale'],
 'Wwi': ['input', 'log', 'scale'],
 'Ri': ['input', 'log', 'scale'],
 'T_typical': ['input', 'scale'],
 'metamorphosis': ['input'],
 'weaning': ['input'],
 'p_Am': ['output', 'log', 'scale'],
 'kap': ['output', 'bounded01'],
 'v': ['output', 'log', 'scale'],
 'p_M': ['output', 'log', 'scale'],
 'h_a': ['output', 'log', 'scale'],
 'E_Hb_p': ['output', 'log', 'bounded01'],
 'E_Hp': ['output', 'log', 'scale'],
}

if include_taxonomy:
    dataset_name += '_taxonomy'
    for col in taxonomy_dummy_cols:
        types_of_col[col] = ['input', 'boolean']

if include_eco_codes:
    dataset_name += '_ecocodes'
    for col in eco_code_dummy_cols:
        types_of_col[col] = ['input', 'boolean']


split_and_save_dataset(df=nap_ratio_output_df, dataset_name=dataset_name, types_of_col=types_of_col, stratify=df['model'])


Train set size: 745
Validation set size: 160
Test set size: 160
