In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display
import pickle
import torch

In [2]:
import sys
import os

# Add the 'project' directory to the path
sys.path.append(os.path.abspath('..'))

from project_code.data.load_data import load_col_types, load_data
from project_code.data.prepare_data_sklearn import get_features_targets
from project_code.utils.results import get_best_model_file
from project_code.inference.parameters import get_core_parameter_predictions, convert_output_to_parameter_predictions, PARAMETER_COLS

# Loading data and models

In [3]:
datasets_folder = f'../data/processed/'

dataset_of_model = {
    'SRTaxo1NN': 'biologist_no_pub_age',  
    'Taxo1NN': 'biologist_no_pub_age',  
    'RandomForestRegressor': 'final_taxonomy_ecocodes',
    'MultiTaskElasticNet': 'final_taxonomy_ecocodes',
    'MLP': 'final_taxonomy_ecocodes',
    'DEBNetHC': 'final_taxonomy_ecocodes',
    'DEBNetSC': 'final_taxonomy_ecocodes',
}


## Loading best models

In [4]:
def load_model(model_file, results_folder):
    if model_file[-4:] == '.pkl':
        with open(f"{results_folder}/models/{model_file}", 'rb') as f:
            model = pickle.load(f)
    elif model_file[-4:] == '.pth':
        model = torch.load(f"{results_folder}/models/{model_file}", weights_only=False)
        model.eval()
    return model

In [5]:
best_models = {}
for mt in dataset_of_model.keys():  
    results_folder = f'../results/{dataset_of_model[mt]}'
    metric = 'GEF'
    model_file = get_best_model_file(results_folder=results_folder, model_type=mt, metric=metric)
    if model_file is not None:
        best_models[mt] = load_model(model_file, results_folder)
print(best_models.keys())

dict_keys(['SRTaxo1NN', 'Taxo1NN', 'RandomForestRegressor', 'MultiTaskElasticNet', 'MLP'])


## Loading data

In [6]:
all_dfs = {}
all_col_types = {}
all_data = {}
for dataset_name in list(set(dataset_of_model.values())):
    results_folder = f'../results/{dataset_name}'
    all_dfs[dataset_name] = load_data(dataset_name=dataset_name, data_split='train_test', datasets_folder=datasets_folder)
    all_col_types[dataset_name] = load_col_types(dataset_name=dataset_name, datasets_folder=datasets_folder)
    if 'biologist' not in dataset_name:
        all_data[dataset_name] = get_features_targets(all_dfs[dataset_name], all_col_types[dataset_name])
    else:
        model = best_models['Taxo1NN'] if 'Taxo1NN' in best_models else best_models['SRTaxo1NN']
        encoded_dfs = {}
        # Encoded data with trained model encoders
        for split in ('train', 'test'):
            encoded_dfs[split] = model.regressor.encode_data(all_dfs[dataset_name][split])
        all_data[dataset_name] = get_features_targets(data=encoded_dfs, col_types=all_col_types[dataset_name])

# Visualize predictions

In [7]:
taxonomy_cols = [col for col in all_col_types['final_taxonomy_ecocodes']['input']['all'] if 'class' in col]

hue_series = {
    'metamorphosis': pd.concat([all_dfs['biologist_no_pub_age'][data_split]['metamorphosis'] for data_split in ['train', 'test']]),
    'class': pd.concat([pd.from_dummies(all_dfs['final_taxonomy_ecocodes'][data_split][taxonomy_cols], sep='_') for data_split in ['train', 'test']])['class'],
    'climate': None,
    'habitat': None,
    'migrate': None,
    'food': None
}

hue_orders = {
    'metamorphosis': [False, True],
    'class': None,
    'climate': None,
    'habitat': None,
    'migrate': None,
    'food': None
}


In [None]:
def plot_residuals_df(model_type, plot_kind, data_split, groupby, scale):
    # Get data for the model
    dataset_name = dataset_of_model[model_type]
    data = all_data[dataset_name]
    col_types = all_col_types[dataset_name]
    model = best_models[model_type]

    # Get predictions:
    X = data[data_split]['input']
    y_true_ps = data[data_split]['output']
    if 'DEBNet' in model_type:
        y_pred_ps = model.predict(torch.tensor(X, dtype=torch.float32))
    elif 'Taxo1NN' in model_type and data_split in ['train', 'val']:
        distance_matrix = model.regressor_._compute_distance_matrix(model.regressor_.train_data, model.regressor_.train_data, data_split='train')
        y_hat, indices = model.regressor_.get_predictions_from_distance_matrix(distance_matrix)
        y_pred_ps = model.regressor_.apply_scaling_relationships(model.regressor_.train_data, y_hat, indices)
    else:
        y_pred_ps = model.predict(data[data_split]['input'])

    if scale == 'model':
        cols_to_plot = col_types['output']['all']
        target_df = pd.DataFrame(y_true_ps, columns=col_types['output']['all'])
        pred_df = pd.DataFrame(y_pred_ps, columns=col_types['output']['all'])
        
    elif scale == 'parameter':
        cols_to_plot = PARAMETER_COLS            
        target_df = convert_output_to_parameter_predictions(y_true_ps, data[data_split]['input'], col_types)
        pred_df = convert_output_to_parameter_predictions(y_pred_ps, data[data_split]['input'], col_types)

    target_df.set_index(all_dfs[dataset_name][data_split].index, inplace=True)
    pred_df.set_index(all_dfs[dataset_name][data_split].index, inplace=True)
    if plot_kind == 'residual_vs_predicted':
        if scale == 'model':
            residuals_df = target_df - pred_df
        elif scale == 'parameter':
            residuals_df = (target_df - pred_df) / target_df  

    # Plot predictions vs targets  
    n_cols = 3
    n_rows = np.ceil(len(cols_to_plot) / n_cols).astype(int)
    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(16, 5*n_rows), tight_layout=True)
    fig.suptitle(model_type, fontsize=16)
    margin_factor = 0.05
    for i, col in enumerate(cols_to_plot):
        ax = axes[i // n_cols, i % n_cols]

        if scale == 'parameter' and col != 'kap':
            plot_log_scale_x = True
            if plot_kind == 'residual_vs_predicted':
                plot_log_scale_y = False
            else:
                plot_log_scale_y = True
        elif scale == 'model' and col in col_types['output']['log']:
            plot_log_scale_x = True
            plot_log_scale_y = True
        else:
            plot_log_scale_x = False
            plot_log_scale_y = False
        
        if plot_kind == 'residual_vs_predicted':
            sns.scatterplot(x=pred_df[col], y=residuals_df[col], ax=ax, hue=hue_series[groupby], hue_order=hue_orders[groupby]) # Fix
            min_v = (1-margin_factor)*min(target_df[col].min(), pred_df[col].min())
            max_v = (1+margin_factor)*max(target_df[col].max(), pred_df[col].max())
            ax.set_xlim([min_v, max_v])

            ax.plot([min_v, max_v], [0, 0], 'k--')
            ax.set_ylabel('Residuals (actual - predicted)')

        elif plot_kind == 'actual_vs_predicted':
            sns.scatterplot(x=pred_df[col], y=target_df[col], ax=ax, hue=hue_series[groupby], hue_order=hue_orders[groupby])
            min_v = (1-margin_factor)*min(target_df[col].min(), pred_df[col].min())
            max_v = (1+margin_factor)*max(target_df[col].max(), pred_df[col].max())
            ax.set_xlim([min_v, max_v])
            ax.set_ylim([min_v, max_v])
            ax.plot([min_v, max_v], [min_v, max_v], 'k--')
            ax.set_ylabel('Actual values')

        if plot_log_scale_x:
            ax.set_xscale('log')
        else:
            ax.set_xscale('linear')
        
        if plot_log_scale_y:
            ax.set_yscale('log')
        else:
            ax.set_yscale('linear')

        ax.set_xlabel('Predicted values') 
        #r2 = metrics.r2_score(target_df, pred_df)
        ax.set_title(f"{col}")

model_selector = widgets.Dropdown(options=list(best_models.keys()), value='MLP', description='Model:')
plot_selector = widgets.Dropdown(options=['actual_vs_predicted', 'residual_vs_predicted'], value='actual_vs_predicted', description='Plot Type:')
data_split_selector = widgets.Dropdown(options=['train', 'test'], value='test', description='Data Split: ')
groupby_selector = widgets.Dropdown(options=['metamorphosis', 'class', 'climate', 'habitat', 'migrate', 'food'])
scale_selector = widgets.Dropdown(options=['model', 'parameter',], value='parameter')
widgets.interactive(plot_residuals_df, model_type=model_selector, plot_kind=plot_selector, data_split=data_split_selector, groupby=groupby_selector, scale=scale_selector)

interactive(children=(Dropdown(description='Model:', options=('SRTaxo1NN', 'Taxo1NN', 'RandomForestRegressor',…

# Save parameter predictions

### AmP values

In [9]:
dataset_name = 'biologist_no_pub_age'
dfs = all_dfs[dataset_name]
col_types = all_col_types[dataset_name]
gt_df = pd.concat({ds: dfs[ds][col_types['output']['all']] for ds in ('train', 'test')}).reset_index(level=0, names='data_split')
gt_pars_df = get_core_parameter_predictions(dfs, pred_df=gt_df, col_types=col_types)
#gt_pars_df.to_csv(f'../results/parameter_predictions/AmP_predictions.csv', float_format='%.6e')
gt_pars_df

Unnamed: 0_level_0,data_split,z,p_M,kap,v,E_G,E_Hb,E_Hx,E_Hj,E_Hp,k_J
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Thalassarche_chrysostoma,train,5.440655,572.95810,0.932257,0.017264,7321.422,8849.133000,120359.800000,8849.133000,257288.10,0.023497
Hynobius_quelpaertensis,train,1.183220,51.09823,0.490739,0.037124,7306.191,54.904860,54.904860,54.904860,11324.85,0.002000
Astyanax_mexicanus,train,1.718575,49.12913,0.993391,0.022779,5218.117,0.007697,0.007697,0.014183,117.09,0.002000
Butastur_rufipennis,train,2.564512,1280.08200,0.929643,0.025569,7320.219,2698.872000,11379.320000,2698.872000,25360.23,0.052769
Chioglossa_lusitanica,train,1.506129,57.15123,0.613918,0.246785,7305.311,479.768400,479.768400,479.768400,10476.07,0.002000
...,...,...,...,...,...,...,...,...,...,...,...
Pelophylax_saharicus,test,4.659638,11.61989,0.555177,0.089015,7336.612,30.489460,30.489460,30.489460,154238.20,0.000471
Rhinobatos_productus,test,16.052420,15.93584,0.862744,0.220510,5233.587,16687.710000,16687.710000,16687.710000,2038832.00,0.002007
Zapteryx_brevirostris,test,8.008853,13.02278,0.643357,0.016723,5232.746,31142.570000,31142.570000,31142.570000,723480.10,0.002000
Grus_americana,test,7.132738,642.80230,0.960532,0.040814,7325.096,5020.499000,140164.700000,5020.499000,333388.60,0.026184


### ML models

In [15]:
def save_parameter_predictions(model_type):
    model = best_models[model_type]
    dataset_name = dataset_of_model[model_type]
    dfs = all_dfs[dataset_name]
    data = all_data[dataset_name]
    col_types = all_col_types[dataset_name]
    pred_df = pd.DataFrame()
    for split in ('train', 'test'):
        if 'Taxo1NN' in model_type and split == 'train':
            train_distance_matrix = model.regressor_._compute_distance_matrix(model.regressor_.train_data, model.regressor_.train_data, data_split='train')
            y_hat, indices = model.regressor_.get_predictions_from_distance_matrix(train_distance_matrix)
            y_pred = model.regressor_.apply_scaling_relationships(model.regressor_.train_data, y_hat, indices)
        else:
            y_pred = model.predict(data[split]['input'])
        split_pred_df = pd.DataFrame(data=y_pred, index=dfs[split].index, columns=col_types['output']['all'])
        split_pred_df['data_split'] = split
        pred_df = pd.concat([pred_df, split_pred_df])
    pars_df = get_core_parameter_predictions(dfs, pred_df=pred_df, col_types=col_types)
    predictions_file_name = f'../results/parameter_predictions/{model_type}_predictions.csv'
    pars_df.to_csv(predictions_file_name, float_format='%.10e')
    print(f'Saved predictions for model {model_type} in {predictions_file_name}')

    return pars_df

In [16]:
for model_type in best_models:
    save_parameter_predictions(model_type)

Saved predictions for model SRTaxo1NN in ../results/parameter_predictions/SRTaxo1NN_predictions.csv
Saved predictions for model Taxo1NN in ../results/parameter_predictions/Taxo1NN_predictions.csv
Saved predictions for model RandomForestRegressor in ../results/parameter_predictions/RandomForestRegressor_predictions.csv
Saved predictions for model MultiTaskElasticNet in ../results/parameter_predictions/MultiTaskElasticNet_predictions.csv
Saved predictions for model MLP in ../results/parameter_predictions/MLP_predictions.csv


## Finding the bug in feasibility for TaxonomicKNNRegressor

# Compare parameter predictions

Table with the MAPE for each parameter in each line

In [None]:
def get_mape(model_type, data_split):
    model = best_models[model_type]
    dataset_name = dataset_of_model[model_type]
    dfs = all_dfs[dataset_name]
    data = all_data[dataset_name]
    col_types = all_col_types[dataset_name]
    pred_df = pd.DataFrame()
    
    y_true_ps = data[data_split]['output']
    if 'DEBNet' in model_type:
        y_pred_ps = model.predict(torch.tensor(data[data_split]['input'], dtype=torch.float32))
    elif model_type == 'TaxonomicKNNRegressor' and data_split == 'train':
        train_distance_matrix = model.regressor_._compute_distance_matrix(model.regressor_.train_data, model.regressor_.train_data, model.regressor_.train_weights,
                                                          model.regressor_.train_weights, data_split='train')
        y_pred_ps = model.regressor_.get_predictions_from_distance_matrix(train_distance_matrix)
    else:
        y_pred_ps = model.predict(data[data_split]['input'])
            
    return #compute_parameter_mape(y_true=y_true_ps, y_pred=y_pred_ps, output_col_names=col_types['output']['all'])  


In [None]:
model_list = ['DEBNetHCSoftplus', 'DEBNet', 'DEBNetSC', 'TaxonomicKNNRegressor']
mape_df = pd.DataFrame(index=model_list, columns=PARAMETER_COLS)
for model_type in model_list:
    model_mape = get_mape(model_type=model_type, data_split='test')
    mape_df.loc[model_type] = model_mape['mean_absolute_percentage_error']
mape_df

In [None]:
model_labels = {
    'DEBNet': 'DEBNet',
    'DEBNetHCSoftplus': 'DEBNet + HC',
    'DEBNetSC': 'DEBNet + SC',
    'TaxonomicKNNRegressor': 'Taxonomic 1-NN',
    
}

In [None]:
#for mt, row in mape_df.iterrows():
for mt, label in model_labels.items():
    row = mape_df.loc[mt]
    line = f"{label} "
    for par in PARAMETER_COLS:
        mape = row[par]
        if mape == min(mape_df[par]):
            line += f' & \\textbf{{ {mape:.4f} }}'
        else:
            line += f' & {mape:.4f}'
    line += ' \\\\'
    print(line)