In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
import pickle
import torch
from scipy.io import savemat

In [2]:
import sys
import os

# Add the 'project' directory to the path
sys.path.append(os.path.abspath('..'))

from project_code.data.load_data import load_dataframes
from project_code.data.prepare_data_sklearn import get_features_targets
from project_code.data.prepare_data_pytorch import prepare_data_tensors
from project_code.utils.results import get_best_model_file
from project_code.inference.parameters import get_core_parameter_predictions, convert_output_to_parameter_predictions, PARAMETER_COLS
from project_code.evaluate.prediction_error import evaluate_parameter_predictions_on_data, compute_metrics, METRIC_LABEL_TO_NAME
from project_code.evaluate.metrics import log_accuracy_ratio, symmetric_mean_absolute_percentage_error, mean_deb_loss
from project_code.plotters.infeasibility import method_labels, par_to_latex

# Loading data and models

In [35]:
datasets_folder = f'../data/processed/'

dataset_of_model = {
    'SRTaxo1NN': 'biologist_no_pub_age',  
    'Taxo1NN': 'biologist_no_pub_age',  
    'RandomForestRegressor': 'final_taxonomy_ecocodes',
    'MultiTaskElasticNet': 'final_taxonomy_ecocodes',
    'MLP': 'final_taxonomy_ecocodes',
    'MLPSC': 'final_taxonomy_ecocodes',
    'DEBNetHC': 'final_taxonomy_ecocodes',
    'DEBNetSC': 'final_taxonomy_ecocodes',
}

save_figures_folder = '../paper/figures'
save_models_folder = '../models'

## Loading best models

In [4]:
def load_model(model_file, results_folder):
    if model_file[-4:] == '.pkl':
        with open(f"{results_folder}/models/{model_file}", 'rb') as f:
            model = pickle.load(f)
    elif model_file[-4:] == '.pth':
        model = torch.load(f"{results_folder}/models/{model_file}", weights_only=False)
        model.eval()
    return model

In [5]:
best_models = {}
best_models_test_performance_files = {}
for mt in dataset_of_model.keys():  
    results_folder = f'../results/{dataset_of_model[mt]}'
    metric = 'logQ'
    model_file = get_best_model_file(results_folder=results_folder, model_type=mt, metric=metric)
    #print(mt, model_file)
    if model_file is not None:
        best_models[mt] = load_model(model_file, results_folder)
        test_performance_filename =  model_file[:-4] + '.csv'
        best_models_test_performance_files[mt] = os.path.join(results_folder, 'test_performance', test_performance_filename)

print(best_models.keys())

dict_keys(['SRTaxo1NN', 'Taxo1NN', 'MLP', 'MLPSC'])


# Save taxonomic models

## Training data

In [54]:
taxo1nn.col_types['input']

{'all': ('Wwi',
  'd_V',
  'genus',
  'family',
  'order',
  'class',
  'phylum',
  'p_M',
  'kap',
  'v',
  's_p_M',
  'E_Hb',
  'E_Hj',
  'E_Hp',
  'k_J',
  's_M',
  'metamorphosis',
  'estim_p_M',
  'estim_v',
  'estim_kap',
  'estim_k_J',
  'estim_E_Hb',
  'estim_E_Hj',
  'estim_E_Hp',
  'estim_s_M',
  'estim_s_p_M'),
 'boolean': ('metamorphosis',
  'estim_p_M',
  'estim_v',
  'estim_kap',
  'estim_k_J',
  'estim_E_Hb',
  'estim_E_Hj',
  'estim_E_Hp',
  'estim_s_M',
  'estim_s_p_M'),
 'scale': (),
 'log': ('Wwi',),
 'bounded01': (),
 'quantile': (),
 'category': ('genus', 'family', 'order', 'class', 'phylum')}

In [67]:
taxo1nn.wi_col

0

In [76]:
taxo1nn = best_models['Taxo1NN'].regressor_
d_V_col = taxo1nn.col_types['input']['all'].index('d_V')
abj_col = taxo1nn.abj_col
par_order = ['s_p_M', 'p_M', 'kap', 'v', 'k_J', 'E_Hb', 'E_Hp', 'E_Hj', 's_M']
data_to_concat = []
data_to_concat.append(taxo1nn.train_data[:, taxo1nn.wi_col].reshape(-1, 1))
data_to_concat.append(taxo1nn.train_data[:, taxo1nn.taxonomy_cols])
for p in par_order:
    data_to_concat.append(taxo1nn.train_data[:, taxo1nn.col_types['input']['all'].index(p)].reshape(-1, 1))
for p in par_order:
    data_to_concat.append(taxo1nn.train_data[:, taxo1nn.col_types['input']['all'].index(f'estim_{p}')].reshape(-1, 1))
    
taxo1nn_train_data = np.concat(data_to_concat, axis=1)

In [77]:
model_name = 'Taxo1NN'
savemat(f"{save_models_folder}/{model_name}/train_data.mat", {'trainData': taxo1nn_train_data})

## Taxonomy Encoder

In [None]:
taxo1nn.taxonomy_encoder.label_encoders

In [65]:
taxa = taxo1nn.col_types['input']['category']
encoding_matlab_dict = {}
for taxon, encoder in zip(taxa, taxo1nn.taxonomy_encoder.label_encoders):
    encoding_matlab_dict[taxon] = np.array([c.strip() for c in encoder.classes_], dtype=object)  # cell array in MATLAB

savemat(f"{save_models_folder}/{model_name}/taxonomy_encoding.mat", encoding_matlab_dict)
print(encoding_matlab_dict)

{'genus': array(['Abbottina', 'Abramis', 'Abroscopus', ..., 'Zosterisessor',
       'Zosterops', 'other'], shape=(1267,), dtype=object), 'family': array(['Acanthisittidae', 'Acanthizidae', 'Accipitridae', 'Acipenseridae',
       'Acrocephalidae', 'Adrianichthyidae', 'Aegothelidae', 'Agamidae',
       'Agonidae', 'Alaudidae', 'Alcedinidae', 'Alcidae', 'Alestidae',
       'Alligatoridae', 'Alopiidae', 'Alosidae', 'Amblyopsidae',
       'Ambystomatidae', 'Ameronothridae', 'Amiidae', 'Ammodytidae',
       'Amphiumidae', 'Amphiuridae', 'Ampullariidae', 'Anabantidae',
       'Anarhichadidae', 'Anatidae', 'Anoplopomatidae', 'Aplodontiidae',
       'Apodidae', 'Apogonidae', 'Apterygidae', 'Arapaimidae', 'Ardeidae',
       'Argulidae', 'Arhynchobatidae', 'Ariommatidae', 'Arripidae',
       'Artedidraconidae', 'Ascidiidae', 'Asellidae', 'Astacidae',
       'Atherinidae', 'Atherinopsidae', 'Bagridae', 'Balaenicipitidae',
       'Balaenidae', 'Balaenopteridae', 'Bathydraconidae', 'Belonidae',
    

## Hyperparameters

In [38]:
taxo1nn.ultimate_weight_factor

0.551