iterate over weight matrix and exract class_of interest label and then save in csv weights - associated best threshold according to iucn data

In [49]:
import pandas as pd
import numpy as np
import json
import os
import sys
from sklearn.metrics import f1_score
import torch

from sklearn.metrics import precision_recall_curve


#sys.path.append('../')
import datasets
import models
import utils
import setup

In [50]:
train_params = {}

train_params['experiment_name'] = 'demo' # This will be the name of the directory where results for this run are saved.
train_params['species_set'] = 'all'
train_params['hard_cap_num_per_class'] = 1000
train_params['num_aux_species'] = 0
train_params['input_enc'] = 'sin_cos'
train_params['loss'] = 'an_full'

In [51]:
params = setup.get_default_params_train(train_params)

In [52]:
# load model
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_params = torch.load('./pretrained_models/model_an_full_input_enc_sin_cos_hard_cap_num_per_class_1000.pt', map_location='cpu')
model = models.get_model(train_params['params'])
model.load_state_dict(train_params['state_dict'], strict=True)
model = model.to(DEVICE)
model.eval()

ResidualFCNet(
  (class_emb): Linear(in_features=256, out_features=47375, bias=False)
  (feats): Sequential(
    (0): Linear(in_features=4, out_features=256, bias=True)
    (1): ReLU(inplace=True)
    (2): ResLayer(
      (nonlin1): ReLU(inplace=True)
      (nonlin2): ReLU(inplace=True)
      (dropout1): Dropout(p=0.5, inplace=False)
      (w1): Linear(in_features=256, out_features=256, bias=True)
      (w2): Linear(in_features=256, out_features=256, bias=True)
    )
    (3): ResLayer(
      (nonlin1): ReLU(inplace=True)
      (nonlin2): ReLU(inplace=True)
      (dropout1): Dropout(p=0.5, inplace=False)
      (w1): Linear(in_features=256, out_features=256, bias=True)
      (w2): Linear(in_features=256, out_features=256, bias=True)
    )
    (4): ResLayer(
      (nonlin1): ReLU(inplace=True)
      (nonlin2): ReLU(inplace=True)
      (dropout1): Dropout(p=0.5, inplace=False)
      (w1): Linear(in_features=256, out_features=256, bias=True)
      (w2): Linear(in_features=256, out_features=

In [53]:
if train_params['params']['input_enc'] in ['env', 'sin_cos_env']:
    raster = datasets.load_env()
else:
    raster = None
enc = utils.CoordEncoder(train_params['params']['input_enc'], raster=raster)

In [54]:
#load reference from iucn
with open(os.path.join('./data/eval/iucn/', 'iucn_res_5.json'), 'r') as f:
            data = json.load(f)
species_ids = list((data['taxa_presence'].keys()))

In [55]:
classes_of_interest = torch.zeros(len(species_ids), dtype=torch.int64)
taxa_ids = torch.zeros(len(species_ids), dtype=torch.int64)
for tt_id, tt in enumerate(species_ids):
    class_of_interest = np.array([train_params['params']['class_to_taxa'].index(int(tt))])
    classes_of_interest[tt_id] = torch.from_numpy(class_of_interest)
    taxa_ids[tt_id] = int(tt)

In [56]:
obs_locs = np.array(data['locs'], dtype=np.float32)
obs_locs = torch.from_numpy(obs_locs).to('cpu')
loc_feat = enc.encode(obs_locs)

In [57]:
with torch.no_grad():
    loc_emb = model(loc_feat, return_feats=True)
    wt = model.class_emb.weight[classes_of_interest, :]

In [58]:
X = wt.numpy()

In [59]:
y = pd.read_csv('./my_experiments/f1_score_baseline/results/theoretical_optimal_iucn_f1score.csv', index_col=0).thres.values

change so that train test split 50-50 then evaluate and obtain f1 score

In [70]:
np.random.seed(42)
num_samples = len(X)
random_indices = np.random.choice(num_samples, size=int(num_samples * 0.75), replace=False)

# Split the dataset into training and testing sets based on the random indices
X_train_thres, X_test_thres = X[random_indices], X[~random_indices]
y_train_thres, y_test_thres = y[random_indices], y[~random_indices]

In [71]:
from sklearn.ensemble import RandomForestRegressor

# Create a Random Forest Regressor object
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the Random Forest Regressor on the training data
rf_regressor.fit(X_train_thres, y_train_thres)

# Once trained, you can use this regressor to make predictions on new data
predictions = rf_regressor.predict(X_test_thres)

In [72]:
from sklearn.metrics import mean_squared_error

# Compute Mean Squared Error (MSE) loss
mse_loss = mean_squared_error(y_test_thres, predictions)

print("Mean Squared Error (MSE) loss:", mse_loss)

Mean Squared Error (MSE) loss: 0.012717987477068928


### use these thresholds for iucn evaluation

In [73]:
def f1_at_thresh(y_true, y_pred, thresh, type = 'binary'):
    y_thresh = y_pred > thresh
    return f1_score(y_true, y_thresh, average=type)

In [74]:
wt_subset = wt[~random_indices]
taxa_ids_subset = taxa_ids[~random_indices]

In [75]:
output = list()
for tt_id, taxa in enumerate(taxa_ids_subset):
    wt_1 = wt_subset[tt_id,:]
    preds = torch.sigmoid(torch.matmul(loc_emb, wt_1)).cpu().numpy()
    taxa = taxa.item()
    species_locs = data['taxa_presence'].get(str(taxa))
    y_test = np.zeros(preds.shape, int)
    y_test[species_locs] = 1

    f1 = f1_at_thresh(y_test, preds, predictions[tt_id])
    
    row = {
        "taxon_id": taxa,
        "thres": predictions[tt_id],
        "fscore": f1
    }
    row_dict = dict(row)
    output.append(row_dict)

    if(tt_id%100==0):
            print(tt_id)

output_pd = pd.DataFrame(output)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800


In [76]:
output_pd

Unnamed: 0,taxon_id,thres,fscore
0,134934,0.777727,0.555309
1,508979,0.438205,0.745727
2,8424,0.828921,0.529557
3,29315,0.387599,0.612996
4,14064,0.182371,0.458973
...,...,...,...
1808,42722,0.186712,0.542331
1809,12916,0.851147,0.017699
1810,105867,0.297980,0.529594
1811,5478,0.579775,0.546926


In [77]:
output_pd.fscore.mean()

0.6206675918047778

In [78]:
output_pd.to_csv('./results/f1_scores/cross_species_calibration_75_split_results.csv')
#50: 60.4%