# 02 - Adjusting Predictions Using Metadata Priors for All Networks (DF20M)

In [1]:
import os

os.chdir('..')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.core import training, metrics
from src.special import calibration, proba_model
from src.utils import io

PREDICTIONS_DIR = 'predictions/'
DATA_DIR = 'data/danish_fungi_dataset/'

PRED_FILES = {
    'EfficientNet-B0': 'fungi_mini_efficientnet_b0_pred.npy',
    'ViT-Base-224': 'fungi_mini_vit_base_224_pred.npy',

    'EfficientNet-B4': 'fungi_mini_efficientnet_b4_pred.npy',
    'NoisyStudent-B4': 'fungi_mini_efficientnet_b4_ns_pred.npy',
    'EfficientNetV2-S': 'fungi_mini_efficientnetv2_s_pred.npy',

    'ViT-Base-384': 'fungi_mini_vit_base_384_pred.npy',
    'DeiT-Base-384': 'fungi_mini_deit_base_384_pred.npy',
    'BEiT-Base-384': 'fungi_mini_beit_base_384_pred.npy',

    'ViT-Large-384': 'fungi_mini_vit_large_384_pred.npy'}
TARG_FILE = 'fungi_mini_targ.npy'

M = 0.1  # m-estimates parameter

## Load the Data

In [3]:
# load metadata
train_df = pd.read_csv(DATA_DIR + 'DF20M-train_metadata_PROD.csv')
valid_df = pd.read_csv(DATA_DIR + 'DF20M-public_test_metadata_PROD.csv')

classes = np.unique(train_df['scientificName'])
no_classes = len(classes)
assert no_classes == len(np.unique(valid_df['scientificName']))
print(f'No classes: {no_classes}')
print(f'Train set length: {len(train_df):,d}')
print(f'Validation set length: {len(valid_df):,d}')

No classes: 182
Train set length: 32,753
Validation set length: 3,640


In [4]:
train_df['observation_id'] = train_df['ImageUniqueID'].str.split('-').str[0]
valid_df['observation_id'] = valid_df['ImageUniqueID'].str.split('-').str[0]

cond = ~valid_df.duplicated('observation_id').values
# cond = ~valid_df.duplicated('observation_id').values & ~valid_df['observation_id'].isin(train_df['observation_id'])

train_df = train_df.drop_duplicates('observation_id')
train_df = train_df[~train_df['observation_id'].isin(valid_df['observation_id'])]
valid_df = valid_df.drop_duplicates('observation_id')
# valid_df = valid_df[~valid_df['observation_id'].isin(train_df['observation_id'])]
print(f'Train set length: {len(train_df):,d}')
print(f'Validation set length: {len(valid_df):,d}')

Train set length: 17,221
Validation set length: 3,397


## Compute Predictions

In [5]:
if not os.path.isdir(PREDICTIONS_DIR):
    os.mkdir(PREDICTIONS_DIR)

# compute predictions
if not all([os.path.isfile(PREDICTIONS_DIR + x) for x in PRED_FILES.values()]):
    !sh test_fungi.sh

## Load Predictions

In [6]:
from tqdm import tqdm

def softmax(x, temperature=None):
    if temperature is not None:
        x = x / temperature
    e = np.exp(x - x.max())  # X.max() makes function exp more stable
    return e / e.sum(axis=1, keepdims=True)


# load target file
targ = np.load(PREDICTIONS_DIR + TARG_FILE)
targ = targ[cond]

# load prediction file of each model and compute scores
logits_dict = {}
preds_dict = {}
for model_name, pred_file in tqdm(PRED_FILES.items()):
    # load prediction file
    logits = np.load(PREDICTIONS_DIR + pred_file)
    logits = logits[cond]

    # calibrate predictions
    temperature = calibration.tune_temperature(
        logits=logits, targs=targ, verbose=False)

    # apply softmax with temperature
    pred = softmax(logits, temperature=temperature)
    preds_dict[model_name] = pred
    logits_dict[model_name] = logits

100%|██████████| 9/9 [00:23<00:00,  2.60s/it]


## Estimate Metadata-Target Likelihood

In [7]:
# compute class distributions
class_priors = proba_model.estimate_relative_freq(train_df['scientificName'])

### Habitat, Substrate, Month

In [8]:
def predict_metadata_probability(train_df, valid_df, col, m=1):
    hist = proba_model.HistogramClassifier(m=m)
    hist.fit(train_df[col], train_df['scientificName'])
    metadata_pred = hist.predict_proba(valid_df[col])
    return metadata_pred

# create metadata posteriors
metadata_cols = ['Habitat', 'Substrate', 'month']
metadata_preds_dict = {
    col: predict_metadata_probability(train_df, valid_df, col, m=M)
    for col in metadata_cols}

### GPS

In [9]:
def predict_gps_probability(train_df, valid_df, bandwidth=0.2, kernel='exponential', metric='haversine'):
    kde = proba_model.KDEClassifier(bandwidth=bandwidth, kernel=kernel, metric=metric)
    kde.fit(train_df[['Latitude', 'Longitude']], train_df['scientificName'])
    gps_pred = kde.predict_proba(valid_df[['Latitude', 'Longitude']])

    return gps_pred


metadata_preds_dict['GPS'] = predict_gps_probability(
    train_df, valid_df, bandwidth=0.2, kernel='exponential', metric='haversine')

## Combine Predictions with Metadata

In [10]:
# adjust predictions with metadata and compute scores
scores_dict = {}
adj_scores_dict = {}
for model_name, pred in tqdm(preds_dict.items()):
    logits = logits_dict[model_name]

    # tune temperature for image and metadata predictions
    combined_preds = np.ones(pred.shape)
    for k in metadata_preds_dict.keys():
        metadata_pred = metadata_preds_dict[k]
        combined_preds *= metadata_pred / class_priors
    temperature = calibration.tune_temperature(
            logits=logits, targs=targ, other_preds=combined_preds, verbose=False)

    # adjust predictions using metadata
    pred_adj = softmax(logits, temperature) * combined_preds

    # compute scores
    scores_dict[model_name] = training.classification_scores(pred, targ)
    adj_scores_dict[model_name] = training.classification_scores(pred_adj, targ)

100%|██████████| 9/9 [00:26<00:00,  2.92s/it]


## Evaluate Scores

In [11]:
scores_df = pd.DataFrame.from_dict(scores_dict, orient='index')
scores_df.columns = pd.MultiIndex.from_product([scores_df.columns, ['Original']])
adj_scores_df = pd.DataFrame.from_dict(adj_scores_dict, orient='index')
adj_scores_df.columns = pd.MultiIndex.from_product([adj_scores_df.columns, ['Adjusted']])

eval_df = pd.concat([scores_df, adj_scores_df], axis=1)
for met in ['accuracy', 'top_3', 'f1_score']:
    eval_df[met, 'Diff'] = eval_df[met, 'Adjusted'] - eval_df[met, 'Original']

In [12]:
_df = eval_df[['accuracy', 'top_3', 'f1_score']].round(3) * 100
for met in ['accuracy', 'top_3', 'f1_score']:
    _df[met, 'Diff'] = '+' + _df[met, 'Diff'].round(1).fillna('').astype(str).replace('+', np.nan)
_df

Unnamed: 0_level_0,accuracy,accuracy,accuracy,top_3,top_3,top_3,f1_score,f1_score,f1_score
Unnamed: 0_level_1,Original,Adjusted,Diff,Original,Adjusted,Diff,Original,Adjusted,Diff
EfficientNet-B0,63.0,67.7,4.8,81.4,85.3,3.9,52.2,59.2,7.1
ViT-Base-224,68.7,71.5,2.8,85.6,87.7,2.1,58.4,61.7,3.3
EfficientNet-B4,68.0,71.8,3.8,85.2,88.3,3.1,58.0,62.2,4.1
NoisyStudent-B4,70.0,73.6,3.6,86.9,88.6,1.7,61.3,64.9,3.7
EfficientNetV2-S,69.8,73.0,3.2,86.4,88.7,2.3,60.7,65.4,4.7
ViT-Base-384,75.2,76.9,1.7,88.8,90.0,1.2,65.8,67.9,2.1
DeiT-Base-384,73.6,75.8,2.2,87.9,89.8,1.9,63.4,66.6,3.3
BEiT-Base-384,73.6,76.9,3.3,88.4,90.7,2.3,64.2,69.0,4.8
ViT-Large-384,76.0,78.3,2.4,89.7,90.6,0.9,66.0,69.0,3.0
