# 01 - Danish Fungi - Adjusting Predictions Using Metadata Priors (DF20M)

In [1]:
import os

os.chdir('..')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch

from src.core import models, metrics, training, data
from src.special import calibration, proba_model
from src.utils import nb_setup, visualization as viz

PREDICTIONS_DIR = 'predictions/'
DATA_DIR = 'data/danish_fungi_dataset/'
TRAIN_SET_DIR = 'train_resized'

MODEL_ARCH = 'efficientnet_b0'
# MODEL_NAME = 'baselines_mini/df2020_efficientnet_b0_ce_11-04-2021_19-52-55'
MODEL_NAME = 'baselines_with_loss_mini/df2020_efficientnet_b0_ce_11-23-2021_13-25-42'

PRED_FILE = PREDICTIONS_DIR + 'fungi_mini_efficientnet_b0_pred.npy'
TARG_FILE = PREDICTIONS_DIR + 'fungi_mini_targ.npy'

M = 0.1  # m-estimates parameter

nb_setup.init()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

Device: cpu


## Load the Data

In [3]:
# load metadata
train_df = pd.read_csv(DATA_DIR + 'DF20M-train_metadata_PROD.csv')
valid_df = pd.read_csv(DATA_DIR + 'DF20M-public_test_metadata_PROD.csv')

classes = np.unique(train_df['scientificName'])
no_classes = len(classes)
assert no_classes == len(np.unique(valid_df['scientificName']))
print(f'No classes: {no_classes}')
print(f'Train set length: {len(train_df):,d}')
print(f'Validation set length: {len(valid_df):,d}')

No classes: 182
Train set length: 32,753
Validation set length: 3,640


In [4]:
train_df['observation_id'] = train_df['ImageUniqueID'].str.split('-').str[0]
valid_df['observation_id'] = valid_df['ImageUniqueID'].str.split('-').str[0]

valid_df_orig = valid_df.copy()

cond = ~valid_df.duplicated('observation_id').values
# cond = ~valid_df.duplicated('observation_id').values & ~valid_df['observation_id'].isin(train_df['observation_id'])

train_df = train_df.drop_duplicates('observation_id')
train_df = train_df[~train_df['observation_id'].isin(valid_df['observation_id'])]
valid_df = valid_df.drop_duplicates('observation_id')
# valid_df = valid_df[~valid_df['observation_id'].isin(train_df['observation_id'])]
print(f'Train set length: {len(train_df):,d}')
print(f'Validation set length: {len(valid_df):,d}')

Train set length: 17,221
Validation set length: 3,397


## Load Fine-tuned Network and Create DataLoader

In [5]:
# create model
model = models.get_model(MODEL_ARCH, no_classes, pretrained=True)
training.load_model(model, MODEL_NAME, path=DATA_DIR + 'models')
assert np.all([param.requires_grad for param in model.parameters()])

model_config = model.pretrained_config
batch_size = 128

In [6]:
# create transforms
_, valid_tfms = data.get_transforms(
    size=model_config['input_size'], mean=model_config['image_mean'],
    std=model_config['image_std'])

# create data loaders
validloader = data.get_dataloader(
    valid_df, img_path_col='image_path', label_col='scientificName',
    path=DATA_DIR + TRAIN_SET_DIR, transforms=valid_tfms, labels=classes,
    batch_size=batch_size, shuffle=False, num_workers=4)

## Create Predictions

In [7]:
def predict_cached(model, validloader, *, pred_filename, targ_filename):
    import os

    if os.path.isfile(pred_filename) and os.path.isfile(targ_filename):
        pred = np.load(pred_filename)
        targ = np.load(targ_filename)
    else:
        pred, targ, _ = training.predict(model, validloader)
        np.save(pred_filename, pred)
        np.save(targ_filename, targ)
    return pred, targ

In [8]:
# create predictions
logits, targ = predict_cached(
    model, validloader, pred_filename=PRED_FILE, targ_filename=TARG_FILE)

# filter records from the same observations
logits = logits[cond]
targ = targ[cond]

In [9]:
def softmax(x, temperature=None):
    if temperature is not None:
        x = x / temperature
    e = np.exp(x - x.max())  # X.max() makes function exp more stable
    return e / e.sum(axis=1, keepdims=True)


# calibrate predictions
temperature = calibration.tune_temperature(logits=logits, targs=targ)

# apply softmax function with temperature scaling
pred = softmax(logits, temperature=temperature)

Before temperature - NLL: -14.866
Optimal temperature: 0.978
After temperature - NLL: -15.196


## Estimate Metadata-Target Likelihood

In [10]:
# compute class distributions
class_priors = proba_model.estimate_relative_freq(train_df['scientificName'])

### Habitat, Substrate, Month

In [11]:
def predict_metadata_probability(train_df, valid_df, col, m=1):
    hist = proba_model.HistogramClassifier(m=m)
    hist.fit(train_df[col], train_df['scientificName'])
    metadata_pred = hist.predict_proba(valid_df[col])
    return metadata_pred

# create metadata posteriors
metadata_cols = ['Habitat', 'Substrate', 'month']
metadata_preds_dict = {
    col: predict_metadata_probability(train_df, valid_df, col, m=M)
    for col in metadata_cols}

### Month

In [12]:
from scipy.ndimage import convolve
from scipy import signal


def smooth_circular_distribution(dist, ksize=3, kernel='expenontial'):
    def _smooth_distribution(dist):
        if kernel == 'gaussian':
            sigma = 0.3*((ksize-1)*0.5 - 1) + 0.8
            kernel_np = signal.gaussian(ksize, sigma)
        elif kernel == 'parzen':
            kernel_np = signal.parzen(ksize)
        elif kernel == 'cosine':
            kernel_np = signal.cosine(ksize)
        else:
            kernel_np = signal.exponential(ksize)
        dist = convolve(dist, kernel_np, mode='wrap') / kernel_np.sum()
        return dist

    return dist.apply(_smooth_distribution, axis=0)


def predict_month_probability(train_df, valid_df, m=1):
    hist = proba_model.HistogramClassifier(m=m)
    hist.fit(train_df['month'], train_df['scientificName'])
    hist.model_ = smooth_circular_distribution(hist.model_, ksize=3, kernel='exponential')
    metadata_pred = hist.predict_proba(valid_df['month'])
    return metadata_pred


metadata_preds_dict['month_2'] = predict_month_probability(train_df, valid_df, m=M)

### GPS

In [13]:
def predict_gps_probability(train_df, valid_df, bandwidth=0.2, kernel='exponential', metric='haversine'):
    kde = proba_model.KDEClassifier(bandwidth=bandwidth, kernel=kernel, metric=metric)
    kde.fit(train_df[['Latitude', 'Longitude']], train_df['scientificName'])
    gps_pred = kde.predict_proba(valid_df[['Latitude', 'Longitude']])

    return gps_pred


gps_pred = predict_gps_probability(
    train_df, valid_df, bandwidth=0.2, kernel='exponential', metric='haversine')
metadata_preds_dict['GPS'] = gps_pred

## Evaluate Results

In [14]:
{k: round(v * 100, 1) for k, v in training.classification_scores(pred, targ).items()}

{'accuracy': 63.0, 'top_3': 81.4, 'f1_score': 52.2}

In [15]:
from itertools import combinations
from functools import reduce

# combine image and metadata posteriors
cols = ['Habitat', 'Substrate', 'GPS', 'month', 'month_2']
results = {'Original': training.classification_scores(pred, targ)}
for j in range(1, len(metadata_preds_dict) + 1):
    for combination in combinations(cols, j):
        combination_str = 'With ' + reduce('{}, {}'.format, combination)
        adj_pred = proba_model.combine_predictions(
            pred, [metadata_preds_dict[k] for k in combination], class_priors)

        # evaluate scores of combined predictions
        results[combination_str] = training.classification_scores(adj_pred, targ)

results_df = pd.DataFrame.from_dict(results, orient='index')
results_df -= results_df.loc['Original']

_results_df = results_df[~results_df.index.str.contains('month_2')]
('+' + (_results_df * 100).round(1).fillna('').astype(str)).apply(lambda c: c.str.replace('+-', '-', regex=False), axis=0)

Unnamed: 0,accuracy,top_3,f1_score
Original,0.0,0.0,0.0
With Habitat,2.0,2.0,2.3
With Substrate,0.9,1.0,1.0
With GPS,1.3,0.8,2.1
With month,1.2,1.2,1.3
"With Habitat, Substrate",2.8,2.4,3.3
"With Habitat, GPS",2.9,2.2,4.4
"With Habitat, month",3.3,2.8,3.7
"With Substrate, GPS",2.4,1.8,3.2
"With Substrate, month",2.0,2.1,2.1


In [16]:
_results_df = results_df[results_df.index.str.contains('month') & 
                         (results_df.index.str.count('month') < 2)].copy()
# _results_df['month_type'] = 'month'
# _results_df.loc[_results_df.index.str.contains('month_2'), 'month_type'] = 'month_2'
_results_df.index = pd.MultiIndex.from_tuples(
    [(x.replace('month_2', 'month'), 'month_2' if 'month_2' in x else 'month')
     for x in _results_df.index])
index = _results_df.index.get_level_values(0).drop_duplicates()
_results_df = _results_df.unstack(-1)
_results_df = _results_df.loc[index]

('+' + (_results_df * 100).round(1).fillna('').astype(str)).apply(lambda c: c.str.replace('+-', '-', regex=False), axis=0)

Unnamed: 0_level_0,accuracy,accuracy,top_3,top_3,f1_score,f1_score
Unnamed: 0_level_1,month,month_2,month,month_2,month,month_2
With month,1.2,0.9,1.2,0.9,1.3,1.1
"With Habitat, month",3.3,3.2,2.8,2.5,3.7,3.4
"With Substrate, month",2.0,1.7,2.1,1.9,2.1,2.3
"With GPS, month",2.4,2.1,1.7,1.4,3.8,2.7
"With Habitat, Substrate, month",4.2,4.0,3.3,3.1,5.2,4.8
"With Habitat, GPS, month",4.3,3.9,3.1,2.9,6.2,5.5
"With Substrate, GPS, month",3.4,3.0,2.6,2.5,4.7,4.0
"With Habitat, Substrate, GPS, month",5.2,4.9,3.8,3.7,7.6,6.7


## Optimize Tepmerature with Metadata Predictions

In [17]:
from itertools import combinations
from functools import reduce

# combine image and metadata posteriors
cols = ['Habitat', 'Substrate', 'GPS', 'month']
pred_dict = {}
for j in range(1, len(metadata_preds_dict) + 1):
# for j in range(1, 2):
    for combination in combinations(cols, j):
        combination_str = 'With ' + reduce('{}, {}'.format, combination)

        combined_preds = np.ones(pred.shape)
        for k in combination:
            metadata_pred = metadata_preds_dict[k]
            combined_preds *= metadata_pred / class_priors
        temperature = calibration.tune_temperature(
            logits=logits, targs=targ, other_preds=combined_preds, verbose=False)
        adj_pred = softmax(logits, temperature) * combined_preds

        # evaluate scores of combined predictions
        results[combination_str] = training.classification_scores(adj_pred, targ)

results_df = pd.DataFrame.from_dict(results, orient='index')
results_df -= results_df.loc['Original']

_results_df = results_df[~results_df.index.str.contains('month_2')]
'+' + (_results_df * 100).round(1).fillna('').astype(str)

Unnamed: 0,accuracy,top_3,f1_score
Original,0.0,0.0,0.0
With Habitat,2.0,2.0,2.3
With Substrate,1.0,0.9,1.1
With GPS,1.3,0.9,2.1
With month,1.1,1.2,1.3
"With Habitat, Substrate",2.7,2.4,3.1
"With Habitat, GPS",2.9,2.3,4.2
"With Habitat, month",3.3,2.8,3.7
"With Substrate, GPS",2.6,1.9,3.8
"With Substrate, month",2.0,2.1,2.3


In [18]:
{k: round(v * 100, 1) for k, v in results['With Habitat, Substrate, GPS, month'].items()}

{'accuracy': 67.7, 'top_3': 85.3, 'f1_score': 59.2}

In [19]:
_results_df = results_df[results_df.index.str.contains('month') & 
                         (results_df.index.str.count('month') < 2)].copy()
# _results_df['month_type'] = 'month'
# _results_df.loc[_results_df.index.str.contains('month_2'), 'month_type'] = 'month_2'
_results_df.index = pd.MultiIndex.from_tuples(
    [(x.replace('month_2', 'month'), 'month_2' if 'month_2' in x else 'month')
     for x in _results_df.index])
index = _results_df.index.get_level_values(0).drop_duplicates()
_results_df = _results_df.unstack(-1)
_results_df = _results_df.loc[index]
'+' + (_results_df * 100).round(1).fillna('').astype(str)

Unnamed: 0_level_0,accuracy,accuracy,top_3,top_3,f1_score,f1_score
Unnamed: 0_level_1,month,month_2,month,month_2,month,month_2
With month,1.1,0.9,1.2,0.9,1.3,1.1
"With Habitat, month",3.3,3.2,2.8,2.5,3.7,3.4
"With Substrate, month",2.0,1.7,2.1,1.9,2.3,2.3
"With GPS, month",2.4,2.1,1.7,1.4,3.7,2.7
"With Habitat, Substrate, month",4.4,4.0,3.2,3.1,5.9,4.8
"With Habitat, GPS, month",4.6,3.9,3.1,2.9,6.6,5.5
"With Substrate, GPS, month",3.5,3.0,2.9,2.5,5.2,4.0
"With Habitat, Substrate, GPS, month",4.8,4.9,3.9,3.7,7.1,6.7


## Evaluate Diffetent m Parameters

In [20]:
m_results = {'Original': training.classification_scores(pred, targ)}
for m in [0, 0.01, 0.05, 0.1, 0.5, 1, 2]:
    # create metadata posteriors
    metadata_cols = ['Habitat', 'Substrate', 'month']
    metadata_preds_dict = {
        col: predict_metadata_probability(train_df, valid_df, col, m=m)
        for col in metadata_cols}
    metadata_preds_dict['GPS'] = gps_pred

    combined_preds = np.ones(pred.shape)
    for metadata_pred in metadata_preds_dict.values():
        combined_preds *= metadata_pred / class_priors
    combined_preds[np.isnan(combined_preds)] = 0.  # adjustment for m=0
    temperature = calibration.tune_temperature(
            logits=logits, targs=targ, other_preds=combined_preds, verbose=False)
    adj_pred = softmax(logits, temperature) * combined_preds

    m_results[m] = training.classification_scores(adj_pred, targ)

In [21]:
m_results_df = pd.DataFrame.from_dict(m_results, orient='index')
m_results_df -= m_results_df.loc['Original']

('+' + (m_results_df * 100).round(1).fillna('').astype(str)).apply(lambda c: c.str.replace('+-', '-', regex=False), axis=0)

Unnamed: 0,accuracy,top_3,f1_score
Original,0.0,0.0,0.0
0,2.8,0.9,4.0
0.01,4.1,3.5,6.2
0.05,4.7,3.7,7.1
0.1,4.8,3.9,7.1
0.5,4.4,4.0,6.6
1,4.0,3.6,6.0
2,3.4,3.1,5.3
