# 03b - Create Ensemble Predictions by Applying Majority Voting Strategy

In [1]:
%load_ext pycodestyle_magic
%pycodestyle_on

In [2]:
import numpy as np
import pandas as pd

OUTPUT_DIR = 'output/'

## Approach 1: Using CSV Files

In [3]:
df1 = pd.read_csv(OUTPUT_DIR + 'SnakeCLEF2021_submission_1.csv')
df2 = pd.read_csv(OUTPUT_DIR + 'SnakeCLEF2021_submission_2.csv')
df3 = pd.read_csv(OUTPUT_DIR + 'SnakeCLEF2021_submission_3.csv')
df4 = pd.read_csv(OUTPUT_DIR + 'SnakeCLEF2021_submission_4.csv')
dfs = [df1, df2, df3, df4]

Compute ratio of same predictions for each pair of submission runs.

In [4]:
from itertools import product

n = len(dfs)
pred_aln = np.zeros((n, n))
for i, j in product(range(n), range(n)):
    alignment = (dfs[i]['prediction'] == dfs[j]['prediction']).sum() / len(dfs[i])
    pred_aln[i, j] = pred_aln[j, i] = alignment
pred_aln

array([[1.        , 0.93372196, 0.92404849, 0.796477  ],
       [0.93372196, 1.        , 0.91158704, 0.78659232],
       [0.92404849, 0.91158704, 1.        , 0.79073206],
       [0.796477  , 0.78659232, 0.79073206, 1.        ]])

Merge submission runs into one dataframe.

In [5]:
def max_count(row):
    cols = ['prediction_1', 'prediction_2', 'prediction_3', 'prediction_4']
    res = row[cols].value_counts()
    idx, count = res.index[0], res.iloc[0]
    row['voted'] = idx
    row['voted_count'] = count
    return row


# merge together
_dfs = [_df.rename(columns={'prediction': f'prediction_{i}'})
        for i, _df in enumerate(dfs, 1)]
merged = _dfs[0]
for _df in _dfs[1:]:
    merged = merged.merge(_df, 'left', on='UUID', validate='one_to_one')
merged = merged.apply(max_count, axis=1)
merged

Unnamed: 0,UUID,prediction_1,prediction_2,prediction_3,prediction_4,voted,voted_count
0,f8e1d62d-8c74-47ae-8db9-c4fb0d7a5672,338,338,338,338,338,4
1,3068da5b-a72c-4398-8f70-afb2134cfa8d,83,83,83,83,83,4
2,fcaece2a-90c7-4b07-80cf-f901e188dea8,635,635,635,635,635,4
3,7aa20f9f-2f55-4f8f-93a2-a5a229df4232,509,509,509,509,509,4
4,a2743578-8204-4f4d-b8e9-1ee65bad4925,637,637,637,637,637,4
...,...,...,...,...,...,...,...
23668,5bd061ad-00fe-4c54-ad56-317db73410ba,51,51,51,51,51,4
23669,8d7a534a-a5fe-40da-aafa-a06805e267c2,184,184,184,184,184,4
23670,6dc610be-05f9-461c-bc59-9ff8ce174423,699,699,699,699,699,4
23671,975abfa9-f351-46f7-b1c9-825b819e533a,152,152,152,152,152,4


In [6]:
merged['voted_count'].value_counts() / len(merged)

4    0.746547
3    0.193850
2    0.054957
1    0.004647
Name: voted_count, dtype: float64

In [7]:
# # save to csv
# out = merged[['UUID', 'voted']].rename(columns={'voted': 'prediction'})
# out.to_csv(OUTPUT_DIR + 'SnakeCLEF2021_submission_5.csv', sep=',', index=False)

## Approach 2: Using Prediction Tensors

In [8]:
import torch
import torch.nn.functional as F


def load_pred_files(filenames, path='./'):
    preds_list, targs_list = [], []
    for file in filenames:
        o = torch.load(path + file)
        if isinstance(o, tuple):
            preds_list.append(o[0])
            targs_list.append(o[1])
        else:
            preds_list.append(o)
    if len(targs_list) > 0:
        out = preds_list, targs_list
    else:
        out = preds_list
    return out


def save_pred_file(voted, preds_list, targs_list=None, *, filename, path='./'):
    voted_onehot = F.one_hot(voted, num_classes=preds_list[0].shape[1])
    assert voted_onehot.shape == preds_list[0].shape

    if targs_list is not None:
        for i in range(len(targs_list)-1):
            assert torch.all(targs_list[i] == targs_list[i+1])

        torch.save((voted_onehot, targs_list[0]), path + filename)
    else:
        torch.save(voted_onehot, path + filename)


def ensemble_argmax(preds_list):
    pred_mat = torch.cat([item.argmax(1).reshape(-1, 1) for item in preds_list], dim=1)
    voted = torch.Tensor([torch.unique(x)[0] for x in pred_mat]).to(torch.int64)
    return voted


def ensemble_probs(preds_list):
    sum_preds = 0
    for preds in preds_list:
        # preds = preds / preds.sum(1).reshape(-1, 1)
        sum_preds += preds
    voted = sum_preds.argmax(1)
    return voted


VAL_PRED_FILES = ['val_preds_1.pt', 'val_preds_2.pt', 'val_preds_3.pt', 'val_preds_4.pt']
TEST_PRED_FILES = ['test_preds_1.pt', 'test_preds_2.pt', 'test_preds_3.pt', 'test_preds_4.pt']


# load predictions and targets
val_preds_list, val_targs_list = load_pred_files(VAL_PRED_FILES, path=OUTPUT_DIR)
test_preds_list = load_pred_files(TEST_PRED_FILES, path=OUTPUT_DIR)

### ArgMax Strategy

Ensemble using ArgMax vectors.

In [9]:
# apply ensembling
val_voted_1 = ensemble_argmax(val_preds_list)

# save to file
save_pred_file(
    val_voted_1, val_preds_list, val_targs_list,
    filename='val_preds_5.pt', path=OUTPUT_DIR)

val_voted_1

tensor([340, 594, 537,  ..., 702, 694, 512])

In [10]:
# apply ensembling
test_voted_1 = ensemble_argmax(test_preds_list)

# save to file
save_pred_file(
    test_voted_1, test_preds_list,
    filename='test_preds_5.pt', path=OUTPUT_DIR)

test_voted_1

tensor([336,  82, 631,  ..., 695, 151, 558])

### Probability Averaging Strategy

Ensemble by averaging Probability Tensors.

In [11]:
# apply ensembling
val_voted_2 = ensemble_probs(val_preds_list)

# save to file
save_pred_file(
    val_voted_2, val_preds_list, val_targs_list,
    filename='val_preds_6.pt', path=OUTPUT_DIR)

val_voted_2

tensor([340, 594, 537,  ..., 702, 702, 516])

In [12]:
# apply ensembling
test_voted_2 = ensemble_probs(test_preds_list)

# save to file
save_pred_file(
    test_voted_2, test_preds_list,
    filename='test_preds_6.pt', path=OUTPUT_DIR)

test_voted_2

tensor([336,  82, 631,  ..., 695, 151, 718])

### Compare ArgMax and Probability Averaging Strategies

In [13]:
((val_voted_1 == val_voted_2).sum() / len(val_voted_1)).item()

0.7314935922622681

In [14]:
((test_voted_1 == test_voted_2).sum() / len(test_voted_1)).item()

0.8510116934776306

### Save to csv

In [15]:
def voted_to_df(voted, train_df, test_df):
    test_preds_np = voted.numpy()

    # map test preds on class ids
    vocab = np.unique(train_df.loc[train_df['source'] != 'flickr', 'binomial'])
    test_preds_names = pd.Series(vocab[test_preds_np])

    class_id_map = (train_df[['binomial', 'class_id']].drop_duplicates()
                    .set_index('binomial')['class_id'].to_dict())
    test_preds_id = test_preds_names.replace(class_id_map)

    # create prediction dataframe
    out_df = pd.DataFrame(test_df['UUID'])
    assert len(out_df) == len(test_preds_id)
    out_df['prediction'] = test_preds_id

    return out_df


DATA_DIR = 'data/snake_clef2021_dataset/'


train_df = pd.read_csv(DATA_DIR + 'SnakeCLEF2021_train_metadata_PROD.csv')
test_df = pd.read_csv(DATA_DIR + 'SnakeCLEF2021_TEST_METADATA.csv')

In [16]:
# create submission dataframe
out_df = voted_to_df(test_voted_1, train_df, test_df)

# save to csv
out_df.to_csv(OUTPUT_DIR + 'SnakeCLEF2021_submission_5.csv', sep=',', index=False)

out_df

Unnamed: 0,UUID,prediction
0,f8e1d62d-8c74-47ae-8db9-c4fb0d7a5672,338
1,3068da5b-a72c-4398-8f70-afb2134cfa8d,83
2,fcaece2a-90c7-4b07-80cf-f901e188dea8,635
3,7aa20f9f-2f55-4f8f-93a2-a5a229df4232,509
4,a2743578-8204-4f4d-b8e9-1ee65bad4925,637
...,...,...
23668,5bd061ad-00fe-4c54-ad56-317db73410ba,51
23669,8d7a534a-a5fe-40da-aafa-a06805e267c2,184
23670,6dc610be-05f9-461c-bc59-9ff8ce174423,699
23671,975abfa9-f351-46f7-b1c9-825b819e533a,152


In [17]:
# create submission dataframe
out_df = voted_to_df(test_voted_2, train_df, test_df)

# save to csv
out_df.to_csv(OUTPUT_DIR + 'SnakeCLEF2021_submission_6.csv', sep=',', index=False)

out_df

Unnamed: 0,UUID,prediction
0,f8e1d62d-8c74-47ae-8db9-c4fb0d7a5672,338
1,3068da5b-a72c-4398-8f70-afb2134cfa8d,83
2,fcaece2a-90c7-4b07-80cf-f901e188dea8,635
3,7aa20f9f-2f55-4f8f-93a2-a5a229df4232,509
4,a2743578-8204-4f4d-b8e9-1ee65bad4925,637
...,...,...
23668,5bd061ad-00fe-4c54-ad56-317db73410ba,51
23669,8d7a534a-a5fe-40da-aafa-a06805e267c2,184
23670,6dc610be-05f9-461c-bc59-9ff8ce174423,699
23671,975abfa9-f351-46f7-b1c9-825b819e533a,152
