In [None]:
import pandas as pd
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
import os

In [None]:
fnames = sorted(glob('path/to/csvs/*.csv'))
dfs = [pd.read_csv(f, sep=',') for f in fnames]
print(f'Loaded {len(dfs)} number of dataframes!')
df_names = [os.path.splitext(f.split('/')[1])[0] for f in fnames]

In [None]:
test = dfs[0].sort_values(by='mean_precision', ascending=False, ignore_index=True)
print(test)

# Collect all placements for every submission

In [None]:
submissions = {}
sort_criteria = ['mean_accuracy', 'mean_precision', 'mean_f1', 'mean_digits_score']
metric_names = []
df_idx = 0
for df in dfs:
    for s in sort_criteria:
        metric_names.append(df_names[df_idx]+'_'+s)
        df = df.sort_values(by=s, ascending=False, ignore_index=True, kind='mergesort')
        for submission_idx in range(len(df)):
            submission_id = df['submission'][submission_idx]
            if submission_id in submissions.keys():
                submissions[submission_id].append(submission_idx)
            else:
                submissions[submission_id] = [submission_idx]
    df_idx += 1

# Calculate standard deviation of placements

In [None]:
submission_ids = [s for s in submissions.keys()]
placement_stds = []
for s in submissions.keys():
    placement_stds.append(np.std(submissions[s]))

# Crossplots between 2 metrics

In [None]:
m = []
for s in submissions.keys():
    m.append(submissions[s])
m = np.array(m)


In [None]:

from matplotlib.colors import LinearSegmentedColormap
cdict = {
    'red': ((0, 240/255, 0),
            (0.5, 1, 0),
            (1, 149/255, 0)),
    'green': ((0, 134/255 ,0),
              (0.5, 1, 0),
              (1, 190/255, 0)),
    'blue': ((0, 153/255, 0),
             (0.5, 1, 0),
             (1, 69/255, 0))
}

list_colors = [(149/255, 190/255, 69/255),
                (1, 1, 1),
                (240/255, 134/255, 153/255)]
rg2 = LinearSegmentedColormap.from_list('RedGr2', list_colors, N=1000)
rg = LinearSegmentedColormap('RedGr', cdict)
plt.register_cmap(cmap=rg)
plt.register_cmap(cmap=rg2)

In [None]:
def scatterplot(metrics, m1_index, m2_index):
    corrcoef = np.corrcoef(metrics.T)
    f = plt.figure(figsize=(15,10))
    #plt.title('Cross-correlation between metrics')
    plt.imshow(corrcoef, vmin=0.2, vmax=1)
    plt.xticks([])
    plt.yticks([])
    plt.set_cmap('RedGr2')
    plt.colorbar()
    plt.tight_layout()
    plt.savefig('corrcoef.eps', dpi=300)
    plt.show()
    f = plt.figure(figsize=(10,10))
    plt.scatter(metrics[:,m1_index], metrics[:,m2_index], color=(240/255, 134/255, 153/255))
    #plt.title(f'scatter plot between metrics {m1_index} and {m2_index} ({corrcoef[m1_index, m2_index]})')
    #plt.xlabel(f'metric #{m1_index}')
    #plt.ylabel(f'metric #{m2_index}')
    plt.tight_layout()
    plt.savefig(f'scatter_{m1_index}_{m2_index}_{corrcoef[m1_index, m2_index]}.eps', dpi=300)
    corrmtx = np.stack((metrics[:,m1_index], metrics[:,m2_index]))
    print(f'correlation matrix: {np.corrcoef(corrmtx)}')

In [None]:
print('Metrics: ')
for i in range(len(metric_names)):
    print(f'{i}: {metric_names[i]}')
scatterplot(m, 20, 22)
scatterplot(m, 4, 36)

# Calculate placement matrix (shape: $(\#metrics, \#submissions)$ )
For every cell $(i,j)$, calculate how the placement of submission $j$ changes when using metric $i$ compared to metric $0$.

In [None]:
placement_mtx = np.zeros((len(submissions[submission_ids[0]]), len(submission_ids)))
print(placement_mtx.shape)

j = 0
for s in submissions.keys():
    for i in range(placement_mtx.shape[0]):
        placement_mtx[i,j] = submissions[s][0]-submissions[s][i]
    j += 1

print(np.min(placement_mtx))

Cluster placement matrix by categories:
- $change \lt -100: -3$
- $-100 \leq change \lt -20: -2$
- $-20 \leq change \lt -10: -1$
- $-10 \leq change \lt 10: 0$
- $10 \leq change \lt 20: 1$
- $20 \leq change \lt 100: 2$
- $100 \lt change: 3$

In [None]:
clustered_placement = np.zeros(placement_mtx.shape)
clustered_placement[placement_mtx < -100] = -3
clustered_placement[(-100 <= placement_mtx) & (placement_mtx < -20)] = -2
clustered_placement[(-20 <= placement_mtx) & (placement_mtx < -10)] = -1
clustered_placement[placement_mtx == 0] = 0 
clustered_placement[(-10 <= placement_mtx) & (placement_mtx < 10)] = 1
clustered_placement[(10 <= placement_mtx) & (placement_mtx < 20)] = 2
clustered_placement[(20 <= placement_mtx) & (placement_mtx < 100)] = 3
clustered_placement[placement_mtx >= 100] = 4

In [None]:
plt.imshow(clustered_placement[:,:100].T, cmap='hot', aspect='auto')
plt.title('rank change of submissions compared to the official DSB score')
plt.xlabel('metric variant')
plt.ylabel('submission ID')
plt.colorbar()

In [None]:
f = plt.figure(figsize=(15,10))
plt.imshow(placement_mtx[:,:10].T, cmap='hot', aspect='auto')
plt.title('rank change of submissions compared to the official DSB score')
plt.xlabel('metric variant')
plt.ylabel('submission ID')
plt.colorbar()