In [None]:
import csv
from etils import epath
import numpy as np
from sklearn import metrics
import tqdm

# Fill this in with path to folder containing solution and submission csv's.
base_path = ''
solution_csv = 'birdclef2024_solution.csv'
subs = [
    'submission-1.csv',
    'submission-2.csv',
]

def load_solution(solution_path: str, bool_sol: bool):
  with open(epath.Path(base_path) / solution_path, 'r') as f:
    dr = csv.DictReader(f)
    sp_list = dr.fieldnames[:]
    sp_list.remove('row_id')
    if 'Usage' in sp_list:
      sp_list.remove('Usage')
    sp_list = sorted(sp_list)
    row_ids = []
    scores = []
    for r in dr:
      row_ids.append(r['row_id'])
      if bool_sol:
        scores.append(np.array([r[k]=='True' for k in sp_list]))
      else:
        scores.append(np.array([float(r[k]) for k in sp_list]))
  perm = np.argsort(row_ids)
  row_ids = np.array(row_ids)[perm]
  scores = np.stack(scores, axis=0)
  scores = scores[perm]
  return row_ids, sp_list, scores

sol_rows, sp_list, sol_array = load_solution(solution_csv, True)
active_cols = sol_array.sum(axis=0) > 0

all_rocs = []
for s in tqdm.tqdm(subs):
  sub_rows, _, sub_scores = load_solution(s, False)
  if not np.all(sol_rows == sub_rows):
    print('submission %s has strange rows.' % s)
  sub_rocs = metrics.roc_auc_score(
      sol_array[:, active_cols], sub_scores[:, active_cols], average=None)
  all_rocs.append(sub_rocs)


In [None]:
from matplotlib import pyplot as plt
sp_counts = sol_array[:, active_cols].sum(axis=0)

plt.figure(figsize=(10, 10))
for i, rocs in enumerate(all_rocs):
  if i not in (0, 1, 2, 3, 4): continue
  if i == 0:
    order = np.argsort(rocs)
  rocs = rocs[order]
  lbl = subs[i][11:].split('.')[0]
  # plt.scatter(sp_counts, rocs, alpha=0.25, label=lbl)
  plt.scatter(tuple(range(rocs.shape[0])), rocs, alpha=0.25, label=lbl)
plt.ylabel('ROC-AUC')
# plt.xscale('log')
# plt.xlabel('Positive Example Count')
plt.legend()

In [None]:
corrs = np.zeros(shape=[len(all_rocs), len(all_rocs)])
for i in range(len(all_rocs)):
  for j in range(len(all_rocs)):
    corrs[i, j] = np.corrcoef(all_rocs[i], all_rocs[j])[0, 1]
print(corrs)

In [None]:
active_sp = [sp for sp, t in zip(sp_list, active_cols) if t]
for i in range(5):
  for loc in np.argwhere(all_rocs[i] < 0.4)[:, 0]:
    s = all_rocs[i][loc]
    print(f'{subs[i]:24s},  {active_sp[loc]:8s},  {s:.3f}')
  print()