### This document analyzes the Rate My Hydrograph ratings of the second round

In [9]:
%load_ext autoreload
%autoreload 2
import math
from collections import defaultdict
from datetime import datetime
import mysql.connector
import pandas as pd
import matplotlib.pyplot as plt
import geopandas
import numpy as np
import itertools
import xarray
from pathlib import Path
from neuralhydrology.evaluation.metrics import kge, nse, fdc_fhv, fdc_flv
from sklearn import metrics
import tqdm

df = pd.read_csv('data/rmh-stage2.csv', index_col=0)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
df.shape

(589, 29)

### Looking at chains of ratings

In [11]:
import networkx as nx
from tqdm import tqdm
cycles = []
id_df = df.set_index('id')
groups = df.groupby(['basin', 'start_date', 'objective', 'task'])
for k in tqdm(groups.groups):
    group = groups.get_group(k)
    graph = nx.from_pandas_edgelist(group, source='model_a', target='model_b', edge_key='id', edge_attr=True, create_using=nx.MultiGraph)
    cliques = nx.enumerate_all_cliques(graph)
    triad_cliques = [x for x in cliques if len(x) == 3]
    candidates = []

    # Bring the entries of each triangle into a consistent order
    for cycle in triad_cliques:
        for edge_key, edge_attrs in graph.get_edge_data(cycle[0], cycle[1]).items():
            model_a = id_df.loc[edge_key, 'model_a']
            model_b = id_df.loc[edge_key, 'model_b']
            for edge_key_2, edge_attrs_2 in graph.get_edge_data(cycle[1], cycle[2]).items():
                model_c = id_df.loc[edge_key_2, 'model_a']
                if model_c in [model_a, model_b]:
                    model_c = id_df.loc[edge_key_2, 'model_b']

                for edge_key_3, edge_attrs_3 in graph.get_edge_data(cycle[2], cycle[0]).items():
                    candidate = [edge_attrs['id'], edge_attrs_2['id'], edge_attrs_3['id']]

                    if id_df.loc[edge_key_2, 'model_a'] == model_a or id_df.loc[edge_key_2, 'model_b'] == model_a:
                        # edge 2 is an ac edge, not a bc edge
                        candidate[1], candidate[2] = candidate[2], candidate[1]
                
                    need_switch_2 = id_df.loc[candidate[1], 'model_a'] != model_b
                    need_switch_3 = id_df.loc[candidate[2], 'model_a'] != model_c

                    candidates.append((candidate[0], (candidate[1], need_switch_2), (candidate[2], need_switch_3)))
    cycles += candidates
print(f'Number of triangles: {len(cycles)}')

100%|██████████| 192/192 [00:01<00:00, 177.98it/s]

Number of triangles: 2507





Count consistent/inconsistent triangles

In [12]:
consistent = 0
eq_consistent = 0
triple_eq_consistent = 0
unclear = 0
eq_conflict = 0
double_eq_conflict = 0
conflict = 0

for a, (b, need_switch_bc), (c, need_switch_ca) in cycles:
    ab = id_df.loc[a]
    bc = id_df.loc[b]
    ca = id_df.loc[c]
    b_b_name = 'b' if need_switch_bc else 'a'
    b_c_name = 'a' if need_switch_bc else 'b'
    c_c_name = 'b' if need_switch_ca else 'a'
    c_a_name = 'a' if need_switch_ca else 'b'
    if ab['num_a_wins']:
        if bc[f'num_{b_b_name}_wins']:
            if ca[f'num_{c_c_name}_wins']:
                conflict += 1
            elif ca[f'num_{c_a_name}_wins']:
                consistent += 1
            else:
                eq_conflict += 1
        elif bc[f'num_{b_c_name}_wins']:
            if ca[f'num_{c_c_name}_wins']:
                consistent += 1
            elif ca[f'num_{c_a_name}_wins']:
                consistent += 1
            else:
                eq_consistent += 1
        else:
            if ca[f'num_{c_c_name}_wins']:
                eq_conflict += 1
            elif ca[f'num_{c_a_name}_wins']:
                eq_consistent += 1
            else:
                double_eq_conflict += 1
    elif ab['num_b_wins']:
        if bc[f'num_{b_b_name}_wins']:
            if ca[f'num_{c_a_name}_wins']:
                consistent += 1
            elif ca[f'num_{c_c_name}_wins']:
                consistent += 1
            else:
                eq_consistent += 1
        elif bc[f'num_{b_c_name}_wins']:
            if ca[f'num_{c_a_name}_wins']:
                conflict += 1
            elif ca[f'num_{c_c_name}_wins']:
                consistent += 1
            else:
                eq_conflict += 1
        else:
            if ca[f'num_{c_a_name}_wins']:
                eq_conflict += 1
            elif ca[f'num_{c_c_name}_wins']:
                eq_consistent += 1
            else:
                double_eq_conflict += 1
    else:
        if bc[f'num_{b_b_name}_wins']:
            if ca[f'num_{c_a_name}_wins']:
                eq_consistent += 1
            elif ca[f'num_{c_c_name}_wins']:
                eq_conflict += 1
            else:
                double_eq_conflict += 1
        elif bc[f'num_{b_c_name}_wins']:
            if ca[f'num_{c_a_name}_wins']:
                eq_consistent += 1
            elif ca[f'num_{c_c_name}_wins']:
                eq_conflict += 1
            else:
                double_eq_conflict += 1
        else:
            triple_eq_consistent += 1
print(f'Consistent: {consistent}, conflict: {conflict}, unclear: {unclear}, eq consistent: {eq_consistent}, eq conflict: {eq_conflict}, 3-eq consistent: {triple_eq_consistent}, 2-eq conflict: {double_eq_conflict}. Total: {consistent+conflict+unclear+eq_consistent+eq_conflict+double_eq_conflict+triple_eq_consistent} (should be {len(cycles)})')
print(f'Consistent total: {consistent+eq_consistent+triple_eq_consistent}, conflict total: {conflict+eq_conflict}, unclear: {unclear+double_eq_conflict}.')

Consistent: 786, conflict: 109, unclear: 0, eq consistent: 680, eq conflict: 405, 3-eq consistent: 192, 2-eq conflict: 335. Total: 2507 (should be 2507)
Consistent total: 1658, conflict total: 514, unclear: 335.


We'd expect at least 13/27 ~= 48.15% ratings to be consistent.

In [15]:
(consistent+eq_consistent+triple_eq_consistent)/len(cycles)

0.6613482249700837

Only looking at >/< ratings, we'd expect at least 6/8 = 0.75 ratings to be consistent

In [16]:
consistent/(consistent+conflict)

0.8782122905027933

In [19]:
results_df = {}
for task in ['all', 'overall', 'high-flow', 'low-flow']:
    if task != 'all':
        task_df = df[df['task'] == task]
    else:
        task_df = df
    print(f'\n--------------------------------------\n{task}: {task_df.shape[0]} ratings.')

    # Compare individual against other raters
    true_and_pred_individual = {}
    for rater in task_df['user_id'].unique():
        true_and_pred_individual[rater] = []
        rater_df = task_df[task_df['user_id'] == rater]
        other_df = task_df[task_df['user_id'] != rater]
        other_groups = other_df.groupby(['model_a', 'model_b', 'start_date', 'objective', 'basin', 'task'])
        for idx, rating in rater_df.iterrows():
            rater_class = {'a_wins': rating['num_a_wins'], 'b_wins': rating['num_b_wins'], 'equal_good': rating['num_equal_good'], 'equal_bad': rating['num_equal_bad']}
            rater_class = max(rater_class, key=rater_class.get)

            setting = (rating['model_a'], rating['model_b'], rating['start_date'], rating['objective'], rating['basin'], rating['task'])
            if setting not in other_groups.groups:
                continue

            other_group = other_groups.get_group(setting)
            for key, other_rating in other_group.iterrows():
                other_class = {'a_wins': other_rating['num_a_wins'], 'b_wins': other_rating['num_b_wins'], 'equal_good': other_rating['num_equal_good'], 'equal_bad': other_rating['num_equal_bad']}
                other_class = max(other_class, key=other_class.get)  # Only one other rating, so there can't be a tie
                true_and_pred_individual[rater].append((other_class, rater_class))

        if len(true_and_pred_individual[rater]) > 0:
            true_and_pred_individual[rater] = np.array(true_and_pred_individual[rater])
        else:
            true_and_pred_individual.pop(rater)

    # Compare max rater against individual raters
    true_and_pred_max = []
    np.random.seed(0)
    groupby = task_df.groupby(['model_a', 'model_b', 'start_date', 'objective', 'basin', 'task'])
    for idx, rating in task_df.iterrows():
        setting = (rating['model_a'], rating['model_b'], rating['start_date'], rating['objective'], rating['basin'], rating['task'])

        rater_class = {'a_wins': rating['num_a_wins'], 'b_wins': rating['num_b_wins'], 'equal_good': rating['num_equal_good'], 'equal_bad': rating['num_equal_bad']}
        rater_class = max(rater_class, key=rater_class.get)
        
        group_df = groupby.get_group(setting)
        group_df = group_df[group_df['user_id'] != rating['user_id']]  # Exclude rater themself
        group_size = group_df.shape[0]
        if group_size < 1:
            continue
        a_wins = group_df['num_a_wins'].sum() / group_size
        b_wins = group_df['num_b_wins'].sum() / group_size
        equal = (group_df['num_equal_good'].sum() + group_df['num_equal_bad'].sum()) / group_size
        equal_good = group_df['num_equal_good'].sum() / group_size
        equal_bad =  group_df['num_equal_bad'].sum() / group_size
        counts = {'a_wins': a_wins, 'b_wins': b_wins, 'equal_good': equal_good, 'equal_bad': equal_bad}
        max_class = max(counts, key=counts.get)
        tied_classes = [k for k, v in counts.items() if v == counts[max_class]]
        # In the case of a tie between rater_class and another one, decide by coin toss:
        if len(tied_classes) > 1:
            coin_toss = np.random.randint(len(tied_classes))
            max_class = tied_classes[coin_toss]
        
        true_and_pred_max.append((rater_class, max_class))
    true_and_pred_max = np.array(true_and_pred_max)

    individual_scores = []
    for rater, results in true_and_pred_individual.items():
        individual_scores.append(metrics.classification_report(results[:, 0], results[:, 1], zero_division=0, output_dict=True))

    # Filter to only include reports where all 4 classes are present (7 = 4 + accuracy + macro avg + weighted avg)
    individual_scores = [pd.DataFrame(score) for score in individual_scores if len(score) == 7]
    print(f'Average metrics for a human rater when comparing them to other human raters. Average across {len(individual_scores)} raters.')
    results_df[(task, 'Individual')] = sum(individual_scores) / len(individual_scores)
    display(results_df[(task, 'Individual')])

    print('Metrics for maximum agreement rater')
    results_df[(task, 'Majority vote')] = pd.DataFrame(metrics.classification_report(true_and_pred_max[:, 0], true_and_pred_max[:, 1], output_dict=True))
    display(results_df[(task, 'Majority vote')])


--------------------------------------
all: 589 ratings.
Average metrics for a human rater when comparing them to other human raters. Average across 26 raters.


Unnamed: 0,a_wins,b_wins,equal_bad,equal_good,accuracy,macro avg,weighted avg
precision,0.514857,0.588939,0.270182,0.133577,0.432587,0.376889,0.472159
recall,0.491258,0.5501,0.28289,0.146526,0.432587,0.367694,0.432587
f1-score,0.461912,0.522147,0.206201,0.11772,0.432587,0.326995,0.406433
support,20.346154,20.230769,10.423077,5.692308,0.432587,56.692308,56.692308


Metrics for maximum agreement rater


Unnamed: 0,a_wins,b_wins,equal_bad,equal_good,accuracy,macro avg,weighted avg
precision,0.575758,0.630252,0.307692,0.172414,0.510145,0.421529,0.498997
recall,0.622951,0.625,0.31746,0.125,0.510145,0.422603,0.510145
f1-score,0.598425,0.627615,0.3125,0.144928,0.510145,0.420867,0.503786
support,122.0,120.0,63.0,40.0,0.510145,345.0,345.0



--------------------------------------
overall: 235 ratings.
Average metrics for a human rater when comparing them to other human raters. Average across 12 raters.


Unnamed: 0,a_wins,b_wins,equal_bad,equal_good,accuracy,macro avg,weighted avg
precision,0.496285,0.158159,0.168442,0.0,0.366448,0.205722,0.322681
recall,0.473039,0.306187,0.362037,0.0,0.366448,0.285316,0.366448
f1-score,0.462463,0.182543,0.226487,0.0,0.366448,0.217873,0.319177
support,13.916667,7.083333,7.25,1.416667,0.366448,29.666667,29.666667


Metrics for maximum agreement rater


Unnamed: 0,a_wins,b_wins,equal_bad,equal_good,accuracy,macro avg,weighted avg
precision,0.568966,0.518519,0.4,0.0,0.477273,0.371871,0.474555
recall,0.6,0.4,0.484848,0.0,0.477273,0.371212,0.477273
f1-score,0.584071,0.451613,0.438356,0.0,0.477273,0.36851,0.472698
support,55.0,35.0,33.0,9.0,0.477273,132.0,132.0



--------------------------------------
high-flow: 185 ratings.
Average metrics for a human rater when comparing them to other human raters. Average across 16 raters.


Unnamed: 0,a_wins,b_wins,equal_bad,equal_good,accuracy,macro avg,weighted avg
precision,0.30524,0.589673,0.011409,0.225054,0.432655,0.282844,0.454855
recall,0.332242,0.583316,0.125,0.348958,0.432655,0.347379,0.432655
f1-score,0.266226,0.545997,0.020833,0.206289,0.432655,0.259836,0.402297
support,5.5625,11.1875,1.6875,3.625,0.432655,22.0625,22.0625


Metrics for maximum agreement rater


Unnamed: 0,a_wins,b_wins,equal_bad,equal_good,accuracy,macro avg,weighted avg
precision,0.571429,0.701754,0.0,0.411765,0.59292,0.421237,0.562428
recall,0.645161,0.754717,0.0,0.333333,0.59292,0.433303,0.59292
f1-score,0.606061,0.727273,0.0,0.368421,0.59292,0.425439,0.575842
support,31.0,53.0,8.0,21.0,0.59292,113.0,113.0



--------------------------------------
low-flow: 169 ratings.
Average metrics for a human rater when comparing them to other human raters. Average across 15 raters.


Unnamed: 0,a_wins,b_wins,equal_bad,equal_good,accuracy,macro avg,weighted avg
precision,0.400889,0.463759,0.20582,0.064591,0.41505,0.283765,0.365771
recall,0.589899,0.491111,0.224074,0.15,0.41505,0.363771,0.41505
f1-score,0.446521,0.430272,0.138105,0.088681,0.41505,0.275895,0.351428
support,6.4,6.066667,3.066667,2.466667,0.41505,18.0,18.0


Metrics for maximum agreement rater


Unnamed: 0,a_wins,b_wins,equal_bad,equal_good,accuracy,macro avg,weighted avg
precision,0.615385,0.648649,0.277778,0.0,0.53,0.385453,0.490217
recall,0.666667,0.75,0.227273,0.0,0.53,0.410985,0.53
f1-score,0.64,0.695652,0.25,0.0,0.53,0.396413,0.508009
support,36.0,32.0,22.0,10.0,0.53,100.0,100.0


In [32]:
paper_df = pd.concat({key: results_df[('all', key)] for key in ['Individual', 'Majority vote']}, axis=0)
paper_df.index = paper_df.index.reorder_levels([1,0])
latex = paper_df.sort_index()[['a_wins', 'b_wins', 'equal_good', 'equal_bad']].style.format(precision=2).to_latex(convert_css=True, siunitx=True, hrules=True)
print(latex)

\begin{tabular}{llSSSS}
\toprule
{} & {} & {a_wins} & {b_wins} & {equal_good} & {equal_bad} \\
\midrule
\multirow[c]{2}{*}{f1-score} & Individual & 0.46 & 0.52 & 0.12 & 0.21 \\
 & Majority vote & 0.60 & 0.63 & 0.14 & 0.31 \\
\multirow[c]{2}{*}{precision} & Individual & 0.51 & 0.59 & 0.13 & 0.27 \\
 & Majority vote & 0.58 & 0.63 & 0.17 & 0.31 \\
\multirow[c]{2}{*}{recall} & Individual & 0.49 & 0.55 & 0.15 & 0.28 \\
 & Majority vote & 0.62 & 0.62 & 0.12 & 0.32 \\
\multirow[c]{2}{*}{support} & Individual & 20.35 & 20.23 & 5.69 & 10.42 \\
 & Majority vote & 122.00 & 120.00 & 40.00 & 63.00 \\
\bottomrule
\end{tabular}

