/
metrics.py
95 lines (78 loc) · 3.26 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# coding=utf-8
# Copyright 2021 The Rliable Authors.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Aggregate Performance Estimators."""
import numpy as np
import scipy.stats
def aggregate_mean(scores: np.ndarray):
"""Computes mean of sample mean scores per task.
Args:
scores: A matrix of size (`num_runs` x `num_tasks`) where scores[n][m]
represent the score on run `n` of task `m`.
Returns:
Mean of sample means.
"""
mean_task_scores = np.mean(scores, axis=0, keepdims=False)
return np.mean(mean_task_scores, axis=0)
def aggregate_median(scores: np.ndarray):
"""Computes median of sample mean scores per task.
Args:
scores: A matrix of size (`num_runs` x `num_tasks`) where scores[n][m]
represent the score on run `n` of task `m`.
Returns:
Median of sample means.
"""
mean_task_scores = np.mean(scores, axis=0, keepdims=False)
return np.median(mean_task_scores, axis=0)
def aggregate_optimality_gap(scores: np.ndarray, gamma=1):
"""Computes optimality gap across all runs and tasks.
Args:
scores: A matrix of size (`num_runs` x `num_tasks`) where scores[n][m]
represent the score on run `n` of task `m`.
gamma: Threshold for optimality gap. All scores above `gamma` are clipped
to `gamma`.
Returns:
Optimality gap at threshold `gamma`.
"""
return gamma - np.mean(np.minimum(scores, gamma))
def aggregate_iqm(scores: np.ndarray):
"""Computes the interquartile mean across runs and tasks.
Args:
scores: A matrix of size (`num_runs` x `num_tasks`) where scores[m][n]
represent the score on run `n` of task `m`.
Returns:
IQM (25% trimmed mean) of scores.
"""
return scipy.stats.trim_mean(scores, proportiontocut=0.25, axis=None)
def probability_of_improvement(scores_x: np.ndarray, scores_y: np.ndarray):
"""Overall Probability of imporvement of algorithm `X` over `Y`.
Args:
scores_x: A matrix of size (`num_runs_x` x `num_tasks`) where scores_x[m][n]
represent the score on run `n` of task `m` for algorithm `X`.
scores_y: A matrix of size (`num_runs_y` x `num_tasks`) where scores_x[m][n]
represent the score on run `n` of task `m` for algorithm `Y`.
Returns:
P(X_m > Y_m) averaged across tasks.
"""
num_tasks = scores_x.shape[1]
task_improvement_probabilities = []
num_runs_x, num_runs_y = scores_x.shape[0], scores_y.shape[0]
for task in range(num_tasks):
if np.array_equal(scores_x[:, task], scores_y[:, task]):
task_improvement_prob = 0.5
else:
task_improvement_prob, _ = scipy.stats.mannwhitneyu(
scores_x[:, task], scores_y[:, task], alternative='greater')
task_improvement_prob /= (num_runs_x * num_runs_y)
task_improvement_probabilities.append(task_improvement_prob)
return np.mean(task_improvement_probabilities)