##### Copyright 2020 Google LLC.
Licensed under the Apache License, Version 2.0 (the "License")

In [None]:
# Copyright 2022 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Human evaluation of visual metrics

This colab explores correlations between the mucped22 dataset and various objective visual metrics.

Unlike many other datasets, these evaluations:
 *   Are made using only compression distortions (since the distortions were created using AVIF, JPEG, and JXL encoders), which will benefit metrics designed for compression artifacts.
 *   Are made by human evaluators experienced with image quality comparisons, which will benefit smaller distortions, relatively speaking, since unexperienced evaluators often don't notice them.
 *   Are made using two-alternative-forced-choice with a reference image instead of MOS, which will reduce the noise, since evaluators don't need to calibrate their MOS scores.

For each image, each distortion (method and quality setting) has then been ranked using ELO, to provide an expected human-rated ranking for each distortion.

This ranking will allow a comparison of the various metrics across different levels of distortion, e.g. near just-noticeable-differences vs far from just-noticeable-differences.

 

In [None]:
import pandas as pd
import functools
import bokeh.io
!pip install pandas_bokeh
import pandas_bokeh
import requests
import json
import numpy as np
bokeh.io.output_notebook()

Collecting pandas_bokeh
  Downloading pandas_bokeh-0.5.5-py2.py3-none-any.whl (29 kB)
Installing collected packages: pandas-bokeh
Successfully installed pandas-bokeh-0.5.5


First download the dataset containing all evaluations.

In [None]:
!wget --quiet --no-check-certificate https://storage.googleapis.com/gresearch/mucped22/evaluations.json

In [None]:
with open('evaluations.json') as f:
  data = pd.DataFrame(json.load(f))

In [None]:
data

Unnamed: 0,crop,greater,image,lesser,random_choice,rater_time_ms,image_dims,greater_butteraugli_max,greater_butteraugli_6,lesser_butteraugli_max,...,lesser_fsim-rgb,greater_msssim-y,lesser_msssim-y,greater_nlpd-y,lesser_nlpd-y,greater_ssimulacra,lesser_ssimulacra,greater_elo,lesser_elo,rater_flips
0,"[176, 279, 768, 768]",dist_6,edgar-castrejon-J_NbCcGPBOw-unsplash.png,dist_17,False,75125,"[1280, 1920]",6.792748,3.291877,2.287096,...,0.998617,0.975003,0.993973,0.161716,0.078657,0.096484,0.045761,1537.137226,2172.213746,9
1,"[149, 32, 768, 768]",dist_6,4KK2_20150823_130650_314.png,dist_11,False,41883,"[1920, 1424]",6.830727,3.382573,1.990106,...,0.998584,0.985169,0.997969,0.145104,0.050339,0.072111,0.019148,1972.598657,2291.390798,13
2,"[1089, 489, 768, 768]",dist_11,6G7M_20150404_121844_208.png,dist_17,False,48385,"[1920, 1424]",1.760315,0.828016,1.656251,...,0.999163,0.999303,0.999264,0.030033,0.030637,0.010733,0.011180,2268.583209,2309.359159,4
3,"[433, 286, 768, 768]",dist_14,michael-niessl-KraoHdRYrRE-unsplash.png,dist_11,True,36774,"[1920, 1080]",4.110695,1.995982,2.432354,...,0.998708,0.993285,0.998220,0.071748,0.037420,0.043709,0.019095,1991.945557,2241.368691,10
4,"[4, 99, 768, 768]",dist_6,0127_20161022_144117_906.png,dist_14,True,22468,"[1920, 1440]",5.866006,2.645947,3.550395,...,0.994832,0.988954,0.994369,0.095547,0.069588,0.049872,0.035815,1851.898409,1945.269660,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12978,"[887, 59, 768, 768]",dist_23,6G7M_20150404_121844_208.png,dist_40,False,7983,"[1920, 1424]",1.877585,0.923585,2.143382,...,0.999173,0.998298,0.998512,0.051131,0.048955,0.020006,0.017322,2356.875932,2388.690716,6
12979,"[1121, 175, 768, 768]",dist_32,47L8_20150518_144020_604.png,dist_30,True,1702517,"[1920, 1424]",4.943964,2.460808,8.950997,...,0.969582,0.983741,0.955344,0.127071,0.201221,0.071460,0.155907,1937.790267,1402.672706,5
12980,"[1121, 175, 768, 768]",dist_21,0006_20160721_220618_556.png,dist_32,True,1430,"[1920, 1440]",2.339564,1.123564,4.024082,...,0.997534,0.999049,0.998433,0.029906,0.036302,0.011507,0.012152,2202.608359,1870.660112,0
12981,"[84, 358, 768, 768]",dist_10,ricardo-gomez-angel-2GglDxer-Ew-unsplash.png,dist_32,True,5731,"[1920, 1319]",3.816417,1.697080,5.742501,...,0.985294,0.991662,0.980832,0.105602,0.189351,0.048339,0.077644,2174.169512,2083.620449,1


Then decorate it with whether the crop settings were actually compatible with the image size (a few, ~15, evaluations have this bug), and the worst ELO of both distortions.
Finally filter out all evaluations where the evaluator didn't seem to do a good job (didn't flip between distortions more than 2 times, didn't spend more than 3 seconds on the evaluation).

In [None]:
data['complete_crop'] = data.apply(lambda row: row.crop[0] + row.crop[2] <= row.image_dims[0] and row.crop[1] + row.crop[3] <= row.image_dims[1], axis=1)
data['worst_elo'] = data.apply(lambda row: row.greater_elo if row.greater_elo > row.lesser_elo else row.lesser_elo, axis=1)
data = data[(data.rater_flips > 2) & (data.rater_time_ms > 3000) & (data.complete_crop == True)]
data

Unnamed: 0,crop,greater,image,lesser,random_choice,rater_time_ms,image_dims,greater_butteraugli_max,greater_butteraugli_6,lesser_butteraugli_max,...,lesser_msssim-y,greater_nlpd-y,lesser_nlpd-y,greater_ssimulacra,lesser_ssimulacra,greater_elo,lesser_elo,rater_flips,complete_crop,worst_elo
0,"[176, 279, 768, 768]",dist_6,edgar-castrejon-J_NbCcGPBOw-unsplash.png,dist_17,False,75125,"[1280, 1920]",6.792748,3.291877,2.287096,...,0.993973,0.161716,0.078657,0.096484,0.045761,1537.137226,2172.213746,9,True,2172.213746
1,"[149, 32, 768, 768]",dist_6,4KK2_20150823_130650_314.png,dist_11,False,41883,"[1920, 1424]",6.830727,3.382573,1.990106,...,0.997969,0.145104,0.050339,0.072111,0.019148,1972.598657,2291.390798,13,True,2291.390798
2,"[1089, 489, 768, 768]",dist_11,6G7M_20150404_121844_208.png,dist_17,False,48385,"[1920, 1424]",1.760315,0.828016,1.656251,...,0.999264,0.030033,0.030637,0.010733,0.011180,2268.583209,2309.359159,4,True,2309.359159
3,"[433, 286, 768, 768]",dist_14,michael-niessl-KraoHdRYrRE-unsplash.png,dist_11,True,36774,"[1920, 1080]",4.110695,1.995982,2.432354,...,0.998220,0.071748,0.037420,0.043709,0.019095,1991.945557,2241.368691,10,True,2241.368691
4,"[4, 99, 768, 768]",dist_6,0127_20161022_144117_906.png,dist_14,True,22468,"[1920, 1440]",5.866006,2.645947,3.550395,...,0.994369,0.095547,0.069588,0.049872,0.035815,1851.898409,1945.269660,9,True,1945.269660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12975,"[361, 268, 768, 768]",dist_23,4KK2_20150912_125525_384.png,dist_21,False,15533,"[1920, 1424]",2.678031,1.201943,3.711323,...,0.988685,0.076957,0.128533,0.025589,0.051236,2334.257408,2105.427273,7,True,2334.257408
12976,"[374, 425, 768, 768]",dist_23,justin-lim-OiFRMM3o7l0-unsplash.png,dist_22,False,7306,"[1920, 1440]",2.273484,1.044854,2.454072,...,0.995344,0.060886,0.078061,0.024588,0.033907,2347.775076,2209.434355,4,True,2347.775076
12977,"[1093, 224, 768, 768]",dist_9,0127_20161022_144117_906.png,dist_23,True,12069,"[1920, 1440]",2.618022,1.303337,1.548064,...,0.998452,0.050813,0.037381,0.023579,0.016908,2187.406607,2373.387899,12,True,2373.387899
12978,"[887, 59, 768, 768]",dist_23,6G7M_20150404_121844_208.png,dist_40,False,7983,"[1920, 1424]",1.877585,0.923585,2.143382,...,0.998512,0.051131,0.048955,0.020006,0.017322,2356.875932,2388.690716,6,True,2388.690716


In [None]:
def strip(ary, n):
  def stripfun(sum, el):
    sum[el] = el[n:]
    return sum
  return functools.reduce(stripfun, ary, {})

In [None]:
greater_metric_cols = list(filter(lambda el: el.startswith('greater_') and not el.endswith('_file'), list(data.columns)))

In [None]:
lesser_metric_cols = list(filter(lambda el: el.startswith('lesser_') and not el.endswith('_file'), list(data.columns)))

In [None]:
greater_metrics = data[greater_metric_cols]
greater_metrics = greater_metrics.rename(columns=strip(greater_metric_cols, 8))

In [None]:
lesser_metrics = data[lesser_metric_cols]
lesser_metrics = lesser_metrics.rename(columns=strip(lesser_metric_cols, 7))

To allow a rank correlation, like Spearman, combine the metrics of the worse distortion (lesser), and the better distortion (greater), into one dataframe. To also allow comparing correlation in different regions of quality, sort by ELO score.

In [None]:
metrics = pd.concat([greater_metrics, lesser_metrics])
metrics = metrics.sort_values('elo').reset_index(drop=True)
metrics

Unnamed: 0,butteraugli_max,butteraugli_6,fsim-y,fsim-rgb,msssim-y,nlpd-y,ssimulacra,elo
0,8.867950,4.764297,0.978342,0.974386,0.965125,0.242242,0.144204,824.179269
1,8.879775,4.580521,0.979431,0.976825,0.960813,0.228356,0.125745,824.179269
2,8.815258,4.479465,0.980187,0.977970,0.961923,0.215990,0.119023,824.179269
3,8.913489,4.727989,0.976972,0.973410,0.959204,0.240950,0.142649,824.179269
4,9.732282,4.798093,0.977469,0.973566,0.958279,0.250352,0.148506,824.179269
...,...,...,...,...,...,...,...,...
24799,1.762231,0.863336,0.998921,0.998753,0.998144,0.053619,0.014470,2699.407630
24800,1.857362,0.921340,0.998864,0.998748,0.998029,0.051740,0.016668,2699.407630
24801,2.087677,0.921140,0.999216,0.999160,0.998273,0.042458,0.013809,2699.407630
24802,1.668185,0.847164,0.998240,0.998118,0.996968,0.056177,0.016632,2699.407630


Then compute the correlation matrix for these, using Spearman's rank correlation coeffient.

In [None]:
corrs = metrics.corr(method='spearman')
corrs

Unnamed: 0,butteraugli_max,butteraugli_6,fsim-y,fsim-rgb,msssim-y,nlpd-y,ssimulacra,elo
butteraugli_max,1.0,0.986256,-0.886318,-0.892921,-0.823469,0.792767,0.84215,-0.874967
butteraugli_6,0.986256,1.0,-0.877144,-0.883633,-0.833595,0.807155,0.859254,-0.863846
fsim-y,-0.886318,-0.877144,1.0,0.998044,0.900061,-0.845079,-0.840729,0.859066
fsim-rgb,-0.892921,-0.883633,0.998044,1.0,0.898808,-0.850361,-0.848226,0.862459
msssim-y,-0.823469,-0.833595,0.900061,0.898808,1.0,-0.958732,-0.942831,0.766435
nlpd-y,0.792767,0.807155,-0.845079,-0.850361,-0.958732,1.0,0.933383,-0.699681
ssimulacra,0.84215,0.859254,-0.840729,-0.848226,-0.942831,0.933383,1.0,-0.766894
elo,-0.874967,-0.863846,0.859066,0.862459,0.766435,-0.699681,-0.766894,1.0


In [None]:
metric_cols = list(map(lambda name: name[7:], lesser_metric_cols))
metric_cols.remove('elo')

In [None]:
def rollingcorr(df, method, window_size, step_size):
  res = []
  for start in range(0, df.shape[0] - window_size, step_size):
    window = df[start:start+window_size]
    row = [window.iloc[-1]['elo']]
    for metric_name in metric_cols:
      row.append(np.abs(window[metric_name].corr(window['elo'], method=method)))
    res.append(row)
  return pd.DataFrame(res, dtype=np.float, columns=['elo'] + list(map(lambda name: f"{name}", metric_cols)))

Plot the correlation in a rolling window of 5000 evaluations with a step of 1000 evaluations for each metric, to see how they behave across a range of ELO scores.

In [None]:
rollingcorr(metrics, 'spearman', 5000, 1000).plot_bokeh(x='elo', figsize=(1400, 400))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if __name__ == '__main__':
