# Task 1 Evaluation

This notebook contains the evaluation for Task 1 of the TREC Fair Ranking track.

In [None]:
DATA_MODE = 'eval'

In [None]:
import wptrec
wptrec.DATA_MODE = DATA_MODE

## Setup

We begin by loading necessary libraries:

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import xarray as xr
from scipy.stats import bootstrap
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
import gzip
import binpickle

In [None]:
tbl_dir = Path('data/metric-tables')

Set up progress bar and logging support:

In [None]:
from tqdm.auto import tqdm
tqdm.pandas(leave=False)

In [None]:
import sys, logging
logging.basicConfig(level=logging.INFO, stream=sys.stderr)
log = logging.getLogger('task1-eval')

Set up the RNG:

In [None]:
import seedbank
seedbank.initialize(20220101)
rng = seedbank.numpy_rng()

Import metric code:

In [None]:
import wptrec.metrics as metrics
from wptrec.trecdata import scan_runs, scan_teams

And finally import the metric itself.  For Task 1, this uses:

* evaluation qrels
* evaluation intersectional targets
* all dimensions (with their page alignments)

In [None]:
from MetricInputs import qrels, dimensions

In [None]:
target = xr.open_dataarray(tbl_dir / f'task1-{DATA_MODE}-int-targets.nc')

In [None]:
metric = metrics.AWRFMetric(qrels.set_index('topic_id'), dimensions, target)

## Importing Data

Let's load the runs now:

In [None]:
runs = pd.DataFrame.from_records(row for rows in scan_runs(1, 'runs/2022') for row in rows)
runs

And the teams:

In [None]:
team_runs = scan_teams('runs/2022')
team_runs

In [None]:
run_team = team_runs.set_index('run')

## Computing Metrics

We are now ready to compute the metric for each (system,topic) pair.  Let's go!

In [None]:
rank_awrf = runs.groupby(['run_name', 'topic_id'])['page_id'].progress_apply(metric)
rank_awrf = rank_awrf.unstack()
rank_awrf

Make sure we aren't missing anything:

In [None]:
rank_awrf[rank_awrf['Score'].isnull()]

Now let's average by runs:

In [None]:
run_scores = rank_awrf.groupby('run_name').mean()
run_scores.sort_values('Score', ascending=False, inplace=True)
run_scores = run_scores.join(run_team)
run_scores

And bootstrap some confidence intervals:

In [None]:
def boot_ci(col):
    res = bootstrap([col], statistic=np.mean, random_state=rng)
    return pd.Series({
        'Score.SE': res.standard_error,
        'Score.Lo': res.confidence_interval.low,
        'Score.Hi': res.confidence_interval.high,
        'Score.W': res.confidence_interval.high - res.confidence_interval.low
    })

In [None]:
run_score_ci = rank_awrf.groupby('run_name')['Score'].apply(boot_ci).unstack()
run_score_ci

In [None]:
run_score_full = run_scores.join(run_score_ci)
run_score_full

In [None]:
run_tbl_df = run_score_full[['nDCG', 'AWRF', 'Score']].copy()
run_tbl_df['95% CI'] = run_score_full.apply(lambda r: "(%.3f, %.3f)" % (r['Score.Lo'], r['Score.Hi']), axis=1)
run_tbl_df

Combine them:

In [None]:
run_tbl_fn = Path('figures/task1-runs.tex')
run_tbl = run_tbl_df.to_latex(float_format="%.4f", bold_rows=True, index_names=False)
run_tbl_fn.write_text(run_tbl)
print(run_tbl)

## Analyzing Scores

What is the distribution of scores?

In [None]:
run_scores.describe()

In [None]:
sns.displot(x='Score', data=run_scores)
plt.savefig('figures/task1-score-dist.pdf')
plt.show()

In [None]:
sns.relplot(x='nDCG', y='AWRF', hue='team', data=run_scores)
sns.rugplot(x='nDCG', y='AWRF', data=run_scores)
plt.savefig('figures/task1-ndcg-awrf.pdf')
plt.show()

## Per-Topic Stats

We need to return per-topic stats to each participant, at least for the score.

In [None]:
topic_stats = rank_awrf.groupby('topic_id').agg(['mean', 'median', 'min', 'max'])
topic_stats

Make final score analysis:

In [None]:
topic_range = topic_stats.loc[:, 'Score']
topic_range = topic_range.drop(columns=['mean'])
topic_range

And now we combine scores with these results to return to participants.

In [None]:
ret_dir = Path('results') / 'coordinators'
ret_dir.mkdir(exist_ok=True)
for system, s_runs in rank_awrf.groupby('run_name'):
    aug = s_runs.join(topic_range).reset_index().drop(columns=['run_name'])
    fn = ret_dir / f'{system}.tsv'
    log.info('writing %s', fn)
    aug.to_csv(fn, sep='\t', index=False)

## Individual Dimensions

We're now going to process the results on an individual dimension basis.

In [None]:
res1d_d = {}
dim_loop = tqdm(dimensions, desc='dims', leave=False)
for dim in dim_loop:
    dim_loop.set_postfix_str(dim.name)
    t1d = pd.read_parquet(tbl_dir / f'task1-{DATA_MODE}-{dim.name}-target.parquet')
    t1d = xr.DataArray(t1d, dims=['topic_id', dim.name])
    m1d = metrics.AWRFMetric(qrels.set_index('topic_id'), [dim], t1d)
    res1d_d[dim.name] = runs.groupby(['run_name', 'topic_id'])['page_id'].progress_apply(m1d)

In [None]:
res1d = pd.concat(res1d_d, names=['dim'])
res1d = res1d.unstack().reset_index()
res1d

Now let's group things to get per-dimension metrics!

In [None]:
rr_1d = res1d.groupby(['dim', 'run_name'])['Score'].mean()
rr_1d = rr_1d.unstack('dim')
rr_1d

In [None]:
df_1d = run_scores[['Score']].rename(columns={'Score': 'Overall'}).join(rr_1d)
df_1d.sort_values('Overall', inplace=True, ascending=False)
df_fmt = df_1d.style.highlight_max(props='font-weight: bold')
df_fmt

In [None]:
df_1d_fn = Path('figures/task1-single.tex')
style = df_1d.style.highlight_max(props='font: bold;').format(lambda x: '%.4f' % (x,))
style = style.format_index(axis=0, escape='latex')
style = style.hide(names=True)
df_tex = style.to_latex()
df_1d_fn.write_text(df_tex)
print(df_tex)

## Attribute Subset Performance

We also want to look at the peformance over *subsets* of the original attributes.  For this, we need two pieces:

- The dimensions
- The reduced target

We'll get the reduced target by marginalizing.  Let's make a function to get dimensions and reduced targets:

In [None]:
def subset_dims(dims):
    return [d for d in dimensions if d.name in dims]

In [None]:
def subset_tgt(dims):
    names = [d.name for d in dimensions if d.name not in dims]
    return target.sum(names)

### Gender and Geography

Last year, we used subject geography and gender.  Let's generate metric results from those.

In [None]:
geo_gender_metric = metrics.AWRFMetric(qrels.set_index('topic_id'), subset_dims(['sub-geo', 'gender']), subset_tgt(['sub-geo', 'gender']))

In [None]:
geo_gender_res = runs.groupby(['run_name', 'topic_id'])['page_id'].progress_apply(geo_gender_metric)

Now show the results per system:

In [None]:
geo_gender_rr = geo_gender_res.unstack().groupby('run_name').mean()
geo_gender_rr.sort_values('Score', inplace=True)
geo_gender_rr

### Internal Properties

This year, several of our properties are ‘internal’: that is, they primarily refer to things that matter within the Wikipedia platform, not broader social concerns.

Let's see how the systems perform on those.

In [None]:
internal_names = ['alpha', 'age', 'pop', 'langs']
internal_dims = subset_dims(internal_names)
internal_tgt = subset_tgt(internal_names)

In [None]:
internal_metric = metrics.AWRFMetric(qrels.set_index('topic_id'), internal_dims, internal_tgt)

In [None]:
internal_res = runs.groupby(['run_name', 'topic_id'])['page_id'].progress_apply(internal_metric)

Now show the results per system:

In [None]:
internal_rr = internal_res.unstack().groupby('run_name').mean()
internal_rr.sort_values('Score', inplace=True)
internal_rr

### Demographic Properties

Let's see performance on the other ones (demographic properties):

In [None]:
demo_names = [d.name for d in dimensions if d.name not in internal_names]
demo_dims = subset_dims(demo_names)
demo_tgt = subset_tgt(demo_names)

In [None]:
demo_metric = metrics.AWRFMetric(qrels.set_index('topic_id'), demo_dims, demo_tgt)

In [None]:
demo_res = runs.groupby(['run_name', 'topic_id'])['page_id'].progress_apply(demo_metric)

Now show the results per system:

In [None]:
demo_rr = demo_res.unstack().groupby('run_name').mean()
demo_rr.sort_values('Score', inplace=True)
demo_rr

### Subset Scores

In [None]:
subsets = {
    'Overall': run_scores,
    '2021': geo_gender_rr,
    'Internal': internal_rr,
    'Demographic': demo_rr,
}

In [None]:
ss_cols = [df[['Score']].rename(columns={'Score': name}) for (name, df) in subsets.items()]
ss_df = reduce(lambda df1, df2: df1.join(df2), ss_cols)
ss_df.sort_values('Overall', inplace=True, ascending=False)
ss_fmt = ss_df.style.highlight_max(props='font-weight: bold;')
ss_fmt

In [None]:
ss_fn = Path('figures/task1-subsets.tex')
style = ss_df.style.highlight_max(props='font: bold;').format(lambda x: '%.4f' % (x,))
style = style.format_index(axis=0, escape='latex')
style = style.hide(names=True)
ss_tex = style.to_latex()
ss_fn.write_text(ss_tex)
print(ss_tex)