In [1]:
import pandas as pd
import numpy as np
from utils import (
    load_labelers_predictions,
    load_models_predictions,
    Split,
    METRIC_2_MODEL_INCREMENTAL,
    METRIC_2_MODEL_INCREMENTAL_PARTIAL,
)
from evaluation import evaluate_labelers
from typing import Dict, Tuple, List, get_args
from pathlib import Path

# plotting
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots

pio.templates.default = 'plotly_white'

## Define variables

In [10]:
METRICS = ("InformativenessRating",)  # "RelevanceRating")
HOME_DIR = "/mnt/ml-team/homes/grzegorz.jurdzinski"
NEWSROOM_DIR = f"{HOME_DIR}/datasets/newsroom"
PREDS_DIR = "/mnt/ml-team/homes/grzegorz.jurdzinski/runs/newsroom-incremental"
MODELS = (
    "deberta-small",
    "deberta-base",
    "deberta-large",
    "reward-deberta-base",
    "reward-deberta-large",
)

In [11]:
file_names = {
    "original": "newsroom-aggregated-original-order.csv",
    "random": "newsroom-aggregated-random-order.csv",
    "sorted": "newsroom-aggregated-sorted-order.csv",
}

If the models' predictions were not bucketised yet, run `bucketise_predictions.py` to do so.

## Load labelers predictions

In [12]:
def get_labelers_preds(dir: str, metric: str, split: str) -> pd.DataFrame:
    labelers_preds = load_labelers_predictions(
        NEWSROOM_DIR,
        "newsroom-aggregated-original-order.csv",
        metric=metric,
        split=split
    )
    
    # Add one random labeler.
    temp_labelers_preds = load_labelers_predictions(
        NEWSROOM_DIR,
        "newsroom-aggregated-random-order.csv",
        metric=metric,
        split=split
    )
    temp_labelers_preds.rename(columns={"labeler_1": "random_labeler"}, inplace=True)
    labelers_preds = labelers_preds.merge(
        temp_labelers_preds[["ArticleID", "System", "random_labeler"]],
        how="inner",
        on=["ArticleID", "System"])
    
    # Add median labeler.
    temp_labelers_preds = load_labelers_predictions(
        NEWSROOM_DIR,
        "newsroom-aggregated-sorted-order.csv",
        metric=metric,
        split=split
    )
    temp_labelers_preds.rename(columns={"labeler_2": "median_labeler"}, inplace=True)
    labelers_preds = labelers_preds.merge(
        temp_labelers_preds[["ArticleID", "System", "median_labeler"]],
        how="inner",
        on=["ArticleID", "System"])
    
    return labelers_preds

In [13]:
labelers_preds_dict = {}

for metric in METRICS:
    labelers_preds_dict[metric] = {}
    for split in get_args(Split):
        labelers_preds_dict[metric][split] = get_labelers_preds(
            NEWSROOM_DIR, metric, split
        )

## Define evaluation helper functions

In [14]:
def evaluate_and_compare_models(
    models_preds: Dict[str, Dict[str, pd.DataFrame]],
    labelers_preds_dict: Dict[str, Dict[str, pd.DataFrame]],
    metric: str,
    split: Split,
    k: int = 1,
) -> Tuple[pd.Series, go.Figure]:
    """
    Merges labelers and models predictions and evaluates them.
    Args:
        models_preds: Dict of dicts of DataFrames with models predictions.
        labelers_preds_dict: Dict of dicts of DataFrames with labelers predictions.
        metric: Metric to evaluate.
        split: Split to evaluate.
        k: k from our evaluation formula.
    Returns:
        Tuple of:
            - Series with evaluation results.
            - Plotly figure with predictions.
    """
    labelers_preds = labelers_preds_dict[metric][split]
    labelers_preds = _merge_labelers_and_models_preds(labelers_preds, models_preds, metric)

    labelers_preds = _add_fixed_columns(
        labelers_preds, [3, 3.25, 3.5, 3.75, 4]
    )
    labelers_preds["mean_jittered"] = (
        labelers_preds["labeler_mean"]
        + np.random.normal(loc=0.0, scale=0.15, size=(len(labelers_preds)))
    ).clip(lower=0, upper=5)

    anot_cols = labelers_preds.columns.to_list()
    anot_cols.remove("ArticleID")
    anot_cols.remove("System")
    anot_cols.remove("summary_id")

    labelers_preds[anot_cols] = (labelers_preds[anot_cols] - 1) / 4

    annotator_scores = evaluate_labelers(
        labelers_preds[anot_cols],
        normalization=None,
        human_prefix="labeler_",
        k=k,
    ).sort_values(ascending=False)
    # display(evaluation_results)

    temp_melt_df = pd.melt(
        labelers_preds, id_vars=["summary_id"], value_vars=anot_cols
    )
    # print(temp_melt_df)
    fig = px.scatter(
        temp_melt_df,
        x="summary_id",
        y="value",
        color="variable",
    )
    return annotator_scores, fig
    # fig.show()


def _add_fixed_columns(df: pd.DataFrame, values: List[float]) -> pd.DataFrame:
    for v in values:
        df[f"fixed_{v}"] = v
    return df


def _merge_labelers_and_models_preds(
    labelers_preds: pd.DataFrame,
    models_preds: Dict[str, Dict[str, pd.DataFrame]],
    metric: str,
) -> pd.DataFrame:
    if metric in models_preds:  # METRIC_2_MODEL_INCREMENTAL:
        for model_name, model_predictions in models_preds[metric].items():
            labelers_preds = labelers_preds.merge(
                model_predictions[["ArticleID", "System", "labels"]],
                how="inner",
                on=["ArticleID", "System"],
            ).rename(columns={"labels": model_name})
    return labelers_preds

In [15]:
# models_preds = load_models_predictions(
#     metric2model=METRIC_2_MODEL_INCREMENTAL,
#     preds_path=Path(PREDS_DIR) / "deberta-small",
#     load_from_buckets_dir=True,
#     split_prefixes=["val_"],
# )
# evaluate_and_compare_models(
#     models_preds, labelers_preds_dict, "RelevanceRating", "valid"
# )

## Load and evaluate models predictions

In [17]:
scores_dict: Dict[str, Dict[str, Dict[str, pd.Series]]] = {}
figures_dict: Dict[str, Dict[str, Dict[str, go.Figure]]] = {}

for metric in METRICS:
    print(f"Evaluating metric {metric}...")
    scores_dict[metric] = {}
    figures_dict[metric] = {}
    for split in get_args(Split):
        print(f"  Evaluating split {split}...")
        scores_dict[metric][split] = {}
        figures_dict[metric][split] = {}
        for model in MODELS:
            print(f"    Evaluating model {model}...")
            models_preds = load_models_predictions(
                metric2model=METRIC_2_MODEL_INCREMENTAL
                if model == "deberta-small"
                else METRIC_2_MODEL_INCREMENTAL_PARTIAL,
                preds_path=Path(PREDS_DIR) / model,
                load_from_buckets_dir=True,
                split_prefixes=["val_" if split == "valid" else "tr_"],
            )
            score, fig = evaluate_and_compare_models(
                models_preds,
                labelers_preds_dict,
                metric,
                split,
            )
            scores_dict[metric][split][model] = score
            figures_dict[metric][split][model] = fig
print("DONE")

Evaluating metric InformativenessRating...
  Evaluating split train...
    Evaluating model deberta-small...
    Evaluating model deberta-base...
    Evaluating model deberta-large...
    Evaluating model reward-deberta-base...
    Evaluating model reward-deberta-large...
  Evaluating split valid...
    Evaluating model deberta-small...
    Evaluating model deberta-base...
    Evaluating model deberta-large...
    Evaluating model reward-deberta-base...
    Evaluating model reward-deberta-large...
DONE


## Visualise

In [9]:
# for metric in METRICS:
#     for split in get_args(Split):
#         for model in MODELS:
#             print(f"Metric: {metric}, Split: {split}, Model: {model}")
#             display(scores_dict[metric][split][model])
#             print(f"{30 * '#'}\n")

In [18]:
def bar_plot_rewards(split: Split, metric: str, model: str):
    split_prefix = "tr" if split == "train" else "val"
    temp = scores_dict[metric][split][model][
        [f"{split_prefix}_rewards_{i}" for i in range(10)]
        + ["median_labeler", "random_labeler", "fixed_3.5"]
    ]
    fig = px.bar(
        temp, x=temp.index, y=temp.values, title=f"{split}, {metric}, {model}"
    )
    fig.update_layout(yaxis_range=[temp.min() * 0.99, temp.max() * 1.01])
    return fig

In [11]:
# figs = {
#     model: {
#         split: bar_plot_rewards(split, "RelevanceRating", model)
#         for split in ("train", "valid")
#     }
#     for model in MODELS
# }

# fig = make_subplots(
#     rows=3,
#     cols=2,
#     subplot_titles=[
#         f"{model} {split}" for model in MODELS for split in ("train", "valid")
#     ],
# )

# for row, model in enumerate(MODELS):
#     for col, split in enumerate(("train", "valid")):
#         fig.add_trace(figs[model][split]["data"][0], row=row + 1, col=col + 1)
#         fig.update_yaxes(range=[0.76, 0.91], row=row + 1, col=col + 1)


# fig.update_layout(height=1500, width=800)
# fig.show()

In [12]:
# figs = {
#     model: {
#         split: bar_plot_rewards(split, "InformativenessRating", model)
#         for split in ("train", "valid")
#     }
#     for model in MODELS
# }

# fig = make_subplots(
#     rows=3,
#     cols=2,
#     subplot_titles=[
#         f"{model} {split}" for model in MODELS for split in ("train", "valid")
#     ],
# )

# for row, model in enumerate(MODELS):
#     for col, split in enumerate(("train", "valid")):
#         fig.add_trace(figs[model][split]["data"][0], row=row + 1, col=col + 1)
#         fig.update_yaxes(range=[0.8, 0.9], row=row + 1, col=col + 1)


# fig.update_layout(height=1500, width=800)
# fig.show()

In [20]:
scores_dict["InformativenessRating"]["valid"]["deberta-small"]

labeler_mean      1.000000
mean_jittered     0.968103
median_labeler    0.921759
val_rewards_2     0.870833
labeler_2         0.869444
val_rewards_8     0.866667
val_rewards_9     0.864815
val_rewards_1     0.862500
labeler_3         0.860648
val_rewards_3     0.860648
val_rewards_4     0.858796
val_rewards_6     0.858796
val_rewards_10    0.856944
val_rewards_5     0.856481
fixed_3.5         0.856019
val_rewards_7     0.855556
random_labeler    0.855093
labeler_1         0.854167
fixed_3.75        0.852778
fixed_3.25        0.843981
val_rewards_0     0.837037
fixed_4           0.831944
fixed_3           0.818981
dtype: float64

In [21]:
scores_dict["InformativenessRating"]["valid"]["deberta-small"]["val_rewards_7"]

0.8555555555555556

In [22]:
pd.Series(range(11)) * 10

0       0
1      10
2      20
3      30
4      40
5      50
6      60
7      70
8      80
9      90
10    100
dtype: int64

In [23]:
def generate_dfs_for_plot():
    dfs = {}
    for metric in METRICS:
        dfs[metric] = {}
        for split in get_args(Split):
            dfs[metric][split] = pd.DataFrame(
                {"percentage": pd.Series(range(11)) * 10}
            )
            for model in MODELS:
                dfs[metric][split][model] = get_model_column(
                    scores_dict,
                    metric,
                    model,
                    split,
                ).values
            dfs[metric][split]["median_labeler"] = scores_dict[metric][split][
                model
            ]["median_labeler"]
            dfs[metric][split]["random_labeler"] = scores_dict[metric][split][
                model
            ]["random_labeler"]
            dfs[metric][split]["fixed_4"] = scores_dict[metric][split][model][
                "fixed_4"
            ]
            dfs[metric][split]["fixed_3.5"] = scores_dict[metric][split][
                model
            ]["fixed_3.5"]
    return dfs


def get_model_column(
    scores_dict: Dict[str, Dict[str, Dict[str, pd.Series]]],
    metric: str,
    model_name: str,
    split: Split,
) -> pd.DataFrame:
    split_prefix = "tr_" if split == "train" else "val_"
    temp = scores_dict[metric][split][model_name]
    # Adding 0 to the beginning of the series because we didn't
    # evaluate untrained model.
    new_col = pd.concat(
        [
            pd.Series([0.0]),
            temp[temp.index.str.startswith(split_prefix)].sort_index(),
        ]
    )
    return new_col

In [24]:
dfs = generate_dfs_for_plot()

ValueError: Length of values (12) does not match length of index (11)

In [48]:
anot_cols = dfs["RelevanceRating"]["train"].columns.to_list()
anot_cols = anot_cols.remove("percentage")
temp_melt_df = pd.melt(
    dfs["RelevanceRating"]["train"], id_vars=["percentage"], value_vars=anot_cols
)
print(temp_melt_df)
fig = px.scatter(
    temp_melt_df,
    x="percentage",
    y="value",
    color="variable",
    symbol="variable",
)
fig.update_layout(yaxis_range=[0.75, 0.95])
fig.show()

    percentage       variable     value
0            0  deberta-small  0.000000
1           10  deberta-small  0.826852
2           20  deberta-small  0.850463
3           30  deberta-small  0.839352
4           40  deberta-small  0.856944
..         ...            ...       ...
72          60      fixed_3.5  0.845370
73          70      fixed_3.5  0.845370
74          80      fixed_3.5  0.845370
75          90      fixed_3.5  0.845370
76         100      fixed_3.5  0.845370

[77 rows x 3 columns]


In [25]:
deb_small = scores_dict["RelevanceRating"]["valid"]["deberta-small"]
deb_small[deb_small.index.str.startswith("val_")].sort_index()

val_rewards_0    0.842130
val_rewards_1    0.847685
val_rewards_2    0.860648
val_rewards_3    0.875463
val_rewards_4    0.858333
val_rewards_5    0.849074
val_rewards_6    0.887500
val_rewards_7    0.879167
val_rewards_8    0.876852
val_rewards_9    0.843056
dtype: float64

In [None]:

df = pd.DataFrame({"deberta-small-valid": scores_dict["RelevanceRating"]["valid"]["deberta-small"]})