In [1]:
import pandas as pd
import numpy as np
from utils import (
    load_labelers_predictions,
    load_models_predictions,
    Split,
    METRIC_2_MODEL_INCREMENTAL,
    METRIC_2_MODEL_INCREMENTAL_PARTIAL,
)
from evaluation import evaluate_labelers
from typing import Dict, Tuple, List, get_args
from pathlib import Path
import pickle as pkl

# plotting
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots

pio.templates.default = 'plotly_white'

## Define variables

In [2]:
METRICS = ("InformativenessRating",)  # "RelevanceRating")
HOME_DIR = "/mnt/ml-team/homes/grzegorz.jurdzinski"
# HOME_DIR = "~"
NEWSROOM_DIR = f"{HOME_DIR}/datasets/newsroom"
# NEWSROOM_DIR = f"{HOME_DIR}/Repositories/newsroom"
PREDS_DIR = "/mnt/ml-team/homes/grzegorz.jurdzinski/runs/newsroom-incremental"
# PREDS_DIR = f"{HOME_DIR}/Repositories/rewards/incremental"
MODELS = (
    "deberta-small",
    "deberta-base",
    "deberta-large",
    "reward-deberta-base",
    "reward-deberta-large",
)

In [3]:
file_names = {
    "original": "newsroom-aggregated-original-order.csv",
    "random": "newsroom-aggregated-random-order.csv",
    "sorted": "newsroom-aggregated-sorted-order.csv",
}

If the models' predictions were not bucketised yet, run `bucketise_predictions.py` to do so.

## Load labelers predictions

In [4]:
def get_labelers_preds(dir: str, metric: str, split: str) -> pd.DataFrame:
    labelers_preds = load_labelers_predictions(
        NEWSROOM_DIR,
        "newsroom-aggregated-original-order.csv",
        metric=metric,
        split=split
    )
    
    # Add one random labeler.
    temp_labelers_preds = load_labelers_predictions(
        NEWSROOM_DIR,
        "newsroom-aggregated-random-order.csv",
        metric=metric,
        split=split
    )
    temp_labelers_preds.rename(columns={"labeler_1": "random_labeler"}, inplace=True)
    labelers_preds = labelers_preds.merge(
        temp_labelers_preds[["ArticleID", "System", "random_labeler"]],
        how="inner",
        on=["ArticleID", "System"])
    
    # Add median labeler.
    temp_labelers_preds = load_labelers_predictions(
        NEWSROOM_DIR,
        "newsroom-aggregated-sorted-order.csv",
        metric=metric,
        split=split
    )
    temp_labelers_preds.rename(columns={"labeler_2": "median_labeler"}, inplace=True)
    labelers_preds = labelers_preds.merge(
        temp_labelers_preds[["ArticleID", "System", "median_labeler"]],
        how="inner",
        on=["ArticleID", "System"])
    
    return labelers_preds

In [5]:
# labelers_preds_dict = {}

# for metric in METRICS:
#     print(f"Processing metric {metric}...")
#     labelers_preds_dict[metric] = {}
#     for split in get_args(Split):
#         print(f"\tProcessing split {split}...")
#         labelers_preds_dict[metric][split] = get_labelers_preds(
#             NEWSROOM_DIR, metric, split
#         )
# print("Done.")

Processing metric InformativenessRating...
	Processing split train...
	Processing split valid...
Done.


## Define evaluation helper functions

In [6]:
def evaluate_and_compare_models(
    models_preds: Dict[str, Dict[str, pd.DataFrame]],
    labelers_preds_dict: Dict[str, Dict[str, pd.DataFrame]],
    metric: str,
    split: Split,
    k: int = 1,
) -> Tuple[pd.Series, go.Figure]:
    """
    Merges labelers and models predictions and evaluates them.
    Args:
        models_preds: Dict of dicts of DataFrames with models predictions.
        labelers_preds_dict: Dict of dicts of DataFrames with labelers predictions.
        metric: Metric to evaluate.
        split: Split to evaluate.
        k: k from our evaluation formula.
    Returns:
        Tuple of:
            - Series with evaluation results.
            - Plotly figure with predictions.
    """
    labelers_preds = labelers_preds_dict[metric][split]
    labelers_preds = _merge_labelers_and_models_preds(labelers_preds, models_preds, metric)

    labelers_preds = _add_fixed_columns(
        labelers_preds, [3, 3.25, 3.5, 3.75, 4]
    )
    labelers_preds["mean_jittered"] = (
        labelers_preds["labeler_mean"]
        + np.random.normal(loc=0.0, scale=0.15, size=(len(labelers_preds)))
    ).clip(lower=0, upper=5)

    anot_cols = labelers_preds.columns.to_list()
    anot_cols.remove("ArticleID")
    anot_cols.remove("System")
    anot_cols.remove("summary_id")

    labelers_preds[anot_cols] = (labelers_preds[anot_cols] - 1) / 4

    annotator_scores = evaluate_labelers(
        labelers_preds[anot_cols],
        normalization=None,
        human_prefix="labeler_",
        k=k,
    ).sort_values(ascending=False)
    # display(evaluation_results)

    temp_melt_df = pd.melt(
        labelers_preds, id_vars=["summary_id"], value_vars=anot_cols
    )
    fig = px.scatter(
        temp_melt_df,
        x="summary_id",
        y="value",
        color="variable",
    )
    return annotator_scores, fig
    # fig.show()


def _add_fixed_columns(df: pd.DataFrame, values: List[float]) -> pd.DataFrame:
    for v in values:
        df[f"fixed_{v}"] = v
    return df


def _merge_labelers_and_models_preds(
    labelers_preds: pd.DataFrame,
    models_preds: Dict[str, Dict[str, pd.DataFrame]],
    metric: str,
) -> pd.DataFrame:
    if metric in models_preds:  # METRIC_2_MODEL_INCREMENTAL:
        for model_name, model_predictions in models_preds[metric].items():
            labelers_preds = labelers_preds.merge(
                model_predictions[["ArticleID", "System", "labels"]],
                how="inner",
                on=["ArticleID", "System"],
            ).rename(columns={"labels": model_name})
    return labelers_preds

In [7]:
# models_preds = load_models_predictions(
#     metric2model=METRIC_2_MODEL_INCREMENTAL,
#     preds_path=Path(PREDS_DIR) / "deberta-small",
#     load_from_buckets_dir=True,
#     split_prefixes=["val_"],
# )
# evaluate_and_compare_models(
#     models_preds, labelers_preds_dict, "RelevanceRating", "valid"
# )

## Load and evaluate models predictions

In [8]:
# scores_dict: Dict[str, Dict[str, Dict[str, pd.Series]]] = {}
# figures_dict: Dict[str, Dict[str, Dict[str, go.Figure]]] = {}

# for metric in METRICS:
#     print(f"Evaluating metric {metric}...")
#     scores_dict[metric] = {}
#     figures_dict[metric] = {}
#     for split in get_args(Split):
#         print(f"  Evaluating split {split}...")
#         scores_dict[metric][split] = {}
#         figures_dict[metric][split] = {}
#         for model in MODELS:
#             print(f"    Evaluating model {model}...")
#             models_preds = load_models_predictions(
#                 metric2model=METRIC_2_MODEL_INCREMENTAL
#                 if model == "deberta-small"
#                 else METRIC_2_MODEL_INCREMENTAL_PARTIAL,
#                 preds_path=Path(PREDS_DIR) / model,
#                 load_from_buckets_dir=True,
#                 split_prefixes=["val_" if split == "valid" else "tr_"],
#             )
#             score, fig = evaluate_and_compare_models(
#                 models_preds,
#                 labelers_preds_dict,
#                 metric,
#                 split,
#             )
#             scores_dict[metric][split][model] = score
#             figures_dict[metric][split][model] = fig
# print("Done.")

Evaluating metric InformativenessRating...
  Evaluating split train...
    Evaluating model deberta-small...
    Evaluating model deberta-base...
    Evaluating model deberta-large...
    Evaluating model reward-deberta-base...
    Evaluating model reward-deberta-large...
  Evaluating split valid...
    Evaluating model deberta-small...
    Evaluating model deberta-base...
    Evaluating model deberta-large...
    Evaluating model reward-deberta-base...
    Evaluating model reward-deberta-large...
Done.


In [18]:
# with open("scores_dict.pkl", "wb") as f:
#     pkl.dump(scores_dict, f)
# with open("figures_dict.pkl", "wb") as f:
#     pkl.dump(figures_dict, f)

In [None]:
scores_dict = pkl.load(open("scores_dict.pkl", "rb"))
figures_dict = pkl.load(open("figures_dict.pkl", "rb"))

## Visualise

### Display scores

In [10]:
# for metric in METRICS:
#     for split in get_args(Split):
#         for model in MODELS:
#             print(f"Metric: {metric}, Split: {split}, Model: {model}")
#             display(scores_dict[metric][split][model])
#             print(f"{30 * '#'}\n")

### Bar plots

In [11]:
def bar_plot_rewards(split: Split, metric: str, model: str):
    split_prefix = "tr" if split == "train" else "val"
    temp = scores_dict[metric][split][model][
        [f"{split_prefix}_rewards_{i}" for i in range(10)]
        + ["median_labeler", "random_labeler", "fixed_3.5"]
    ]
    fig = px.bar(
        temp, x=temp.index, y=temp.values, title=f"{split}, {metric}, {model}"
    )
    fig.update_layout(yaxis_range=[temp.min() * 0.99, temp.max() * 1.01])
    return fig

In [12]:
# figs = {
#     model: {
#         split: bar_plot_rewards(split, "RelevanceRating", model)
#         for split in ("train", "valid")
#     }
#     for model in MODELS
# }

# fig = make_subplots(
#     rows=3,
#     cols=2,
#     subplot_titles=[
#         f"{model} {split}" for model in MODELS for split in ("train", "valid")
#     ],
# )

# for row, model in enumerate(MODELS):
#     for col, split in enumerate(("train", "valid")):
#         fig.add_trace(figs[model][split]["data"][0], row=row + 1, col=col + 1)
#         fig.update_yaxes(range=[0.76, 0.91], row=row + 1, col=col + 1)


# fig.update_layout(height=1500, width=800)
# fig.show()

In [13]:
# figs = {
#     model: {
#         split: bar_plot_rewards(split, "InformativenessRating", model)
#         for split in ("train", "valid")
#     }
#     for model in MODELS
# }

# fig = make_subplots(
#     rows=3,
#     cols=2,
#     subplot_titles=[
#         f"{model} {split}" for model in MODELS for split in ("train", "valid")
#     ],
# )

# for row, model in enumerate(MODELS):
#     for col, split in enumerate(("train", "valid")):
#         fig.add_trace(figs[model][split]["data"][0], row=row + 1, col=col + 1)
#         fig.update_yaxes(range=[0.8, 0.9], row=row + 1, col=col + 1)


# fig.update_layout(height=1500, width=800)
# fig.show()

### Scatter plots

#### Define helper functions

In [14]:
def generate_dfs_for_plot():
    dfs = {}
    for metric in METRICS:
        dfs[metric] = {}
        for split in get_args(Split):
            dfs[metric][split] = pd.DataFrame(
                {"percentage": pd.Series(range(11)) * 10}
            )
            for model in MODELS:
                dfs[metric][split][model] = get_model_column(
                    scores_dict,
                    metric,
                    model,
                    split,
                ).values
            dfs[metric][split]["median_labeler"] = scores_dict[metric][split][
                model
            ]["median_labeler"]
            dfs[metric][split]["random_labeler"] = scores_dict[metric][split][
                model
            ]["random_labeler"]
            dfs[metric][split]["fixed_4"] = scores_dict[metric][split][model][
                "fixed_4"
            ]
            dfs[metric][split]["fixed_3.5"] = scores_dict[metric][split][
                model
            ]["fixed_3.5"]
    return dfs


def get_model_column(
    scores_dict: Dict[str, Dict[str, Dict[str, pd.Series]]],
    metric: str,
    model_name: str,
    split: Split,
) -> pd.DataFrame:
    split_prefix = "tr_" if split == "train" else "val_"
    temp = scores_dict[metric][split][model_name]
    temp = temp[temp.index.str.startswith(split_prefix)].sort_index()
    # Adding 0 to the beginning or middle of the series because we didn't
    # evaluate all cases.
    if len(temp) == 10:
        new_col = pd.concat(
            [
                pd.Series([0.0] * (11 - len(temp))),
                temp,
            ]
        )
    elif len(temp) == 1:
        new_col = pd.concat(
            [
                pd.Series([0.0] * 10),
                pd.Series([temp.iloc[1]]),
            ]
        )
    elif len(temp) == 2:
        new_col = pd.concat(
            [
                pd.Series([temp.iloc[0]]),
                pd.Series([0.0] * 9),
                pd.Series([temp.iloc[1]]),
            ]
        )
    else:
        new_col = temp
    return new_col

In [15]:
# dfs = generate_dfs_for_plot()

In [19]:
# with open("dfs.pkl", "wb") as f:
#     pkl.dump(dfs, f)

In [20]:
# dfs = pkl.load(open("dfs.pkl", "rb"))

In [37]:
# df_total = pd.DataFrame()
# for metric, d in dfs.items():
#     print(metric)
#     for split, df in d.items():
#         # print("\t", split)
#         df_temp = df.copy()
#         df_temp["split"] = split
#         df_temp["metric"] = metric
#         # print(df_temp)
#         if df_total.empty:
#             df_total = df_temp.copy()
#         else:
#             df_total = pd.concat([df_total, df_temp])
# df_total
# # df_temp

InformativenessRating


Unnamed: 0,percentage,deberta-small,deberta-base,deberta-large,reward-deberta-base,reward-deberta-large,median_labeler,random_labeler,fixed_4,fixed_3.5,split,metric
0,0,0.831944,0.817593,0.804167,0.835185,0.806481,0.933333,0.857407,0.849074,0.853704,train,InformativenessRating
1,10,0.873148,0.0,0.0,0.0,0.0,0.933333,0.857407,0.849074,0.853704,train,InformativenessRating
2,20,0.873148,0.0,0.0,0.0,0.0,0.933333,0.857407,0.849074,0.853704,train,InformativenessRating
3,30,0.885648,0.0,0.0,0.0,0.0,0.933333,0.857407,0.849074,0.853704,train,InformativenessRating
4,40,0.851389,0.0,0.0,0.0,0.0,0.933333,0.857407,0.849074,0.853704,train,InformativenessRating
5,50,0.881944,0.0,0.0,0.0,0.0,0.933333,0.857407,0.849074,0.853704,train,InformativenessRating
6,60,0.884259,0.0,0.0,0.0,0.0,0.933333,0.857407,0.849074,0.853704,train,InformativenessRating
7,70,0.885185,0.0,0.0,0.0,0.0,0.933333,0.857407,0.849074,0.853704,train,InformativenessRating
8,80,0.89537,0.0,0.0,0.0,0.0,0.933333,0.857407,0.849074,0.853704,train,InformativenessRating
9,90,0.874074,0.0,0.0,0.0,0.0,0.933333,0.857407,0.849074,0.853704,train,InformativenessRating


In [42]:
# df_total_melt = pd.melt(df_total, id_vars=["percentage", "split", "metric"], value_vars=MODELS)
# df_total_melt.rename(columns={"variable": "model", "value": "eval_metric_value"}, inplace=True)
# # df_total_melt.loc[df_total_melt["eval_metric_value"] > 0.0]
# df_total_melt

Unnamed: 0,percentage,split,metric,model,eval_metric_value
0,0,train,InformativenessRating,deberta-small,0.831944
1,10,train,InformativenessRating,deberta-small,0.873148
2,20,train,InformativenessRating,deberta-small,0.873148
3,30,train,InformativenessRating,deberta-small,0.885648
4,40,train,InformativenessRating,deberta-small,0.851389
...,...,...,...,...,...
105,60,valid,InformativenessRating,reward-deberta-large,0.000000
106,70,valid,InformativenessRating,reward-deberta-large,0.000000
107,80,valid,InformativenessRating,reward-deberta-large,0.000000
108,90,valid,InformativenessRating,reward-deberta-large,0.000000


In [47]:
# with open("df_total_melt.pkl", "wb") as f:
#     pkl.dump(df_total_melt, f)

In [48]:
df_total_melt = pkl.load(open("df_total_melt.pkl", "rb"))

In [73]:
list(df_total_melt["model"])

['reward-deberta-base',
 'reward-deberta-base',
 'reward-deberta-base',
 'reward-deberta-base',
 'reward-deberta-base',
 'reward-deberta-base',
 'reward-deberta-base',
 'reward-deberta-base',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large',
 'reward-deberta-large']

In [None]:
# def plot_scores(metric: str, split: str, width: int = 900):
#     df = dfs[metric][split]

#     anot_cols = df.columns.to_list()
#     anot_cols = anot_cols.remove("percentage")

#     temp_melt_df_models = pd.melt(
#         df[df.columns.intersection(MODELS + ("percentage",))],
#         id_vars=["percentage"],
#         value_vars=anot_cols,
#     )

#     fig = go.Figure()

#     fig_scatter = px.scatter(
#         temp_melt_df_models,
#         x="percentage",
#         y="value",
#         color="variable",
#         symbol="variable",
#         width=width,
#     )
#     for d in fig_scatter.data:
#         fig.add_trace(d)

#     temp_melt_df_not_models = pd.melt(
#         df[df.columns.difference(MODELS)],
#         id_vars=["percentage"],
#         value_vars=anot_cols,
#     )
#     fig_line = px.line(
#         temp_melt_df_not_models,
#         x="percentage",
#         y="value",
#         color="variable",
#         color_discrete_sequence=px.colors.qualitative.Plotly[len(MODELS) :],
#         width=width,
#     )
#     fig_line.update_traces(opacity=0.25)
#     for d in fig_line.data:
#         fig.add_trace(d)

#     fig.update_layout(title=f"{metric} {split}", yaxis_range=[0.7, 1])
#     fig.show()

In [61]:
def plot_scores(df_total_melt, metric: str, split: str, width: int = 900):
    # df = dfs[metric][split]

    # anot_cols = df.columns.to_list()
    # anot_cols = anot_cols.remove("percentage")

    # temp_melt_df_models = pd.melt(
    #     df[df.columns.intersection(MODELS + ("percentage",))],
    #     id_vars=["percentage"],
    #     value_vars=anot_cols,
    # )

    fig = go.Figure()

    fig_scatter = px.scatter(
        df_total_melt[df_total_melt["metric"] == metric][
            df_total_melt["split"] == split
        ][df_total_melt["model"].isin(MODELS)],
        x="percentage",
        y="eval_metric_value",
        color="model",
        symbol="model",
        width=width,
    )
    for d in fig_scatter.data:
        fig.add_trace(d)

    # temp_melt_df_not_models = pd.melt(
    #     df[df.columns.difference(MODELS)],
    #     id_vars=["percentage"],
    #     value_vars=anot_cols,
    # )
    print(MODELS)
    print(df_total_melt["model"])
    print(df_total_melt[df_total_melt["metric"] == metric][
            df_total_melt["split"] == split
        ][~df_total_melt["model"].isin(MODELS)])
    fig_line = px.line(
        df_total_melt[df_total_melt["metric"] == metric][
            df_total_melt["split"] == split
        ][~df_total_melt["model"].isin(MODELS)],
        x="percentage",
        y="eval_metric_value",
        color="model",
        color_discrete_sequence=px.colors.qualitative.Plotly[len(MODELS):],
        width=width,
    )
    fig_line.update_traces(opacity=0.25)
    for d in fig_line.data:
        fig.add_trace(d)

    fig.update_layout(title=f"{metric} {split}", yaxis_range=[0.7, 1])
    fig.show()

#### Plots

In [62]:
for metric in METRICS:
    for split in get_args(Split):
        plot_scores(df_total_melt, metric, split)

('deberta-small', 'deberta-base', 'deberta-large', 'reward-deberta-base', 'reward-deberta-large')
0             deberta-small
1             deberta-small
2             deberta-small
3             deberta-small
4             deberta-small
               ...         
105    reward-deberta-large
106    reward-deberta-large
107    reward-deberta-large
108    reward-deberta-large
109    reward-deberta-large
Name: model, Length: 110, dtype: object
Empty DataFrame
Columns: [percentage, split, metric, model, eval_metric_value]
Index: []



Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



('deberta-small', 'deberta-base', 'deberta-large', 'reward-deberta-base', 'reward-deberta-large')
0             deberta-small
1             deberta-small
2             deberta-small
3             deberta-small
4             deberta-small
               ...         
105    reward-deberta-large
106    reward-deberta-large
107    reward-deberta-large
108    reward-deberta-large
109    reward-deberta-large
Name: model, Length: 110, dtype: object
Empty DataFrame
Columns: [percentage, split, metric, model, eval_metric_value]
Index: []



Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.

