In [None]:
import sys

sys.path.append("../")

import numpy as np
import pandas as pd
from dynalign.experiments.paths import LP_EVALUATION_RESULTS, PREV_EXPERIMENTS_PATH
from pathlib import Path
from typing import List, Dict, Any, Union, Tuple
from collections import defaultdict
from itertools import chain


def get_dirs_from_path(path: Path, only_files_with_extension: str = "") -> List[Path]:
    if only_files_with_extension:
        return list(path.glob(f"*{only_files_with_extension}"))
    else:
        return [it for it in path.iterdir() if ".gitignore" not in str(it)]


DF_COLUMNS_TO_AGGREGATION = ["run", "embeddings_aggregation"]


def aggregate_aligner_results_(
    df: pd.DataFrame, metric_name: str, precision: float = 3
) -> pd.DataFrame:
    df_columns_to_aggregation = [*DF_COLUMNS_TO_AGGREGATION, metric_name]
    df_columns_to_remove = set(df.columns).difference(set(df_columns_to_aggregation))
    #     df = df[df.prediction_snapshot == df.prediction_snapshot.max()].copy()
    snapshots_results = {}
    for snapshot in df.prediction_snapshot.unique():
        snapshot_df = df[
            (df.prediction_snapshot == snapshot)
            & (df.snapshot != df.prediction_snapshot)
        ].copy()
        snapshot_df = snapshot_df.drop(df_columns_to_remove, axis=1)
        snapshot_df = (
            snapshot_df.groupby(by=["embeddings_aggregation"])
            .agg(("mean", "std"))
            .drop("run", axis=1)
        )

        snapshot_df = snapshot_df.apply(
            lambda x: (
                round(x[metric_name]["mean"], precision),
                round(x[metric_name]["std"], precision),
            ),
            axis=1,
        )
        snapshots_results[snapshot] = snapshot_df.to_dict()
    return snapshots_results


def aggreagte_all_results_last_snapshot(
    paths: str, metric_name: str, precision: float = 3
):
    results = []
    for method_results_path in paths:
        method_name = method_results_path.name
        method_ds_results_paths = get_dirs_from_path(
            method_results_path, only_files_with_extension=".pkl"
        )

        for method_ds_results_path in method_ds_results_paths:
            ds_name = method_ds_results_path.name.replace(".pkl", "")

            alignment_snapshot = "zero"
            if "prev" in str(method_ds_results_path):
                alignment_snapshot = "prev"
            elif "full" in str(method_ds_results_path):
                alignment_snapshot = "none"

            method_df = pd.read_pickle(method_ds_results_path)
            for snapshot in method_df.prediction_snapshot.unique():
                snapshot_df = method_df[
                    (method_df.prediction_snapshot == snapshot)
                    & (method_df.same_snapshot_prediction == False)
                ].copy()

                snapshot_df = snapshot_df[
                    [
                        "run",
                        "embeddings_aggregation",
                        metric_name,
                    ]
                ]

                for _, row in snapshot_df.iterrows():
                    results.append(
                        {
                            "ds": ds_name,
                            "method_name": method_name,
                            "aggregation_method": row["embeddings_aggregation"],
                            "alignment_snapshot": alignment_snapshot,
                            metric_name: row[metric_name],
                            "snapshot": snapshot,
                            "run": row["run"],
                        }
                    )
    return results


def convert_float_to_str(x: float) -> str:
    return f"{x:.2f}"


def percentage_style(
    x: Union[float, Tuple[float, float]]
) -> Union[float, Tuple[float, float]]:
    """Percantage style fn."""
    if isinstance(x, float):
        return round(x * 100, 2)
    elif isinstance(x, tuple):
        return round(x[0] * 100, 2), round(x[1] * 100, 2)
    raise ValueError("X parsing error")


def highlight_max(x: pd.Series) -> List[str]:
    values = [it[0] if it else 0 for it in x.values]
    max_id = np.argmax(values)

    output = []
    for i in range(len(x)):
        if i == max_id:
            output.append("color:red")
        else:
            output.append("")
    return output

In [None]:
paths = get_dirs_from_path(LP_EVALUATION_RESULTS)
results = aggreagte_all_results_last_snapshot(paths, metric_name="auc", precision=3)
df = pd.DataFrame(results)
df["method_name"] = "Node2Vec (snapshot)"
results = df.to_dict(orient="records")

prev_paths = get_dirs_from_path(PREV_EXPERIMENTS_PATH / "evaluation" / "lp")
results.extend(
    aggreagte_all_results_last_snapshot(prev_paths, metric_name="auc", precision=3)
)
full_paths = get_dirs_from_path(Path("../data/evaluation_full/lp/"))
results.extend(
    aggreagte_all_results_last_snapshot(full_paths, metric_name="auc", precision=3)
)
results_df = pd.DataFrame(results)

In [None]:
from ipywidgets import interact
import plotly.express as px


@interact
def plot_results(
    dataset=results_df.ds.unique(),
    aggregation_method=results_df.aggregation_method.unique(),
) -> None:
    alignment_df = results_df[
        (results_df.ds == dataset)
        & (results_df.aggregation_method == aggregation_method)
        & (results_df.alignment_snapshot == "prev") 
        & (results_df.snapshot > 1)
    ].sort_values(by="method_name").copy()
    
    n2v_df = results_df[
          (results_df.ds == dataset)
        & (results_df.aggregation_method == "last")
        & (results_df.method_name.str.contains("Node2Vec")) 
        & (results_df.snapshot > 1)].copy()
    
    df = pd.concat([alignment_df, n2v_df], ignore_index=True)
    fig = px.box(
        data_frame=df,
        x="snapshot",
        y="auc",
        color="method_name"
    )
    fig.show()
#     display(df)
    
    
#     sub_df = df[(df.alignment_snapshot == "prev") & (df.aggregation_method == "last")].copy()
#     display(sub_df.groupby(["ds", "method_name", "snapshot"]).agg(["mean", "std"]))


## Tables

In [None]:
results_df

In [None]:
order = [
    "Node2Vec",
    "Node2Vec (snapshot)",
    "PosthocALL",
    "PosthocEJ",
    "PosthocTB",
    "Node2VecAligned_L2_ALL",
    "Node2VecAligned_L2_EJ",
    "Node2VecAligned_L2_EJ_Weighted",
    "Node2VecAligned_L2_TB",
    "Node2VecAligned_L2_TB_Weighted",
]

In [None]:
markdown_str = ''
best_aggregation_method = {}
for idx, ds_results in results_df.groupby(by="ds"):
    ds_df = ds_results.copy()
    display(idx)
    ds_df.drop(["ds", "alignment_snapshot"], axis=1, inplace=True)
    ds_df = ds_df[ds_df.snapshot == ds_df.snapshot.max()].drop(["snapshot"], axis=1)
    ds_df.columns = ["method_name", "aggregation_method", "auc", "run"]
    #     display(ds_df["method_name"])
    ds_df = (
        ds_df.groupby(by=["method_name", "aggregation_method"])
        .aggregate(["mean", "std"])
        .drop("run", axis=1)
        #
    )
    #     display(ds_df.fillna(0))
    #     ds_df.drop(["Node2Vec", "Node2Vec (snapshot)"], inplace=True)
    ds_df = (
        ds_df.apply(
            lambda x: (round(x["auc"]["mean"], 3), round(x["auc"]["std"], 3)),
            axis=1,
        )
        .to_frame()
        .reset_index()
        .pivot(index="method_name", columns="aggregation_method", values=0)
        .applymap(lambda x: (0.0, 0.0) if isinstance(x, float) else x)
    )
    best_aggregation_method[idx] = ds_df.applymap(lambda x: x[0]).max().idxmax()
    display(ds_df.style.highlight_max())
    markdown_str += (idx + " \n \n ")
    markdown_str +=(
        ds_df.loc[
            ds_df.max(axis=1).apply(lambda x: x[0]).sort_values(ascending=False).index
        ].to_markdown() + " \n \n" 
    )


In [None]:
node2vec_aggregation_method = {
    "Node2Vec": "last",
    "Node2Vec (snapshot)": "last"
}

In [None]:
best_aggregation_method

In [None]:
results_data = defaultdict(dict)
for idx, ds_results in results_df.groupby(by="ds"):
    ds_df = ds_results.copy()
    ds_df.drop(["ds", "alignment_snapshot"], axis=1, inplace=True)
    ds_df = ds_df[ds_df.snapshot == ds_df.snapshot.max()].drop(["snapshot"], axis=1)

    for method_name, method_df in ds_df.groupby(by="method_name"):
        aggregation_method = (
            best_aggregation_method[idx]
            if not method_name in node2vec_aggregation_method.keys()
            else node2vec_aggregation_method[method_name]
        )
        method_df = method_df[method_df.aggregation_method == aggregation_method]
        assert len(method_df) == method_df.run.max() + 1
        mean_auc = method_df['auc'].mean().round(3)
        mean_std = method_df['auc'].std().round(3)
        results_data[idx][method_name] = (mean_auc, mean_std)
        
        


In [None]:
def get_top_score_bold(x: pd.Series) -> List[str]:
    max_id = np.argmax(x.values)

    output = []
    for i in range(len(x)):
        if i == max_id:
            out_str = f'$\\mathbf{{{convert_float_to_str(x[i][0]) + " ± " + convert_float_to_str(x[i][1])}}}'
            out_str += "$"
            output.append(out_str)

        elif -1_000_000 < x[i][0] < 1_000_000:
            out_str = f'${convert_float_to_str(x[i][0]) + " ± " + convert_float_to_str(x[i][1])}'
            out_str += "$"
            output.append(out_str)

        else:
            output.append("$\\times$")
    return output


In [None]:
results = pd.DataFrame(results_data).loc[order]
mean_rank = results.applymap(lambda x: x[0]).rank(ascending=False).mean(axis=1).round(2)
# results = results.applymap(percentage_style)
results = results.apply(get_top_score_bold)
results

In [None]:
MODEL_NAMES = {
    "PosthocTB": "Posthoc-TB",
    "PosthocEJ": "Posthoc-EJ",
    "PosthocALL": "Posthoc-PA",
    "Node2VecAligned_L2_ALL": "\makecell[l]{Node2Vec \\\\ (Regularized, All)}",
    "Node2VecAligned_L2_EJ": "\makecell[l]{Node2Vec \\\\ (Regularized, EJ)}",
    "Node2VecAligned_L2_EJ_Weighted": "\makecell[l]{Node2Vec \\\\ (Regularized, \\\\ Weighted, EJ)}",
    "Node2VecAligned_L2_TB": "\makecell[l]{Node2Vec \\\\ (Regularized, TB)}",
    "Node2VecAligned_L2_TB_Weighted": "\makecell[l]{Node2Vec \\\\ (Regularized, \\\\ Weighted, TB)}",
}

In [None]:
results = pd.DataFrame(results_data).loc[order]
mean_rank = results.applymap(lambda x: x[0]).rank(ascending=False).mean(axis=1).round(2)
results = results.applymap(percentage_style)
results = results.apply(get_top_score_bold)
results["Mean rank"] = [f'{it:.2f}' for it in mean_rank]
results = results.loc[order].rename(MODEL_NAMES, axis=0)
# results = results.sort_values(by="Mean rank").rename(MODEL_NAMES, axis=0)
results

In [None]:
print((
    results.style.to_latex()
    .replace("{llllll}", "{lrrrrr}\n\\toprule")
    .replace("&  &  &  &  &  \\\\", "")
    .replace("NaN", "---")
    .replace("±", r"\pm")
    .replace(r"\$", "$")
    .replace("textbackslash ", "")
    .replace("\{", "{")
    .replace(r"\}", "}")
    .replace("_", "\_")
    .replace("}<KLEJ>", "\\textcolor{red}{*}}")
    .replace("mathbf", "mathbf")
    .replace("\\end{tabular}", "\\bottomrule \n\\end{tabular}")
))