In [None]:
import logging
import os
import sys
from collections import defaultdict
from itertools import product
from pathlib import Path
from typing import Dict, List, Optional

import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import seaborn as sns
import textalloc as ta
from seaborn._statistics import LetterValues
from tqdm import tqdm

NOTEBOOK_PATH: Path = Path(IPython.extract_module_locals()[1]["__vsc_ipynb_file__"])
PROJECT_DIR: Path = NOTEBOOK_PATH.parent.parent
sys.path.append(str(PROJECT_DIR))
import src.utils.custom_log as custom_log
import src.utils.json_util as json_util
from src._StandardNames import StandardNames
from src.load.LoadForClassification import RENAMER, LoadForClassification
from src.mcdm.GetRankingFromExperiments import GetRankingFromExperiments
from src.mcdm.ReadAlternatives import ReadAlternative
from src.utils.PathChecker import PathChecker
from src.utils.set_rcparams import set_rcparams

os.chdir(PROJECT_DIR)
set_rcparams()

LOG: logging.Logger = logging.getLogger(__name__)
custom_log.init_logger(log_lvl=logging.INFO)
LOG.info("Log start, project directory is %s (exist: %s)", PROJECT_DIR, PROJECT_DIR.is_dir())

CHECK: PathChecker = PathChecker()
STR: StandardNames = StandardNames()

FIG_DIR: Path = CHECK.check_directory(PROJECT_DIR / "reports" / "figures", exit=False)
FIG_DIR /= NOTEBOOK_PATH.stem
FIG_DIR.mkdir(parents=True, exist_ok=True)
LOG.info("Figure directory is %s (exist: %s)", FIG_DIR, FIG_DIR.is_dir())

EXP_DIR:Path = CHECK.check_directory(PROJECT_DIR / "experiments", exit=False)

In [2]:
WIDTH: float = 448.13095 / 72 -0.2

In [None]:
ALTERNATIVES: pd.DataFrame = ReadAlternative(b_path=EXP_DIR, search_pattern="*_lstm_treesearch_*").get_data()
ALTERNATIVES

In [29]:
ALTERNATIVES.drop(columns=[c for c in ALTERNATIVES.columns if int(c.split("-")[1]) >= 11], inplace=True)

In [None]:
def get_factors():
    factors = {}
    for alternative in ALTERNATIVES.columns:
        f_path = EXP_DIR / alternative / STR.fname_para
        LOG.info("Reading factors from %s", f_path)
        paras = json_util.load(f_path)
        factors[alternative] = paras[STR.pipeline]
        for key in (
            "feature_extractor_path",
            "plot_model",
            "feature_extractor_path",
            "start_early_stopping_from_n_epochs",
            "max_epochs",
            "patience_factor",
        ):
            if key in factors[alternative]:
                del factors[alternative][key]
        factors[alternative][STR.perc] = paras[STR.perc][STR.target][0]
        factors[alternative]["ai_in"] = paras[STR.data][STR.input][STR.feature]

    factors = pd.DataFrame(factors).T
    factors.index.name = STR.alternatives
    factors.columns.name = "Factors"

    for col in factors.columns:
        if isinstance(factors[col].iloc[0], list):
            factors[col] = [tuple(q) for q in factors[col]]

    return factors


FACTORS:pd.DataFrame = get_factors()
FACTORS = FACTORS[FACTORS[STR.perc].eq(95)].copy()
ALTERNATIVES = ALTERNATIVES[FACTORS.index].copy()
FACTORS

In [None]:
def get_rankings() -> pd.DataFrame:
    r = GetRankingFromExperiments(b_path=EXP_DIR, files=FACTORS.index.to_list())
    r.get_data()
    return r.get_ranking()


RANKINGS: pd.DataFrame = get_rankings()
RANKINGS

In [None]:
def make_a_plot():
    groups = set([x[36:] for x in sorted(ALTERNATIVES.columns) if not x.endswith("false")])
    groups = set([x for x in groups if "perc" not in x])
    fig, (axl1, axl2, ax1) = plt.subplots(
        nrows=3,
        height_ratios=[0.1, 0.1, 1],
        layout="constrained",
    )
    ax2 = ax1.twinx()
    criteria = ("us_MLmetric", "setup_training_comp_time_metamodel")
    naming = {
        "us_MLmetric": "R² on Test Set",
        "setup_training_comp_time_metamodel": "$\O$ Computation Time Training per Fit [min]",
    }

    j = 0
    stored = {}
    for criterion, ax in zip(criteria, (ax1, ax2)):
        data = ALTERNATIVES.loc[[criterion]].T.sort_index()
        data["IDX"] = range(data.shape[0])
        stored[criterion] = data

        if criterion == criteria[0]:
            data[criterion] -= 1
            data[criterion] *= -1
        else:
            data[criterion] /= 60
            data[criterion] /= 6

        ax.plot(data["IDX"], data[criterion], c="black", alpha=0.1)

        for group in sorted(groups):
            idx = [x for x in data.index if x[36:] in group]

            ax.scatter(
                data.loc[idx, "IDX"],
                data.loc[idx, criterion],
                label=group,
                s=60,
                marker="o" if criterion == criteria[0] else "x",
            )

            if criterion == criteria[0]:
                for i in idx:
                    try:
                        ax.annotate(
                            text=FACTORS.loc[i, group],
                            xy=(data.loc[i, "IDX"], data.loc[i, criterion]),
                            xytext=(data.loc[i, "IDX"], 0.003 + data.loc[i, criterion]),  # 0.915 + j * 0.0008),
                            arrowprops=dict(ec="black", lw=1, arrowstyle="-", alpha=0.6),
                            ha="center",
                            bbox=dict(boxstyle="round", fc="grey", ec="grey", alpha=0.2),
                            rotation=90,
                        )
                        j += 1
                    except KeyError:
                        pass

        ax.set_ylabel(naming[criterion])

    axl1.legend(*ax1.get_legend_handles_labels(), loc="center", ncol=4, title=naming[criteria[0]])
    axl1.axis("off")
    axl2.legend(*ax2.get_legend_handles_labels(), loc="center", ncol=4, title=" ".join(naming[criteria[1]].split()[:-1]))
    axl2.axis("off")

    ax1.grid()
    ax1.set_xticks(range(data.shape[0] - 2))
    ax1.set_axisbelow(True)
    ax1.set_xlim([-0.5, 18.5])
    ax1.set_xlabel("Iteration")
    ax1.set_ylim([0.9, 0.93])
    # ax2.set_ylim([0, 82000])

    fig.set_figwidth(WIDTH)
    fig.set_figheight(WIDTH * 0.72)
    fig.savefig(FIG_DIR / "lstm_tournament.pdf")

    return stored


STORED = make_a_plot()
STORED

In [None]:
def get_duplicates(criterion: str = "us_MLmetric"):
    c = list(FACTORS.columns)
    c = FACTORS.reset_index().set_index(c).sort_index()
    w = []

    grouper = {}
    used = set()
    for qq, idx in enumerate(c.index):
        if idx in used:
            continue
        used.add(idx)
        n = c.loc[idx]
        if n.shape[0] > 1:
            for i in range(n.shape[0]):
                w.append(
                    (
                        ALTERNATIVES.loc[criterion, n.iloc[i]].to_list()[0],
                        str(i),
                        str(qq),
                        STORED[criterion].loc[n.iloc[i]["Alternatives"], "IDX"],
                    )
                )
            grouper[qq] = idx

    w = pd.DataFrame(w, columns=[criterion, "Repetition", "Group", "IDX"])
    w[criterion] -= 1
    w[criterion] *= -1
    grouper = pd.DataFrame(grouper, index=c.index.names).T

    display(grouper)

    fig, ax = plt.subplots(figsize=(WIDTH, 0.3 * WIDTH))
    plots = sns.barplot(
        w,
        y=criterion,
        x="Group",
        hue="Repetition",
        legend=False,
        ax=ax,
        gap=0.1,
    )
    ax.set_ylim([0.9, 0.93])
    ax.set_ylabel("R² on Test Set")
    ax.set_xlabel("")
    ax.set_xticklabels([])
    ax.grid()
    ax.set_axisbelow(True)

    q = w.set_index(criterion)
    for bar in plots.patches:
        plots.annotate(
            q.loc[bar.get_height(), "IDX"],
            (bar.get_x() + bar.get_width() / 2, bar.get_height()),
            ha="center",
            va="center",
            xytext=(0, 8),
            textcoords="offset points",
        )

    fig.savefig(FIG_DIR / f"lstm_tournament_duplicates_{criterion}.pdf")


get_duplicates()

In [None]:
get_duplicates("setup_training_comp_time_metamodel")