In [None]:
%load_ext autoreload
%autoreload 2

import hydra
from hydra import initialize, compose

initialize(version_base=None, config_path=str("../conf"), job_name="matrioska_learning")

# Instantiate configuration

In [None]:
from nn_core.common import PROJECT_ROOT

# Instantiate torchvision dataset
cfg = compose(config_name="matrioska_learning", overrides=[])

# Instantiate dataset

In [None]:
from la.utils.io_utils import add_ids_to_dataset, load_data
from la.utils.io_utils import preprocess_dataset


original_dataset = dataset = load_data(cfg).shard(num_shards=10, index=0)  # TODO remove sharding when done develop
dataset = preprocess_dataset(dataset, cfg)
dataset = add_ids_to_dataset(dataset)
img_size = dataset["train"][0]["x"].size[1]
dataset

# Define matrioska datasets

In [None]:
# hf specific variables
# (if a dataset change is needed, it is enough to redefine these variables...)
class_names = original_dataset["train"].features["label"].names
class_idxs = [original_dataset["train"].features["label"].str2int(class_name) for class_name in class_names]

class_names, class_idxs

In [None]:
# Define matrioska parameters... just start with the first two classes
MATRIOSKA_START_NCLASSES = [0, 1]
remanining_classes = sorted((set(class_idxs) - set(MATRIOSKA_START_NCLASSES)))
MATRIOSKA_START_NCLASSES, remanining_classes

In [None]:
# Generate matrioska classes
matrioskaclasses = [set(MATRIOSKA_START_NCLASSES + remanining_classes[:i]) for i in range(len(remanining_classes) + 1)]
matrioskaclasses

In [None]:
# Generate associated datasets
# TODO: do we want to have the same number of samples in all the datasets?
# I think not. This is more fair, if this works we are in the worst case scenario.
matrioskaidx2dataset = {
    i: dataset.filter(lambda row: row["y"] in matrioskaclasses[i]) for i in range(len(matrioskaclasses))
}

# Note that we are using the prefix convention for the classes, thus we have consistency
# between local and global classes ids... let's stay with that it is easier
matrioskaidx2dataset

# Train matrioska models

In [None]:
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from torch.utils.data import DataLoader


matrioskaidx2model = {}


for i in range(len(matrioskaclasses)):
    print(f"Training model {i}...")

    model: pl.LightningModule = hydra.utils.instantiate(
        cfg.nn.model,
        _recursive_=False,
        num_classes=len(matrioskaclasses[i]),
        model=cfg.nn.model.model,
        input_dim=img_size,
    )

    processed_dataset = matrioskaidx2dataset[i].map(
        desc=f"Preprocessing samples",
        function=lambda x: {"x": model.transform_func(x["x"])},
    )
    processed_dataset.set_format(type="torch", columns=["x", "y"])

    train_loader = DataLoader(
        processed_dataset["train"],
        batch_size=64,
        pin_memory=True,
        shuffle=True,
        num_workers=8,
    )
    val_loader = DataLoader(
        processed_dataset["test"],
        batch_size=64,
        pin_memory=True,
        shuffle=True,
        num_workers=0,
    )

    trainer = Trainer(
        accelerator="auto",
        devices=1,
        max_epochs=3,
        logger=None,
        # callbacks=[RichProgressBar()],
        enable_progress_bar=True,
    )
    trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

    matrioskaidx2model[i] = trainer.model.eval().cpu().requires_grad_(False)

# Evaluate matrioska models

In [None]:
# Decide which classes to evaluate on -- it may be interesting to change this
EVALUATION_CLASSES = MATRIOSKA_START_NCLASSES
EVALUATION_CLASSES

In [None]:
# Define the evaluation dataset according to chosen classes
eval_dataset = dataset.filter(lambda row: row["y"] in set(EVALUATION_CLASSES))
eval_dataset = eval_dataset.map(
    desc=f"Preprocessing samples",
    function=lambda x: {"x": model.transform_func(x["x"])},
)
eval_dataset.set_format(type="torch", columns=["x", "y"])

eval_loader = DataLoader(
    eval_dataset["test"],
    batch_size=64,
    pin_memory=True,
    shuffle=True,
    num_workers=0,
)

trainer = Trainer(
    accelerator="auto",
    devices=1,
    max_epochs=3,
    logger=None,
    # callbacks=[RichProgressBar()],
    enable_progress_bar=True,
)

eval_dataset

In [None]:
import dataclasses


@dataclasses.dataclass
class Result:
    matrioska_idx: int
    test_acc: float
    test_loss: float
    clusterer: str
    v_measure_score: float
    adjusted_mutual_info_score: float
    adjusted_rand_score: float
    completeness_score: float
    fowlkes_mallows_score: float
    homogeneity_completeness_v_measure: float
    homogeneity_score: float
    mutual_info_score: float
    normalized_mutual_info_score: float
    rand_score: float

In [None]:
from sklearn.cluster import KMeans, BisectingKMeans
import sklearn
import torch


model = matrioskaidx2model[0]


def compute_eval_embedings(model, eval_loader):
    eval_embeddings = []
    eval_labels = []
    for batch in eval_loader:
        out = model(batch["x"])
        eval_embeddings.append(out["embeds"])
        eval_labels.append(batch["y"])

    eval_embeddings = torch.cat(eval_embeddings, dim=0)
    eval_labels = torch.cat(eval_labels, dim=0)
    return eval_embeddings.detach().cpu().numpy(), eval_labels.detach().cpu().numpy()


clusterizer = {
    "kmeans": lambda embeds: KMeans(n_clusters=len(EVALUATION_CLASSES)).fit(embeds).labels_,
    "bisect-kmeans": lambda embeds: BisectingKMeans(n_clusters=len(EVALUATION_CLASSES)).fit(embeds).labels_,
}

clustering_metric = {
    "v_measure_score": lambda x, y_pred, y_true: sklearn.metrics.v_measure_score(y_true, y_pred),
    "adjusted_mutual_info_score": lambda x, y_pred, y_true: sklearn.metrics.adjusted_mutual_info_score(y_true, y_pred),
    "adjusted_rand_score": lambda x, y_pred, y_true: sklearn.metrics.adjusted_rand_score(y_true, y_pred),
    "completeness_score": lambda x, y_pred, y_true: sklearn.metrics.completeness_score(y_true, y_pred),
    "fowlkes_mallows_score": lambda x, y_pred, y_true: sklearn.metrics.fowlkes_mallows_score(y_true, y_pred),
    "homogeneity_completeness_v_measure": lambda x, y_pred, y_true: sklearn.metrics.homogeneity_completeness_v_measure(
        y_true, y_pred
    ),
    "homogeneity_score": lambda x, y_pred, y_true: sklearn.metrics.homogeneity_score(y_true, y_pred),
    "mutual_info_score": lambda x, y_pred, y_true: sklearn.metrics.mutual_info_score(y_true, y_pred),
    "normalized_mutual_info_score": lambda x, y_pred, y_true: sklearn.metrics.normalized_mutual_info_score(
        y_true, y_pred
    ),
    "rand_score": lambda x, y_pred, y_true: sklearn.metrics.rand_score(y_true, y_pred),
}

In [None]:
performance = []
for i in range(len(matrioskaidx2model)):
    result = trainer.test(model=matrioskaidx2model[i], dataloaders=eval_loader)[0]

    eval_embeddings, eval_labels = compute_eval_embedings(model, eval_loader)

    for clusterizer_name, clusterizer_func in clusterizer.items():
        clustering_labels = clusterizer[clusterizer_name](eval_embeddings)

        metrics = {
            metric_name: metric_func(x=eval_embeddings, y_pred=clustering_labels, y_true=eval_labels)
            for metric_name, metric_func in clustering_metric.items()
        }

        performance.append(
            Result(
                matrioska_idx=i,
                test_acc=result["acc/test"],
                test_loss=result["loss/test"],
                clusterer=clusterizer_name,
                **metrics
            )
        )

In [None]:
import pandas as pd

perf = pd.DataFrame(performance)
perf

In [None]:
perf.to_csv(PROJECT_ROOT / "perf.csv", index=False)

In [None]:
perf = pd.read_csv(PROJECT_ROOT / "perf.csv")
perf["ntrain_classes"] = perf["matrioska_idx"] + 2

In [None]:
import plotly.express as px

px.scatter(
    perf,
    facet_col="clusterer",
    x="ntrain_classes",
    y="test_acc",
    labels={"matrioska_idx": "Number of classes trained on", "test_acc": "Test accuracy"},
)

In [None]:
px.scatter(
    perf,
    facet_col="clusterer",
    x="ntrain_classes",
    y="v_measure_score",
    labels={"matrioska_idx": "Number of classes trained on", "test_acc": "Test accuracy"},
)

In [None]:
px.scatter(
    perf,
    facet_col="clusterer",
    x="ntrain_classes",
    y="adjusted_mutual_info_score",
    labels={"matrioska_idx": "Number of classes trained on", "test_acc": "Test accuracy"},
)

In [None]:
px.scatter(
    perf,
    facet_col="clusterer",
    x="ntrain_classes",
    y="adjusted_rand_score",
    labels={"matrioska_idx": "Number of classes trained on", "test_acc": "Test accuracy"},
)

In [None]:
px.scatter(
    perf,
    facet_col="clusterer",
    x="ntrain_classes",
    y="completeness_score",
    labels={"matrioska_idx": "Number of classes trained on", "test_acc": "Test accuracy"},
)

In [None]:
px.scatter(
    perf,
    facet_col="clusterer",
    x="ntrain_classes",
    y="fowlkes_mallows_score",
    labels={"matrioska_idx": "Number of classes trained on", "test_acc": "Test accuracy"},
)

In [None]:
# y not a number
# px.scatter(
#     perf,
#     facet_col="clusterer",
#     x="ntrain_classes",
#     y="homogeneity_completeness_v_measure",
#     labels={"matrioska_idx": "Number of classes trained on", "test_acc": "Test accuracy"},
# )

In [None]:
px.scatter(
    perf,
    facet_col="clusterer",
    x="ntrain_classes",
    y="homogeneity_score",
    labels={"matrioska_idx": "Number of classes trained on", "test_acc": "Test accuracy"},
)

In [None]:
px.scatter(
    perf,
    facet_col="clusterer",
    x="ntrain_classes",
    y="mutual_info_score",
    labels={"matrioska_idx": "Number of classes trained on", "test_acc": "Test accuracy"},
)

In [None]:
px.scatter(
    perf,
    facet_col="clusterer",
    x="ntrain_classes",
    y="normalized_mutual_info_score",
    labels={"matrioska_idx": "Number of classes trained on", "test_acc": "Test accuracy"},
)

In [None]:
px.scatter(
    perf,
    facet_col="clusterer",
    x="ntrain_classes",
    y="rand_score",
    labels={"matrioska_idx": "Number of classes trained on", "test_acc": "Test accuracy"},
)