## Notebook hypotheses
#### 1. Contrastative learning embedding has significantly better separability than PCA or tsne even when overfitted

### Motivation
Compare contrastative learning model against benchmarks that should perform worse.
* PCA is a simple and fast linear method, shouldnt beat a contrastative model non-linear mapping in a complex problems.
* t-sne does not learn a mapping from feature-space so cant be used as an embedding. If we cant beat t-sne then there is a better non-linear mapping we have not achived. 

### Actions:
* Hypotheses N. 1 is `True`.
    * Make a dataset of embeddings useful for training a classifier
* Hypotheses N. 1 is `False`.
    * Review contrastative learning model.

### Results:
* 1. `True`, contrastative embedding has orders of magnitude more separability. [Link](#techniques-separability).

In [None]:
# nb metadata
NB_NAME = "dimentionality reduction.ipynb"
NB_PATH = "notebooks"

## Contrastative embeddings dimentionality reduction

In [None]:
%env WANDB_API_KEY=4f8699d18b665419da19c00aeb7291bcafb88ac5
# import os
# os.environ['WANDB_API_KEY'] = '4f8699d18b665419da19c00aeb7291bcafb88ac5'
from omegaconf import DictConfig
from test_nn_template.run import WandbHandler

print(f"Runs: {len(WandbHandler.get_runs(entity='fernandoezequiel512', project='test_nn_template'))}")
run_id = "fernandoezequiel512/test_nn_template/runs/28vaq6y9"
run = WandbHandler.get_run(run_id)
print(f"Got {run.name}")
model = WandbHandler.load_run_model_checkpoint(run_id)
print(model)


def replace_dataset(cfg: DictConfig):
    _target_train = cfg.data.datamodule.datasets.train._target_.replace("MyContrastativeDataset", "MyDataset")
    _target_test = cfg.data.datamodule.datasets.test._target_.replace("MyContrastativeDataset", "MyDataset")
    cfg.data.datamodule.datasets.train._target_ = _target_train
    cfg.data.datamodule.datasets.test._target_ = _target_test
    return cfg


datamodule = WandbHandler.load_run_datamodule(run_id, cfg_func=replace_dataset)
# print(datamodule)

In [None]:
datamodule.setup(stage="test")
test_dataloader = datamodule.test_dataloader()[0]
train_dataloader = datamodule.train_dataloader()

In [None]:
import random
import matplotlib.pyplot as plt
import numpy as np
from test_nn_template.data.datamodule import MyDataModule

random.seed(42)

x_contrast = []
labels = []
for x, y in test_dataloader:
    # print(x.shape, y.detach().numpy())
    emb = model.model.forward_once(x).detach().numpy()
    x_contrast.append(emb)
    labels.append(y)
    # break
x_contrast = np.concatenate(x_contrast)
labels = np.concatenate(labels)

ixs = random.choices(range(x_contrast.shape[0]), k=500)

plt.scatter(x_contrast[ixs, 0], x_contrast[ixs, 1], c=labels[ixs])

## PCA dimentionality reduction

In [None]:
import numpy as np
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X = []
labels = []
for x, y in test_dataloader:
    X.append(x.detach().numpy())
    labels.append(y)
X = np.concatenate(X).squeeze()
X = X.reshape((X.shape[0], -1))
print(X.shape)
pca.fit(X)
X_pca = pca.transform(X)

print(f"explained variance: {pca.explained_variance_ratio_}")
print(f"singular values: {pca.singular_values_}")

labels = np.concatenate(labels)

plt.scatter(X_pca[ixs, 0], X_pca[ixs, 1], c=labels[ixs])

## t-sne dimentionality reduction

In [None]:
from sklearn.manifold import TSNE

X = []
labels = []
for x, y in test_dataloader:
    X.append(x.detach().numpy())
    labels.append(y)
X = np.concatenate(X).squeeze()
X = X.reshape((X.shape[0], -1))
# print(X.shape)

X_tsne = TSNE(n_components=2, learning_rate="auto", init="random", perplexity=3).fit_transform(X)
X_tsne.shape

labels = np.concatenate(labels)
plt.scatter(X_tsne[ixs, 0], X_tsne[ixs, 1], c=labels[ixs])

## Techniques separability

In [None]:
import time
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
import pandas as pd

vars = ["contrast_1", "contrast_2", "pca_1", "pca_2", "tsne_1", "tsne_2"]
X = np.concatenate([x_contrast, X_pca, X_tsne], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, labels, stratify=labels, random_state=42)

forest = RandomForestClassifier(random_state=0)
forest.fit(X_train, y_train)

start_time = time.time()
result = permutation_importance(forest, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
forest_importances = pd.Series(result.importances_mean, index=vars)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

## Save embeddings dataset

In [None]:
import os
import json
from typing import Callable, Optional
from torch.utils.data.dataloader import DataLoader
from wandb.apis.public import Run
import torch
from tqdm import tqdm


source = os.path.join(NB_PATH, NB_NAME)
transform = lambda x: model.model.forward_once(x)
# for x, y in test_dataloader:
#     emb = model.model.forward_once(x)


class EmbeddingsSaver(object):
    def __init__(
        self,
        run: Run,
        train_dataloader: DataLoader,
        test_dataloader: DataLoader,
        source: str,
        transform: Optional[Callable] = None,
    ):
        self.transform = transform
        self.train = train_dataloader
        self.test = test_dataloader
        self.run = run
        self.run_id = run.name
        self.metadata = self.build_metadata(source)

    def build_metadata(self, source):
        self.metadata = {
            "wdb_entity": self.run.entity,
            "wdb_project": self.run.project,
            "wdb_run_id": self.run.name,
            "source": source,
        }

    def save_embeddings(self, dataloader: DataLoader, paths: dict) -> torch.Tensor:
        for ix, (x, y) in tqdm(enumerate(dataloader), desc="Embedding"):
            path = os.path.join(paths["embeddings"], f"{ix}.pt")
            self.save_tensor(transform(x), path)
            path = os.path.join(paths["labels"], f"{ix}.pt")
            labels.append(y)
        embeddings = torch.cat(embeddings)
        labels = torch.cat(y)
        return embeddings, labels

    def save_tensor(self, tensor: torch.Tensor, path: str) -> None:
        torch.save(tensor, path)

    def save_dict(self, dict: dict, path: str) -> None:
        with open(path, "w") as f:
            json.dumps(dict, f)

    def save(self, root: str) -> None:
        struct = self.build_folders_struct(root, self.run_id)
        self.save_dict(self.metadata, struct["metadata"])
        train_emb, train_lab = self.save_embeddings(self.train, struct["train"])
        test_emb, test_lab = self.save_embeddings(self.test, struct["test"])

    @classmethod
    def build_folders_struct(cls, root: str, run_id: str) -> dict:
        def join(args):
            return os.path.join(*args)

        def get_base_struct(split):
            return {"embeddings": join([root, split, "embeddings"]), "labels": join([root, split, "labels"])}

        root = join([root, run_id])
        struct = {
            "metadata": join([root, "metadata", "metadata.json"]),
            "train": get_base_struct("train"),
            "test": get_base_struct("test"),
        }
        os.makedirs(os.path.dirname(struct["metadata"]))
        os.makedirs(os.path.dirname(struct["train"]["embeddings"]))
        os.makedirs(os.path.dirname(struct["test"]["embeddings"]))
        return struct


embeddings_saver = EmbeddingsSaver(run, train_dataloader, test_dataloader, source)
embeddings_saver.save("data")

In [None]:
# torch.save(torch.cat(embeddings), 'test.pt')
os.path.dirname("a/b/c.pt")