In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import time
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from src.utils import *

from src.autoencoder import AutoEncoder, NSAAutoEncoder, NSAAutoEncoder_2
from src.loss import RTDLoss, NSALoss, NSALoss3, LID_NSALoss_v1, LID_NSALoss_v2,LID_NSALoss_v3 
from src.top_ae import TopologicallyRegularizedAutoencoder

from sklearn.decomposition import PCA

from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from collections import defaultdict

from tqdm.notebook import tqdm

In [None]:
config = {
    "dataset_name":"LinkPrediction/Amazon",
    "version":"d64_2",
    "model_name":"default",
    "max_epochs":250,
    "gpus":[0],
    "rtd_every_n_batches":1,
    "rtd_start_epoch":60,
    "rtd_l":1.0, # rtd loss
    "nsa_every_n_batches":1,
    "nsa_start_epoch":60,
    "nsa_l":1.0, # rtd loss
    "n_runs":1, # number of runs for each model
    "lid_nsa_every_n_batches":1,
    "lid_nsa_l":10.0,
    "card":50, # number of points on the persistence diagram
    "n_threads":1, # number of threads for parallel ripser computation of pers homology
    "latent_dim":64, # latent dimension (2 or 3 for vizualization purposes)
    "input_dim":256,
    "n_hidden_layers":3,
    "hidden_dim":512,
    "batch_size":256,
    "engine":"ripser",
    "is_sym":True,
    "lr":1e-4,
    "sampler":"base", #Change this for version A or B
    "validate":True,
}
config['knn'] = 32 #CHange this for the LID NSA KNN size
config['sampler_knn'] = 32 #CHange this for the LID NSA KNN size
config['full'] = (config['batch_size'] == (config['knn']+1))

In [None]:
def get_model(input_dim, latent_dim=2, n_hidden_layers=2, m_type='encoder', **kwargs):
    n = int(np.log2(input_dim))-1
    layers = []
    if m_type == 'encoder':
        in_dim = input_dim
        if input_dim  // 2 >= latent_dim:
            out_dim = input_dim // 2
        else:
            out_dim = input_dim
        for i in range(min(n, n_hidden_layers)):
            layers.extend([nn.Linear(in_dim, out_dim), nn.ReLU()])
            in_dim = out_dim
            if in_dim  // 2 >= latent_dim:
                out_dim = in_dim // 2
            else:
                out_dim = in_dim
        layers.extend([nn.Linear(in_dim, latent_dim)])
    elif m_type == 'decoder':
        in_dim = latent_dim
        out_dim = latent_dim * 2
        for i in range(min(n, n_hidden_layers)):
            layers.extend([nn.Linear(in_dim, out_dim), nn.ReLU()])
            in_dim = out_dim
            out_dim *= 2
        layers.extend([nn.Linear(in_dim, input_dim)])
    return nn.Sequential(*layers)

def get_list_of_models(**config):
    # define a list of models
    encoder = get_linear_model(
        m_type='encoder',
        **config
    )
    decoder = get_linear_model(
        m_type='decoder',
        **config
    )
    models = {
        'Basic AutoEncoder':AutoEncoder(
           encoder = encoder,
            decoder = decoder,
            MSELoss = nn.MSELoss(),
            **config
        ),
        'Topological AutoEncoder':TopologicallyRegularizedAutoencoder(
            encoder = encoder,
            decoder = decoder,
            MSELoss = nn.MSELoss(),
            **config
        ),
        'RTD AutoEncoder H1':AutoEncoder(
            encoder = encoder,
            decoder = decoder,
            RTDLoss = RTDLoss(dim=1, lp=1.0,  **config), # only H1
            MSELoss = nn.MSELoss(),
            **config
        ),
        'GNSA AutoEncoder':NSAAutoEncoder(
            encoder = encoder,
            decoder = decoder,
            NSALoss = NSALoss(), # only H1
            MSELoss = nn.MSELoss(),
            **config
        ),
        'NSA AutoEncoder':NSAAutoEncoder_2(
            encoder = encoder,
            decoder = decoder,
            NSALoss = NSALoss(),
            LIDNSALoss = LID_NSALoss_v1(k=config['knn'],full=config['full']), # only H1
            MSELoss = nn.MSELoss(),
            **config
        ),
        'LNSA AutoEncoder':NSAAutoEncoder(
            encoder = encoder,
            decoder = decoder,
            LIDNSALoss = LID_NSALoss_v1(k=config['knn'],full=config['full']), # only H1
            MSELoss = nn.MSELoss(),
            **config
        ),
    }
    return models, encoder, decoder

In [None]:
def collate_with_matrix(samples):
    indicies, data, labels = zip(*samples)
    data, labels = torch.tensor(np.asarray(data)), torch.tensor(np.asarray(labels))
    if len(data.shape) > 2:
        dist_data = torch.flatten(data, start_dim=1)
    else:
        dist_data = data
    x_dist = torch.cdist(dist_data, dist_data, p=2) / np.sqrt(dist_data.shape[1])
#     x_dist = (x_dist + x_dist.T) / 2.0 # make symmetrical (cdist is prone to computational errors)
    return data, x_dist, labels

def collate_with_matrix_geodesic(samples):
    indicies, data, labels, dist_data = zip(*samples)
    data, labels = torch.tensor(np.asarray(data)), torch.tensor(np.asarray(labels))
    x_dist = torch.tensor(np.asarray(dist_data)[:, indicies])
    return data, x_dist, labels

In [None]:
dataset_name = config['dataset_name']
if dataset_name in ['COIL-20','COIL-100']:
    train_data = np.load(f'data/{dataset_name}/prepared/data.npy').astype(np.float32)
elif dataset_name.startswith('LinkPrediction'):
    train_data = np.load(f'data/{dataset_name}/LP_3_200.npz')
    train_data = dict(train_data)
    print(train_data.keys())
    key = list(train_data.keys())[-1]
    print(key)
    train_data = train_data[key]
else:
    train_data = np.load(f'data/{dataset_name}/prepared/train_data.npy').astype(np.float32)


try:        
    test_data = np.load(f'data/{dataset_name}/prepared/test_data.npy').astype(np.float32)
except FileNotFoundError:
    ids = np.random.choice(np.arange(len(train_data)), size=int(0.2*len(train_data)), replace=False)
    test_data = train_data[ids]

try:
    if dataset_name in ['COIL-20','COIL-100']:
        print("Inside here")
        train_labels = np.load(f'data/{dataset_name}/prepared/labels.npy')
    elif dataset_name.startswith('LinkPrediction'):
        train_labels = np.arange(1,len(train_data)+1)
    else:
        train_labels = np.load(f'data/{dataset_name}/prepared/train_labels.npy')
except FileNotFoundError:
    train_labels = None

try:
    test_labels = np.load(f'data/{dataset_name}/prepared/test_labels.npy')
except FileNotFoundError:
    if train_labels is None:
        test_labels = None
    else:
        test_labels = train_labels[ids]

In [None]:
print(train_data.shape)
print(train_labels[:10])
print(train_labels.shape)
print(test_data.shape)
print(test_labels[:10])
print(test_labels.shape)

In [None]:
import numpy as np

class CustomMinMaxScaler:
    def __init__(self):
        self.min_vals = train_data.min()
        self.max_vals = train_data.max()
        self.is_fitted = True
        
    def fit(self, data):
        self.min_vals = np.min(data, axis=0)
        self.max_vals = np.max(data, axis=0)
        self.is_fitted = True
        
    def transform(self, data):
        if not self.is_fitted:
            raise NotFittedError
        scaled_data = (data - self.min_vals) / (self.max_vals - self.min_vals)
        return scaled_data
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [None]:
#Use minmaxscaler for image datasets
#Scaler is None for link prediction or graph datasets

#scaler = FurthestScaler()
#scaler = CustomMinMaxScaler()
scaler = None
flatten = True
geodesic = False

train = FromNumpyDataset(
    train_data, 
    train_labels, 
    geodesic=geodesic, 
    scaler=scaler, 
    flatten=flatten, 
    n_neighbors=2
)
print("Train done")
test = FromNumpyDataset(
    test_data, 
    test_labels, 
    geodesic=geodesic, 
    scaler = train.scaler,    
    flatten=flatten, 
    n_neighbors=2
)


if config['sampler']=='base':
    print("Base Sampler")
    train_loader = DataLoader(
        train, 
        batch_size=config["batch_size"], 
        num_workers=0, 
        collate_fn=collate_with_matrix_geodesic if geodesic else collate_with_matrix, 
        shuffle=True,
        drop_last=True
    )
    
    val_loader = DataLoader(
        test,
        batch_size=config["batch_size"],
        num_workers=0,
        collate_fn=collate_with_matrix_geodesic if geodesic else collate_with_matrix,
        shuffle=False,
        drop_last=False
    )
elif config['sampler']=='neighbor':
    print("Neighbor Sampler")
    train_sampler = NearestNeighborBatchSamplerMulti(train, config['batch_size'], num_neighbors=config['sampler_knn'], num_threads=24)
    val_sampler = NearestNeighborBatchSamplerMulti(test, config['batch_size'], num_neighbors=config['sampler_knn'], num_threads=24)

    train_loader = DataLoader(
        train, 
        batch_sampler=train_sampler, 
        num_workers=24, 
        collate_fn=collate_with_matrix_geodesic if geodesic else collate_with_matrix, 
    )
    if config['validate']:
            val_loader = DataLoader(
            test,
            batch_sampler=val_sampler,
            num_workers=24,
            collate_fn=collate_with_matrix_geodesic if geodesic else collate_with_matrix,
        )
    else:
        val_loader=None

In [None]:
def train_autoencoder(model, train_loader, val_loader=None, model_name='default', 
                      dataset_name='F-MNIST', gpus=[0], max_epochs=100, run=0, version="d1"):
    version = f"{dataset_name}_{model_name}_{version}_{run}"
    logger = pl.loggers.TensorBoardLogger(save_dir=os.getcwd(), name='lightning_logs', version=version)
    trainer = pl.Trainer(
        logger=logger, 
        gpus=gpus, 
        max_epochs=max_epochs, 
        log_every_n_steps=1, 
        num_sanity_val_steps=0
    )
    trainer.fit(model, train_loader, val_loader)
    return model

def dump_figures(figures, dataset_name, version):
    for model_name in figures:
        figures[model_name].savefig(f'results/{dataset_name}/{model_name}_{version}.png')

def train_models(train_loader, val_loader, dataset_name="", max_epochs=1, gpus=[], n_neighbors=[1], n_runs=1, version='', **kwargs):
    models, encoder, decoder = get_list_of_models(**kwargs)
    
    for model_name in tqdm(models, desc=f"Training models"):
        if 'AutoEncoder' in model_name: # train an autoencoder
            models[model_name] = train_autoencoder(
                models[model_name], 
                train_loader, 
                val_loader, 
                model_name, 
                dataset_name,
                gpus,
                max_epochs,
                0,
                version
            )
        else: # umap / pca / t-sne (sklearn interface)
            train_latent = models[model_name].fit_transform(train_loader.dataset.data)
        # measure training time
    return encoder, decoder, models

In [None]:
encoder, decoder, trained_models = train_models(train_loader, val_loader, **config)

In [None]:
from src.utils import *

In [None]:
trained_models

In [None]:
config['version']

In [None]:
version = config['version']
train_loader = DataLoader(
    train,
    batch_size=config["batch_size"],
    num_workers=0,
    collate_fn=collate_with_matrix_geodesic if geodesic else collate_with_matrix,
    shuffle=False
)

for model_name in trained_models:
    latent, labels = get_latent_representations(trained_models[model_name], train_loader)
    print(latent.shape)
    np.save(f'data/{dataset_name}/{model_name}_latent_output_{version}.npy', latent)
    np.save(f'data/{dataset_name}/{model_name}_latent_labels_{version}.npy', labels)

for model_name in trained_models:
    latent, labels = get_output_representations(trained_models[model_name], train_loader)
    print(latent.shape)
    np.save(f'data/{dataset_name}/{model_name}_final_output_{version}.npy', latent)
    np.save(f'data/{dataset_name}/{model_name}_final_labels_{version}.npy', labels)

In [None]:
for model_name in trained_models:
    latent, labels = get_latent_representations(trained_models[model_name], val_loader)
    print(latent.shape)
    np.save(f'data/{dataset_name}/{model_name}_latent_output_{version}_test.npy', latent)
    np.save(f'data/{dataset_name}/{model_name}_latent_labels_{version}_test.npy', labels)

for model_name in trained_models:
    latent, labels = get_output_representations(trained_models[model_name], val_loader)
    print(latent.shape)
    np.save(f'data/{dataset_name}/{model_name}_final_output_{version}_test.npy', latent)
    np.save(f'data/{dataset_name}/{model_name}_final_labels_{version}_test.npy', labels)