In [1]:
import sys
sys.path.insert(0, '../../src')
from utils import preprocession as prep
from utils.metrics import compute_metric, add_to_metrics, columns
import models.VAE as VAE

import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import decomposition as mf
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm
tqdm.pandas()

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
def trainSingleVAE(encoder, decoder, reactions, **kwargs):
    params = {'beta':1, 'patience':300, 'lr':1e-3, 'silent':False, 'savedir':None, 'test_size':0.1}
    params.update(kwargs)

    imputed_reactions = prep.impute_dataframe(reactions)
    train, evals, train_mask, eval_mask = train_test_split(imputed_reactions, reactions, test_size=params['test_size'], random_state=0)
    train = torch.tensor(train.values).float()
    evals = torch.tensor(evals.values).float()
    train_mask  = torch.tensor(~train_mask.isna().values).float()
    eval_mask = torch.tensor(~eval_mask.isna().values).float()

    # Model
    encoder = encoder(reactions.columns, 2)
    decoder = decoder(2, reactions.columns)
    vae = VAE.VAE(encoder, decoder)
    optimizer = torch.optim.Adam(vae.parameters(), lr=params['lr'])

    # Training loop
    vae.train_losses = []
    vae.eval_losses = []

    patience = params['patience']
    i = stop = 0 
    while np.min(([0] * patience + vae.eval_losses)[-patience:]) <= stop:
        optimizer.zero_grad()
        train_batch, mu, logvar = vae(train)
        train_loss = VAE.MaskedLoss(train_batch, train, mu, logvar, train_mask, params['beta'])
        vae.train_losses.append(train_loss.item()/train.shape[0])

        train_loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            eval_batch, mu, logvar = vae(evals)
            eval_loss =  VAE.MaskedLoss(eval_batch, evals, mu, logvar, eval_mask, params['beta'])
            vae.eval_losses.append(eval_loss.item()/evals.shape[0])
            stop = np.min(vae.eval_losses)

            if vae.eval_losses[-1] == stop and params['savedir'] is not None:
                with open(params['savedir'], 'wb') as file:
                    opt = i
                    pickle.dump(vae, file)
        i+=1
        if not params['silent']:
            print(f'Epoch {i}, Loss: {round(vae.train_losses[-1])}, Eval: {round(vae.eval_losses[-1])}')
            
    if params['savedir'] is not None:
        with open(params['savedir'], 'rb') as file:
            vae = pickle.load(file)
    return vae
        
    

In [3]:
def evaluateVAE(folder_name, data_name, encoder, decoder, name, **kwargs):
    train, test = prep.load_data(folder_name, data_name)
    _, train_reactions = train
    _, test_reactions = test

    output_path = f"../../embeddings/VAE/{folder_name}/{data_name}/{name}"
    os.makedirs(output_path, exist_ok=True)

    metrics = []
    for p in tqdm(list(train_reactions.keys()), desc='Sparsity', leave=False):
        print(f"--------{p}-----------")

        vae = trainSingleVAE(encoder, decoder, train_reactions[p], silent=True, savedir=f'{output_path}/VAE_{p}.pkl', **kwargs)
    
        train_data = prep.impute_dataframe(train_reactions[p])

        train_embedding = pd.DataFrame(vae.embed(train_data.values),
                                       index = train_reactions[p].index, columns=['x','y'])
        train_embedding.to_csv(f'{output_path}/train_embedding_{p}.csv')

        train_predictions = pd.DataFrame(vae.predict(train_embedding.values),
                                         index=train_embedding.index, columns=train_reactions[p].columns)
        train_predictions.to_csv(f'{output_path}/train_predictions_{p}.csv')

        train_result = compute_metric(train_predictions, train_reactions[0], train_reactions[p].isna(), silent=True)
        add_to_metrics(metrics, train_result, [folder_name, data_name, 'Train', p, 'Train', p, 'VAE', name])

        test_list = test_reactions.keys() if p == 0 else [p]
        for q in test_list:
            test_data  = prep.impute_dataframe(test_reactions[q], mean=train_data.mean())

            test_embedding = pd.DataFrame(vae.embed(test_data.values),
                                          index = test_reactions[q].index, columns=['x','y'])
            test_embedding.to_csv(f'{output_path}/test_embedding_{p}_{q}.csv')

            test_predictions = pd.DataFrame(vae.predict(test_embedding.values),
                                            index=test_reactions[q].index, columns=test_reactions[q].columns)
            test_predictions.to_csv(f'{output_path}/test_predictions_{p}_{q}.csv')

            test_result = compute_metric(test_predictions, test_reactions[0], test_reactions[q].isna(), silent=True)
            add_to_metrics(metrics, test_result, [folder_name, data_name, 'Train', p, 'Test', q, 'VAE', name])
            # print(f"{p}: Test RMSE {round(test_result[1], 4)} vs. Sqrt of Test Fit {round(np.sqrt(msetest),4)}")

    metrics = pd.DataFrame(metrics, columns=columns)
    metrics.to_csv(f'{output_path}/metrics.csv')
    return metrics

In [4]:
metrics = evaluateVAE('Synthetic_60_50', 'Binary', VAE.Encoder, VAE.Decoder, '2-Layer')
metrics.sample(5)

Sparsity:   0%|          | 0/10 [00:00<?, ?it/s]

--------60-----------
--------70-----------
--------10-----------
--------0-----------
--------20-----------
--------30-----------
--------40-----------
--------80-----------
--------90-----------
--------50-----------


Unnamed: 0,Dataset,Datatype,Train Set,Train Sparsity,Evaluation Set,Evaluation Sparsity,Embedding Method,Prediction Method,Task,Accuracy,RMSE
24,Synthetic_60_50,Binary,Train,0,Test,30,VAE,2-Layer,Fit,0.956667,0.195725
32,Synthetic_60_50,Binary,Train,0,Test,80,VAE,2-Layer,Overall,0.745333,0.432153
11,Synthetic_60_50,Binary,Train,70,Test,70,VAE,2-Layer,Overall,0.933,0.222442
13,Synthetic_60_50,Binary,Train,10,Train,10,VAE,2-Layer,Impute,0.956667,0.194696
72,Synthetic_60_50,Binary,Train,80,Test,80,VAE,2-Layer,Fit,0.941667,0.214532


In [6]:
metrics = evaluateVAE('Synthetic_60_50', 'Original', VAE.Encoder, VAE.Decoder, '2-Layer')
metrics.sample(5)

Sparsity:   0%|          | 0/10 [00:00<?, ?it/s]

--------60-----------
--------70-----------
--------10-----------
--------0-----------
--------20-----------
--------30-----------
--------40-----------
--------80-----------
--------90-----------
--------50-----------


Unnamed: 0,Dataset,Datatype,Train Set,Train Sparsity,Evaluation Set,Evaluation Sparsity,Embedding Method,Prediction Method,Task,Accuracy,RMSE
4,Synthetic_60_50,Original,Train,60,Test,60,VAE,2-Layer,Impute,0.940556,0.131735
86,Synthetic_60_50,Original,Train,50,Test,50,VAE,2-Layer,Overall,0.950667,0.121219
18,Synthetic_60_50,Original,Train,0,Train,0,VAE,2-Layer,Fit,0.969833,0.106511
71,Synthetic_60_50,Original,Train,80,Train,80,VAE,2-Layer,Overall,0.926333,0.144894
36,Synthetic_60_50,Original,Train,0,Test,50,VAE,2-Layer,Fit,0.888667,0.192954


In [7]:
metrics = evaluateVAE('Smartvote', 'Binary', VAE.Encoder, VAE.Decoder, '2-Layer')
metrics.sample(5)

Sparsity:   0%|          | 0/10 [00:00<?, ?it/s]

--------60-----------
--------70-----------
--------10-----------
--------0-----------
--------20-----------
--------30-----------
--------40-----------
--------80-----------
--------90-----------
--------50-----------


Unnamed: 0,Dataset,Datatype,Train Set,Train Sparsity,Evaluation Set,Evaluation Sparsity,Embedding Method,Prediction Method,Task,Accuracy,RMSE
39,Smartvote,Binary,Train,0,Test,60,VAE,2-Layer,Fit,0.768966,0.393474
51,Smartvote,Binary,Train,20,Train,20,VAE,2-Layer,Fit,0.832203,0.342349
85,Smartvote,Binary,Train,50,Test,50,VAE,2-Layer,Impute,0.813067,0.357408
13,Smartvote,Binary,Train,10,Train,10,VAE,2-Layer,Impute,0.822827,0.350377
14,Smartvote,Binary,Train,10,Train,10,VAE,2-Layer,Overall,0.836054,0.338967


In [8]:
metrics = evaluateVAE('Smartvote', 'Original', VAE.Encoder, VAE.Decoder, '2-Layer')
metrics.sample(5)

Sparsity:   0%|          | 0/10 [00:00<?, ?it/s]

--------60-----------
--------70-----------
--------10-----------
--------0-----------
--------20-----------
--------30-----------
--------40-----------
--------80-----------
--------90-----------
--------50-----------


Unnamed: 0,Dataset,Datatype,Train Set,Train Sparsity,Evaluation Set,Evaluation Sparsity,Embedding Method,Prediction Method,Task,Accuracy,RMSE
63,Smartvote,Original,Train,40,Train,40,VAE,2-Layer,Fit,0.817427,0.256911
59,Smartvote,Original,Train,30,Train,30,VAE,2-Layer,Overall,0.815857,0.255454
6,Smartvote,Original,Train,70,Train,70,VAE,2-Layer,Fit,0.827159,0.246981
35,Smartvote,Original,Train,0,Test,90,VAE,2-Layer,Overall,0.683126,0.356025
75,Smartvote,Original,Train,90,Train,90,VAE,2-Layer,Fit,0.856526,0.228233


## Logistic Regression AE

In [7]:
metrics = evaluateVAE('Synthetic_60_50', 'Binary', VAE.Encoder,  VAE.LogisticDecoder, 'Logistic', beta=0.)
metrics = evaluateVAE('Synthetic_60_50', 'Original', VAE.Encoder, VAE.LogisticDecoder, 'Logistic', beta=0.)
metrics = evaluateVAE('Smartvote', 'Binary', VAE.Encoder, VAE.LogisticDecoder, 'Logistic', beta=0.)
metrics = evaluateVAE('Smartvote', 'Original', VAE.Encoder, VAE.LogisticDecoder, 'Logistic', beta=0.)


Sparsity:   0%|          | 0/10 [00:00<?, ?it/s]

--------60-----------
--------70-----------
--------10-----------
--------0-----------
--------20-----------
--------30-----------
--------40-----------
--------80-----------
--------90-----------
--------50-----------


Sparsity:   0%|          | 0/10 [00:00<?, ?it/s]

--------60-----------
--------70-----------
--------10-----------
--------0-----------
--------20-----------
--------30-----------
--------40-----------
--------80-----------
--------90-----------
--------50-----------


Sparsity:   0%|          | 0/10 [00:00<?, ?it/s]

--------60-----------
--------70-----------
--------10-----------
--------0-----------
--------20-----------
--------30-----------
--------40-----------
--------80-----------
--------90-----------
--------50-----------


Sparsity:   0%|          | 0/10 [00:00<?, ?it/s]

--------60-----------
--------70-----------
--------10-----------
--------0-----------
--------20-----------
--------30-----------
--------40-----------
--------80-----------
--------90-----------
--------50-----------
