# Deep Autoencoder applied to the stroke dataset
---

In this notebook, an *autoencoder* is implemented using `Pytorch`, `Pytorch Lighting` and the hyperparameter optimization is done by means of `Optuna` for several latent space ([0,100]) in order to compare it with some traditional feature extraction methods such us PCA and ICA.


Instead of using an undercomplete autoencoder, in which the latents space is smaller than the input space, we are going to make use of a overcomplete autoencoder. In order to not overfit our model we need to make use of regularizatoin techniques. In this notebook, we are going to run our model using a k-sparse strategy.

Importing, 


In [None]:
%cd ..
%cd "Notebook utilities"

In [None]:
from pathlib import Path
import numpy as np
from sklearn.model_selection import train_test_split

import glob
import os
import sys
import inspect
import gc

from Datasets import dataset
from Model_utilities import *
from Data_Preprocessing import get_arrays, to_vector, get_HCP, NormalizeData
from ConvAutoencoder import ConvAutoEncoder
from callbacks import MetricsCallback, LitProgressBar, cross_validate
from tqdm import tqdm

In [None]:
import torch
import torch.nn.functional as F
from torch import nn
import torch.optim as optim

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping 
from pytorch_lightning.metrics.functional import accuracy

import optuna
from optuna.integration import PyTorchLightningPruningCallback

In [None]:
MODEL_SAVE_FOLDER = Path("SavedModels")

# Check if the GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Training device: {device}")

### Set the random seed for reproducible results
torch.manual_seed(1234)
np.random.seed(1234)


## 1.1 Loading Dataset: FC matrices of *Stroke Patients*
---
The data is loaded from the .mat and .xlsl file and converted to a 3D array contining per each patient the correspondent fc matrix. Since not all patients have values, some of them are empty, those patients are simply removed and not consider. Furthermore NA values are converted to 0 values. Notice that since the autoencoder is an unsupervised model the labels are not needed in this part.

##### Vectorizing matrices
Following the paper we will exploit the symmetry of the matrix and convert each of them as vector

In [None]:
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir) 

#get data
mat_path = parentdir+'/DATA/FC_Stroke/FCMatrixImage_131subj.mat'
lang_path = parentdir+'/DATA/FC_Stroke/language_score.xlsx'
Normalize = True

fc_3d, language_score, ID = get_arrays(mat_path, lang_path, Normalize)

#vectorizing matrices
vect_mat = to_vector(fc_3d)

#standarize
for i in range(len(vect_mat)):
    vect_mat[i] = (vect_mat[i] - vect_mat[i].mean())/ vect_mat[i].std()

In [None]:
from torch.utils.data import Dataset

#splitting the vectorize data in train-test using the sklearn library
vect_train, vect_test, score_train, score_test = train_test_split(vect_mat, language_score, test_size=0.2, shuffle=True)

In [None]:
# Parameters
params = {'batch_size': 8,
          'shuffle': True,
          'num_workers': 2}

#create dataloader
train_dataset = dataset(vect_train)
test_dataset = dataset(vect_test)
total_dataset = dataset(vect_mat)

train_dataloader = torch.utils.data.DataLoader(train_dataset, **params)
test_dataloader = torch.utils.data.DataLoader(test_dataset, **params)

## Autoencoder 
---

#### Autoencoder 
A convolutional autoencoder consisted of 3 conv layers and 2 linear layers are implemented using the `Pytorch Lighting` module.
The model is found in `ConvAutoencoder.py`Since in this case we are goinf to deal with overcomplete autoecnoder we are going to make use of some   regulation techniques for limiting the direct propagation of the data. 

In [None]:
import torch
import pytorch_lightning as pl  
from torch import nn
import torch.nn.functional as F
from callbacks import MetricsCallback, LitProgressBar
import torch.optim as optim

class KSparse(torch.nn.Module):
    def __init__(self, k):

        super(KSparse, self).__init__()
        self.k = k

    def forward(self, x):
        x = x.to(device)
        _, indices = torch.topk(x, self.k)
        mask = torch.zeros(x.size(), device = device)
        mask.scatter_(1, indices, 1)
        return torch.mul(x, mask).double()

In [None]:
class ConvAutoEncoder(pl.LightningModule):
    
    def __init__(self, encoded_space_dim, k_sparcity, hyper_parameters : dict = None, *args, **kwargs):
        super().__init__()
    
        if hyper_parameters is None:
            self.hyper_parameters = { #Default values
                'optimizer' : 'Adam',
                'learning_rate' : 1e-3,
                'dropout' : 0,
                'conv1' : 8,
                'conv2' : 16, 
                'conv3' : 32,
                'fc' : 16, 
                'weight':1e-5
            }
            self.hyper_parameters.update(**kwargs)
        else:
            self.hyper_parameters = hyper_parameters    
    
        self.save_hyperparameters() #store hyper_parameters in checkpoints

        self.dropout = self.hyper_parameters['dropout']
        self.encoded_space_dim = encoded_space_dim
        self.w_decay = self.hyper_parameters['weight']
        self.conv1 = self.hyper_parameters['conv1']
        self.conv2 = self.hyper_parameters['conv2']
        self.conv3 = self.hyper_parameters['conv3']
        self.fc = self.hyper_parameters['fc']
        self.k_sparcity = k_sparcity
        

        self.encoder_cnn = nn.Sequential(
            # First convolutional layer
            nn.Conv1d(1,self.conv1,5, stride=3).double(),
            nn.Dropout(p=self.dropout),
            
            # Second convolutional layer
            nn.Conv1d(self.conv1, self.conv2, 3, stride=3).double(),
            nn.LeakyReLU(),

            
            # Third convolutional layer
            nn.Conv1d(self.conv2, self.conv3, 4, stride=2).double(),
            nn.Dropout(p=self.dropout),
            nn.LeakyReLU(),

        
        ### Flatten layer
            nn.Flatten(start_dim=1),

        ### Linear section
            # First linear layer
            nn.Linear(2905*self.conv3, self.fc).double(),
            # Second linear layer
            nn.Linear(self.fc, self.encoded_space_dim).double(),
            
        )
 

        ### Linear section
        self.decoder_cnn = nn.Sequential(
            # First linear layer
            nn.Linear(self.encoded_space_dim, self.fc).double(),
           
            # Second linear layer
            nn.Linear(self.fc, 2905*self.conv3).double(),
           
        ### Unflatten
            nn.Unflatten(dim=1, unflattened_size=(self.conv3, 2905)),


        ### Convolutional section
            # First transposed convolution
            nn.ConvTranspose1d(self.conv3, self.conv2, 4, stride=2,output_padding=1).double(),
            nn.LeakyReLU(),
            nn.Dropout(p=self.dropout),

            # Second transposed convolution
            nn.ConvTranspose1d(self.conv2, self.conv1, 3, stride=3, padding =0, output_padding=2).double(),
            nn.LeakyReLU(),
            nn.Dropout(p=self.dropout),
            
            # Third transposed convolution
            nn.ConvTranspose1d(self.conv1, 1, 5, stride=3, padding =0, output_padding=1).double(),
           
            )
        
    def forward(self, x : "torch.tensor"):
        embedding = self.encoder_cnn(x)
        
        return embedding
    
    def training_step(self, batch, batch_idx):

        x = batch #ignore labels
        internal_repr = self.encoder_cnn(x)
        #sparcity
        k_sparce = KSparse(self.k_sparcity)
        internal_repr = k_sparce(internal_repr)

        x_hat = self.decoder_cnn(internal_repr)

        loss = F.mse_loss(x_hat, x, reduction='mean')
        self.log('train_loss', loss, on_epoch=True)
        return loss

    def configure_optimizers(self):
        optimizer = getattr(optim, self.hyper_parameters['optimizer'])(self.parameters(), lr=self.hyper_parameters['learning_rate'], weight_decay = self.hyper_parameters['weight']) #, weight_decay=1e-5)
        return optimizer

    def validation_step(self, batch, batch_idx, log_name = 'val_loss'):
        x = batch
        internal_repr = self.encoder_cnn(x)
        #sparcity
        k_sparce = KSparse(self.k_sparcity)
        internal_repr = k_sparce(internal_repr)

        x_hat = self.decoder_cnn(internal_repr)

        loss = F.mse_loss(x_hat, x)
        
        self.log(log_name, loss)

        return loss

    def test_step(self, batch, batch_idx):
        
        return self.validation_step(batch, batch_idx, log_name='test_loss')

##### Hyperparameter search


Searching the optimal values using the `Optuna` library. This is done by minimizing the mean of the validation loss function obtained by implementing a *5-kfold* cv. 

In [None]:
#---Hyperparameter Optimization with Optuna---#
def objective(trial: optuna.trial.Trial) -> float:
    
    optimizer = trial.suggest_categorical("optimizer", ["Adam", "SGD"])

    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)

    weight = trial.suggest_float("weight", 1e-5, 1e-1, log=True)

    dropout = trial.suggest_float("dropout", 0.0, 1)
   
    conv1 =  trial.suggest_categorical("conv1", [8, 16, 32, 64, 128])
    
    conv2 =  trial.suggest_categorical("conv2", [8, 16, 32, 64, 128])
    
    conv3 =  trial.suggest_categorical("conv3", [8, 16, 32, 64, 128])
    
    fc = trial.suggest_categorical("fc", [8, 16, 32, 64, 128])

    #Convert to a dict 
    hyper_parameters = {
        'optimizer' : optimizer,
        'learning_rate' : learning_rate,
        'dropout' : dropout,
        'conv1' : conv1,
        'conv2' : conv2,
        'conv3': conv3, 
        'fc' : fc, 
        'weight': weight
    }
    cnn_autoencoder = ConvAutoEncoder(encoded_space_dim,k,  hyper_parameters = hyper_parameters) 
    
    bar = LitProgressBar()

    fold_metrics = []
    #create 5-kfolds
    data_kfold = cross_validate(vect_mat, 8,5)
    
    for fold_id, (train_loader, val_loader) in enumerate(data_kfold):
        #Define training
        metrics_callback = MetricsCallback()
        trainer = pl.Trainer(
            logger = True,
            limit_val_batches=1., #percentage of validation batches to be used
            checkpoint_callback=False, #Do not save models during hyperparams opt.
            max_epochs=50,
            gpus = 1 if torch.cuda.is_available() else None,
            callbacks = [PyTorchLightningPruningCallback(trial, monitor="val_loss"),metrics_callback, bar]
        )
        trainer.logger.log_hyperparams(hyper_parameters)

        #Train
        trainer.fit(cnn_autoencoder, train_dataloader, test_dataloader)
        
        fold_metrics.append(metrics_callback.metrics)
        gc.collect()
        
    final_val_losses = [metric[-1]['val_loss'] for metric in fold_metrics]
    
    #optimize the model minimizing the mean value of the folders
    return np.mean(final_val_losses) 

In [None]:
top_k = [10,30,60,90]
encoded_space_dim = 200
for k in top_k:
    pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=10) #Prune (=terminate a trial early) if the trial's best intermediate result is worse than the median of intermediate results of previous trials at the same step. It is used to avoid wasting time evaluating hyperparameter choices that are "really bad".
    study = optuna.create_study(study_name="Study_200_k_"+str(k), storage="sqlite:///Study_200_k_"+str(k)+".db", direction="minimize", pruner=pruner, load_if_exists=True)
    #study.optimize(objective, n_trials=100, timeout=None) #timeout = stop after this many seconds (set to None to proceed without time limitation)
    
    #Free RAM
    torch.cuda.empty_cache()
    gc.collect()
    del study

## 1.5 Training with optimal values

In [None]:
for k in top_k:
    #upload best study    
    study = optuna.create_study(study_name="Study_200_k_"+str(k), storage="sqlite:///Study_200_k_"+str(k)+".db", direction="minimize", pruner=pruner, load_if_exists=True)
    best_trial = study.best_trial                                                                              
    best_hyperparameters = best_trial.params 
    
    #Checkpoints for the model
    bar = LitProgressBar()
    metrics_callback = MetricsCallback()
    early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, verbose=True)
    checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath='Models/experiments_90', filename='cnn_autoencoder-{epoch:02d}-{val_loss:.2f}', save_top_k=1, mode='min')

    #Define training
    trainer = pl.Trainer(gpus=1, max_epochs=500, callbacks=[metrics_callback, bar, early_stopping_callback])
    #Define model
    cnn_autoencoder = ConvAutoEncoder(encoded_space_dim,k,  best_hyperparameters) 
    #Train
    trainer.fit(cnn_autoencoder, train_dataloader,test_dataloader)
    
    #saving model
    save_state("best_strk_"+str(encoded_space_dim)+"k"+str(k), trainer, metrics_callback.metrics)
    
    #relase RAM
    del study
    torch.cuda.empty_cache()
    gc.collect()

## 1.6 Saving values
---


In [None]:
def save_features(cnn_autoencoder, train_dataset, k):
    k_sparce = KSparse(k)

    with torch.no_grad():        
        encoded_imgs = cnn_autoencoder.encoder_cnn(train_dataset.fc)
        encoded_imgs = k_sparce(encoded_imgs.cuda()).cpu().detach()
        np.savetxt("RESULTS/Features_k_"+str(k), np.asarray(encoded_imgs))

In [None]:
top_k = [10,30,60,90]
encoded_space_dim = 200
for k in top_k:
    print(k)
    #loading state
    cnn_autoencoder_strk, metrics = load_state(ConvAutoEncoder, "best_strk_"+str(encoded_space_dim)+"k"+str(k))    
    save_features(cnn_autoencoder_strk, total_dataset, k)

In [None]:
top_k = [10,30,60,90]
encoded_space_dim = 200
MSE = []
SD = []
for k in top_k:
    print(k)
    #loading state
    cnn_autoencoder, metrics = load_state(ConvAutoEncoder, "best_strk_"+str(encoded_space_dim)+"k"+str(k))    
    with torch.no_grad():
        encoded_imgs = cnn_autoencoder.encoder_cnn(total_dataset.fc)
        #encoded_imgs = k_sparce(encoded_imgs.cuda()).cpu()
        rec_img = cnn_autoencoder.decoder_cnn(encoded_imgs)
    
    rer = []
    for i in range(len(total_dataset)):
        rer.append(np.mean(((total_dataset.fc[i].squeeze(1) - rec_img[i].squeeze(1))**2).cpu().detach().numpy()))

    MSE.append(np.mean(rer))
    SD.append(np.std(rer, ddof=1))


In [None]:
np.savetxt('MSE_k.txt', MSE)
np.savetxt('SD_k.txt', SD)