In [4]:
# for running this it should be on the root of the project
%load_ext autoreload
%autoreload 2
import torch
import torch.optim as optim
import argparse
import os
from tqdm import tqdm
import sys
sys.path.insert(0, '/Users/maruanottoni/home/master/research/graph-corr-embedd/src')

from src.models.SAE import StackedSparseAutoencoder
from src.utils.conn_data import save_pickle
from src.utils.parsers import str_2_bool
from src.data.Simulation1Loader import Simulation1Loader

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Inputs

In [5]:
dataset_name = 'simulation1'
sample = True
batch_size = 1

# define dataset
sim = Simulation1Loader(name=dataset_name, sample=sample)
loader = sim.create_graph_loader(batch_size=batch_size)

10it [00:01,  8.94it/s]


# Model training

In [6]:
model_name ="sae"
input_size = 100
hidden_sizes = [50, 25, 50]
dropout = 0.5
learning_rate = 0.001
epochs = 100
sparsity_penalty = 1e-4

# define model
model1    = StackedSparseAutoencoder(input_size=input_size,
                                    hidden_sizes=hidden_sizes,
                                    dropout=dropout,
                                    sparsity_penalty=sparsity_penalty)

model2    = StackedSparseAutoencoder(input_size=input_size,
                                    hidden_sizes=hidden_sizes,
                                    dropout=dropout,
                                    sparsity_penalty=sparsity_penalty)



# define optimizer
optimizer1 = optim.Adam(model1.parameters(), lr=learning_rate)
optimizer2 = optim.Adam(model2.parameters(), lr=learning_rate)

In [7]:
# initialize tqdm
pbar = tqdm(range(epochs))
train_pred, train_true = [], []
xs_train, zs_train = [], []
epoch_loss_train = []
for epoch in pbar:
    epoch_loss1, epoch_loss2 = 0, 0 
    for data in loader:
        # get inputs
        x1 = data.x[0, :, :]
        x2 = data.x[1, :, :]

        # forward pass
        x1_hat, z1, = model1.forward(x1)
        x2_hat, z2  = model2.forward(x1)

        # compute correlation between embeddings (true target)
        corr = model1.compute_spearman_rank_correlation(x=z1.flatten().detach(), y=z2.flatten().detach())

        # compute loss
        loss1 = model1.loss_function(x1_hat, x1)
        loss2 = model2.loss_function(x2_hat, x2)

        # backward and optimize
        optimizer1.zero_grad()
        loss1.backward()
        optimizer1.step()

        optimizer2.zero_grad()
        loss2.backward()
        optimizer2.step()

        epoch_loss1 += loss1.item()
        epoch_loss2 += loss2.item()

    # update tqdm
    pbar.update(1)
    pbar.set_description("Train Epoch: %d, Train Loss I & II: %.4f & %.4f" % (epoch, epoch_loss1, epoch_loss2))

    # save loss
    epoch_loss_train.append([epoch_loss1, epoch_loss2])

Train Epoch: 99, Train Loss I & II: 2.2342 & 2.1970: 100%|██████████| 100/100 [00:08<00:00, 11.85it/s]


In [8]:
 # pred list to tensor
train_pred = torch.tensor(train_pred)
train_true = torch.tensor(train_true)

pbar = tqdm(enumerate(loader), total=len(loader))
test_pred = []
test_true = []
with torch.no_grad():
    for s, data in pbar:
        # get inputs
        x1 = data.x[0, :, :]
        x2 = data.x[1, :, :]

        # forward pass
        x1_hat, z1 = model1.forward(x1)
        x2_hat, z2 = model2.forward(x2)

        # compute correlation between embeddings (true target)
        corr = model1.compute_spearman_rank_correlation(x=z1.flatten().detach(), y=z2.flatten().detach())

        # store pred and true values
        test_pred.append(corr)
        test_true.append(data.y)

        # update tqdm
        pbar.update(1)
        pbar.set_description(f"Test Sample: {s}")
    
# pred list to tensor
test_pred = torch.tensor(test_pred)
test_true = torch.tensor(test_true)

results = {
    "train_pred": train_pred,
    "train_true": train_true,
    "test_pred": test_pred,
    "test_true": test_true,
    "epoch_loss_train": epoch_loss_train,
}

Test Sample: 9: 100%|██████████| 10/10 [00:00<00:00, 81.37it/s]


In [47]:
#model_name = f'{args.model_name}_{int(args.n_hidden)}_{int(args.n_layers_enc)}_{int(args.n_layers_dec)}'
# check if file exists
#output_path = f"{os.path.dirname(__file__)}/data/outputs/{args.dataset_name}/{model_name}"
#if not os.path.exists(output_path):
#    os.makedirs(output_path)

# save file
#if args.sample:
#    save_pickle(path=f"{output_path}/sample_results.pkl", obj=results)
#else:
#    save_pickle(path=f"{output_path}/results.pkl", obj=results)

## Checking results

In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt

mae = mean_absolute_error(test_true, test_pred)
rmse = sqrt(mean_squared_error(test_true, test_pred))

print(f"MAE: {mae}")
print(f"RMSE: {rmse}")

MAE: 0.6021018845133181
RMSE: 0.6670507519348209


## multiple times same architecture:w

In [51]:
model_name ="sae"
input_size = 100
hidden_sizes = [50, 25, 50]
dropout = 0.5
learning_rate = 0.001
epochs = 100
sparsity_penalty = 1e-4
n_runs = 5

results_list = []
for i in range(n_runs):
    # define model
    model1    = StackedSparseAutoencoder(input_size=input_size,
                                        hidden_sizes=hidden_sizes,
                                        dropout=dropout,
                                        sparsity_penalty=sparsity_penalty)

    model2    = StackedSparseAutoencoder(input_size=input_size,
                                        hidden_sizes=hidden_sizes,
                                        dropout=dropout,
                                        sparsity_penalty=sparsity_penalty)



    # define optimizer
    optimizer1 = optim.Adam(model1.parameters(), lr=learning_rate)
    optimizer2 = optim.Adam(model2.parameters(), lr=learning_rate)

    # initialize tqdm
    pbar = tqdm(range(epochs))
    train_pred, train_true = [], []
    xs_train, zs_train = [], []
    epoch_loss_train = []
    for epoch in pbar:
        epoch_loss1, epoch_loss2 = 0, 0 
        for data in loader:
            # get inputs
            x1 = data.x[0, :, :]
            x2 = data.x[1, :, :]

            # forward pass
            x1_hat, z1, = model1.forward(x1)
            x2_hat, z2  = model2.forward(x1)

            # compute correlation between embeddings (true target)
            corr = model1.compute_spearman_rank_correlation(x=z1.flatten().detach(), y=z2.flatten().detach())

            # compute loss
            loss1 = model1.loss_function(x1_hat, x1)
            loss2 = model2.loss_function(x2_hat, x2)

            # backward and optimize
            optimizer1.zero_grad()
            loss1.backward()
            optimizer1.step()

            optimizer2.zero_grad()
            loss2.backward()
            optimizer2.step()

            epoch_loss1 += loss1.item()
            epoch_loss2 += loss2.item()

        # update tqdm
        pbar.update(1)
        pbar.set_description("Train Epoch: %d, Train Loss I & II: %.4f & %.4f" % (epoch, epoch_loss1, epoch_loss2))


    # pred list to tensor
    train_pred = torch.tensor(train_pred)
    train_true = torch.tensor(train_true)

    pbar = tqdm(enumerate(loader), total=len(loader))
    test_pred = []
    test_true = []
    with torch.no_grad():
        for s, data in pbar:
            # get inputs
            x1 = data.x[0, :, :]
            x2 = data.x[1, :, :]

            # forward pass
            x1_hat, z1 = model1.forward(x1)
            x2_hat, z2 = model2.forward(x2)

            # compute correlation between embeddings (true target)
            corr = model1.compute_spearman_rank_correlation(x=z1.flatten().detach(), y=z2.flatten().detach())

            # store pred and true values
            test_pred.append(corr)
            test_true.append(data.y)

            # update tqdm
            pbar.update(1)
            pbar.set_description(f"Test Sample: {s}")

        # save loss
        epoch_loss_train.append([epoch_loss1, epoch_loss2])
        

    # pred list to tensor
    test_pred = torch.tensor(test_pred)
    test_true = torch.tensor(test_true)

    results = {
        "train_pred": train_pred,
        "train_true": train_true,
        "test_pred": test_pred,
        "test_true": test_true,
        "epoch_loss_train": epoch_loss_train,
    }
    results_list.append(results)

Train Epoch: 99, Train Loss I & II: 2.2518 & 2.1861: 100%|██████████| 100/100 [00:09<00:00, 10.58it/s]
Test Sample: 9: 100%|██████████| 10/10 [00:00<00:00, 174.98it/s]
Train Epoch: 99, Train Loss I & II: 2.2535 & 2.1922: 100%|██████████| 100/100 [00:08<00:00, 12.23it/s]
Test Sample: 9: 100%|██████████| 10/10 [00:00<00:00, 189.23it/s]
Train Epoch: 99, Train Loss I & II: 2.2386 & 2.1919: 100%|██████████| 100/100 [00:06<00:00, 15.35it/s]
Test Sample: 9: 100%|██████████| 10/10 [00:00<00:00, 148.72it/s]
Train Epoch: 99, Train Loss I & II: 2.2525 & 2.1982: 100%|██████████| 100/100 [00:07<00:00, 13.56it/s]
Test Sample: 9: 100%|██████████| 10/10 [00:00<00:00, 241.08it/s]
Train Epoch: 99, Train Loss I & II: 2.2266 & 2.1689: 100%|██████████| 100/100 [00:06<00:00, 15.24it/s]
Test Sample: 9: 100%|██████████| 10/10 [00:00<00:00, 90.65it/s]


In [54]:
mae_list = []
rmse_list = []
for i in range(len(results_list)):
    test_true = results_list[i]['test_true']
    test_pred = results_list[i]['test_pred']

    mae = mean_absolute_error(test_true, test_pred)
    rmse = sqrt(mean_squared_error(test_true, test_pred))

    mae_list.append(mae)
    rmse_list.append(rmse)

In [58]:
np.mean(mae_list), np.std(mae_list)

(0.5695954334114186, 0.03552441607647186)

In [59]:
np.mean(rmse_list), np.std(rmse_list)

(0.6397551084884349, 0.03139197371433021)

## Multiple times different architectures

In [11]:
import torch.optim as optim
from tqdm import tqdm
n_runs = 5 

# Hyperparameter configurations to test
configurations = [
    {"hidden_sizes": [50, 25, 50], "dropout": 0.5},
    {"hidden_sizes": [100, 50, 100], "dropout": 0.4},
    {"hidden_sizes": [75, 35, 75], "dropout": 0.6},
]

results_list = []

for config in configurations:
    for run in range(n_runs):
        hidden_sizes = config["hidden_sizes"]
        dropout = config["dropout"]

        # Initialize models with current configuration
        model1 = StackedSparseAutoencoder(input_size=input_size,
                                        hidden_sizes=hidden_sizes,
                                        dropout=dropout,
                                        sparsity_penalty=sparsity_penalty)

        model2 = StackedSparseAutoencoder(input_size=input_size,
                                        hidden_sizes=hidden_sizes,
                                        dropout=dropout,
                                        sparsity_penalty=sparsity_penalty)

        # Define optimizers for each model
        optimizer1 = optim.Adam(model1.parameters(), lr=learning_rate)
        optimizer2 = optim.Adam(model2.parameters(), lr=learning_rate)

        # initialize tqdm
        pbar = tqdm(range(epochs))
        train_pred, train_true = [], []
        xs_train, zs_train = [], []
        epoch_loss_train = []
        for epoch in pbar:
            epoch_loss1, epoch_loss2 = 0, 0 
            for data in loader:
                # get inputs
                x1 = data.x[0, :, :]
                x2 = data.x[1, :, :]

                # forward pass
                x1_hat, z1, = model1.forward(x1)
                x2_hat, z2  = model2.forward(x1)

                # compute correlation between embeddings (true target)
                corr = model1.compute_spearman_rank_correlation(x=z1.flatten().detach(), y=z2.flatten().detach())

                # compute loss
                loss1 = model1.loss_function(x1_hat, x1)
                loss2 = model2.loss_function(x2_hat, x2)

                # backward and optimize
                optimizer1.zero_grad()
                loss1.backward()
                optimizer1.step()

                optimizer2.zero_grad()
                loss2.backward()
                optimizer2.step()

                epoch_loss1 += loss1.item()
                epoch_loss2 += loss2.item()

            # update tqdm
            pbar.update(1)
            pbar.set_description("Train Epoch: %d, Train Loss I & II: %.4f & %.4f" % (epoch, epoch_loss1, epoch_loss2))


        # pred list to tensor
        train_pred = torch.tensor(train_pred)
        train_true = torch.tensor(train_true)

        pbar = tqdm(enumerate(loader), total=len(loader))
        test_pred = []
        test_true = []
        with torch.no_grad():
            for s, data in pbar:
                # get inputs
                x1 = data.x[0, :, :]
                x2 = data.x[1, :, :]

                # forward pass
                x1_hat, z1 = model1.forward(x1)
                x2_hat, z2 = model2.forward(x2)

                # compute correlation between embeddings (true target)
                corr = model1.compute_spearman_rank_correlation(x=z1.flatten().detach(), y=z2.flatten().detach())

                # store pred and true values
                test_pred.append(corr)
                test_true.append(data.y)

                # update tqdm
                pbar.update(1)
                pbar.set_description(f"Test Sample: {s}")

            # save loss
            epoch_loss_train.append([epoch_loss1, epoch_loss2])
            

        # pred list to tensor
        test_pred = torch.tensor(test_pred)
        test_true = torch.tensor(test_true)


        ############################
        # After training and evaluation, compile results for this configuration
        results = {
            "config": config,
            "run": run,
            "train_pred": train_pred,
            "train_true": train_true,
            "test_pred": test_pred,
            "test_true": test_true,
            "epoch_loss_train": epoch_loss_train,
        }
        results_list.append(results)

Train Epoch: 99, Train Loss I & II: 2.2426 & 2.1612: 100%|██████████| 100/100 [00:08<00:00, 11.44it/s]
Test Sample: 9: 100%|██████████| 10/10 [00:00<00:00, 315.65it/s]
Train Epoch: 99, Train Loss I & II: 2.2377 & 2.1891: 100%|██████████| 100/100 [00:08<00:00, 11.16it/s]
Test Sample: 9: 100%|██████████| 10/10 [00:00<00:00, 277.41it/s]
Train Epoch: 99, Train Loss I & II: 2.2403 & 2.1844: 100%|██████████| 100/100 [00:08<00:00, 12.47it/s]
Test Sample: 9: 100%|██████████| 10/10 [00:00<00:00, 250.35it/s]
Train Epoch: 99, Train Loss I & II: 2.2409 & 2.1967: 100%|██████████| 100/100 [00:07<00:00, 13.77it/s]
Test Sample: 9: 100%|██████████| 10/10 [00:00<00:00, 252.95it/s]
Train Epoch: 99, Train Loss I & II: 2.2361 & 2.1964: 100%|██████████| 100/100 [00:07<00:00, 14.16it/s]
Test Sample: 9: 100%|██████████| 10/10 [00:00<00:00, 295.97it/s]
Train Epoch: 99, Train Loss I & II: 2.1947 & 2.1464: 100%|██████████| 100/100 [00:29<00:00,  3.43it/s]
Test Sample: 9: 100%|██████████| 10/10 [00:00<00:00, 141.

In [12]:
mae_list = []
rmse_list = []
    for i in range(len(results_list)):
        test_true = results_list[i]['test_true']
        test_pred = results_list[i]['test_pred']

        mae = mean_absolute_error(test_true, test_pred)
        rmse = sqrt(mean_squared_error(test_true, test_pred))
        mae_list.append(mae)
        rmse_list.append(rmse)

IndentationError: unexpected indent (4257272446.py, line 3)

In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt

# Initialize lists for MAE and RMSE
mae_list = []
rmse_list = []
configs = [results_list[i]['config'] for i in range(len(results_list))]

# Loop over the results_list
for result in results_list:
    # Calculate MAE and RMSE
    mae = mean_absolute_error(result['test_true'], result['test_pred'])
    rmse = sqrt(mean_squared_error(result['test_true'], result['test_pred']))
    
    # Append MAE and RMSE to the respective lists
    mae_list.append(mae)
    rmse_list.append(rmse)

# Create a DataFrame
df = pd.DataFrame({
    'MAE': mae_list,
    'RMSE': rmse_list
}, index=configs)

df

Unnamed: 0,MAE,RMSE
"{'hidden_sizes': [50, 25, 50], 'dropout': 0.5}",0.584968,0.653801
"{'hidden_sizes': [50, 25, 50], 'dropout': 0.5}",0.449809,0.531727
"{'hidden_sizes': [50, 25, 50], 'dropout': 0.5}",0.623423,0.688414
"{'hidden_sizes': [50, 25, 50], 'dropout': 0.5}",0.568513,0.635418
"{'hidden_sizes': [50, 25, 50], 'dropout': 0.5}",0.592732,0.65414
"{'hidden_sizes': [100, 50, 100], 'dropout': 0.4}",0.530975,0.602576
"{'hidden_sizes': [100, 50, 100], 'dropout': 0.4}",0.512614,0.585895
"{'hidden_sizes': [100, 50, 100], 'dropout': 0.4}",0.600431,0.66329
"{'hidden_sizes': [100, 50, 100], 'dropout': 0.4}",0.535248,0.60741
"{'hidden_sizes': [100, 50, 100], 'dropout': 0.4}",0.543774,0.616702


In [None]:
df.index = df.index.map(str)
grouped_df = df.groupby(df.index).agg(['mean', 'std'])
grouped_df

Unnamed: 0_level_0,MAE,MAE,RMSE,RMSE
Unnamed: 0_level_1,mean,std,mean,std
"{'hidden_sizes': [100, 50, 100], 'dropout': 0.4}",0.544608,0.033219,0.615175,0.029128
"{'hidden_sizes': [50, 25, 50], 'dropout': 0.5}",0.563889,0.066814,0.6327,0.059607
"{'hidden_sizes': [75, 35, 75], 'dropout': 0.6}",0.549017,0.026525,0.619137,0.023649
