In [1]:
import os
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from utils.conn_data import load_pickle

plt.style.use('ggplot')

In [2]:
outputs_path = os.path.join(os.getcwd(), 'data', 'outputs')

## Simulation 1a

### Dataset

For each covariance in a list of different covariances between graph pairs, we simulate 30 times (n_simulations) a list of 50 (n_graphs) pairs of graphs from the erdos-renyi family of graphs. For each pair of graph, we sample a random variable p from a multivariate gaussian distribution with fixed mean and covariance.

### Model

1) $\textbf{Spectrum}$: For each pair of graphs, compute the spectral radii for each adjacency matrix individually, and compute the Spearman's rank correlation between the spectral radiis for each graph.

2) $\textbf{SDNE}$: For each epoch, for each pair of graphs, compute the embeddings using the SDNE (autoencoder), and compute the Spearman's rank correlation between the vectorial version of the embeddings of each graph.


In [3]:
# dataset = "simulation1a"
# models = os.listdir(os.path.join(outputs_path, dataset))
# all_test_data = []
# agg_metrics = []
# all_metrics_by_true_cov = []
# all_diff_by_true_cov = []
# for model in models:

#     if 'spectrum' in model:
#         data = load_pickle(os.path.join(outputs_path, dataset, model, 'results.pkl'))
#         test_data = data["train_test_results"]
#     else:
#         data = load_pickle(os.path.join(outputs_path, dataset, model, 'predictions.pkl'))
#         test_data = data["test_predictions"]

#     if len(test_data.shape) > 2:
#         test_data_df = []
#         for i in range(test_data.shape[0]):
#             simulation_test_data = test_data[i, :, :]
#             simulation_test_data_df = pd.DataFrame(simulation_test_data.numpy(), columns=["pred", "true"])
#             simulation_test_data_df.loc[:, "simulation"] = i

#             test_data_df.append(simulation_test_data_df)
#         test_data_df = pd.concat(test_data_df, axis=0)
#         test_data_df.loc[:, "true"] = [round(x, 2) for x in test_data_df["true"]]
#         test_data_df.loc[:, "model"] = model
#     else:
#         test_data_df = pd.DataFrame(test_data.numpy(), columns=["pred", "true"])
#         test_data_df.loc[:, "true"] = [round(x, 2) for x in test_data_df["true"]]
#         test_data_df.loc[:, "model"] = model
#     all_test_data.append(test_data_df)

#     # compute aggregated mse and mae
#     mse = mean_squared_error(test_data_df["true"], test_data_df["pred"])
#     mae = mean_absolute_error(test_data_df["true"], test_data_df["pred"])
#     agg_metrics.append({"model": model, "mse": mse, "mae": mae})

#     # compute mse and mae per true cov
#     metrics_by_true_cov = []
#     for cov in test_data_df["true"].unique():
#         selected_df = test_data_df.loc[test_data_df["true"] == cov]

#         pred = selected_df["pred"]
#         true = selected_df["true"]

#         mse = mean_squared_error(true, pred)
#         mae = mean_absolute_error(true, pred)
#         metrics_by_true_cov.append({"model": model, "dataset": dataset, "cov": cov, "mse": mse, "mae": mae})
#     metrics_by_true_cov_df = pd.DataFrame(metrics_by_true_cov)
#     all_metrics_by_true_cov.append(metrics_by_true_cov_df)

#     # compute diff by true cov
#     diff_by_true_cov = []
#     for cov in test_data_df["true"].unique():
#         selected_df = test_data_df.loc[test_data_df["true"] == cov]

#         pred = selected_df["pred"]
#         true = selected_df["true"]

#         selected_df.loc[:, "diff"] = true - pred
#         diff_by_true_cov.append(selected_df)
#     diff_by_true_cov_df = pd.concat(diff_by_true_cov, axis=0)
#     all_diff_by_true_cov.append(diff_by_true_cov_df)

# all_test_data_df = pd.concat(all_test_data, axis=0)
# agg_metrics_df = pd.DataFrame(agg_metrics)
# all_metrics_by_true_cov_df = pd.concat(all_metrics_by_true_cov, axis=0)
# all_diff_by_true_cov_df = pd.concat(all_diff_by_true_cov, axis=0)

In [4]:
# parse_agg_metrics = []
# for idx, row in agg_metrics_df.iterrows():
#    model = row["model"].split("_")[0]
   
#    if model.startswith("spectrum"):
#        parse_agg_metrics.append({"model": model,
#                                  "mse": row["mse"],
#                                  "mae": row["mae"]})
#    elif model.startswith('sae'):
#         #model_name = f'{args.model_name}_{hidden_sizes_str}_{sparsity_penalty_scaled}_{dropout_scaled}_{int(args.epochs)}'
#         epochs = row["model"].split("_")[-1]
#         dropout = row["model"].split("_")[-2]
#         sparsity = row["model"].split("_")[-3]
#         hidden_layers = row["model"].split("_")[1:-4]

#         parse_agg_metrics.append({"model": model,
#                                   "dropout": dropout,
#                                   "sparsity": sparsity,
#                                   "hidden_layers": str(hidden_layers),
#                                   "epochs": epochs,
#                                   "mse": row["mse"],
#                                   "mae": row["mae"]}
#                                 )

#    elif model.startswith("sdne") or model.startswith("sgnn0"):
#          n_hidden = row["model"].split("_")[1]
#          n_layers_enc = row["model"].split("_")[2]
#          n_layers_dec = row["model"].split("_")[3]
#          epochs = row["model"].split("_")[4]
#          parse_agg_metrics.append({"model": model,
#                                    "n_hidden": n_hidden,
#                                    "n_layers_enc": n_layers_enc,
#                                    "n_layers_dec": n_layers_dec,
#                                    "epochs": epochs,
#                                    "mse": row["mse"],
#                                    "mae": row["mae"]})
#    else:
#        raise ValueError(f"Model not recognized: {model}")
# parse_agg_metrics_df = pd.DataFrame(parse_agg_metrics)

In [5]:
# parse_agg_metrics_df.loc[parse_agg_metrics_df["model"] == "sgnn0"]

In [6]:
# parse_agg_metrics_df.sort_values(by="mse").head(10)

In [7]:
# parse_agg_metrics_df.sort_values(by="mae").head(20)

In [8]:
# parse_agg_metrics_df.loc[parse_agg_metrics_df["model"].isin(["sdne3", "sae"])&(parse_agg_metrics_df["n_layers_enc"] == "1")].sort_values(by="mse")

In [9]:
# set(parse_agg_metrics_df.model)

In [10]:
# # Define unique line styles and markers for each model, ensure there are enough styles for each model
# styles = ['-', '--', '-.', ':']
# markers = ['o', 's', 'X', 'D']

# plt.figure(figsize=(8, 6))  # Create a figure for the plots

# # Get the unique models
# unique_models = all_metrics_by_true_cov_df["model"].unique()

# # Iterate through each unique model and plot on the same graph with a unique style
# for idx, model in enumerate(unique_models):
#     # Filter the DataFrame for the current model
#     model_df = all_metrics_by_true_cov_df[all_metrics_by_true_cov_df["model"] == model]
    
#     # Plot the lineplot on the same figure with unique style and marker
#     sns.lineplot(
#         x='cov', y='mse', data=model_df,
#         label=model, linestyle=styles[idx % len(styles)], marker=markers[idx % len(markers)]
#     )

# plt.title('MSE by Correlation across Models')  # Set the title for the entire plot
# plt.xlabel('Correlation')  # Set the x-axis label
# plt.ylabel('MSE')  # Set the y-axis label
# plt.legend(title='Model')  # Enable the legend to distinguish models

# # Rotate x labels for better fit
# plt.xticks(rotation=45, ha='right')

# # Adjust the layout
# plt.tight_layout()

# # Show the plot
# plt.show()

In [11]:
# # Define unique line styles and markers for each model, ensure there are enough styles for each model
# styles = ['-', '--', '-.', ':']
# markers = ['o', 's', 'X', 'D']

# plt.figure(figsize=(8, 6))  # Create a figure for the plots

# # Get the unique models
# unique_models = all_metrics_by_true_cov_df["model"].unique()

# # Iterate through each unique model and plot on the same graph with a unique style
# for idx, model in enumerate(unique_models):
#     # Filter the DataFrame for the current model
#     model_df = all_metrics_by_true_cov_df[all_metrics_by_true_cov_df["model"] == model]
    
#     # Plot the lineplot on the same figure with unique style and marker
#     sns.lineplot(
#         x='cov', y='mae', data=model_df,
#         label=model, linestyle=styles[idx % len(styles)], marker=markers[idx % len(markers)]
#     )

# plt.title('MAE by Correlation across Models')  # Set the title for the entire plot
# plt.xlabel('Correlation')  # Set the x-axis label
# plt.ylabel('MAE')  # Set the y-axis label
# plt.legend(title='Model')  # Enable the legend to distinguish models

# # Rotate x labels for better fit
# plt.xticks(rotation=45, ha='right')

# # Adjust the layout
# plt.tight_layout()

# # Show the plot
# plt.show()

In [12]:
# for model in all_diff_by_true_cov_df["model"].unique():
#     model_df = all_diff_by_true_cov_df[all_diff_by_true_cov_df["model"] == model]
#     plt.figure(figsize=(8, 6))
#     sns.boxplot(x="true", y="pred", data=model_df)
#     plt.title(f'Predicted value for the model {model}')
#     plt.xlabel('True Correlation value')
#     plt.ylabel('Predicted')
#     plt.xticks(rotation=45, ha='right')
#     plt.tight_layout()
#     plt.show()

## Simulation 1b

### Dataset

For each covariance in a list of different covariances between graph pairs, for n in a list of the number of nodes (10 to 100, 10 by 10), we simulate 30 times (n_simulations) a list of 50 (n_graphs) pairs of graphs from the erdos-renyi family of graphs. For each pair of graph, we sample a random variable p from a multivariate gaussian distribution with fixed mean and covariance.

In [13]:
# dataset = "simulation1a"
# n_nodes = os.listdir(os.path.join(outputs_path, dataset))
# n_nodes_agg_metrics = []
# for n in n_nodes:
#     models = os.listdir(os.path.join(outputs_path, dataset, n))
#     for model in models:

#         # load data
#         data = load_pickle(os.path.join(outputs_path, dataset, n, model, 'results.pkl'))

#         try:
#             if model == 'spectrum':
#                 test_data = data["train_test_results"]
#             else:
#                 test_data = data["test_results"]
#         except:
#             print(model, n)

#         test_data_df = []
#         for i in range(test_data.shape[0]):
#             simulation_test_data = test_data[i, :, :]
#             simulation_test_data_df = pd.DataFrame(simulation_test_data.numpy(), columns=["pred", "true"])
#             simulation_test_data_df.loc[:, "simulation"] = i

#             test_data_df.append(simulation_test_data_df)
#         test_data_df = pd.concat(test_data_df, axis=0)
#         test_data_df.loc[:, "true"] = [round(x, 2) for x in test_data_df["true"]]
#         test_data_df.loc[:, "model"] = model
#         all_test_data.append(test_data_df)

#         # compute aggregated mse and mae
#         mse = mean_squared_error(test_data_df["true"], test_data_df["pred"])
#         mae = mean_absolute_error(test_data_df["true"], test_data_df["pred"])
#         n_nodes_agg_metrics.append({"model": model, "n_nodes": n, "mse": mse, "mae": mae})
# n_nodes_agg_metrics_df = pd.DataFrame(n_nodes_agg_metrics)

In [14]:
# n_nodes_agg_metrics_df

In [15]:
# colors = ['red', 'green', 'blue', 'purple']  # Define a color for each line

# plt.figure(figsize=(8, 6))  # Create a figure for the plots

# sns.lineplot(data=n_nodes_agg_metrics_df, x="n_nodes", y="mse", hue="model")

# plt.title('MSE as a Function of the Number of Nodes')  # Set the title for the entire plot
# plt.xlabel('Number of nodes in the graph')  # Set the x-axis label
# plt.ylabel('MSE')  # Set the y-axis label
# plt.legend(title='Model')  # Enable the legend to distinguish models

# # Rotate x labels for better fit
# plt.xticks(rotation=45, ha='right')

# # Adjust the layout
# plt.tight_layout()

# # Show the plot
# plt.show()

In [17]:
dataset_name = "simulation1c"
simulation_names = os.listdir(os.path.join(outputs_path, dataset_name))

all_metrics = []
for simulation_name in simulation_names:


    if ".DS_Store" == simulation_name:
        continue

    for model in os.listdir(os.path.join(outputs_path, dataset_name, simulation_name)):

        if ".DS_Store" == model:
            continue

        if 'spectrum' in model:
            data = load_pickle(os.path.join(outputs_path, dataset_name, simulation_name, model, 'results.pkl'))
            test_data = data["train_test_results"]
        else:
            data = load_pickle(os.path.join(outputs_path, dataset_name, simulation_name, model, 'predictions.pkl'))
            test_data = data["test_predictions"]

        if len(test_data.shape) > 2:
            test_data_df = []
            for i in range(test_data.shape[0]):
                simulation_test_data = test_data[i, :, :]
                simulation_test_data_df = pd.DataFrame(simulation_test_data.cpu().numpy(), columns=["pred", "true"])
                simulation_test_data_df.loc[:, "simulation"] = i

                test_data_df.append(simulation_test_data_df)
            test_data_df = pd.concat(test_data_df, axis=0)
            test_data_df.loc[:, "true"] = [round(x, 2) for x in test_data_df["true"]]
            test_data_df.loc[:, "model"] = model
        else:
            test_data_df = pd.DataFrame(test_data.numpy(), columns=["pred", "true"])
            test_data_df.loc[:, "true"] = [round(x, 2) for x in test_data_df["true"]]
            test_data_df.loc[:, "model"] = model


        # compute aggregated mse and mae
        mse = mean_squared_error(test_data_df["true"], test_data_df["pred"])
        mae = mean_absolute_error(test_data_df["true"], test_data_df["pred"])

        tmp_metrics = pd.DataFrame({"dataset": dataset_name,
                                    "simulation": simulation_name,
                                    "model": model,
                                    "mse": mse,
                                "mae": mae}, index=[0])
        all_metrics.append(tmp_metrics)
all_metrics_df = pd.concat(all_metrics, axis=0)

all_metrics_df

  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))
  return torch.load(io.BytesIO(b))


Unnamed: 0,dataset,simulation,model,mse,mae
0,simulation1c,watts_strogatz,sdne9_50_1_1_10,0.33521,0.500083
0,simulation1c,watts_strogatz,sdne8_100_1_1_200,0.328972,0.495722
0,simulation1c,watts_strogatz,sdne9_50_1_1_100,0.335268,0.500399
0,simulation1c,watts_strogatz,sdne8_50_1_1_100_alpha1_beta1_gamma1_nu0,0.514668,0.590721
0,simulation1c,watts_strogatz,sdne8_50_1_1_100,0.329044,0.491965
0,simulation1c,watts_strogatz,sdne8_100_1_1_100,0.332854,0.498456
0,simulation1c,watts_strogatz,sdne8_50_1_1_200_alpha1_beta1_gamma1_nu1,0.344008,0.505725
0,simulation1c,watts_strogatz,sdne8_50_1_1_200_alpha1_beta1_gamma1_nu2,0.350792,0.508529
0,simulation1c,watts_strogatz,sdne8_100_1_1_10,0.335484,0.500555
0,simulation1c,watts_strogatz,spectrum,0.114484,0.269591
