# Read the results of the training

In [None]:
import seml
import pandas as pd
import matplotlib as mpl
from matplotlib import pyplot as plt
import torch
import seaborn as sns
import json
import os
import sys
import time
from torch_geometric.loader import DataLoader
from statistics import mean, median
from tqdm import tqdm
import dgl
import networkx as nx
import torch_geometric
sys.path.append(os.path.dirname(os.getcwd()))

from src.baseline.model_gcn import GIN
from src.ppgn.ppgn import PPGN
from src.I2GNN.I2GNN import I2GNN
from src.baseline.dataset_gcn import GraphDataset
from src.I2GNN.I2GNN_dataset import I2GNNDataset, I2GNNDataLoader, I2GNNDatasetRobustness
from src.metrics.L1_based import L1LossCount, L1LossStd
from src.I2GNN.utils import create_subgraphs2

In [None]:
exp = 'dataset_2'
exp = 'er_10'
arch = "I2GNN"
device = 'cuda'
experiment_name = f'train_{arch}_{exp}'
results: pd.DataFrame = seml.get_results(experiment_name, to_data_frame=True)
results.index = results["_id"]

In [3]:
substrucutres = results['config.subgraph'].unique()
best_l1 = pd.DataFrame(columns=["Best_model", "L1", "L1 std", "L1 count", "Mean", "Varience", "Std"], index=substrucutres)
# best_l1_std = pd.DataFrame(columns=["Best_model", "L1 std", "Mean", "Varience", "Std"], index=substrucutres)
# best_l1_count = pd.DataFrame(columns=["Best_model", "L1 count", "Mean", "Varience", "Std"], index=substrucutres)
for substrcuture in substrucutres:
    substructure_experiments = results[results['config.subgraph'] == substrcuture]
    best_experiment_l1 = substructure_experiments.sort_values(axis=0, by='result.l1_average').head(1).squeeze()
    # best_experiment_l1_std = substructure_experiments.sort_values(axis=0, by='result.l1').head(1).squeeze()
    # best_experiment_l1_count = substructure_experiments.sort_values(axis=0, by='result.l1').head(1).squeeze()
    
    best_l1.loc[substrcuture] = [best_experiment_l1['_id'], best_experiment_l1['result.l1_average'], best_experiment_l1['result.l1_std_average'], 
    best_experiment_l1['result.l1_count_average'], best_experiment_l1['result.count_mean'], best_experiment_l1['result.count_variance'], best_experiment_l1['result.count_std']]

In [4]:
print("Best models with l1 loss:")
display(best_l1)


Best models with l1 loss:


Unnamed: 0,Best_model,L1,L1 std,L1 count,Mean,Varience,Std
Triangle,1,0.00223,0.000482,0.000261,7.632,21.449957,4.63141
2-Path,2,0.027203,0.00296,0.000786,34.616001,84.432716,9.188727
4-Clique,3,0.001608,0.001081,0.000327,0.824,2.212309,1.487383
Chordal cycle,4,0.01966,0.002779,0.002505,7.6836,50.046711,7.07437
Tailed triangle,5,0.061716,0.005093,0.002864,23.134399,146.820663,12.116958
3-Star,6,0.079077,0.013998,0.007414,11.692,31.911501,5.649026
4-Cycle,7,0.035215,0.008019,0.00607,5.8216,19.283087,4.391251
3-Path,8,0.062456,0.005997,0.002069,34.9716,108.471779,10.414979
3-Star not ind.,9,0.035508,0.00113,0.000864,53.489601,986.939087,31.415586


Now that we have the best models we can test them on the test set

In [None]:
def evaluate_epoch(dataloader, gnn, loss_fn, device)->torch.Tensor:
    gnn.eval()
    with torch.no_grad():
        num_batches = len(dataloader)
        loss = torch.zeros(len(l)).to(device)
        for data in dataloader:
            data = data.to(device)
            y = data.y
            pred = gnn(data)
            for i, loss_fn in enumerate(l):
                loss[i] += loss_fn(pred, y)
        
        loss = loss / num_batches
    return loss

def pre_transform(g, hops):
            return create_subgraphs2(g, hops)
    
hops = {
    "Triangle": 1,
    "2-Path": 2,
    "4-Clique": 1, 
    "Chordal cycle": 2,
    "Tailed triangle": 2,
    "3-Star": 2,
    "4-Cycle": 2,
    "3-Path": 3,
    "3-Star not ind.": 2,
}

models_folder = results['config.model_folder'][1]
model = results['config.model'][1]
n_seeds = len(results['config.seeds'][1])
test_set = results['config.test_dataset'][1]
test = pd.DataFrame(columns=["Best_model", "L1 avg", "L1 std avg", "L1 count avg","MSE var avg", "L1", "L1 std", "L1 count", "MSE var"], index=substrucutres)

gnns = {}
for subgraph, line in tqdm(best_l1.iterrows()):
    dict_models = results.loc[line['Best_model'], 'result.model_paths']
    params_models = results.loc[line['Best_model'], 'result.h_param_paths']
    if arch in ['GIN', 'PPGN']:
        test_dataset = GraphDataset(test_set, subgraph, 1)
        test_loader = DataLoader(test_dataset, batch_size=1)
        std = torch.std(test_dataset.labels)
        var = torch.var(test_dataset.labels)
    elif arch == 'I2GNN':
        test_dataset = I2GNNDataset(root=os.path.dirname(test_set),dataset=os.path.basename(test_set),  subgraph_type=subgraph, pre_transform=pre_transform, hops=hops[subgraph])
        test_loader = I2GNNDataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
        std = torch.std(test_dataset.data.y)
        var = torch.var(test_dataset.data.y)
    l1 = torch.nn.L1Loss()
    l1_std = L1LossStd(std)
    l1_count = L1LossCount()
    mse = torch.nn.MSELoss()
    
    start = time.time()
    gnns = []
    for dict_model, params_model in zip(dict_models, params_models):
        with open(params_model, 'r') as f:
            h_params = json.load(f)
        if arch == "GIN":
            gnn = GIN(**h_params).to(device)
        if arch == "PPGN":
            gnn = PPGN(**h_params).to(device)
        elif arch == 'I2GNN':
            gnn = I2GNN(**h_params).to(device)
        gnn.load_state_dict(torch.load(dict_model, map_location=torch.device(device)))
        gnns.append(gnn)
    # evaluate
    l = [l1, l1_std, l1_count, mse]
    l1_err = []
    l1_std_err = []
    l1_count_err = []
    mse_err = []
    for gnn in gnns:
        err = evaluate_epoch(test_loader, gnn, l, device)
        l1_err.append(err[0].item())
        l1_std_err.append(err[1].item())
        l1_count_err.append(err[2].item())
        mse_err.append(err[3].item())
    test.loc[subgraph] = [line["Best_model"], mean(l1_err), mean(l1_std_err), mean(l1_count_err), median(mse_err), l1_err,l1_std_err, l1_count_err, mse_err]
    print(f'{subgraph}: {mean(l1_err)}, time: {time.time() - start}')
display(test)


Now we store the best mdels in a separate folder

In [None]:
import shutil
import os

if exp != 'dataset_2':

    destination_path = f"/nfs/students/campi/best_models/{arch}_{exp}"
    os.makedirs(destination_path, exist_ok = True)

    for subgraph, line in best_l1.iterrows():
        original_models_path = results.loc[line['Best_model'], 'result.model_paths']
        original_h_params_path = results.loc[line['Best_model'], 'result.h_param_paths']
        for path in original_models_path:
            file_parts = os.path.basename(path).split('_')
            new_file = f"{file_parts[0]}_{file_parts[1]}_{file_parts[-1]}"
            shutil.copyfile(path, os.path.join(destination_path, new_file))
        for path in original_h_params_path:
            file_parts = os.path.basename(path).split('_')
            new_file = f"{file_parts[0]}_{file_parts[1]}_{file_parts[-1]}"
            shutil.copyfile(path, os.path.join(destination_path, new_file))

Store the correctly classified samples

In [None]:
def generate_gnn_input(graph: nx.Graph, device)->torch_geometric.data.Data:
    """Creates from a networkx graph a Data instance, which is the input a a pytorch geometric model."""
    num_edges = graph.number_of_edges()
    x = torch.ones(graph.number_of_nodes(), 1) # no improovement by using more channels in the first layer
    edge_index = torch.empty(2, 2*num_edges, dtype=torch.long)
    for i, edge in enumerate(graph.edges()):
        edge_index[0,i] = edge[0]
        edge_index[1,i] = edge[1]
        edge_index[0, i+num_edges] = edge[1]
        edge_index[1, i+num_edges] = edge[0]
    return torch_geometric.data.Data(x=x, edge_index=edge_index).to(device)

def pre_transform(g, hops):
            return create_subgraphs2(g, hops)

hops = {
    "Triangle": 1,
    "2-Path": 2,
    "4-Clique": 1, 
    "Chordal cycle": 2,
    "Tailed triangle": 2,
    "3-Star": 2,
    "4-Cycle": 2,
    "3-Path": 3,
    "3-Star not ind.": 2,
}

n_samples = 100
test_set = results['config.test_dataset'][1]
models_path = f"/nfs/students/campi/best_models/{arch}_{exp}"
graphs, counts = dgl.load_graphs(results['config.test_dataset'].iloc[0])
graphs = [nx.Graph(dgl.to_networkx(graph)) for graph in graphs]
destination_path = os.path.dirname(results['config.test_dataset'].iloc[0])
file_base = os.path.basename(results['config.test_dataset'].iloc[0]).split('.')[0]
file_name = f'{file_base}_{arch}.json'
l1 = torch.nn.L1Loss()

adversarial_ids = {}
for subgraph, line in tqdm(best_l1.iterrows()):
    gnns = []
    for i in range(5):
        model_dict = f"{models_path}/{arch}_{subgraph}_{i}.pth"
        model_params = f"{models_path}/{arch}_{subgraph}_{i}.json"
        with open(model_params, 'r') as fp:
            h_params = json.load(fp)
        if arch == 'GIN':
            gnns.append(GIN(**h_params).to(device))
        elif arch == 'PPGN':
            gnns.append(PPGN(**h_params).to(device))
        elif arch == 'I2GNN':
            gnns.append(I2GNN(**h_params).to(device))
        else:
            raise ValueError("The architecture is not supported!")
        gnns[-1].load_state_dict(torch.load(model_dict, map_location=torch.device(device)))
        gnns[-1].eval()
    correctly_pred_ids = []
    if arch in ['GIN', 'PPGN']:
        for i, (graph, count) in enumerate(zip(graphs, counts[subgraph])):
            corr_pred = True
            for gnn in gnns:
                err = l1(gnn(generate_gnn_input(graph, device)).flatten(), count)
                if  err >= 0.5:
                    corr_pred = False
                    break
            if corr_pred == True:
                correctly_pred_ids.append(i)
            if len(correctly_pred_ids) == n_samples:
                print(f'{subgraph} has enough at {i}')
                break
        adversarial_ids[subgraph] = correctly_pred_ids
        print(f'{subgraph}: {len(correctly_pred_ids)}')
    if arch == 'I2GNN':
        test_dataset = I2GNNDataset(root=os.path.dirname(test_set),dataset=os.path.basename(test_set),  subgraph_type=subgraph, pre_transform=pre_transform, hops=hops[subgraph])
        test_loader = I2GNNDataLoader(dataset=test_dataset, batch_size=1, shuffle=False)
        for i, data in enumerate(test_loader):
            data.to(device)
            corr_pred = True
            for gnn in gnns:
                err = l1(gnn(data), data.y)
                if  err >= 0.5:
                    corr_pred = False
                    break
            if corr_pred == True:
                correctly_pred_ids.append(i)
            if len(correctly_pred_ids) == n_samples:
                print(f'{subgraph} has enough at {i}')
                break
        adversarial_ids[subgraph] = correctly_pred_ids
        print(f'{subgraph}: {len(correctly_pred_ids)}')
    
# write the json file
with open(os.path.join(destination_path, file_name), 'w') as f:
    json.dump(adversarial_ids, f)
    