In [2]:
import os
from pathlib import Path

import numpy as np
import optuna
import pandas as pd
import torch
from tqdm import tqdm
from torch_geometric.transforms import AddSelfLoops
from mscproject.transforms import RemoveSelfLoops

from mscproject.datasets import CompanyBeneficialOwners
from mscproject.transforms import RemoveSelfLoops
import mscproject.models as mod
import mscproject.experiment as exp

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

In [3]:
study_names = (
    # "pyg_model_selection_ALL",
    "pyg_model_selection_GCN",
    "pyg_model_selection_GraphSAGE",
    "pyg_model_selection_GAT",
    "pyg_model_selection_HGT",
    "pyg_model_selection_HAN",
)

In [4]:
from IPython.display import display

In [5]:
OPTUNA_STORAGE = "sqlite:///data/optuna.db"
MODEL_DIR = Path("data/models/pyg")
PREDICTION_DIR = Path("data/predictions")

Path(MODEL_DIR).mkdir(parents=True, exist_ok=True)
Path(PREDICTION_DIR).mkdir(parents=True, exist_ok=True)

trials_dfs = [
    study.trials_dataframe().assign(study_name=study.study_name)
    for study in (
        optuna.load_study(study_name=study_name, storage=OPTUNA_STORAGE)
        for study_name in study_names
    )
]
eval_df = pd.concat(trials_dfs, join="inner", axis=0)

In [6]:
# for study in (
#     optuna.load_study(study_name=study_name, storage=OPTUNA_STORAGE)
#     for study_name in study_names
# ):
#     print(study.study_name)
#     # Plot the results.
#     mean_top_10_loss = study.trials_dataframe()["value"].sort_values().head(10).mean()
#     print("Mean of top 10 best loss:", mean_top_10_loss)
#     optuna.visualization.plot_optimization_history(study).show()
#     # optuna.visualization.plot_contour(study).show()
#     optuna.visualization.plot_slice(study).show()
#     optuna.visualization.plot_param_importances(study).show()
#     print()
#     print()

In [7]:
def mark_outliers(df):
    df = df.copy()
    upper_threshold = df.value.mean() + 3 * df.value.std()
    df["outlier"] = df.value > upper_threshold
    return df

In [8]:
metric = "user_attrs_aprc"
first = True

best_trials = {}

for study_name, df in zip(study_names, trials_dfs):
    if study_name.endswith("_ALL"):
        continue
    print(study_name)
    top = df.sort_values(metric, ascending=False)[:30]
    param_columns = [x for x in top.columns if x.startswith("params")]
    top = mark_outliers(top)
    display(top[["outlier", "value", metric, *param_columns]][:10].T)

    model_type = study_name.split("_")[-1]
    best_trials[model_type] = (
        top.query(
            "not outlier and (params_hidden_channels_log2 * params_num_layers) < 40"
        )
        .iloc[0]
        .to_dict()
    )
    best_trials[model_type]["model_type"] = model_type

    print()

pyg_model_selection_GCN


Unnamed: 0,outlier,value,user_attrs_aprc,params_act,params_add_self_loops,params_bias,params_dropout,params_edge_aggr,params_gcn_aggr,params_hidden_channels_log2,params_jk,params_num_layers,params_weight_decay
107,True,0.717129,0.717129,relu,False,True,0.176726,min,mean,8,none,3,5.8e-05
178,False,0.521349,0.521349,gelu,False,True,0.074155,sum,sum,7,none,3,0.000155
71,False,0.506553,0.506553,relu,False,False,0.252673,min,sum,7,none,3,0.000141
43,False,0.440476,0.440476,relu,False,True,0.266203,min,min,8,none,3,0.001613
61,False,0.362126,0.362126,relu,False,False,0.203275,max,min,8,none,3,0.001546
134,False,0.356192,0.356192,relu,True,True,0.080616,min,sum,7,none,3,0.003823
63,False,0.340579,0.340579,relu,False,False,0.260325,max,min,7,none,3,6.1e-05
26,False,0.335414,0.335414,relu,True,True,0.187773,min,sum,6,none,3,0.000853
103,False,0.329718,0.329718,relu,True,True,0.203291,min,sum,7,none,3,0.003462
86,False,0.327589,0.327589,relu,True,True,0.154225,min,sum,7,none,3,0.005081



pyg_model_selection_GraphSAGE


Unnamed: 0,outlier,value,user_attrs_aprc,params_act,params_add_self_loops,params_dropout,params_edge_aggr,params_hidden_channels_log2,params_jk,params_num_layers,params_weight_decay
121,True,0.342527,0.342527,relu,False,0.341147,max,7,none,2,8.1e-05
126,False,0.325065,0.325065,relu,False,0.331615,sum,7,none,2,0.000577
96,False,0.317189,0.317189,relu,False,0.038688,max,6,none,2,0.001376
94,False,0.310221,0.310221,relu,False,0.115187,max,6,none,2,0.000243
62,False,0.310048,0.310048,relu,False,0.146898,max,6,none,2,0.001266
110,False,0.309514,0.309514,relu,False,0.239721,max,7,none,2,1.4e-05
172,False,0.308062,0.308062,relu,False,0.245433,max,7,none,2,0.000384
191,False,0.307384,0.307384,relu,False,0.292773,sum,7,none,2,4.9e-05
112,False,0.305729,0.305729,relu,False,0.220047,max,7,none,2,0.000172
108,False,0.305253,0.305253,relu,False,0.176865,max,7,none,2,0.003586



pyg_model_selection_GAT


Unnamed: 0,outlier,value,user_attrs_aprc,params_act,params_add_self_loops,params_concat,params_dropout,params_edge_aggr,params_heads_log2,params_hidden_channels_log2,params_jk,params_num_layers,params_v2,params_weight_decay
198,False,0.343791,0.343791,relu,False,False,0.098545,sum,2,5,none,3,True,0.000246
194,False,0.32807,0.32807,relu,False,False,0.124463,sum,2,5,none,3,True,7.7e-05
184,False,0.320952,0.320952,relu,True,False,0.140558,sum,1,4,none,3,True,0.000174
174,False,0.301325,0.301325,relu,True,False,0.086613,sum,1,4,none,3,True,0.000292
146,False,0.298826,0.298826,relu,True,False,0.067838,sum,1,4,none,3,True,0.000199
139,False,0.294188,0.294188,relu,False,False,0.187118,sum,1,4,none,3,True,5.1e-05
165,False,0.292404,0.292404,relu,True,False,0.135194,sum,1,4,none,3,True,0.000539
86,False,0.28683,0.28683,relu,True,False,0.099089,max,1,4,none,3,True,0.000134
175,False,0.286583,0.286583,relu,True,False,0.104073,sum,1,4,none,3,True,2.6e-05
81,False,0.283789,0.283789,relu,True,False,0.149403,max,1,6,none,3,True,0.000754



pyg_model_selection_HGT


Unnamed: 0,outlier,value,user_attrs_aprc,params_act,params_add_self_loops,params_dropout,params_edge_aggr,params_group,params_heads_log2,params_hidden_channels_log2,params_jk,params_num_layers,params_weight_decay
95,False,0.312829,0.312829,gelu,False,0.67879,max,min,3,5,last,1,6.9e-05
84,False,0.310135,0.310135,gelu,False,0.793006,max,max,3,5,last,1,0.000178
138,False,0.308111,0.308111,gelu,False,0.443489,max,min,3,5,last,4,3.6e-05
151,False,0.306952,0.306952,gelu,False,0.459553,max,sum,3,5,last,1,0.000274
82,False,0.299447,0.299447,gelu,False,0.793519,max,max,3,5,last,1,0.000167
166,False,0.29722,0.29722,gelu,False,0.411539,max,sum,3,5,last,1,0.000174
162,False,0.296736,0.296736,gelu,False,0.401875,max,sum,3,5,last,1,0.00021
148,False,0.294608,0.294608,gelu,False,0.41953,max,sum,3,5,last,1,3.9e-05
186,False,0.294554,0.294554,gelu,False,0.45079,max,sum,3,5,last,1,0.000257
164,False,0.293815,0.293815,gelu,False,0.398174,max,sum,3,5,last,1,0.000619



pyg_model_selection_HAN


Unnamed: 0,outlier,value,user_attrs_aprc,params_act,params_add_self_loops,params_dropout,params_edge_aggr,params_han_dropout,params_heads_log2,params_hidden_channels_log2,params_jk,params_negative_slope,params_num_layers,params_weight_decay
186,False,0.287472,0.287472,gelu,True,0.019282,max,0.671257,2,4,last,0.454808,1,0.000228
195,False,0.28691,0.28691,gelu,True,0.012761,max,0.669714,2,4,last,0.427534,1,0.000105
182,False,0.282866,0.282866,gelu,True,0.002861,max,0.675573,2,4,last,0.4737,1,0.00037
116,False,0.279735,0.279735,gelu,True,0.066071,mean,0.056817,2,4,last,0.688181,1,1.4e-05
181,False,0.275998,0.275998,gelu,True,0.032338,max,0.870073,2,4,last,0.46918,1,4.9e-05
91,False,0.275153,0.275153,gelu,True,0.051403,mean,0.004445,2,4,last,0.605652,1,0.000413
185,False,0.274733,0.274733,gelu,True,0.031843,max,0.860483,2,4,last,0.451086,1,0.000163
92,False,0.273608,0.273608,gelu,True,0.131027,mean,0.013472,2,4,last,0.693585,1,8.1e-05
86,False,0.273064,0.273064,gelu,True,0.060547,mean,0.083546,2,4,last,0.59716,1,3.4e-05
194,False,0.270773,0.270773,gelu,True,0.000738,max,0.676205,2,4,last,0.435841,1,0.000113





In [9]:
import mscproject.experiment as exp

In [10]:
# remove prefix from string
def remove_prefix(text, prefix):
    if text.startswith(prefix):
        return text[len(prefix) :]
    return text

In [11]:
def build_experiment_from_trial_params(trial_params, dataset, verbose=False):
    param_dict = {
        remove_prefix(k, "params_"): v
        for k, v in trial_params.items()
        if k.startswith("params")
    }
    # Rename key from "n_layers" to "num_layers"
    if "n_layers" in param_dict:
        param_dict["num_layers"] = param_dict.pop("n_layers")
    param_dict["in_channels"] = -1
    param_dict["out_channels"] = 1
    param_dict["act_first"] = True
    param_dict["add_self_loops"] = False
    param_dict["model_type"] = mod.get_model(trial_params["model_type"])
    param_dict["v2"] = True
    lr = trial_params["user_attrs_learning_rate"]
    param_dict["jk"] = None if param_dict["jk"] == "none" else param_dict["jk"]
    if verbose:
        print(param_dict)
    return exp.get_model_and_optimiser(param_dict, dataset, lr)

In [12]:
dataset_path = "data/pyg/"

# Set the device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {str(device).upper()}")

# Load the dataset.
dataset = CompanyBeneficialOwners(dataset_path, to_undirected=True)
dataset = dataset.data.to(device)

model_metrics = {}

models_dir = Path("models/pyg")
models_dir.mkdir(parents=True, exist_ok=True)

for model_name in best_trials.keys():

    model_path = MODEL_DIR / f"{model_name}.pt"
    if model_path.exists():
        print(f"Model {model_name} already exists, skipping")
        continue

    print("Training model:", model_name)
    trial_dict = best_trials[model_name]

    if trial_dict["params_add_self_loops"]:
        dataset = AddSelfLoops(fill_value=1.0)(dataset)  # type: ignore
    else:
        dataset = RemoveSelfLoops()(dataset)

    model, optimiser = build_experiment_from_trial_params(
        trial_dict, dataset, verbose=True
    )

    # Train and evaluate the model.
    best_epoch = trial_dict["user_attrs_best_epoch"]
    best_epoch = int(best_epoch) if not np.isnan(best_epoch) else 200

    progress = tqdm(range(best_epoch))

    # Train model ten times and keep the best one
    best_model = None
    best_aprc = np.inf

    for i in range(10):
        for epoch in progress:
            loss = exp.train(model, dataset, optimiser, on_val=True)
            progress.set_description(f"Train loss: {loss:.4f}")

        eval_metrics = exp.evaluate(
            model, dataset, on_train=False, on_val=False, on_test=True
        )

        model_metrics[model_name] = eval_metrics.test
        print(i, eval_metrics.test)

        if eval_metrics.test.average_precision < best_aprc:
            # Save the trained model.
            torch.save(model.state_dict(), model_path)
            print()

Using device: CPU
Model GCN already exists, skipping
Model GraphSAGE already exists, skipping
Model GAT already exists, skipping
Model HGT already exists, skipping
Model HAN already exists, skipping


In [17]:
# Load and evaluate models
for model_name in best_trials.keys():

    print("Evaluating model:", model_name)

    model_path = MODEL_DIR / f"{model_name}.pt"
    if not model_path.exists():
        print(f"Model {model_name} does not exist, skipping")
        continue

    trial_params = best_trials[model_name]

    dataset = CompanyBeneficialOwners(dataset_path, to_undirected=True)
    dataset = dataset.data.to(device)

    model, _ = build_experiment_from_trial_params(trial_params, dataset)
    model.load_state_dict(torch.load(model_path, map_location=torch.device(device)))

    model.to(device)

    if trial_params["params_add_self_loops"]:
        dataset = AddSelfLoops(fill_value=1.0)(dataset)
    else:
        dataset = RemoveSelfLoops()(dataset)

    eval_metrics = exp.evaluate(
        model, dataset, on_train=False, on_val=False, on_test=True
    )

    model_metrics[model_name] = eval_metrics.test
    print(model_name, eval_metrics.test)

    print("Making predictions...")
    prediction_dict = model(dataset.x_dict, dataset.edge_index_dict)
    prediction_df_list = []

    print("Saving predictions...")
    for node_type in dataset.node_types:
        prediction = (
            prediction_dict[node_type][dataset[node_type].test_mask]
            .cpu()
            .detach()
            .numpy()
            .flatten()
        )
        actual = (
            dataset.y_dict[node_type][dataset[node_type].test_mask]
            .cpu()
            .detach()
            .numpy()
            .flatten()
        )
        df = pd.DataFrame({"pred_proba": prediction, "actual": actual})
        prediction_df_list.append(df)

    prediction_df = pd.concat(prediction_df_list)
    prediction_df.to_csv(PREDICTION_DIR / f"{model_name}.csv", index=False)

GCN loss: 1.042, acc: 0.913, prc: 0.185, rec: 0.689, f1: 0.291, auc: 0.792, aprc: 0.408
GraphSAGE loss: 1.066, acc: 0.913, prc: 0.138, rec: 0.720, f1: 0.231, auc: 0.714, aprc: 0.287
GAT loss: 0.984, acc: 0.913, prc: 0.135, rec: 0.837, f1: 0.233, auc: 0.745, aprc: 0.349
HGT loss: 1.051, acc: 0.913, prc: 0.134, rec: 0.767, f1: 0.228, auc: 0.722, aprc: 0.303
HAN loss: 1.094, acc: 0.913, prc: 0.127, rec: 0.795, f1: 0.220, auc: 0.695, aprc: 0.264


In [18]:
performance_comparison = pd.DataFrame.from_dict(model_metrics, orient="index")
performance_comparison.to_csv("reports/test-performance-pyg.csv", index_label="model")