In [3]:
import os
from pathlib import Path

import numpy as np
import optuna
import pandas as pd
import torch
import catboost as cb
from tqdm import tqdm

from mscproject.datasets import CompanyBeneficialOwners
from mscproject.transforms import RemoveSelfLoops
import mscproject.models as mod
import mscproject.experiment as exp

while not Path("data") in Path(".").iterdir():
    os.chdir("..")

In [4]:
study_names = (
    # "pyg_model_selection_ALL",
    "pyg_model_selection_GCN",
    # "pyg_model_selection_GraphSAGE",
    # "pyg_model_selection_GAT",
    # "pyg_model_selection_HGT",
    # "pyg_model_selection_HAN",
)

In [5]:
from IPython.display import display

In [6]:
OPTUNA_STORAGE = "sqlite:///data/optuna-aprc.db"

trials_dfs = [
    study.trials_dataframe().assign(study_name=study.study_name)
    for study in (
        optuna.load_study(study_name=study_name, storage=OPTUNA_STORAGE)
        for study_name in study_names
    )
]
eval_df = pd.concat(trials_dfs, join="inner", axis=0)

In [7]:
for study in (
    optuna.load_study(study_name=study_name, storage=OPTUNA_STORAGE)
    for study_name in study_names
):
    print(study.study_name)
    # Plot the results.
    mean_top_10_loss = study.trials_dataframe()["value"].sort_values().head(10).mean()
    print("Mean of top 10 best loss:", mean_top_10_loss)
    optuna.visualization.plot_optimization_history(study).show()
    # optuna.visualization.plot_contour(study).show()
    optuna.visualization.plot_slice(study).show()
    optuna.visualization.plot_param_importances(study).show()
    print()
    print()

pyg_model_selection_GCN
Mean of top 10 best loss: 0.0852541171014309






In [8]:
def mark_outliers(df):
    df = df.copy()
    upper_threshold = df.value.mean() + 2 * df.value.std()
    df["outlier"] = df.value > upper_threshold
    return df

In [9]:
metric = "user_attrs_aprc"
first = True

best_trials = {}

for study_name, df in zip(study_names, trials_dfs):
    if study_name.endswith("_ALL"):
        continue
    print(study_name)
    top = df.sort_values(metric, ascending=False)[:30]
    param_columns = [x for x in top.columns if x.startswith("params")]
    top = mark_outliers(top)
    display(top[["outlier", "value", metric, *param_columns]][:10])

    model_type = study_name.split("_")[-1]
    best_trials[model_type] = top.query("not outlier").iloc[0].to_dict()
    best_trials[model_type]["model_type"] = model_type

    print()

pyg_model_selection_GCN


Unnamed: 0,outlier,value,user_attrs_aprc,params_act,params_add_self_loops,params_bias,params_dropout,params_edge_aggr,params_gcn_aggr,params_hidden_channels_log2,params_jk,params_num_layers,params_weight_decay
107,True,0.717129,0.717129,relu,False,True,0.176726,min,mean,8,none,3,5.8e-05
178,False,0.521349,0.521349,gelu,False,True,0.074155,sum,sum,7,none,3,0.000155
71,False,0.506553,0.506553,relu,False,False,0.252673,min,sum,7,none,3,0.000141
43,False,0.440476,0.440476,relu,False,True,0.266203,min,min,8,none,3,0.001613
61,False,0.362126,0.362126,relu,False,False,0.203275,max,min,8,none,3,0.001546
134,False,0.356192,0.356192,relu,True,True,0.080616,min,sum,7,none,3,0.003823
63,False,0.340579,0.340579,relu,False,False,0.260325,max,min,7,none,3,6.1e-05
26,False,0.335414,0.335414,relu,True,True,0.187773,min,sum,6,none,3,0.000853
103,False,0.329718,0.329718,relu,True,True,0.203291,min,sum,7,none,3,0.003462
86,False,0.327589,0.327589,relu,True,True,0.154225,min,sum,7,none,3,0.005081





In [43]:
train_df = df.dropna()
feature_columns = [
    col for col in train_df.columns if col.startswith("params")
]
X = train_df[feature_columns]
y = train_df[metric]

In [44]:
cat_features = [i for i, d in enumerate(X.dtypes) if d != "float64"]

In [88]:
train_df = top
cb_model = cb.CatBoostRegressor(verbose=100, iterations=10, eval_metric="RMSE", max_depth=2)
# Train on top df
cb_model.fit(X, y, cat_features=cat_features)
cb_model.predict(X)
best = np.argmax(cb_model.predict(X))
print()
print(X.iloc[best], y.iloc[best])

Learning rate set to 0.5
0:	learn: 0.0778643	total: 302us	remaining: 2.73ms
9:	learn: 0.0591603	total: 1.64ms	remaining: 0us

params_act                         relu
params_add_self_loops             False
params_bias                        True
params_dropout                 0.243477
params_edge_aggr                    min
params_gcn_aggr                     sum
params_hidden_channels_log2           7
params_jk                          none
params_num_layers                     3
params_weight_decay            0.000147
Name: 21, dtype: object 0.2985683083534241


In [73]:
train_df

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_act,params_add_self_loops,params_bias,params_dropout,params_edge_aggr,...,user_attrs_f1,user_attrs_learning_rate,user_attrs_loss,user_attrs_n_hidden,user_attrs_precision,user_attrs_recall,user_attrs_total_epochs,state,study_name,outlier
107,107,0.717129,2022-09-12 15:15:35.562570,2022-09-12 15:16:20.448680,0 days 00:00:44.886110,relu,False,True,0.176726,min,...,0.492175,0.01,0.530996,256,0.339266,0.896011,170.0,COMPLETE,pyg_model_selection_GCN,True
178,178,0.521349,2022-09-12 15:32:31.601182,2022-09-12 15:32:55.191742,0 days 00:00:23.590560,gelu,False,True,0.074155,sum,...,0.370631,0.01,0.730062,128,0.233444,0.89886,230.0,COMPLETE,pyg_model_selection_GCN,False
71,71,0.506553,2022-09-12 15:08:18.194454,2022-09-12 15:08:31.887772,0 days 00:00:13.693318,relu,False,False,0.252673,min,...,0.339881,0.01,0.8015,128,0.214823,0.81339,118.0,COMPLETE,pyg_model_selection_GCN,False
43,43,0.440476,2022-09-12 15:01:58.287941,2022-09-12 15:02:38.520952,0 days 00:00:40.233011,relu,False,True,0.266203,min,...,0.314334,0.01,0.897759,256,0.195989,0.793447,120.0,COMPLETE,pyg_model_selection_GCN,False
61,61,0.362126,2022-09-12 15:05:10.293969,2022-09-12 15:05:46.008563,0 days 00:00:35.714594,relu,False,False,0.203275,max,...,0.233209,0.01,0.999814,256,0.134178,0.890313,109.0,COMPLETE,pyg_model_selection_GCN,False
134,134,0.356192,2022-09-12 15:20:36.223176,2022-09-12 15:20:50.916958,0 days 00:00:14.693782,relu,True,True,0.080616,min,...,0.277601,0.01,1.003204,128,0.174245,0.682336,82.0,COMPLETE,pyg_model_selection_GCN,False
63,63,0.340579,2022-09-12 15:06:12.585291,2022-09-12 15:06:29.467451,0 days 00:00:16.882160,relu,False,False,0.260325,max,...,0.237052,0.01,1.007383,128,0.137795,0.847578,103.0,COMPLETE,pyg_model_selection_GCN,False
26,26,0.335414,2022-09-12 15:00:32.905652,2022-09-12 15:00:39.895966,0 days 00:00:06.990314,relu,True,True,0.187773,min,...,0.248035,0.01,1.00465,64,0.149852,0.719373,88.0,COMPLETE,pyg_model_selection_GCN,False
103,103,0.329718,2022-09-12 15:14:51.148426,2022-09-12 15:15:01.815471,0 days 00:00:10.667045,relu,True,True,0.203291,min,...,0.24605,0.01,1.02718,128,0.146191,0.776353,59.0,COMPLETE,pyg_model_selection_GCN,False
86,86,0.327589,2022-09-12 15:09:57.819314,2022-09-12 15:10:12.598412,0 days 00:00:14.779098,relu,True,True,0.154225,min,...,0.246841,0.01,1.020339,128,0.152138,0.653846,108.0,COMPLETE,pyg_model_selection_GCN,False


params_act                         relu
params_add_self_loops             False
params_bias                        True
params_dropout                 0.176726
params_edge_aggr                    min
params_gcn_aggr                    mean
params_hidden_channels_log2           8
params_jk                          none
params_num_layers                     3
params_weight_decay            0.000058
Name: 107, dtype: object 0.12472061067819595


In [63]:
import mscproject.experiment as exp

In [64]:
# remove prefix from string
def remove_prefix(text, prefix):
    if text.startswith(prefix):
        return text[len(prefix) :]
    return text

In [65]:
def build_experiment_from_trial_params(trial_params, dataset, verbose=False):
    param_dict = {
        remove_prefix(k, "params_"): v
        for k, v in trial_params.items()
        if k.startswith("params")
    }
    # Rename key from "n_layers" to "num_layers"
    if "n_layers" in param_dict:
        param_dict["num_layers"] = param_dict.pop("n_layers")
    param_dict["in_channels"] = -1
    param_dict["out_channels"] = 1
    param_dict["act_first"] = True
    param_dict["add_self_loops"] = False
    param_dict["model_type"] = mod.get_model(trial_params["model_type"])
    param_dict["v2"] = True
    lr = trial_params["user_attrs_learning_rate"]
    param_dict["jk"] = None if param_dict["jk"] == "none" else param_dict["jk"]
    if verbose:
        print(param_dict)
    return exp.get_model_and_optimiser(param_dict, dataset, lr)

In [66]:
dataset_path = "data/pyg/"

# Set the device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {str(device).upper()}")

# Load the dataset.
dataset = CompanyBeneficialOwners(dataset_path, to_undirected=True)
dataset = dataset.data.to(device)

model_metrics = {}

models_dir = Path("models/pyg")
models_dir.mkdir(parents=True, exist_ok=True)

for model_name in best_trials.keys():
    print("Training model:", model_name)
    trial_dict = best_trials[model_name]

    if trial_dict["params_add_self_loops"]:
        dataset = AddSelfLoops(fill_value=1.0)(dataset)  # type: ignore
    else:
        dataset = RemoveSelfLoops()(dataset)

    model, optimiser = build_experiment_from_trial_params(
        trial_dict, dataset, verbose=True
    )

    # Train and evaluate the model.
    best_epoch = int(trial_dict["user_attrs_best_epoch"])

    progress = tqdm(range(best_epoch))

    for epoch in progress:
        loss = exp.train(model, dataset, optimiser, on_val=True)
        progress.set_description(f"Train loss: {loss:.4f}")

    eval_metrics = exp.evaluate(
        model, dataset, on_train=False, on_val=False, on_test=True
    )

    model_metrics[model_name] = eval_metrics.test
    print(eval_metrics.test)

    # Save the trained model.
    torch.save(model, f"models/pyg/{model_name}.pt")
    print()

Using device: CPU
Training model: GCN
{'act': 'gelu', 'add_self_loops': False, 'bias': True, 'dropout': 0.07415535691161837, 'edge_aggr': 'sum', 'gcn_aggr': 'sum', 'hidden_channels_log2': 7, 'jk': None, 'num_layers': 3, 'weight_decay': 0.00015514462183085446, 'in_channels': -1, 'out_channels': 1, 'act_first': True, 'model_type': <class 'mscproject.models.GCN'>, 'v2': True}


Train loss: 0.7777: 100%|██████████| 220/220 [03:41<00:00,  1.01s/it]


loss: 0.771, acc: 0.913, prc: 0.205, rec: 0.860, f1: 0.331, auc: 0.868, aprc: 0.496



In [67]:
model_metrics

{'GCN': EvalMetrics(loss=0.7713417410850525, accuracy=0.9127776622772217, precision=0.204684317111969, recall=0.8601996898651123, f1=0.33068275451660156, auroc=0.8681554198265076, average_precision=0.4956749677658081)}

In [68]:
import dataclasses as dc

In [69]:
pd.DataFrame.from_dict(model_metrics, orient="index")

Unnamed: 0,loss,accuracy,precision,recall,f1,auroc,average_precision
GCN,0.771342,0.912778,0.204684,0.8602,0.330683,0.868155,0.495675
