In [1]:
import model_helper as mh
import torch
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
import time
from sklearn.metrics import roc_curve, auc, precision_recall_fscore_support
import json
from datetime import datetime

2.0.1+cu118


In [2]:
def rf_gridsearch(ds):
    param_grid = {'bootstrap': [True, False],
        'max_depth': [10, 30, 50, 70, 90, None],
        'max_features': ['auto', 'sqrt'],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        'n_estimators': [20, 50, 100]}
    
    trainX = ds.data.x[ds.data.train_mask + ds.data.val_mask].detach().cpu().numpy()
    trainY = ds.data.y[ds.data.train_mask + ds.data.val_mask].detach().cpu().numpy()

    testX = ds.data.x[ds.data.test_mask].detach().cpu().numpy()
    testY = ds.data.y[ds.data.test_mask].detach().cpu().numpy()

    clf = RandomForestClassifier()
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1)
    grid_search.fit(trainX, trainY)

    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    #clf = RandomForestClassifier(bootstrap=best_params['bootstrap'],max_depth=best_params['max_depth'],max_features=best_params['max_features'],min_samples_leaf=best_params['min_samples_leaf'],min_samples_split=best_params['min_samples_split'],n_estimators=best_params['n_estimators'])
    #st = time.time()
    #clf.fit(trainX, trainY)
    #et = time.time()
    #elapsed_time = et - st
    #best_model = clf
    
    test_pred = best_model.predict(testX)
    y_pred_prob = best_model.predict_proba(testX)
    y_pred_prob = y_pred_prob[:,1]

    precision, recall, f1_score, _ = precision_recall_fscore_support(testY, test_pred, average='binary')                     
    fpr, tpr, thresholds = roc_curve(testY, y_pred_prob)
    roc_auc = auc(fpr, tpr)

    results = {
        'test_f1':f1_score,
        'test_auc':roc_auc,
        'test_precision':precision,
        'test_recall':recall,
        'bootstrap':best_params['bootstrap'],
        'max_depth':best_params['max_depth'],
        'n_estimators':best_params['n_estimators'],
        'max_features':best_params['max_features'],
        'min_samples_leaf':best_params['min_samples_leaf'],
        'min_samples_split':best_params['min_samples_split']
    }
    return results

In [3]:
def xg_gridsearch(ds):
    trainX = ds.data.x[ds.data.train_mask + ds.data.val_mask].detach().cpu().numpy()
    trainY = ds.data.y[ds.data.train_mask + ds.data.val_mask].detach().cpu().numpy()

    testX = ds.data.x[ds.data.test_mask].detach().cpu().numpy()
    testY = ds.data.y[ds.data.test_mask].detach().cpu().numpy()

    clf = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

    params = {
        "gamma": [0,0.2,0.4,0.6],
        'max_depth': [10, 30, 50, 70, 90, None],
        'n_estimators': [20, 50, 100]}

    grid_search = GridSearchCV(estimator=clf, param_grid=params, cv=3, n_jobs=-1)
    grid_search.fit(trainX, trainY)

    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    #clf = xgb.XGBClassifier(objective="binary:logistic", random_state=42,gamma=best_params["gamma"], max_depth=best_params["max_depth"],n_estimators=best_params["n_estimators"])
    #st = time.time()
    #clf.fit(trainX, trainY)
    #et = time.time()
    #elapsed_time = et - st

    test_pred = best_model.predict(testX)
    test_pred = np.round(test_pred)
    print(np.sum(test_pred))

    y_pred_prob = best_model.predict_proba(testX)
    y_pred_prob = y_pred_prob[:,1]

    #test_acc, test_out, test_pred  = model_gs.test(data.test_mask)
    precision, recall, f1_score, _ = precision_recall_fscore_support(testY, test_pred, average='binary')                     
    fpr, tpr, thresholds = roc_curve(testY, y_pred_prob)
    roc_auc = auc(fpr, tpr)

    results = {
        'test_f1':f1_score,
        'test_auc':roc_auc,
        'test_precision':precision,
        'test_recall':recall,
        'max_depth':best_params['max_depth'],
        'n_estimators':best_params['n_estimators'],
        'gamma':best_params['gamma']
    }
    return results

In [2]:
def gnn_gridsearch(ds_split_seeds, dataset_folder, method, gnn_params, ds_name, results_path, gs_results):
    epochs = 5000
    results = {
        'test_f1':None,
        'test_auc':None,
        'test_precision':None,
        'test_recall':None,
        'lr':None,
        'w':None,
        'K':None,
        'F':None,
        'K1':None,
        'K2':None,
        'F1':None,
        'F2':None,
        'epoch':None
    }
    best_average_test_f1 = 0
    count = 1
    for lr in gnn_params['lr']:
        for w in gnn_params['w']:
            if method == "GINSAGE":
                
                for K1 in gnn_params['K1']:
                    for K2 in gnn_params['K2']:
                        for F1 in gnn_params['F1']:
                            for F2 in gnn_params['F2']:
                                f1_scores_for_group = []
                                best_epochs_for_group = []
                                auc_scores_for_group = []
                                precision_scores_for_group = []
                                recall_scores_for_group = []
                                for seed in ds_split_seeds:
                                    print(method, count, 'out of', len(gnn_params['lr'])*len(gnn_params['w'])*len(gnn_params['K1'])*len(gnn_params['K2'])*len(gnn_params['F1'])*len(gnn_params['F2'])*len(ds_split_seeds))
                                    
                                    ds = mh.Dataset()
                                    ds.load_dataset(folder=dataset_folder+ds_name,splits=[0.5,0.2,0.3],split_type="normal",split_seed=seed)
                                    gin_feature_indices = [ds.feature_labels.index(item) for item in ['node_deg_out_unique', 'node_deg_in_unique']]
                                    #gin_feature_indices = [ds.feature_labels.index(item) for item in ['node_deg_out', 'node_deg_in']]
                                    #gin_feature_indices = [-1]
                                    
                                    model = mh.Model(ds.data, gridsearch_flag=True)
                                    
                                    additional_params = {'K1':K1, 'K2':K2, 'F1':F1, 'F2':F2, 'gin_feature_indices':gin_feature_indices}
                                    model.w = w
                                    model.lr = lr
                                    model.load_model("GINSAGE",additional_params=additional_params)
                                    model.train_model(epochs=epochs)
                                    f1_scores_for_group.append(model.gridsearch_results['test_f1'])
                                    best_epochs_for_group.append(model.gridsearch_results['test_epoch'])
                                    auc_scores_for_group.append(model.gridsearch_results['test_auc'])
                                    precision_scores_for_group.append(model.gridsearch_results['test_precision'])
                                    recall_scores_for_group.append(model.gridsearch_results['test_recall'])
                                    count+=1

                                if np.mean(f1_scores_for_group) > best_average_test_f1:
                                    best_average_test_f1 = np.mean(f1_scores_for_group)
                                    results = {
                                    'mean_f1':np.mean(f1_scores_for_group),
                                    'test_f1':f1_scores_for_group,
                                    'test_auc':auc_scores_for_group,
                                    'test_precision':precision_scores_for_group,
                                    'test_recall':recall_scores_for_group,
                                    'epoch':best_epochs_for_group,
                                    'lr':lr,
                                    'w':w,
                                    'K':None,
                                    'F':None,
                                    'K1':K1,
                                    'K2':K2,
                                    'F1':F1,
                                    'F2':F2
                                    }
                                for ele in results:
                                    gs_results[ds_name][method][ele] = results[ele]
                                    print(ele,results[ele])
                                with open(results_path, 'w') as file:
                                    json.dump(gs_results, file, indent=2)
                                    
                                
            else:
                for K in gnn_params['K']:
                    for F in gnn_params['F']:
                        f1_scores_for_group = []
                        best_epochs_for_group = []
                        auc_scores_for_group = []
                        precision_scores_for_group = []
                        recall_scores_for_group = []
                        for seed in ds_split_seeds:
                            print(method, count, 'out of', len(gnn_params['lr'])*len(gnn_params['w'])*len(gnn_params['K'])*len(gnn_params['F'])*len(ds_split_seeds))


                            ds = mh.Dataset()
                            ds.load_dataset(folder=dataset_folder+ds_name,splits=[0.5,0.2,0.3],split_type="normal",split_seed=seed)

                            model = mh.Model(ds.data, gridsearch_flag=True)
                            model.w = w
                            model.lr = lr
                            model.load_model(method,K=K,F=F)
                            model.train_model(epochs=epochs)
                            f1_scores_for_group.append(model.gridsearch_results['test_f1'])
                            best_epochs_for_group.append(model.gridsearch_results['test_epoch'])
                            auc_scores_for_group.append(model.gridsearch_results['test_auc'])
                            precision_scores_for_group.append(model.gridsearch_results['test_precision'])
                            recall_scores_for_group.append(model.gridsearch_results['test_recall'])
                            count+=1

                        if np.mean(f1_scores_for_group) > best_average_test_f1:
                            best_average_test_f1 = np.mean(f1_scores_for_group)
                            results = {
                            'mean_f1':np.mean(f1_scores_for_group),
                            'test_f1':f1_scores_for_group,
                            'test_auc':auc_scores_for_group,
                            'test_precision':precision_scores_for_group,
                            'test_recall':recall_scores_for_group,
                            'epoch':best_epochs_for_group,
                            'lr':lr,
                            'w':w,
                            'K':K,
                            'F':F,
                            'K1':None,
                            'K2':None,
                            'F1':None,
                            'F2':None
                            }
                        for ele in results:
                            gs_results[ds_name][method][ele] = results[ele]
                            print(ele,results[ele])
                        with open(results_path, 'w') as file:
                            json.dump(gs_results, file, indent=2)
                            
                        

    return results

In [3]:
gnn_params = {
    'lr':[0.001,0.005,0.01,0.05],
    'w':[0.4],
    'K':[2,3,4,5,6],
    'F':[8,16,32,64],
    'K1':[1,2,3],
    'K2':[4,5,6],
    'F1':[8,16],
    'F2':[16,32,64]
}
gnn_params = {
    'lr':[0.01, 0.005],
    'w':[0.44],
    'K':[1,2,3],
    'F':[16,32,8],
    'K1':[2,3],
    'K2':[4,5,6],
    'F1':[8,16,32],
    'F2':[32,64]
}
#'w':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],

gnn_params = {
    'lr':[0.005],
    'w':[.96],
    'K':[3],
    'F':[32],
    'K1':[3],
    'K2':[5],
    'F1':[8],
    'F2':[64]
}

gnn_params_copy = gnn_params
weights_temp = [.96, .79, .44, .26]
weights_temp = [.44, .26, .79, .96]
#weights_temp = [.96]

In [4]:
def rf_toposearch(ds,param_grid,folder):
    '''
    param_grid = {'bootstrap': [True],
        'max_depth': [70],
        'max_features': ['sqrt'],
        'min_samples_leaf': [4],
        'min_samples_split': [5],
        'n_estimators': [50]}
    '''
    trainX = ds.data.x[ds.data.train_mask + ds.data.val_mask].detach().cpu().numpy()
    trainY = ds.data.y[ds.data.train_mask + ds.data.val_mask].detach().cpu().numpy()

    testX = ds.data.x[ds.data.test_mask].detach().cpu().numpy()
    testY = ds.data.y[ds.data.test_mask].detach().cpu().numpy()

    clf = RandomForestClassifier()
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=-1)
    grid_search.fit(trainX, trainY)

    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    #clf = RandomForestClassifier(bootstrap=best_params['bootstrap'],max_depth=best_params['max_depth'],max_features=best_params['max_features'],min_samples_leaf=best_params['min_samples_leaf'],min_samples_split=best_params['min_samples_split'],n_estimators=best_params['n_estimators'])
    #st = time.time()
    #clf.fit(trainX, trainY)
    #et = time.time()
    #elapsed_time = et - st
    #best_model = clf
    
    test_pred = best_model.predict(testX)
    y_pred_prob = best_model.predict_proba(testX)
    y_pred_prob = y_pred_prob[:,1]

    precision, recall, f1_score, _ = precision_recall_fscore_support(testY, test_pred, average='binary')                     
    fpr, tpr, thresholds = roc_curve(testY, y_pred_prob)
    roc_auc = auc(fpr, tpr)

    results = {}
    results["f1"] = f1_score

    return results

def xg_toposearch(ds,params,folder):
    trainX = ds.data.x[ds.data.train_mask + ds.data.val_mask].detach().cpu().numpy()
    trainY = ds.data.y[ds.data.train_mask + ds.data.val_mask].detach().cpu().numpy()

    testX = ds.data.x[ds.data.test_mask].detach().cpu().numpy()
    testY = ds.data.y[ds.data.test_mask].detach().cpu().numpy()

    clf = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
    '''
    params = {
        "gamma": [0],
        'max_depth': [30],
        'n_estimators': [100]}
    '''
    grid_search = GridSearchCV(estimator=clf, param_grid=params, cv=3, n_jobs=-1)
    grid_search.fit(trainX, trainY)

    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    #clf = xgb.XGBClassifier(objective="binary:logistic", random_state=42,gamma=best_params["gamma"], max_depth=best_params["max_depth"],n_estimators=best_params["n_estimators"])
    #st = time.time()
    #clf.fit(trainX, trainY)
    #et = time.time()
    #elapsed_time = et - st

    test_pred = best_model.predict(testX)
    test_pred = np.round(test_pred)
    print(np.sum(test_pred))

    y_pred_prob = best_model.predict_proba(testX)
    y_pred_prob = y_pred_prob[:,1]

    precision, recall, f1_score, _ = precision_recall_fscore_support(testY, test_pred, average='binary')                     
    fpr, tpr, thresholds = roc_curve(testY, y_pred_prob)
    roc_auc = auc(fpr, tpr)

    results = {}
    results["f1"] = f1_score

    return results

In [9]:
#loop format
#loop through datasets
#loop through models
#gridsearch model

dataset_folder = 'v2/'
#dataset_folder = 'thesis_datasets/'
#datasets = ['128K_05_v2_2','128K_1_v2_2','128K_5_v2_2','128K_10_v2_2']
#datasets = ['128K_5_v2_2','128K_10_v2_2','128K_1_v2_2','128K_05_v2_2']
datasets = ['8K_5_v2','16K_5_v2','32K_5_v2','64K_5_v2','128K_5_v2_2','128K_10_v2_2','128K_1_v2_2','128K_05_v2_2']
#datasets = ['128K_5_v2_2']
#methods = ['SAGE','XG','RF']
#methods = ['GINSAGE','SAGE','XG','RF']
#methods = ['SAGE','GINSAGE']
methods = ['GINSAGE','GIN','SAGE','XG','RF']
#methods = ['RF','XG']

now = datetime.now()
current_date = now.strftime("%Y-%m-%d")
current_time = now.strftime("%H-%M-%S")
result_string = f"{current_date}_{current_time}"

results_path = 'gridsearch_results.json'
#results_path = 'gridsearch_results/gridsearch_results_'+result_string+'.json'


gs_results = {}

try:
    # Try to open the existing JSON file
    with open(results_path, 'r') as file:
        gs_results = json.load(file)
        print("Existing JSON gridsearch results file loaded.")
except FileNotFoundError:
    # If the file doesn't exist, create an empty dictionary
    gs_results = {}
    print("JSON gridsearch results file not found. Creating a new one.")
    with open(results_path, 'w') as file:
        json.dump(gs_results, file, indent=2)

count = -1
for ds_name in datasets:
    #remove this after
    #count+=1
    #gnn_params['w'] = [weights_temp[count]]
    #~~~

    split_seeds = [0,111]
    split_seeds = [0,111,222,333,444,555,666,777,888,999]
    #split_seeds = [0]
    ds = mh.Dataset()
    ds.load_dataset(folder=dataset_folder+ds_name,splits=[0.5,0.2,0.3],split_type="normal")
    print('Running Dataset',ds_name)
    if ds_name not in gs_results:
        gs_results[ds_name] = {}

    for method in methods:
        model_path = './best_models/'+dataset_folder+ds_name+'_'+method+'.pth'
        print('    Running method',method)
        #grid search
        
        #gs_results[ds_name][method] = {}
        
        results = {}
        if method in ['GIN','SAGE','MPNN','GCN','GAT','GINSAGE']:
            gnn_params = {
                'lr':[gs_results[ds_name][method]['lr']],
                'w':[gs_results[ds_name][method]['w']],
                'K':[gs_results[ds_name][method]['K']],
                'F':[gs_results[ds_name][method]['F']],
                'K1':[gs_results[ds_name][method]['K1']],
                'K2':[gs_results[ds_name][method]['K2']],
                'F1':[gs_results[ds_name][method]['F1']],
                'F2':[gs_results[ds_name][method]['F2']],
                'mean_f1':gs_results[ds_name][method]['mean_f1'],
                'epochs':gs_results[ds_name][method]['epoch']
            }
            model = mh.Model(ds.data, gridsearch_flag=True)
            results = gnn_gridsearch(split_seeds, dataset_folder, method, gnn_params, ds_name, results_path, gs_results)
            '''
            try:
                results = gnn_gridsearch(model, ds, method, gnn_params)
            except:
                print(method,'on dataset',ds_name,'has crashed. Continuing to next seach.')
            '''
        elif method == 'XG':
            xg_params = {
                "gamma": [gs_results[ds_name][method]['gamma']],
                'max_depth': [gs_results[ds_name][method]['max_depth']],
                'n_estimators': [gs_results[ds_name][method]['n_estimators']]}
            f1s = [] 
            for seed in split_seeds:
                print("seed",seed)
                ds = mh.Dataset()
                ds.load_dataset(folder=dataset_folder+ds_name,splits=[0.5,0.2,0.3],split_type="normal",split_seed=seed)
                results2 = xg_toposearch(ds, xg_params,"v2/"+ds_name)
                f1s.append(results2["f1"])
            results["f1_scores"] = f1s
            results["mean_f1"] = np.mean(f1s)
        elif method == 'RF':
            rf_params = {'bootstrap': [gs_results[ds_name][method]['bootstrap']],
                'max_depth': [gs_results[ds_name][method]['max_depth']],
                'max_features': [gs_results[ds_name][method]['max_features']],
                'min_samples_leaf': [gs_results[ds_name][method]['min_samples_leaf']],
                'min_samples_split': [gs_results[ds_name][method]['min_samples_split']],
                'n_estimators': [gs_results[ds_name][method]['n_estimators']]}
            f1s = []
            for seed in split_seeds:
                print("seed",seed)
                ds = mh.Dataset()
                ds.load_dataset(folder=dataset_folder+ds_name,splits=[0.5,0.2,0.3],split_type="normal",split_seed=seed)
                results2 = rf_toposearch(ds,rf_params,"v2/"+ds_name)
                f1s.append(results2["f1"])
            results["f1_scores"] = f1s
            results["mean_f1"] = np.mean(f1s)
        else:
            print('method',method,'does not exist!')

        
        
        for ele in results:
            gs_results[ds_name][method][ele] = results[ele]
            print(ele,results[ele])

        with open(results_path, 'w') as file:
            json.dump(gs_results, file, indent=2)








Existing JSON gridsearch results file loaded.
device: cuda:0
loading dataset v2/8K_5_v2 | length: 8032 | fraud percentage (%): 4.38
Running Dataset 8K_5_v2
    Running method GINSAGE
device: cuda:0
GINSAGE 1 out of 10
device: cuda:0
loading dataset v2/8K_5_v2 | length: 8032 | fraud percentage (%): 4.38
device: cuda:0
{'K1': 2, 'K2': 6, 'F1': 8, 'F2': 64, 'gin_feature_indices': [19, 11]}
model training starting...


In [15]:
ds.data.x.shape

torch.Size([128002, 27])

In [13]:
results

{'mean_f1': 0.8276669931216768,
 'test_f1': [0.828786999419617,
  0.8412880966072455,
  0.8145281333730277,
  0.8248242127789668,
  0.828294687408277,
  0.8342089900758903,
  0.8217378321887131],
 'test_auc': [0.8678542016626297,
  0.8745943554443908,
  0.8545909685806059,
  0.864160909867488,
  0.8751723765077881,
  0.8707201902743981,
  0.8611052905544044],
 'test_precision': [0.9450694904037061,
  0.9562091503267974,
  0.9533101045296167,
  0.9473314606741573,
  0.9192182410423453,
  0.9501329787234043,
  0.9496204278812974],
 'test_recall': [0.737984496124031,
  0.7510266940451745,
  0.7110187110187111,
  0.7303735787763942,
  0.7537393162393162,
  0.7434963579604579,
  0.7242105263157895],
 'epoch': [4810, 4510, 1870, 2900, 4400, 4960, 4940],
 'lr': 0.005,
 'w': 0.44,
 'K': None,
 'F': None,
 'K1': 2,
 'K2': 5,
 'F1': 8,
 'F2': 32}

In [34]:
model.check_topology_performance(folder=ds1.folder, data=ds1.data)

NameError: name 'ds1' is not defined

In [65]:
results

{'mean_f1': 0.8294841855240406,
 'test_f1': [0.8183807439824947, 0.8624708624708625, 0.8076009501187648],
 'test_auc': [0.8690524190029358, 0.8820119623429218, 0.8543327460911694],
 'test_precision': [0.9121951219512195,
  0.9893048128342246,
  0.9340659340659341],
 'test_recall': [0.7420634920634921, 0.7644628099173554, 0.7112970711297071],
 'epoch': [1040, 1220, 4850],
 'lr': 0.01,
 'w': 0.44,
 'K': None,
 'F': None,
 'K1': 3,
 'K2': 5,
 'F1': 8,
 'F2': 32}

In [17]:
ds = mh.Dataset()
ds.load_dataset(folder=dataset_folder+'128K_05_v2_2',splits=[0.5,0.2,0.3],split_type="normal")
    
method = 'GINSAGE'
model = mh.Model(ds.data, gridsearch_flag=True)


device: cuda:0
loading dataset v2/128K_05_v2_2 | length: 128002 | fraud percentage (%): 0.49
device: cuda:0


In [53]:
!pip show torch

Name: torch
Version: 2.0.1+cu118
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: d:\anaconda\envs\anaconda\envs\gdl2\lib\site-packages
Requires: filelock, jinja2, networkx, sympy, typing-extensions
Required-by: tensordict, torchaudio, torchrl, torchvision


In [3]:
ds1 = mh.Dataset()
ds1.load_dataset(folder='v2/'+'16K_5_v2',splits=[0.5,0.2,0.3],split_type="normal",split_seed=0)

ds2 = mh.Dataset()
ds2.load_dataset(folder='v2/'+'16K_5_v2',splits=[0.5,0.2,0.3],split_type="normal",split_seed=0)

ds3 = mh.Dataset()
ds3.load_dataset(folder='v2/'+'16K_5_v2',splits=[0.5,0.2,0.3],split_type="normal",split_seed=294)

device: cuda:0
loading dataset v2/16K_5_v2 | length: 15995 | fraud percentage (%): 5.0
device: cuda:0
loading dataset v2/16K_5_v2 | length: 15995 | fraud percentage (%): 5.0
device: cuda:0
loading dataset v2/16K_5_v2 | length: 15995 | fraud percentage (%): 5.0


In [20]:
sum(ds1.data.y[ds1.data.test_mask] != ds3.data.y[ds3.data.test_mask])

tensor(440, device='cuda:0')

In [51]:


ds1 = mh.Dataset()
ds1.load_dataset(folder='v2/'+'16K_5_v2',splits=[0.5,0.2,0.3],split_type="normal",split_seed=6)



device: cuda:0
loading dataset v2/16K_5_v2 | length: 15995 | fraud percentage (%): 5.0


In [52]:
print(sum(ds1.data.y[ds1.data.train_mask])/sum(ds1.data.train_mask), sum(ds1.data.y[ds1.data.train_mask]))
print(sum(ds1.data.y[ds1.data.val_mask])/sum(ds1.data.val_mask), sum(ds1.data.y[ds1.data.val_mask]))
print(sum(ds1.data.y[ds1.data.test_mask])/sum(ds1.data.test_mask), sum(ds1.data.y[ds1.data.test_mask]))

tensor(0.0525, device='cuda:0') tensor(420, device='cuda:0')
tensor(0.0416, device='cuda:0') tensor(133, device='cuda:0')
tensor(0.0515, device='cuda:0') tensor(247, device='cuda:0')
