### Import Required Libraries

In [1]:
import torch
import torch_geometric
print('Torch', torch.__version__)
print("PyTorch Geometric Version:", torch_geometric.__version__)
print("CUDA Version:", torch.version.cuda)
torch.cuda.empty_cache()

from src import config_jupyter as config # import config_attack for attack scenarios
from src import sage_experiment, loader

from src.features import feature_labels
from src.score import roc, prec_recall, score
from utils import gpu_util

### Sample Data
Sample nodes and edges data can be accessed from the following link:
[Sample Data - Google Drive](https://drive.google.com/drive/folders/1rVDY26SO9xUp4DDZLQ7DuhSJ3B9r0wGQ?usp=sharing)

- Nodes File: The file containing nodes data (fqdn_apex_nodes.csv).
- Edges File: The file containing edges data (fqdn_apex_edges.csv).
- Model File: Define where the trained model will be saved (sage.pkl).

In [2]:
nodes_file = 'data/fqdn_apex_nodes2.csv'
edges_file = 'data/fqdn_apex_edges.csv'
model_file = 'models/sample_sage.pkl'

### Set Experiment Arguments
You can configure the number of layers and specify the GPU ID and also change other parameters as shown in the example below:

In [3]:
args = config.parse()
args.num_layers = 3
args.dim = 256
args.epoch = 100
args.gpu_id = gpu_util.pick_gpu_lowest_memory()

In [4]:
args

Namespace(model_type='sage', epoch=100, num_layers=3, num_features=62, dim=256, fanout=[-1, 25, 10], outer_batch_size=500, inner_batch_size=40, train_percentage=0.8, seed=42, identifier='None', experiment_id=0, gpu_id=3, edge_type=[0, 1, 2], extra='s1', lstm=False, weighted=False, hetero=False, use_syn=False, syn_file=None, syn_labels=None, balance_labels=False, labelfeature_names=['feat_label_ben', 'feat_label_mal', 'feat_label_unknown'], label_source=['popular', 'alexa', 'tranco', 'umbrella', 'edugov', 'benign4', 'vt_seed5_3months', 'vt_active5_3months', 'vt_active5_12months', 'vt_seed5_12months', 'vt_mixed'], popularity_lists=['alexa', 'tranco', 'umbrella', 'crux'], lr=0.01, weight_decay=0.0005, model_file=None, balance_label_source='benign4', nodes_file=None, edges_file=None, epsilon=0.8)

### Load Data and Train Model
Use the loader to load the nodes and edges data.
You can initiate the training process using the SAGE_Experiment class.
Use loader_hetero and sage_hetero for running experiments with heterogeneous graphs.

In [5]:
data_loader = loader.Loader(nodes_file, edges_file, feature_labels, args)
experiment = sage_experiment.SAGE_Experiment(data_loader.data, args)
model = experiment.train()
torch.save(model.state_dict(), model_file)

Label == 0: 12152
Label == 1: 10494
Train tensor(18116) Test tensor(4530)
Data converted to undirected: True
Data(x=[932721, 62], edge_index=[2, 5173716], y=[932721], num_nodes=932721, n_id=[932721], train_mask=[932721], test_mask=[932721], validation_mask=[932721], domain_mask=[932721], popular_ip_mask=[932721], edge_weight=[5173716], edge_type=[5173716])


Epoch: 090, Loss 0.0151, Train: 0.9829, Val: 0.9760: 100%|██████████| 100/100 [30:44<00:00, 18.44s/it]
