### Import Required Libraries

In [1]:
import torch
import torch_geometric
print('Torch', torch.__version__)
print("PyTorch Geometric Version:", torch_geometric.__version__)
print("CUDA Version:", torch.version.cuda)
torch.cuda.empty_cache()

from src import config_jupyter as config # import config/config_attack for script/attack scenarios
from src import sage_hetero, loader_hetero

from src.features import hetero_features
from src.score import roc, prec_recall, score
from utils import gpu_util

### Sample Data
Sample nodes and edges data can be accessed from the following link:
[Sample Data - Google Drive](https://drive.google.com/drive/folders/1rVDY26SO9xUp4DDZLQ7DuhSJ3B9r0wGQ?usp=sharing)

- Nodes File: The file containing nodes data (fqdn_apex_nodes.csv).
- Edges File: The file containing edges data (fqdn_apex_edges.csv).
- Model File: Define where the trained model will be saved (sage.pkl).

In [2]:
nodes_file = 'data/fqdn_apex_nodes_hetero.csv'
edges_file = 'data/fqdn_apex_edges_hetero.csv'

model_file = 'models/sample_sage_hetero.pkl'

### Set Experiment Arguments
You can configure the number of layers and specify the GPU ID and also change other parameters as shown in the example below:

In [4]:
args = config.parse()
args.hetero = True
args.num_layers = 3
args.dim = 256
args.epoch = 800
args.gpu_id = gpu_util.pick_gpu_lowest_memory()

args

Namespace(model_type='sage', epoch=100, num_layers=3, num_features=62, dim=256, fanout=[-1, 25, 10], outer_batch_size=500, inner_batch_size=40, train_percentage=0.8, seed=42, identifier='None', experiment_id=0, gpu_id=6, edge_type=[0, 1, 2], extra='s1', lstm=False, weighted=False, hetero=True, use_syn=False, syn_file=None, syn_labels=None, balance_labels=False, labelfeature_names=['feat_label_ben', 'feat_label_mal', 'feat_label_unknown'], label_source=['popular', 'alexa', 'tranco', 'umbrella', 'edugov', 'benign4', 'vt_seed5_3months', 'vt_active5_3months', 'vt_active5_12months', 'vt_seed5_12months', 'vt_mixed'], popularity_lists=['alexa', 'tranco', 'umbrella', 'crux'], lr=0.01, weight_decay=0.0005, model_file=None, balance_label_source='benign4', nodes_file=None, edges_file=None, epsilon=0.8)

### Load Data and Train Model
Use the loader to load the nodes and edges data.
You can initiate the training process using the SAGE_Experiment class.
Use loader_hetero and sage_hetero for running experiments with heterogeneous graphs.

In [5]:
data_loader = loader_hetero.HeteroLoader(nodes_file, edges_file, hetero_features, args)
experiment = sage_hetero.Sage_Hetero(data_loader.data, args)

model = experiment.train()
torch.save(model.state_dict(), model_file)

Label == 0: 12152
Label == 1: 10494
Train tensor(18116) Test tensor(4530)
Data converted to undirected: True


Epoch: 090, Loss: 0.1605, Train: 0.9742, Test: 0.9661: 100%|██████████| 100/100 [05:29<00:00,  3.29s/it]


  from .autonotebook import tqdm as notebook_tqdm


Torch 1.10.0
PyTorch Geometric Version: 2.0.4
CUDA Version: 10.2


In [None]:
import pickle as pc

from torch_geometric.loader import NeighborLoader

# experiment = pc.load(open('data/experiment.pkl', 'rb'))
experiment = pc.load(open('data/experiment_condaexplainer.pkl', 'rb'))
mask = experiment.data['domain'].validation_mask

# experiment.define_batch(experiment.data['domain'].validation_mask)
mask = experiment.data['domain'].validation_mask
input_nodes = experiment.data['domain'].val_index_tensor

NeighborLoader(experiment.data,
                num_neighbors={key: [30] * 2 for key in experiment.data.edge_types},
                weight_attr= None,
                batch_size=128,
                input_nodes=('domain', mask), 
                subgraph_type='induced'
                )

NameError: name 'experiment' is not defined

In [2]:
import pickle as pc

from torch_geometric.loader import NeighborLoader

experiment = pc.load(open('data/experiment.pkl', 'rb'))
experiment.data.edge_types

[('domain', 'to', 'ipp'),
 ('ipp', 'to', 'subnet'),
 ('domain', 'fqdnapex', 'domain'),
 ('domain', 'similar_apex', 'domain'),
 ('domain', 'similar_all', 'domain'),
 ('ipp', 'rev_to', 'domain'),
 ('subnet', 'rev_to', 'ipp')]