<a href="https://colab.research.google.com/github/dp457/Graph-Neural-Network/blob/main/Creating_Graph_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Although PyG already contains a lot of useful datasets, we may wish to create your own dataset with self-recorded or non-publicly available data.

Implementing datasets by yourself is straightforward and you may want to take a look at the source code to find out how the various datasets are implemented. However, we give a brief introduction on what is needed to setup your own dataset.

In [2]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


# In-Memory Datasets

In [3]:
import os
import torch
from torch_geometric.data import InMemoryDataset, Data, download_url # aim is to fit it entirely in the meory
# Data - fundamental container for the single graph
# download_url - help fetch files from internet

class MyOwnDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        # root - where dataset is stored
        # trasform - applied on the fly when we access the graph
        # prefilter - filter out certain graphs
        super().__init__(root, transform, pre_transform, pre_filter)
        self.load(self.processed_paths[0])   # For PyG>=2.4
        # For PyG<2.4, use:
        # self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        # List of files expected in `raw_dir`
        return ['some_file_1.txt', 'some_file_2.txt']

    @property
    def processed_file_names(self):
        # List of files written to `processed_dir`
        return ['data.pt']

    def download(self):
        # Example: download from URLs
        # url = "https://example.com/data.txt"
        # download_url(url, self.raw_dir)
        pass   # If you already have raw files, leave this empty

    def process(self):
        data_list = []

        # Example 1: first graph with 3 nodes and 2 edges
        x = torch.tensor([[1], [2], [3]], dtype=torch.float)         # 3 nodes each with 1 feature
        edge_index = torch.tensor([[0, 1], [1, 2]], dtype=torch.long).t().contiguous() # edge list with shape [2, num_edges]
        y = torch.tensor([0])   # graph label
        data = Data(x=x, edge_index=edge_index, y=y) # wrap it into data and to dataset
        data_list.append(data)

        # Example 2: second graph with 4 nodes
        x = torch.tensor([[1], [0], [1], [0]], dtype=torch.float) # graph with 4 nodes
        edge_index = torch.tensor([[0, 1, 2, 3], [1, 0, 3, 2]], dtype=torch.long)
        y = torch.tensor([1])
        data = Data(x=x, edge_index=edge_index, y=y)
        data_list.append(data)

        # Apply optional pre_filter
        if self.pre_filter is not None:
            data_list = [d for d in data_list if self.pre_filter(d)]

        # Apply optional pre_transform
        if self.pre_transform is not None:
            data_list = [self.pre_transform(d) for d in data_list]

        # Save processed dataset
        self.save(data_list, self.processed_paths[0])
        # For PyG<2.4:
        # torch.save(self.collate(data_list), self.processed_paths[0])


## Larger Dataset online

In [4]:
import os.path as osp

import torch
from torch_geometric.data import Dataset, download_url, Data


class MyOwnDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)

    @property
    def raw_file_names(self):
        # After download, PyG will check if this file exists in raw_dir
        return ['karate.gml']

    @property
    def processed_file_names(self):
        # Processed dataset files to save
        return ['data_0.pt']

    def download(self):
        # Example: download Zachary's Karate Club graph in GML format
        url = 'https://raw.githubusercontent.com/networkx/networkx/main/networkx/readwrite/gml/tests/fixtures/karate.gml'
        download_url(url, self.raw_dir)

    def process(self):
        import networkx as nx
        from torch_geometric.utils import from_networkx

        raw_path = osp.join(self.raw_dir, 'karate.gml')
        G = nx.read_gml(raw_path, label='id')  # read GML graph
        data = from_networkx(G)  # convert to PyG Data object

        # Example: assign random features
        data.x = torch.eye(data.num_nodes)  # identity as features
        data.y = torch.zeros(data.num_nodes, dtype=torch.long)  # dummy labels

        if self.pre_filter is not None and not self.pre_filter(data):
            return

        if self.pre_transform is not None:
            data = self.pre_transform(data)

        torch.save(data, osp.join(self.processed_dir, 'data_0.pt'))

    def len(self):
        return len(self.processed_file_names)

    def get(self, idx):
        data = torch.load(osp.join(self.processed_dir, f'data_{idx}.pt'))
        return data
