# Imports

In [28]:
import os 

import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

data_dir = '../data'
dataset = ''

# Investigate example: Reddit

In [None]:
import os
import os.path as osp
from typing import Callable, List, Optional

import numpy as np
import scipy.sparse as sp
import torch

from torch_geometric.data import (
    Data,
    InMemoryDataset,
    download_url,
    extract_zip,
)
from torch_geometric.utils import coalesce


class Reddit(InMemoryDataset):
    r"""The Reddit dataset from the `"Inductive Representation Learning on
    Large Graphs" <https://arxiv.org/abs/1706.02216>`_ paper, containing
    Reddit posts belonging to different communities.

    Args:
        root (str): Root directory where the dataset should be saved.
        transform (callable, optional): A function/transform that takes in an
            :obj:`torch_geometric.data.Data` object and returns a transformed
            version. The data object will be transformed before every access.
            (default: :obj:`None`)
        pre_transform (callable, optional): A function/transform that takes in
            an :obj:`torch_geometric.data.Data` object and returns a
            transformed version. The data object will be transformed before
            being saved to disk. (default: :obj:`None`)
        force_reload (bool, optional): Whether to re-process the dataset.
            (default: :obj:`False`)

    **STATS:**

    .. list-table::
        :widths: 10 10 10 10
        :header-rows: 1

        * - #nodes
          - #edges
          - #features
          - #classes
        * - 232,965
          - 114,615,892
          - 602
          - 41
    """

    url = 'https://data.dgl.ai/dataset/reddit.zip'

    def __init__(
        self,
        root: str,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
        force_reload: bool = False,
    ) -> None:
        super().__init__(root, transform, pre_transform,
                         force_reload=force_reload)
        self.load(self.processed_paths[0])

    @property
    def raw_file_names(self) -> List[str]:
        return ['reddit_data.npz', 'reddit_graph.npz']

    @property
    def processed_file_names(self) -> str:
        return 'data.pt'

    def download(self) -> None:
        path = download_url(self.url, self.raw_dir)
        extract_zip(path, self.raw_dir)
        os.unlink(path)

    def process(self) -> None:
        data = np.load(osp.join(self.raw_dir, 'reddit_data.npz'))
        x = torch.from_numpy(data['feature']).to(torch.float)
        y = torch.from_numpy(data['label']).to(torch.long)
        split = torch.from_numpy(data['node_types'])

        adj = sp.load_npz(osp.join(self.raw_dir, 'reddit_graph.npz'))
        row = torch.from_numpy(adj.row).to(torch.long)
        col = torch.from_numpy(adj.col).to(torch.long)
        edge_index = torch.stack([row, col], dim=0)
        edge_index = coalesce(edge_index, num_nodes=x.size(0))

        data = Data(x=x, edge_index=edge_index, y=y)
        data.train_mask = split == 1
        data.val_mask = split == 2
        data.test_mask = split == 3

        data = data if self.pre_transform is None else self.pre_transform(data)

        self.save([data], self.processed_paths[0])

In [None]:
data_dir = '../data'
dataset_name = 'reddit'

In [None]:
data = np.load(osp.join(data_dir, dataset_name, 'reddit_data.npz'))
data # NpzFile '../data\\reddit\\reddit_data.npz' with keys: feature, node_types, node_ids, label
type(data) # numpy.lib.npyio.NpzFile

In [None]:
x = torch.from_numpy(data['feature']).to(torch.float)
x
# tensor([[ 1.2334,  9.0430, -0.9233,  ..., -0.2579,  0.3112, -0.3772],
#         [-0.1386, -0.2022,  0.1277,  ...,  0.1563,  0.1048, -0.6534],
#         [-0.1330, -0.1962, -0.0296,  ...,  0.0358,  0.2864,  0.2744],
#         ...,
#         [-0.0614, -0.2022,  0.9698,  ...,  1.1064, -1.4323, -0.2398],
#         [-0.1606, -0.2022, -0.0892,  ...,  0.7440, -0.5046, -2.2288],
#         [ 0.0929,  0.2822,  0.1768,  ...,  0.2196,  0.5967,  0.5588]])
x.shape
# torch.Size([232965, 602])

In [None]:
y = torch.from_numpy(data['label']).to(torch.long)
y # tensor([30, 17, 18,  ...,  3, 13, 13])
y.shape # torch.Size([232965])

In [None]:
split = torch.from_numpy(data['node_types'])
split

In [None]:
adj = sp.load_npz(osp.join(data_dir, dataset_name, 'reddit_graph.npz'))
adj
# <232965x232965 sparse matrix of type '<class 'numpy.int64'>'
# 	with 114615892 stored elements in COOrdinate format>

In [None]:
row = torch.from_numpy(adj.row).to(torch.long)
col = torch.from_numpy(adj.col).to(torch.long)
row # tensor([     0,      0,      0,  ..., 232920, 232931, 232952])
col # tensor([225202, 177307, 107546,  ..., 232897, 232907, 232910])

In [None]:
edge_index = torch.stack([row, col], dim=0)
edge_index
# tensor([[     0,      0,      0,  ..., 232920, 232931, 232952],
#         [225202, 177307, 107546,  ..., 232897, 232907, 232910]])
edge_index.shape
# torch.Size([2, 114615892])

In [None]:
edge_index = coalesce(edge_index, num_nodes=x.size(0))
edge_index
# tensor([[     0,      0,      0,  ..., 232964, 232964, 232964],
#         [   242,    249,    524,  ..., 231806, 232594, 232634]])
edge_index.shape

# Node classification

### Interaction Datasets

In [97]:
import os.path as osp
import torch
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.utils import coalesce
from typing import Callable, List, Optional


class NodeClassificationInteractionDataset(InMemoryDataset):
    def __init__(
        self,
        root: str,
        node_file: str=None,
        edge_file: str=None, 
        processed_file: str=None,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
    ):
        self.node_file = node_file
        self.edge_file = edge_file
        self.processed_file = processed_file
        
        super(NodeClassificationInteractionDataset, self).__init__(root, transform, pre_transform)

        self.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return [self.node_file, self.edge_file]

    @property
    def processed_file_names(self):
        return [self.processed_file]

    def download(self):
        pass

    def process(self):
        labels_path = os.path.join(self.raw_dir, self.node_file)
        with open(labels_path, 'r') as f:
            labels = [list(map(int, line.strip().split())) for line in f]

        edgelist_path = os.path.join(self.raw_dir, self.edge_file)
        with open(edgelist_path, 'r') as f:
            edges = [
                [int(start_node), int(end_node), float(weight)]
                for line in f
                for start_node, end_node, weight in [map(str, line.strip().split())]
            ]

        max_num_labels = max(len(node_labels) for node_labels in labels)
        padded_labels = [node_labels + [0] * (max_num_labels - len(node_labels)) for node_labels in labels]
        x = torch.tensor(padded_labels, dtype=torch.float32)

        start_nodes = torch.tensor([edge[0] for edge in edges], dtype=torch.long)
        end_nodes = torch.tensor([edge[1] for edge in edges], dtype=torch.long)
        weights = torch.tensor([edge[2] for edge in edges], dtype=torch.float32)
        edge_index = torch.stack([start_nodes, end_nodes], dim=0)
        edge_attr = weights.view(-1, 1) 
        edge_index, edge_attr = coalesce(edge_index, edge_attr=edge_attr, num_nodes=x.size(0))

        y = None

        data = Data(
            x=x, 
            edge_index=edge_index, 
            edge_attr=edge_attr,
            y=y
        )
        
        data = data if self.pre_transform is None else self.pre_transform(data)
        self.save([data], self.processed_paths[0])

##### Node2Vec PPI

In [99]:
class Node2VecPPIDataset(NodeClassificationInteractionDataset):
    def __init__(
        self,
        root: str,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
    ):
        super(Node2VecPPIDataset, self).__init__(
            root, 
            'node2vec_PPI_labels.txt', 'node2vec_PPI.edgelist', 'node2vec_PPI_data.pt',
            transform, pre_transform
        )


data_dir = '../data'
dataset_name = 'node2vec_PPI'

dataset = Node2VecPPIDataset(
    osp.join(data_dir, dataset_name)
)
data = dataset[0]
data

Processing...
Done!


Data(x=[3890, 11], edge_index=[2, 76584], edge_attr=[76584, 1])

##### Mashup PPI

In [101]:
class MashupPPIDataset(NodeClassificationInteractionDataset):
    def __init__(
        self,
        root: str,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
    ):
        super(MashupPPIDataset, self).__init__(
            root, 
            'Mashup_PPI_labels.txt', 'Mashup_PPI.edgelist', 'Mashup_PPI_data.pt',
            transform, pre_transform
        )


data_dir = '../data'
dataset_name = 'Mashup_PPI'

dataset = MashupPPIDataset(
    osp.join(data_dir, dataset_name)
)
data = dataset[0]
data

Processing...
Done!


Data(x=[3831, 12], edge_index=[2, 596457], edge_attr=[596457, 1])

### CoOccurences

In [134]:
import os.path as osp
import torch
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.utils import coalesce
from typing import Callable, List, Optional


class CoocurenceDataset(InMemoryDataset):
    def __init__(
        self,
        root: str,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
    ):       
        self.node_file = 'Clin_Term_COOC_labels.txt'
        self.edge_file = 'Clin_Term_COOC.edgelist'
        self.processed_file = 'Clin_Term_COOC_processed.pt'
        super(InMemoryDataset, self).__init__(root, transform, pre_transform)

        self.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return [self.node_file, self.edge_file]

    @property
    def processed_file_names(self):
        return [self.processed_file]

    def download(self):
        pass

    def process(self):
        labels_path = os.path.join(self.raw_dir, self.node_file)
        with open(labels_path, 'r') as f:
            labels = [list(map(int, line.strip().split())) for line in f]

        edgelist_path = os.path.join(self.raw_dir, self.edge_file)
        with open(edgelist_path, 'r') as f:
            edges = [
                [int(start_node), int(end_node), float(weight)]
                for line in f
                for start_node, end_node, weight in [map(str, line.strip().split())]
            ]

        max_num_labels = max(len(node_labels) for node_labels in labels)
        padded_labels = [node_labels + [0] * (max_num_labels - len(node_labels)) for node_labels in labels]
        x = torch.tensor(padded_labels, dtype=torch.float32)

        start_nodes = torch.tensor([edge[0] for edge in edges], dtype=torch.long)
        end_nodes = torch.tensor([edge[1] for edge in edges], dtype=torch.long)
        weights = torch.tensor([edge[2] for edge in edges], dtype=torch.float32)
        edge_index = torch.stack([start_nodes, end_nodes], dim=0)
        edge_attr = weights.view(-1, 1) 
        edge_index, edge_attr = coalesce(edge_index, edge_attr=edge_attr, num_nodes=x.size(0))

        y = None

        data = Data(
            x=x, 
            edge_index=edge_index, 
            edge_attr=edge_attr,
            y=y
        )
        
        data = data if self.pre_transform is None else self.pre_transform(data)
        self.save([data], self.processed_paths[0])


data_dir = '../data'
dataset_name = 'Clin_Term_COOC'

dataset = CoocurenceDataset(
    osp.join(data_dir, dataset_name)
)
data = dataset[0]
data

Processing...
Done!


Data(x=[25120, 6], edge_index=[2, 1658695], edge_attr=[1658695, 1])

In [136]:
data.x[0]

tensor([17378.,     0.,     0.,     0.,     0.,     0.])

# Link prediction

### Interaction Datasets

In [89]:
import os.path as osp
import torch
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.utils import coalesce
from typing import Callable, List, Optional


class LinkPredictionInteractionDataset(InMemoryDataset):
    def __init__(
        self,
        root: str,
        node_file: str=None,
        edge_file: str=None, 
        processed_file: str=None,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
    ):
        self.node_file = node_file
        self.edge_file = edge_file
        self.processed_file = processed_file
        
        super(LinkPredictionInteractionDataset, self).__init__(root, transform, pre_transform)

        self.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return [self.node_file, self.edge_file]

    @property
    def processed_file_names(self):
        return [self.processed_file]

    def process(self):
        labels_path = os.path.join(self.raw_dir, self.node_file)
        with open(labels_path, 'r') as f:
            next(f)
            labels = [
                int(node)
                for line in f
                for node, _ in [map(str, line.strip().split())]
            ]

        edgelist_path = os.path.join(self.raw_dir, self.edge_file)
        with open(edgelist_path, 'r') as f:
            edges = [
                [int(start_node), int(end_node)]
                for line in f
                for start_node, end_node in [map(str, line.strip().split())]
            ]

        x = torch.tensor(labels, dtype=torch.int32).view(-1, 1)

        start_nodes = torch.tensor([edge[0] for edge in edges], dtype=torch.long)
        end_nodes = torch.tensor([edge[1] for edge in edges], dtype=torch.long)
        edge_index = torch.stack([start_nodes, end_nodes], dim=0)
        edge_index = coalesce(edge_index, num_nodes=x.size(0))

        y = None

        data = Data(
            x=x, 
            edge_index=edge_index, 
            y=y
        )
        
        data = data if self.pre_transform is None else self.pre_transform(data)
        self.save([data], self.processed_paths[0])

##### String PPI

In [94]:
class StringPPIDataset(LinkPredictionInteractionDataset):
    def __init__(
        self,
        root: str,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
    ):
        super(StringPPIDataset, self).__init__(
            root, 
            'node_list.txt', 'STRING_PPI.edgelist', 'string_PPI_data.pt',
            transform, pre_transform
        )
  
data_dir = '../data'
dataset_name = 'STRING_PPI'

dataset = StringPPIDataset(
    osp.join(data_dir, dataset_name)
)
data = dataset[0]
data

Data(x=[15131, 1], edge_index=[2, 359776])

##### DrugBank DDI

In [95]:
class DrugBankDDIDataset(LinkPredictionInteractionDataset):
    def __init__(
        self,
        root: str,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
    ):
        super(DrugBankDDIDataset, self).__init__(
            root, 
            'node_list.txt', 'DrugBank_DDI.edgelist', 'drugbank_DDI_data.pt',
            transform, pre_transform
        )

data_dir = '../data'
dataset_name = 'DrugBank_DDI'

dataset = DrugBankDDIDataset(
    osp.join(data_dir, dataset_name)
)
data = dataset[0]
data

Data(x=[2191, 1], edge_index=[2, 242027])

### Association Datasets

In [118]:
data_dir = '../data'
dataset_name = 'NDFRT_DDA'

labels_path = os.path.join(data_dir, dataset_name, 'node_list.txt')
with open(labels_path, 'r') as f:
    next(f)
    labels = [
        [node_id, UMLS_CUI, agent_type]
        for line in f
        for node_id, UMLS_CUI, agent_type in [map(str, line.strip().split())]
    ]

node_id_list = [label[0] for label in labels]
UMLS_CUI_list = [label[1] for label in labels]
agent_type_list = [label[2] for label in labels]

print(len(np.unique(node_id_list)))
print(len(np.unique(UMLS_CUI_list)))
print(len(np.unique(agent_type_list)))


13545
13545
2


In [112]:
data_dir = '../data'
dataset_name = 'CTD_DDA'

labels_path = os.path.join(data_dir, dataset_name, 'node_list.txt')
with open(labels_path, 'r') as f:
    next(f)
    labels = [
        [node_id, CTD_id, UMLS_CUI, agent_type]
        for line in f
        for node_id, CTD_id, UMLS_CUI, agent_type in [map(str, line.strip().split())]
    ]

node_id_list = [label[0] for label in labels]
CTD_id_list = [label[1] for label in labels]
UMLS_CUI_list = [label[2] for label in labels]
agent_type_list = [label[3] for label in labels]


print(len(np.unique(node_id_list)))
print(len(np.unique(CTD_id_list)))
print(len(np.unique(UMLS_CUI_list)))
print(len(np.unique(agent_type_list)))


12765
12765
12765
2


In [124]:
data_dir = '../data'
dataset_name = 'CTD_DDA'

labels_path = os.path.join(data_dir, dataset_name, 'node_list.txt')
with open(labels_path, 'r') as f:
    next(f)
    labels = [
        list(map(str, line.strip().split()))
        for line in f        
    ]
labels = [1 if label[-1] == 'disease' else 0 for label in labels]

sum(labels)/len(labels)


0.24951037994516256

In [125]:
import os.path as osp
import torch
from torch_geometric.data import InMemoryDataset, Data
from torch_geometric.utils import coalesce
from typing import Callable, List, Optional


class LinkPredictionAssociationDataset(InMemoryDataset):
    def __init__(
        self,
        root: str,
        node_file: str=None,
        edge_file: str=None, 
        processed_file: str=None,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
    ):
        self.node_file = node_file
        self.edge_file = edge_file
        self.processed_file = processed_file
        
        super(LinkPredictionAssociationDataset, self).__init__(root, transform, pre_transform)

        self.load(self.processed_paths[0])

    @property
    def raw_file_names(self):
        return [self.node_file, self.edge_file]

    @property
    def processed_file_names(self):
        return [self.processed_file]

    def process(self):
        labels_path = os.path.join(self.raw_dir, self.node_file)
        with open(labels_path, 'r') as f:
            next(f)
            labels = [
                list(map(str, line.strip().split()))
                for line in f        
            ]
        labels = [1 if label[-1] == 'disease' else 0 for label in labels]

        edgelist_path = os.path.join(self.raw_dir, self.edge_file)
        with open(edgelist_path, 'r') as f:
            edges = [
                [int(start_node), int(end_node)]
                for line in f
                for start_node, end_node in [map(str, line.strip().split())]
            ]

        x = torch.tensor(labels, dtype=torch.int32).view(-1, 1)

        start_nodes = torch.tensor([edge[0] for edge in edges], dtype=torch.long)
        end_nodes = torch.tensor([edge[1] for edge in edges], dtype=torch.long)
        edge_index = torch.stack([start_nodes, end_nodes], dim=0)
        edge_index = coalesce(edge_index, num_nodes=x.size(0))

        y = None

        data = Data(
            x=x, 
            edge_index=edge_index, 
            y=y
        )
        
        data = data if self.pre_transform is None else self.pre_transform(data)
        self.save([data], self.processed_paths[0])

##### CTD DDA

In [127]:
class CTDDDADataset(LinkPredictionAssociationDataset):
    def __init__(
        self,
        root: str,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
    ):
        super(CTDDDADataset, self).__init__(
            root, 
            'node_list.txt', 'CTD_DDA.edgelist', 'CTD_DDA.pt',
            transform, pre_transform
        )

data_dir = '../data'
dataset_name = 'CTD_DDA'

dataset = CTDDDADataset(
    osp.join(data_dir, dataset_name)
)
data = dataset[0]
data

Processing...
Done!


Data(x=[12765, 1], edge_index=[2, 92813])

##### NDFRT DDA

In [130]:
class NDFRTDDADataset(LinkPredictionAssociationDataset):
    def __init__(
        self,
        root: str,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
    ):
        super(NDFRTDDADataset, self).__init__(
            root, 
            'node_list.txt', 'NDFRT_DDA.edgelist', 'NDFRT_DDA.pt',
            transform, pre_transform
        )

data_dir = '../data'
dataset_name = 'NDFRT_DDA'

dataset = NDFRTDDADataset(
    osp.join(data_dir, dataset_name)
)
data = dataset[0]
data

Processing...
Done!


Data(x=[13545, 1], edge_index=[2, 56515])

# Notes 

In [13]:
import torch

from torch_geometric.data import (
    Data,
    InMemoryDataset
)


class Node2VecPPI(InMemoryDataset):
    def __init__(
        self,
        root: str,
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
    ):
        super(InMemoryDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = self.load(self.processed_paths[0])
    
    @property
    def raw_file_names(self) -> List[str]:
        return ['node2vec_PPI_labels.txt', 'node2vec_PPI.edgelist']
    
    @property
    def processed_file_names(self):
        # Return a list of processed file names if applicable
        return ['node2vec_PPI.pt']
    

    def process(self):
        data_list = []  # List to store Data objects
        
        # Your data processing logic here
        # Construct Data objects and append to data_list
        for i in range(num_samples):
            # Example: Construct a Data object
            x = ...  # Node features
            edge_index = ...  # Edge indices
            y = ...  # Target label
            
            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data)
        
        # max_num_labels = max(len(node_labels) for node_labels in labels)
        # padded_labels = [node_labels + [0] * (max_num_labels - len(node_labels)) for node_labels in labels]
        # x = torch.tensor(padded_labels, dtype=torch.float32) # Node features (assuming unbalanced lists)
        # x = torch.tensor(labels, dtype=torch.float32).view(-1, 1)  # Node features (assuming one-dimensional labels)
        # x = torch.tensor(labels, dtype=torch.float32)  # Node features (assuming labels are integers)



            
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])


import os.path as osp

data_dir = '../data'
dataset_name = 'node2vec_PPI'


dataset = Node2VecPPI(
    osp.join(data_dir, dataset_name)
)


Processing...


NameError: name 'num_samples' is not defined

Data(x=[3890, 11], edge_index=[2, 76584], edge_attr=[76584, 1])