# Graph Classification

### Proteins

In [21]:
import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.utils import to_networkx
import networkx as nx
import math
data_path = "/data/XXX/Pooling/"
dataset = TUDataset(root=data_path, name="PROTEINS", use_node_attr=True)
num_graphs = len(dataset)
total_num_nodes = 0
total_num_edges = 0
total_clustering_coefficient = 0
total_degree = 0
total_diameter = 0
for data in dataset:
    G = to_networkx(data, to_undirected=True)
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    clustering_coefficient = nx.average_clustering(G)
    avg_degree = sum(dict(G.degree()).values()) / num_nodes
    diameter = nx.diameter(G) if nx.is_connected(G) else float('nan')
    total_num_nodes += num_nodes
    total_num_edges += num_edges
    total_clustering_coefficient += clustering_coefficient
    total_degree += avg_degree
    total_diameter += diameter if not math.isnan(diameter) else 0
avg_clustering_coefficient = total_clustering_coefficient / num_graphs
avg_degree = total_degree / num_graphs
avg_diameter = total_diameter / num_graphs
print(f"Graph Count: {num_graphs}")
print(f"Average Number of Nodes: {total_num_nodes / num_graphs}")
print(f"Average Number of Edges: {total_num_edges / num_graphs}")
print(f"Average Clustering Coefficient: {avg_clustering_coefficient}")
print(f"Average Degree: {avg_degree}")
print(f"Average Diameter: {avg_diameter}")

Graph Count: 1113
Average Number of Nodes: 39.05750224618149
Average Number of Edges: 72.8158131176999
Average Clustering Coefficient: 0.5141966922501326
Average Degree: 3.7346421711505515
Average Diameter: 11.14375561545373


### NCI1

In [22]:
import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.utils import to_networkx
import networkx as nx
import math
data_path = "/data/XXX/Pooling/"
dataset = TUDataset(root=data_path, name="NCI1", use_node_attr=True)
num_graphs = len(dataset)
total_num_nodes = 0
total_num_edges = 0
total_clustering_coefficient = 0
total_degree = 0
total_diameter = 0
for data in dataset:
    G = to_networkx(data, to_undirected=True)
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    clustering_coefficient = nx.average_clustering(G)
    avg_degree = sum(dict(G.degree()).values()) / num_nodes
    diameter = nx.diameter(G) if nx.is_connected(G) else float('nan')
    total_num_nodes += num_nodes
    total_num_edges += num_edges
    total_clustering_coefficient += clustering_coefficient
    total_degree += avg_degree
    total_diameter += diameter if not math.isnan(diameter) else 0
avg_clustering_coefficient = total_clustering_coefficient / num_graphs
avg_degree = total_degree / num_graphs
avg_diameter = total_diameter / num_graphs
print(f"Graph Count: {num_graphs}")
print(f"Average Number of Nodes: {total_num_nodes / num_graphs}")
print(f"Average Number of Edges: {total_num_edges / num_graphs}")
print(f"Average Clustering Coefficient: {avg_clustering_coefficient}")
print(f"Average Degree: {avg_degree}")
print(f"Average Diameter: {avg_diameter}")



Graph Count: 4110
Average Number of Nodes: 29.8654501216545
Average Number of Edges: 32.3
Average Clustering Coefficient: 0.0030999008340164545
Average Degree: 2.155013792267047
Average Diameter: 11.450608272506082


### NCI109

In [23]:
import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.utils import to_networkx
import networkx as nx
import math
data_path = "/data/XXX/Pooling/"
dataset = TUDataset(root=data_path, name="NCI109", use_node_attr=True)
num_graphs = len(dataset)
total_num_nodes = 0
total_num_edges = 0
total_clustering_coefficient = 0
total_degree = 0
total_diameter = 0
for data in dataset:
    G = to_networkx(data, to_undirected=True)
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    clustering_coefficient = nx.average_clustering(G)
    avg_degree = sum(dict(G.degree()).values()) / num_nodes
    diameter = nx.diameter(G) if nx.is_connected(G) else float('nan')
    total_num_nodes += num_nodes
    total_num_edges += num_edges
    total_clustering_coefficient += clustering_coefficient
    total_degree += avg_degree
    total_diameter += diameter if not math.isnan(diameter) else 0
avg_clustering_coefficient = total_clustering_coefficient / num_graphs
avg_degree = total_degree / num_graphs
avg_diameter = total_diameter / num_graphs
print(f"Graph Count: {num_graphs}")
print(f"Average Number of Nodes: {total_num_nodes / num_graphs}")
print(f"Average Number of Edges: {total_num_edges / num_graphs}")
print(f"Average Clustering Coefficient: {avg_clustering_coefficient}")
print(f"Average Degree: {avg_degree}")
print(f"Average Diameter: {avg_diameter}")

Graph Count: 4127
Average Number of Nodes: 29.681124303368065
Average Number of Edges: 32.13084565059365
Average Clustering Coefficient: 0.003073238935522851
Average Degree: 2.1564461686190715
Average Diameter: 11.206929973346256


### MUTAG

In [24]:
import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.utils import to_networkx
import networkx as nx
import math
data_path = "/data/XXX/Pooling/"
dataset = TUDataset(root=data_path, name="MUTAG", use_node_attr=True)
num_graphs = len(dataset)
total_num_nodes = 0
total_num_edges = 0
total_clustering_coefficient = 0
total_degree = 0
total_diameter = 0
for data in dataset:
    G = to_networkx(data, to_undirected=True)
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    clustering_coefficient = nx.average_clustering(G)
    avg_degree = sum(dict(G.degree()).values()) / num_nodes
    diameter = nx.diameter(G) if nx.is_connected(G) else float('nan')
    total_num_nodes += num_nodes
    total_num_edges += num_edges
    total_clustering_coefficient += clustering_coefficient
    total_degree += avg_degree
    total_diameter += diameter if not math.isnan(diameter) else 0
avg_clustering_coefficient = total_clustering_coefficient / num_graphs
avg_degree = total_degree / num_graphs
avg_diameter = total_diameter / num_graphs
print(f"Graph Count: {num_graphs}")
print(f"Average Number of Nodes: {total_num_nodes / num_graphs}")
print(f"Average Number of Edges: {total_num_edges / num_graphs}")
print(f"Average Clustering Coefficient: {avg_clustering_coefficient}")
print(f"Average Degree: {avg_degree}")
print(f"Average Diameter: {avg_diameter}")

Graph Count: 188
Average Number of Nodes: 17.930851063829788
Average Number of Edges: 19.79255319148936
Average Clustering Coefficient: 0.0
Average Degree: 2.1887720785524962
Average Diameter: 8.21808510638298


### DD

In [25]:
import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.utils import to_networkx
import networkx as nx
import math
data_path = "/data/XXX/Pooling/"
dataset = TUDataset(root=data_path, name="DD", use_node_attr=True)
num_graphs = len(dataset)
total_num_nodes = 0
total_num_edges = 0
total_clustering_coefficient = 0
total_degree = 0
total_diameter = 0
for data in dataset:
    G = to_networkx(data, to_undirected=True)
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    clustering_coefficient = nx.average_clustering(G)
    avg_degree = sum(dict(G.degree()).values()) / num_nodes
    diameter = nx.diameter(G) if nx.is_connected(G) else float('nan')
    total_num_nodes += num_nodes
    total_num_edges += num_edges
    total_clustering_coefficient += clustering_coefficient
    total_degree += avg_degree
    total_diameter += diameter if not math.isnan(diameter) else 0
avg_clustering_coefficient = total_clustering_coefficient / num_graphs
avg_degree = total_degree / num_graphs
avg_diameter = total_diameter / num_graphs
print(f"Graph Count: {num_graphs}")
print(f"Average Number of Nodes: {total_num_nodes / num_graphs}")
print(f"Average Number of Edges: {total_num_edges / num_graphs}")
print(f"Average Clustering Coefficient: {avg_clustering_coefficient}")
print(f"Average Degree: {avg_degree}")
print(f"Average Diameter: {avg_diameter}")

Graph Count: 740
Average Number of Nodes: 169.94324324324324
Average Number of Edges: 421.3540540540541
Average Clustering Coefficient: 0.4827248137860852
Average Degree: 4.920788188580781
Average Diameter: 16.4527027027027


### IMDB-BINARY

In [26]:
import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.utils import to_networkx
import networkx as nx
import math
data_path = "/data/XXX/Pooling/"
dataset = TUDataset(root=data_path, name="IMDB-BINARY", use_node_attr=True)
num_graphs = len(dataset)
total_num_nodes = 0
total_num_edges = 0
total_clustering_coefficient = 0
total_degree = 0
total_diameter = 0
for data in dataset:
    G = to_networkx(data, to_undirected=True)
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    clustering_coefficient = nx.average_clustering(G)
    avg_degree = sum(dict(G.degree()).values()) / num_nodes
    diameter = nx.diameter(G) if nx.is_connected(G) else float('nan')
    total_num_nodes += num_nodes
    total_num_edges += num_edges
    total_clustering_coefficient += clustering_coefficient
    total_degree += avg_degree
    total_diameter += diameter if not math.isnan(diameter) else 0
avg_clustering_coefficient = total_clustering_coefficient / num_graphs
avg_degree = total_degree / num_graphs
avg_diameter = total_diameter / num_graphs
print(f"Graph Count: {num_graphs}")
print(f"Average Number of Nodes: {total_num_nodes / num_graphs}")
print(f"Average Number of Edges: {total_num_edges / num_graphs}")
print(f"Average Clustering Coefficient: {avg_clustering_coefficient}")
print(f"Average Degree: {avg_degree}")
print(f"Average Diameter: {avg_diameter}")

Graph Count: 1000
Average Number of Nodes: 19.773
Average Number of Edges: 96.531
Average Clustering Coefficient: 0.9470755285248149
Average Degree: 8.885904503361854
Average Diameter: 1.861


### IMDB-MULTI

In [27]:
import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.utils import to_networkx
import networkx as nx
import math
data_path = "/data/XXX/Pooling/"
dataset = TUDataset(root=data_path, name="IMDB-MULTI", use_node_attr=True)
num_graphs = len(dataset)
total_num_nodes = 0
total_num_edges = 0
total_clustering_coefficient = 0
total_degree = 0
total_diameter = 0
for data in dataset:
    G = to_networkx(data, to_undirected=True)
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    clustering_coefficient = nx.average_clustering(G)
    avg_degree = sum(dict(G.degree()).values()) / num_nodes
    diameter = nx.diameter(G) if nx.is_connected(G) else float('nan')
    total_num_nodes += num_nodes
    total_num_edges += num_edges
    total_clustering_coefficient += clustering_coefficient
    total_degree += avg_degree
    total_diameter += diameter if not math.isnan(diameter) else 0
avg_clustering_coefficient = total_clustering_coefficient / num_graphs
avg_degree = total_degree / num_graphs
avg_diameter = total_diameter / num_graphs
print(f"Graph Count: {num_graphs}")
print(f"Average Number of Nodes: {total_num_nodes / num_graphs}")
print(f"Average Number of Edges: {total_num_edges / num_graphs}")
print(f"Average Clustering Coefficient: {avg_clustering_coefficient}")
print(f"Average Degree: {avg_degree}")
print(f"Average Diameter: {avg_diameter}")

Graph Count: 1500
Average Number of Nodes: 13.001333333333333
Average Number of Edges: 65.93533333333333
Average Clustering Coefficient: 0.9691436936400352
Average Degree: 8.101174869045057
Average Diameter: 1.474


### COLLAB

In [28]:
import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.utils import to_networkx
import networkx as nx
import math
data_path = "/data/XXX/Pooling/"
dataset = TUDataset(root=data_path, name="COLLAB", use_node_attr=True)
num_graphs = len(dataset)
total_num_nodes = 0
total_num_edges = 0
total_clustering_coefficient = 0
total_degree = 0
total_diameter = 0
for data in dataset:
    G = to_networkx(data, to_undirected=True)
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    clustering_coefficient = nx.average_clustering(G)
    avg_degree = sum(dict(G.degree()).values()) / num_nodes
    diameter = nx.diameter(G) if nx.is_connected(G) else float('nan')
    total_num_nodes += num_nodes
    total_num_edges += num_edges
    total_clustering_coefficient += clustering_coefficient
    total_degree += avg_degree
    total_diameter += diameter if not math.isnan(diameter) else 0
avg_clustering_coefficient = total_clustering_coefficient / num_graphs
avg_degree = total_degree / num_graphs
avg_diameter = total_diameter / num_graphs
print(f"Graph Count: {num_graphs}")
print(f"Average Number of Nodes: {total_num_nodes / num_graphs}")
print(f"Average Number of Edges: {total_num_edges / num_graphs}")
print(f"Average Clustering Coefficient: {avg_clustering_coefficient}")
print(f"Average Degree: {avg_degree}")
print(f"Average Diameter: {avg_diameter}")

Graph Count: 5000
Average Number of Nodes: 74.4948
Average Number of Edges: 2457.2158
Average Clustering Coefficient: 0.8907720079424438
Average Degree: 37.36960015354266
Average Diameter: 1.864


### ENZYMES

In [1]:
import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.utils import to_networkx
import networkx as nx
import math
data_path = "/data/XXX/Pooling/"
dataset = TUDataset(root=data_path, name="ENZYMES", use_node_attr=True)
num_graphs = len(dataset)
total_num_nodes = 0
total_num_edges = 0
total_clustering_coefficient = 0
total_degree = 0
total_diameter = 0
for data in dataset:
    G = to_networkx(data, to_undirected=True)
    num_nodes = G.number_of_nodes()
    num_edges = G.number_of_edges()
    clustering_coefficient = nx.average_clustering(G)
    avg_degree = sum(dict(G.degree()).values()) / num_nodes
    diameter = nx.diameter(G) if nx.is_connected(G) else float('nan')
    total_num_nodes += num_nodes
    total_num_edges += num_edges
    total_clustering_coefficient += clustering_coefficient
    total_degree += avg_degree
    total_diameter += diameter if not math.isnan(diameter) else 0
avg_clustering_coefficient = total_clustering_coefficient / num_graphs
avg_degree = total_degree / num_graphs
avg_diameter = total_diameter / num_graphs
print(f"Graph Count: {num_graphs}")
print(f"Average Number of Nodes: {total_num_nodes / num_graphs}")
print(f"Average Number of Edges: {total_num_edges / num_graphs}")
print(f"Average Clustering Coefficient: {avg_clustering_coefficient}")
print(f"Average Degree: {avg_degree}")
print(f"Average Diameter: {avg_diameter}")

  from .autonotebook import tqdm as notebook_tqdm


Graph Count: 600
Average Number of Nodes: 32.63333333333333
Average Number of Edges: 62.13666666666666
Average Clustering Coefficient: 0.4533912862690094
Average Degree: 3.862625314410416
Average Diameter: 10.358333333333333


# Graph Regression

### QM7

In [1]:
import deepchem as dc
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from deepchem.feat import MolGraphConvFeaturizer
from deepchem.data import NumpyDataset
qm7_dataset = "/data/XXX/Pooling/Graph_Pooling_Benchmark/Regression/datasets/qm7.csv"
data = pd.read_csv(qm7_dataset)
smiles = data['smiles'].values
featurizer = MolGraphConvFeaturizer()
mols = featurizer.featurize(smiles)
def graphdata_to_nx(graph_data):
    G = nx.Graph()
    num_nodes = graph_data.node_features.shape[0]
    G.add_nodes_from(range(num_nodes))
    for i in range(graph_data.edge_index.shape[1]):
        source = graph_data.edge_index[0, i]
        target = graph_data.edge_index[1, i]
        G.add_edge(source, target)
    return G
total_nodes = 0
total_edges = 0
total_clustering_coefficient = 0
total_diameter = 0
total_degree = 0
connected_graph_count = 0
total_graph_count = 0
for i, mol in enumerate(mols):
    if isinstance(mol, dc.feat.graph_data.GraphData):  
        if mol.node_features.size == 0:  
            continue
        G = graphdata_to_nx(mol)
        num_nodes = G.number_of_nodes()
        num_edges = G.number_of_edges()
        total_nodes += num_nodes
        total_edges += num_edges
        total_clustering_coefficient += nx.average_clustering(G)
        total_degree += sum(dict(G.degree()).values()) / num_nodes  
        if nx.is_connected(G):
            total_diameter += nx.diameter(G)
            connected_graph_count += 1 
        total_graph_count += 1
if total_graph_count > 0:
    avg_nodes = total_nodes / total_graph_count
    avg_edges = total_edges / total_graph_count
    avg_clustering_coefficient = total_clustering_coefficient / total_graph_count
    avg_diameter = total_diameter / connected_graph_count if connected_graph_count > 0 else float('nan')
    avg_degree = total_degree / total_graph_count
    print(f"Average number of nodes: {avg_nodes:.2f}")
    print(f"Average number of edges: {avg_edges:.2f}")
    print(f"Average clustering coefficient: {avg_clustering_coefficient:.2f}")
    print(f"Average diameter: {avg_diameter:.2f}")
    print(f"Average degree: {avg_degree:.2f}")
else:
    print("No valid graphs found in the dataset.")

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
  from .autonotebook import tqdm as notebook_tqdm
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'dgl'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped loading some PyTorch models, missing a dependency. No module named 'tensorflow'
Failed to featurize datapoint 0, C. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Failed to featurize datapoint 5505, [H]. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Exception message: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (6834,) + inhomogeneous part

Average number of nodes: 6.79
Average number of edges: 6.44
Average clustering coefficient: 0.06
Average diameter: 4.21
Average degree: 1.89


### QM8

In [2]:
import deepchem as dc
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from deepchem.feat import MolGraphConvFeaturizer
from deepchem.data import NumpyDataset
qm8_dataset = "/data/XXX/Pooling/Graph_Pooling_Benchmark/Regression/datasets/qm8.csv"
data = pd.read_csv(qm8_dataset)
smiles = data['smiles'].values
featurizer = MolGraphConvFeaturizer()
mols = featurizer.featurize(smiles)
def graphdata_to_nx(graph_data):
    G = nx.Graph()
    num_nodes = graph_data.node_features.shape[0]
    G.add_nodes_from(range(num_nodes))
    for i in range(graph_data.edge_index.shape[1]):
        source = graph_data.edge_index[0, i]
        target = graph_data.edge_index[1, i]
        G.add_edge(source, target)
    return G
total_nodes = 0
total_edges = 0
total_clustering_coefficient = 0
total_diameter = 0
total_degree = 0
connected_graph_count = 0
total_graph_count = 0
for i, mol in enumerate(mols):
    if isinstance(mol, dc.feat.graph_data.GraphData):  
        if mol.node_features.size == 0:  
            continue
        G = graphdata_to_nx(mol)
        num_nodes = G.number_of_nodes()
        num_edges = G.number_of_edges()
        total_nodes += num_nodes
        total_edges += num_edges
        total_clustering_coefficient += nx.average_clustering(G)
        total_degree += sum(dict(G.degree()).values()) / num_nodes  
        if nx.is_connected(G):
            total_diameter += nx.diameter(G)
            connected_graph_count += 1 
        total_graph_count += 1
if total_graph_count > 0:
    avg_nodes = total_nodes / total_graph_count
    avg_edges = total_edges / total_graph_count
    avg_clustering_coefficient = total_clustering_coefficient / total_graph_count
    avg_diameter = total_diameter / connected_graph_count if connected_graph_count > 0 else float('nan')
    avg_degree = total_degree / total_graph_count
    print(f"Average number of nodes: {avg_nodes:.2f}")
    print(f"Average number of edges: {avg_edges:.2f}")
    print(f"Average clustering coefficient: {avg_clustering_coefficient:.2f}")
    print(f"Average diameter: {avg_diameter:.2f}")
    print(f"Average degree: {avg_degree:.2f}")
else:
    print("No valid graphs found in the dataset.")

Failed to featurize datapoint 0, C. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Failed to featurize datapoint 1, N. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Failed to featurize datapoint 2, O. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Exception message: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (21786,) + inhomogeneous part.


Average number of nodes: 7.77
Average number of edges: 8.09
Average clustering coefficient: 0.09
Average diameter: 4.35
Average degree: 2.08


### BACE

In [3]:
import deepchem as dc
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from deepchem.feat import MolGraphConvFeaturizer
from deepchem.data import NumpyDataset
bace_dataset = "/data/XXX/Pooling/Graph_Pooling_Benchmark/Regression/datasets/bace.csv"
data = pd.read_csv(bace_dataset)
smiles = data['smiles'].values
featurizer = MolGraphConvFeaturizer()
mols = featurizer.featurize(smiles)
def graphdata_to_nx(graph_data):
    G = nx.Graph()
    num_nodes = graph_data.node_features.shape[0]
    G.add_nodes_from(range(num_nodes))
    for i in range(graph_data.edge_index.shape[1]):
        source = graph_data.edge_index[0, i]
        target = graph_data.edge_index[1, i]
        G.add_edge(source, target)
    return G
total_nodes = 0
total_edges = 0
total_clustering_coefficient = 0
total_diameter = 0
total_degree = 0
connected_graph_count = 0
total_graph_count = 0
for i, mol in enumerate(mols):
    if isinstance(mol, dc.feat.graph_data.GraphData):  
        if mol.node_features.size == 0:  
            continue
        G = graphdata_to_nx(mol)
        num_nodes = G.number_of_nodes()
        num_edges = G.number_of_edges()
        total_nodes += num_nodes
        total_edges += num_edges
        total_clustering_coefficient += nx.average_clustering(G)
        total_degree += sum(dict(G.degree()).values()) / num_nodes  
        if nx.is_connected(G):
            total_diameter += nx.diameter(G)
            connected_graph_count += 1 
        total_graph_count += 1
if total_graph_count > 0:
    avg_nodes = total_nodes / total_graph_count
    avg_edges = total_edges / total_graph_count
    avg_clustering_coefficient = total_clustering_coefficient / total_graph_count
    avg_diameter = total_diameter / connected_graph_count if connected_graph_count > 0 else float('nan')
    avg_degree = total_degree / total_graph_count
    print(f"Average number of nodes: {avg_nodes:.2f}")
    print(f"Average number of edges: {avg_edges:.2f}")
    print(f"Average clustering coefficient: {avg_clustering_coefficient:.2f}")
    print(f"Average diameter: {avg_diameter:.2f}")
    print(f"Average degree: {avg_degree:.2f}")
else:
    print("No valid graphs found in the dataset.")

Average number of nodes: 34.09
Average number of edges: 36.86
Average clustering coefficient: 0.01
Average diameter: 15.22
Average degree: 2.17


### ESOL

In [4]:
import deepchem as dc
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from deepchem.feat import MolGraphConvFeaturizer
from deepchem.data import NumpyDataset
esol_dataset = "/data/XXX/Pooling/Graph_Pooling_Benchmark/Regression/datasets/esol.csv"
data = pd.read_csv(esol_dataset)
smiles = data['smiles'].values
featurizer = MolGraphConvFeaturizer()
mols = featurizer.featurize(smiles)
def graphdata_to_nx(graph_data):
    G = nx.Graph()
    num_nodes = graph_data.node_features.shape[0]
    G.add_nodes_from(range(num_nodes))
    for i in range(graph_data.edge_index.shape[1]):
        source = graph_data.edge_index[0, i]
        target = graph_data.edge_index[1, i]
        G.add_edge(source, target)
    return G
total_nodes = 0
total_edges = 0
total_clustering_coefficient = 0
total_diameter = 0
total_degree = 0
connected_graph_count = 0
total_graph_count = 0
for i, mol in enumerate(mols):
    if isinstance(mol, dc.feat.graph_data.GraphData):  
        if mol.node_features.size == 0:  
            continue
        G = graphdata_to_nx(mol)
        num_nodes = G.number_of_nodes()
        num_edges = G.number_of_edges()
        total_nodes += num_nodes
        total_edges += num_edges
        total_clustering_coefficient += nx.average_clustering(G)
        total_degree += sum(dict(G.degree()).values()) / num_nodes  
        if nx.is_connected(G):
            total_diameter += nx.diameter(G)
            connected_graph_count += 1 
        total_graph_count += 1
if total_graph_count > 0:
    avg_nodes = total_nodes / total_graph_count
    avg_edges = total_edges / total_graph_count
    avg_clustering_coefficient = total_clustering_coefficient / total_graph_count
    avg_diameter = total_diameter / connected_graph_count if connected_graph_count > 0 else float('nan')
    avg_degree = total_degree / total_graph_count
    print(f"Average number of nodes: {avg_nodes:.2f}")
    print(f"Average number of edges: {avg_edges:.2f}")
    print(f"Average clustering coefficient: {avg_clustering_coefficient:.2f}")
    print(f"Average diameter: {avg_diameter:.2f}")
    print(f"Average degree: {avg_degree:.2f}")
else:
    print("No valid graphs found in the dataset.")

Failed to featurize datapoint 934, C. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Exception message: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (1128,) + inhomogeneous part.


Average number of nodes: 13.30
Average number of edges: 13.69
Average clustering coefficient: 0.00
Average diameter: 7.02
Average degree: 1.98


### Freesolv

In [5]:
import deepchem as dc
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from deepchem.feat import MolGraphConvFeaturizer
from deepchem.data import NumpyDataset
freesolv_dataset = "/data/XXX/Pooling/Graph_Pooling_Benchmark/Regression/datasets/freesolv.csv"
data = pd.read_csv(freesolv_dataset)
smiles = data['smiles'].values
featurizer = MolGraphConvFeaturizer()
mols = featurizer.featurize(smiles)
def graphdata_to_nx(graph_data):
    G = nx.Graph()
    num_nodes = graph_data.node_features.shape[0]
    G.add_nodes_from(range(num_nodes))
    for i in range(graph_data.edge_index.shape[1]):
        source = graph_data.edge_index[0, i]
        target = graph_data.edge_index[1, i]
        G.add_edge(source, target)
    return G
total_nodes = 0
total_edges = 0
total_clustering_coefficient = 0
total_diameter = 0
total_degree = 0
connected_graph_count = 0
total_graph_count = 0
for i, mol in enumerate(mols):
    if isinstance(mol, dc.feat.graph_data.GraphData):  
        if mol.node_features.size == 0:  
            continue
        G = graphdata_to_nx(mol)
        num_nodes = G.number_of_nodes()
        num_edges = G.number_of_edges()
        total_nodes += num_nodes
        total_edges += num_edges
        total_clustering_coefficient += nx.average_clustering(G)
        total_degree += sum(dict(G.degree()).values()) / num_nodes  
        if nx.is_connected(G):
            total_diameter += nx.diameter(G)
            connected_graph_count += 1 
        total_graph_count += 1
if total_graph_count > 0:
    avg_nodes = total_nodes / total_graph_count
    avg_edges = total_edges / total_graph_count
    avg_clustering_coefficient = total_clustering_coefficient / total_graph_count
    avg_diameter = total_diameter / connected_graph_count if connected_graph_count > 0 else float('nan')
    avg_degree = total_degree / total_graph_count
    print(f"Average number of nodes: {avg_nodes:.2f}")
    print(f"Average number of edges: {avg_edges:.2f}")
    print(f"Average clustering coefficient: {avg_clustering_coefficient:.2f}")
    print(f"Average diameter: {avg_diameter:.2f}")
    print(f"Average degree: {avg_degree:.2f}")
else:
    print("No valid graphs found in the dataset.")

Failed to featurize datapoint 61, N. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Failed to featurize datapoint 195, S. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Failed to featurize datapoint 286, C. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Exception message: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (642,) + inhomogeneous part.


Average number of nodes: 8.76
Average number of edges: 8.43
Average clustering coefficient: 0.00
Average diameter: 5.06
Average degree: 1.84


### Lipophilicity

In [7]:
import deepchem as dc
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from deepchem.feat import MolGraphConvFeaturizer
from deepchem.data import NumpyDataset
lipo_dataset = "/data/XXX/Pooling/Graph_Pooling_Benchmark/Regression/datasets/Lipophilicity.csv"
data = pd.read_csv(lipo_dataset)
smiles = data['smiles'].values
featurizer = MolGraphConvFeaturizer()
mols = featurizer.featurize(smiles)
def graphdata_to_nx(graph_data):
    G = nx.Graph()
    num_nodes = graph_data.node_features.shape[0]
    G.add_nodes_from(range(num_nodes))
    for i in range(graph_data.edge_index.shape[1]):
        source = graph_data.edge_index[0, i]
        target = graph_data.edge_index[1, i]
        G.add_edge(source, target)
    return G
total_nodes = 0
total_edges = 0
total_clustering_coefficient = 0
total_diameter = 0
total_degree = 0
connected_graph_count = 0
total_graph_count = 0
for i, mol in enumerate(mols):
    if isinstance(mol, dc.feat.graph_data.GraphData):  
        if mol.node_features.size == 0:  
            continue
        G = graphdata_to_nx(mol)
        num_nodes = G.number_of_nodes()
        num_edges = G.number_of_edges()
        total_nodes += num_nodes
        total_edges += num_edges
        total_clustering_coefficient += nx.average_clustering(G)
        total_degree += sum(dict(G.degree()).values()) / num_nodes  
        if nx.is_connected(G):
            total_diameter += nx.diameter(G)
            connected_graph_count += 1 
        total_graph_count += 1
if total_graph_count > 0:
    avg_nodes = total_nodes / total_graph_count
    avg_edges = total_edges / total_graph_count
    avg_clustering_coefficient = total_clustering_coefficient / total_graph_count
    avg_diameter = total_diameter / connected_graph_count if connected_graph_count > 0 else float('nan')
    avg_degree = total_degree / total_graph_count
    print(f"Average number of nodes: {avg_nodes:.2f}")
    print(f"Average number of edges: {avg_edges:.2f}")
    print(f"Average clustering coefficient: {avg_clustering_coefficient:.2f}")
    print(f"Average diameter: {avg_diameter:.2f}")
    print(f"Average degree: {avg_degree:.2f}")
else:
    print("No valid graphs found in the dataset.")

Average number of nodes: 27.04
Average number of edges: 29.50
Average clustering coefficient: 0.00
Average diameter: 13.85
Average degree: 2.18


# Node Clasiffication

### Cora

In [12]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import random
import pandas as pd
import torch
visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
print(f"Using GPU(s): {visible_devices}")
print(torch.cuda.is_available())
num_gpus = torch.cuda.device_count()
print(f'Available GPUs: {num_gpus}')
import torch_geometric.transforms as T
from typing import Optional
import torch
from torch import Tensor
from torch_geometric.data import Data
from torch_geometric.data.datapipes import functional_transform
from torch_geometric.transforms import BaseTransform
import networkx as nx
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_networkx
import torch_geometric.transforms as T
import deepchem as dc
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import WebKB
from torch_geometric.datasets import Actor
from torch_geometric.datasets import CitationFull
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = Planetoid(root="/data/XXX/Pooling", name='Cora')
data = dataset[0]
G = to_networkx(data, to_undirected=True)
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
avg_clustering_coefficient = nx.average_clustering(G)
degree_distribution = [d for n, d in G.degree()]
average_degree = round(sum(degree_distribution) / num_nodes, 2)
diameter = nx.diameter(G) if nx.is_connected(G) else "Graph is not connected"
print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")
print(f"Average clustering coefficient: {avg_clustering_coefficient}")
print(f"Degree distribution: {degree_distribution}")
print(f"Diameter: {diameter}")
print(f"Average degree: {average_degree}")

Using GPU(s): 1
True
Available GPUs: 1
Number of nodes: 2708
Number of edges: 5278
Average clustering coefficient: 0.24067329850193728
Degree distribution: [3, 3, 5, 1, 5, 3, 4, 1, 3, 2, 2, 2, 4, 2, 5, 4, 4, 5, 5, 1, 5, 2, 5, 1, 7, 4, 5, 4, 1, 2, 6, 1, 4, 9, 1, 3, 8, 3, 4, 7, 3, 4, 2, 6, 3, 6, 2, 2, 9, 2, 1, 6, 5, 3, 2, 12, 4, 1, 1, 10, 3, 5, 1, 1, 3, 10, 1, 3, 3, 7, 2, 3, 2, 12, 9, 6, 3, 2, 3, 2, 3, 4, 2, 2, 5, 3, 4, 3, 36, 10, 4, 6, 2, 4, 11, 21, 1, 2, 1, 6, 2, 3, 8, 6, 4, 4, 1, 4, 3, 32, 5, 6, 4, 3, 3, 1, 2, 2, 19, 6, 5, 7, 4, 4, 5, 2, 6, 1, 4, 2, 3, 2, 5, 9, 1, 3, 1, 4, 3, 8, 2, 3, 5, 3, 4, 6, 5, 2, 5, 4, 3, 4, 4, 6, 12, 7, 6, 4, 4, 2, 7, 2, 5, 1, 2, 4, 2, 4, 3, 2, 2, 3, 5, 4, 4, 7, 3, 4, 1, 3, 10, 4, 2, 3, 1, 2, 6, 1, 2, 4, 2, 6, 1, 2, 5, 2, 3, 3, 1, 2, 2, 7, 4, 5, 1, 2, 2, 1, 1, 1, 10, 5, 1, 4, 1, 3, 2, 3, 11, 3, 3, 1, 1, 3, 2, 1, 1, 2, 5, 4, 5, 9, 5, 1, 2, 1, 11, 1, 1, 13, 5, 1, 4, 5, 4, 2, 4, 1, 3, 3, 1, 7, 4, 3, 2, 4, 1, 4, 7, 2, 1, 4, 2, 3, 4, 1, 4, 3, 2, 5, 5, 5, 1, 2, 2, 2,

### CiteCeer

In [13]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import random
import pandas as pd
import torch
visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
print(f"Using GPU(s): {visible_devices}")
print(torch.cuda.is_available())
num_gpus = torch.cuda.device_count()
print(f'Available GPUs: {num_gpus}')
import torch_geometric.transforms as T
from typing import Optional
import torch
from torch import Tensor
from torch_geometric.data import Data
from torch_geometric.data.datapipes import functional_transform
from torch_geometric.transforms import BaseTransform
import networkx as nx
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_networkx
import torch_geometric.transforms as T
import deepchem as dc
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import WebKB
from torch_geometric.datasets import Actor
from torch_geometric.datasets import CitationFull
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = Planetoid(root="/data/XXX/Pooling", name='CiteSeer')
data = dataset[0]
G = to_networkx(data, to_undirected=True)
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
avg_clustering_coefficient = nx.average_clustering(G)
degree_distribution = [d for n, d in G.degree()]
average_degree = round(sum(degree_distribution) / num_nodes, 2)
diameter = nx.diameter(G) if nx.is_connected(G) else "Graph is not connected"
print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")
print(f"Average clustering coefficient: {avg_clustering_coefficient}")
print(f"Degree distribution: {degree_distribution}")
print(f"Diameter: {diameter}")
print(f"Average degree: {average_degree}")

Using GPU(s): 1
True
Available GPUs: 1
Number of nodes: 3327
Number of edges: 4552
Average clustering coefficient: 0.14147102442629086
Degree distribution: [1, 5, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1, 12, 5, 2, 1, 14, 2, 3, 3, 1, 1, 3, 1, 2, 2, 2, 10, 11, 2, 2, 4, 2, 5, 1, 1, 1, 1, 2, 2, 2, 3, 1, 10, 2, 1, 1, 5, 1, 6, 2, 8, 2, 1, 3, 3, 2, 2, 5, 7, 3, 6, 1, 1, 1, 1, 3, 1, 1, 3, 7, 2, 1, 3, 1, 4, 6, 1, 4, 2, 3, 6, 1, 16, 2, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 6, 4, 3, 10, 3, 2, 7, 4, 1, 2, 1, 5, 1, 2, 4, 6, 1, 1, 4, 6, 2, 1, 1, 3, 4, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 3, 6, 1, 1, 3, 11, 4, 9, 2, 1, 1, 5, 1, 4, 1, 2, 8, 3, 1, 10, 4, 3, 4, 5, 4, 4, 4, 3, 1, 5, 3, 1, 1, 1, 3, 1, 2, 1, 2, 7, 1, 3, 1, 1, 1, 2, 3, 2, 4, 4, 2, 1, 1, 1, 1, 3, 1, 1, 3, 3, 2, 0, 1, 7, 2, 1, 2, 3, 1, 2, 1, 1, 3, 3, 7, 1, 1, 11, 1, 1, 2, 6, 3, 3, 1, 3, 3, 6, 2, 7, 1, 6, 0, 1, 5, 2, 4, 6, 10, 3, 1, 2, 1, 3, 1, 2, 4, 1, 3, 1, 5, 6, 2, 1, 5, 2, 8, 2, 1, 2, 1, 1, 3, 12, 1, 1, 2, 1, 2, 1, 1, 1, 5, 1, 3, 8, 2, 4, 6, 1, 1, 2, 5, 1, 2, 0, 

### Pubmed

In [14]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import random
import pandas as pd
import torch
visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
print(f"Using GPU(s): {visible_devices}")
print(torch.cuda.is_available())
num_gpus = torch.cuda.device_count()
print(f'Available GPUs: {num_gpus}')
import torch_geometric.transforms as T
from typing import Optional
import torch
from torch import Tensor
from torch_geometric.data import Data
from torch_geometric.data.datapipes import functional_transform
from torch_geometric.transforms import BaseTransform
import networkx as nx
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_networkx
import torch_geometric.transforms as T
import deepchem as dc
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import WebKB
from torch_geometric.datasets import Actor
from torch_geometric.datasets import CitationFull
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = Planetoid(root="/data/XXX/Pooling", name='PubMed')
data = dataset[0]
G = to_networkx(data, to_undirected=True)
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
avg_clustering_coefficient = nx.average_clustering(G)
degree_distribution = [d for n, d in G.degree()]
average_degree = round(sum(degree_distribution) / num_nodes, 2)
diameter = nx.diameter(G) if nx.is_connected(G) else "Graph is not connected"
print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")
print(f"Average clustering coefficient: {avg_clustering_coefficient}")
print(f"Degree distribution: {degree_distribution}")
print(f"Diameter: {diameter}")
print(f"Average degree: {average_degree}")

Using GPU(s): 1
True
Available GPUs: 1
Number of nodes: 19717
Number of edges: 44324
Average clustering coefficient: 0.060175209437523615
Degree distribution: [5, 3, 3, 1, 1, 2, 22, 17, 1, 9, 6, 1, 10, 1, 1, 6, 29, 6, 8, 8, 1, 2, 2, 1, 4, 1, 5, 3, 1, 2, 1, 1, 1, 1, 3, 17, 1, 1, 4, 1, 8, 4, 1, 1, 1, 1, 11, 31, 18, 2, 1, 1, 1, 1, 6, 2, 1, 3, 7, 3, 25, 1, 22, 1, 2, 3, 1, 2, 1, 7, 1, 3, 5, 1, 1, 1, 3, 9, 1, 2, 3, 2, 1, 2, 3, 7, 6, 1, 14, 1, 1, 2, 1, 1, 1, 4, 1, 2, 1, 13, 2, 19, 3, 2, 1, 15, 1, 1, 5, 2, 23, 1, 17, 2, 1, 2, 24, 27, 2, 1, 1, 1, 1, 1, 16, 1, 1, 2, 1, 7, 1, 2, 11, 1, 1, 1, 12, 1, 1, 3, 2, 6, 4, 2, 15, 9, 3, 7, 2, 1, 10, 1, 20, 1, 1, 2, 2, 1, 1, 7, 2, 1, 1, 2, 9, 2, 1, 3, 17, 2, 1, 1, 1, 4, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 14, 2, 1, 1, 1, 1, 2, 2, 5, 8, 2, 13, 1, 1, 1, 3, 5, 5, 1, 10, 9, 3, 1, 1, 1, 28, 6, 8, 2, 1, 1, 3, 1, 5, 1, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 1, 7, 1, 1, 1, 4, 9, 7, 1, 2, 1, 1, 25, 1, 1, 6, 1, 9, 1, 2, 2, 2, 1, 10, 2, 1, 1, 2, 18, 1, 3, 1, 2, 4, 2, 7, 3, 3, 8, 2

### Cornell

In [15]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import random
import pandas as pd
import torch
visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
print(f"Using GPU(s): {visible_devices}")
print(torch.cuda.is_available())
num_gpus = torch.cuda.device_count()
print(f'Available GPUs: {num_gpus}')
import torch_geometric.transforms as T
from typing import Optional
import torch
from torch import Tensor
from torch_geometric.data import Data
from torch_geometric.data.datapipes import functional_transform
from torch_geometric.transforms import BaseTransform
import networkx as nx
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_networkx
import torch_geometric.transforms as T
import deepchem as dc
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import WebKB
from torch_geometric.datasets import Actor
from torch_geometric.datasets import CitationFull
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = WebKB(root="/data/XXX/Pooling", name='Cornell')
data = dataset[0]
G = to_networkx(data, to_undirected=True)
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
avg_clustering_coefficient = nx.average_clustering(G)
degree_distribution = [d for n, d in G.degree()]
average_degree = round(sum(degree_distribution) / num_nodes, 2)
diameter = nx.diameter(G) if nx.is_connected(G) else "Graph is not connected"
print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")
print(f"Average clustering coefficient: {avg_clustering_coefficient}")
print(f"Degree distribution: {degree_distribution}")
print(f"Diameter: {diameter}")
print(f"Average degree: {average_degree}")

Using GPU(s): 1
True
Available GPUs: 1
Number of nodes: 183
Number of edges: 280
Average clustering coefficient: 0.16708048040169962
Degree distribution: [2, 3, 2, 5, 2, 7, 4, 2, 8, 1, 5, 1, 1, 3, 1, 1, 1, 1, 2, 1, 5, 5, 3, 2, 8, 2, 2, 4, 5, 2, 5, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 4, 2, 3, 2, 1, 2, 3, 2, 3, 2, 1, 1, 4, 1, 1, 1, 94, 1, 1, 1, 1, 2, 4, 1, 2, 11, 9, 2, 4, 3, 1, 3, 1, 2, 4, 1, 3, 2, 2, 1, 1, 3, 7, 3, 1, 1, 1, 1, 3, 1, 1, 3, 8, 1, 2, 5, 5, 1, 5, 1, 6, 1, 3, 4, 2, 1, 1, 1, 6, 3, 1, 1, 1, 1, 1, 2, 1, 4, 1, 3, 2, 5, 1, 1, 2, 1, 1, 3, 4, 3, 1, 1, 4, 1, 4, 1, 4, 5, 1, 2, 1, 1, 1, 2, 2, 8, 4, 4, 4, 5, 2, 1, 4, 4, 3, 1, 2, 6, 2, 3, 1, 2, 1, 4, 4, 1, 3, 3, 2, 1, 6, 1, 1, 5, 3, 10, 2, 1, 1, 2, 2, 1]
Diameter: 8
Average degree: 3.06


### Texas

In [16]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import random
import pandas as pd
import torch
visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
print(f"Using GPU(s): {visible_devices}")
print(torch.cuda.is_available())
num_gpus = torch.cuda.device_count()
print(f'Available GPUs: {num_gpus}')
import torch_geometric.transforms as T
from typing import Optional
import torch
from torch import Tensor
from torch_geometric.data import Data
from torch_geometric.data.datapipes import functional_transform
from torch_geometric.transforms import BaseTransform
import networkx as nx
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_networkx
import torch_geometric.transforms as T
import deepchem as dc
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import WebKB
from torch_geometric.datasets import Actor
from torch_geometric.datasets import CitationFull
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = WebKB(root="/data/XXX/Pooling", name='Texas')
data = dataset[0]
G = to_networkx(data, to_undirected=True)
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
avg_clustering_coefficient = nx.average_clustering(G)
degree_distribution = [d for n, d in G.degree()]
average_degree = round(sum(degree_distribution) / num_nodes, 2)
diameter = nx.diameter(G) if nx.is_connected(G) else "Graph is not connected"
print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")
print(f"Average clustering coefficient: {avg_clustering_coefficient}")
print(f"Degree distribution: {degree_distribution}")
print(f"Diameter: {diameter}")
print(f"Average degree: {average_degree}")

Using GPU(s): 1
True
Available GPUs: 1
Number of nodes: 183
Number of edges: 295
Average clustering coefficient: 0.1979261921103117
Degree distribution: [2, 1, 1, 1, 3, 3, 2, 2, 2, 2, 1, 3, 1, 3, 1, 10, 9, 2, 2, 1, 4, 3, 4, 4, 3, 2, 1, 1, 2, 6, 3, 3, 1, 1, 9, 1, 2, 2, 1, 3, 1, 5, 2, 1, 2, 3, 2, 3, 2, 2, 3, 1, 1, 2, 2, 4, 104, 8, 8, 3, 3, 2, 3, 2, 3, 3, 13, 3, 2, 1, 1, 1, 1, 4, 3, 2, 2, 1, 2, 3, 5, 2, 5, 3, 17, 3, 3, 2, 2, 2, 4, 1, 1, 2, 3, 6, 1, 2, 1, 3, 1, 1, 5, 1, 2, 1, 1, 1, 5, 1, 2, 1, 1, 2, 1, 1, 9, 2, 2, 2, 2, 1, 1, 1, 1, 2, 3, 9, 1, 3, 1, 7, 2, 4, 3, 1, 1, 1, 1, 2, 4, 2, 1, 1, 3, 2, 8, 4, 4, 1, 3, 3, 1, 3, 1, 1, 4, 1, 1, 6, 1, 2, 2, 3, 2, 3, 1, 4, 2, 1, 2, 7, 2, 7, 1, 3, 1, 3, 2, 1, 5, 1, 1]
Diameter: 8
Average degree: 3.22


### Wisconsin

In [17]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import random
import pandas as pd
import torch
visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
print(f"Using GPU(s): {visible_devices}")
print(torch.cuda.is_available())
num_gpus = torch.cuda.device_count()
print(f'Available GPUs: {num_gpus}')
import torch_geometric.transforms as T
from typing import Optional
import torch
from torch import Tensor
from torch_geometric.data import Data
from torch_geometric.data.datapipes import functional_transform
from torch_geometric.transforms import BaseTransform
import networkx as nx
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_networkx
import torch_geometric.transforms as T
import deepchem as dc
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import WebKB
from torch_geometric.datasets import Actor
from torch_geometric.datasets import CitationFull
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = WebKB(root="/data/XXX/Pooling", name='Wisconsin')
data = dataset[0]
G = to_networkx(data, to_undirected=True)
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()
avg_clustering_coefficient = nx.average_clustering(G)
degree_distribution = [d for n, d in G.degree()]
average_degree = round(sum(degree_distribution) / num_nodes, 2)
diameter = nx.diameter(G) if nx.is_connected(G) else "Graph is not connected"
print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")
print(f"Average clustering coefficient: {avg_clustering_coefficient}")
print(f"Degree distribution: {degree_distribution}")
print(f"Diameter: {diameter}")
print(f"Average degree: {average_degree}")

Using GPU(s): 1
True
Available GPUs: 1
Number of nodes: 251
Number of edges: 466
Average clustering coefficient: 0.20767919426170567
Degree distribution: [6, 1, 2, 3, 2, 10, 3, 2, 3, 1, 2, 4, 8, 2, 1, 2, 1, 1, 2, 1, 8, 2, 4, 1, 2, 4, 2, 3, 6, 7, 2, 4, 7, 3, 3, 4, 1, 9, 2, 1, 1, 10, 1, 1, 11, 4, 3, 2, 2, 2, 6, 4, 11, 4, 10, 1, 1, 5, 2, 1, 1, 1, 2, 5, 3, 1, 2, 4, 3, 5, 2, 2, 1, 1, 2, 10, 1, 2, 3, 9, 1, 3, 2, 2, 2, 2, 3, 4, 1, 5, 1, 3, 6, 5, 1, 1, 3, 10, 122, 2, 1, 3, 3, 3, 6, 2, 1, 4, 1, 2, 1, 2, 4, 2, 3, 10, 1, 3, 2, 3, 5, 3, 5, 2, 2, 2, 5, 2, 4, 6, 3, 2, 4, 1, 1, 2, 1, 2, 3, 2, 4, 3, 1, 2, 5, 4, 2, 8, 2, 3, 2, 2, 6, 4, 2, 4, 7, 2, 2, 3, 5, 3, 1, 3, 1, 3, 1, 4, 5, 4, 8, 8, 3, 13, 1, 2, 1, 1, 9, 1, 1, 1, 3, 5, 4, 1, 6, 2, 3, 3, 3, 2, 12, 2, 2, 3, 1, 5, 3, 5, 2, 4, 5, 4, 15, 3, 3, 5, 3, 4, 2, 7, 1, 2, 1, 2, 4, 5, 1, 1, 1, 1, 1, 2, 1, 2, 2, 1, 1, 4, 2, 4, 3, 2, 1, 1, 3, 1, 3, 1, 1, 4, 20, 3, 3, 3, 3, 1, 1, 2, 1]
Diameter: 8
Average degree: 3.71


### Github

In [19]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import random
import pandas as pd
import torch
visible_devices = os.environ["CUDA_VISIBLE_DEVICES"]
print(f"Using GPU(s): {visible_devices}")
print(torch.cuda.is_available())
num_gpus = torch.cuda.device_count()
print(f'Available GPUs: {num_gpus}')
import torch_geometric.transforms as T
from typing import Optional
import torch
from torch import Tensor
from torch_geometric.data import Data
from torch_geometric.data.datapipes import functional_transform
from torch_geometric.transforms import BaseTransform
import networkx as nx
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_networkx
import torch_geometric.transforms as T
import deepchem as dc
import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import WebKB
from torch_geometric.datasets import Actor
from torch_geometric.datasets import CitationFull
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from torch_geometric.datasets import GitHub
dataset = GitHub(root="/data/XXX/Pooling/GitHub")
data = dataset[0]
G = to_networkx(data, to_undirected=True)
degree_distribution = [d for n, d in G.degree()]
average_degree = round(sum(degree_distribution) / num_nodes, 2)
print(f"Degree distribution: {degree_distribution}")
print(f"Average degree: {average_degree}")

Using GPU(s): 1
True
Available GPUs: 1
Degree distribution: [1, 8, 1, 5, 2, 1, 6, 8, 8, 7, 66, 2, 6, 3, 18, 18, 1, 3, 3, 22, 4, 33, 7, 12, 18, 14, 7, 70, 1, 2, 6, 7, 2, 2, 4, 27, 23, 1, 127, 1, 5, 7, 55, 8, 2, 3, 3, 1, 2, 5, 2, 5, 31, 2, 1, 3, 88, 1, 7, 1, 1, 21, 1, 1, 1, 3, 4, 27, 4, 12, 1, 3, 6, 1267, 6, 18, 4, 1, 15, 1, 1, 19, 2, 7, 2, 5, 54, 24, 44, 5, 5, 2, 55, 1, 144, 1, 1, 9, 23, 19, 1, 47, 2, 3, 8, 13, 5, 47, 23, 4, 6, 2, 1, 7, 7, 5, 9, 2, 6, 135, 25, 3, 3, 1, 12, 17, 6, 2, 1, 13, 5, 27, 3, 4, 18, 6, 3, 5, 2, 10, 32, 17, 3, 7, 78, 4, 4, 2, 3, 2, 9, 2, 3, 1, 3, 2, 16, 9, 2, 103, 1, 3, 6, 2, 8, 2, 53, 2, 27, 1, 1, 2, 1, 5, 7, 29, 3, 6, 7, 20, 5, 19, 39, 3, 61, 28, 3, 3, 1, 3, 9, 2, 16, 18, 25, 2, 1, 40, 5, 2, 8, 23, 3, 13, 1, 10, 3, 2, 6, 1, 5, 1, 2, 8, 6, 7, 7, 1, 35, 36, 1, 1, 4, 9, 7, 1, 3, 10, 19, 1, 23, 6, 23, 4, 1, 9, 7, 3, 1, 52, 29, 12, 17, 11, 11, 1, 9, 1, 3, 1, 3, 2, 1, 45, 4, 9, 2, 4, 6, 2, 9, 11, 9, 2, 13, 12, 11, 1, 2, 18, 5, 2, 5, 11, 6, 5, 7, 24, 16, 5, 2, 25, 17, 