In [1]:
import os
import numpy as np
import pandas as pd
import networkx as nx
from stellargraph import StellarGraph
from stellargraph.datasets import Cora, CiteSeer, PubMedDiabetes

import matplotlib.pyplot as plt

from scipy import linalg
from scipy.spatial.distance import pdist, squareform
import scipy.sparse as sp
from sklearn.preprocessing import StandardScaler
from scipy.sparse.csgraph import shortest_path

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score, adjusted_mutual_info_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

from IPython.display import display, HTML

In [2]:
class CitationNetworkVisualizer:
    def __init__(self, dataset_name):
        self.dataset_name = dataset_name
        self.graph = None
        self.node_subjects = None
        self.node_features = None

    def load_data(self):
        """Load the specified citation dataset and prepare data for spectral clustering."""
        if self.dataset_name == 'cora':
            dataset = Cora()
        elif self.dataset_name == 'citeseer':
            dataset = CiteSeer()
        elif self.dataset_name == 'pubmed':
            dataset = PubMedDiabetes()
        else:
            raise ValueError("Unsupported dataset name. Choose 'cora', 'citeseer', or 'pubmed'.")
        
        display(HTML(dataset.description))
        self.graph, self.node_subjects = dataset.load()

        # Convert the graph to a dense PyTorch tensor
        adjacency_matrix = self.graph.to_adjacency_matrix(weighted=False)
        # Prepare labels
        labels = self.node_subjects.values

        feature_matrix = self.graph.node_features()

        return feature_matrix, adjacency_matrix, labels

    def visualize(self):
        """Visualize the citation dataset network."""
        # Convert to NetworkX graph for visualization
        G_nx = self.graph.to_networkx()

        # Create a network layout
        pos = nx.spring_layout(G_nx)

        # Create a color map based on the labels
        unique_labels = np.unique(self.node_subjects)
        color_map = {label: i for i, label in enumerate(unique_labels)}
        colors = [color_map[label] for label in self.node_subjects]

        # Draw nodes and edges
        plt.figure(figsize=(12, 12))
        nx.draw_networkx_nodes(G_nx, pos, node_size=50, node_color=colors, cmap=plt.get_cmap('Set1'), alpha=0.6)
        nx.draw_networkx_edges(G_nx, pos, alpha=0.3)
        
        # Create a legend
        handles = [plt.Line2D([0], [0], marker='o', color='w', label=label, 
                               markerfacecolor=plt.get_cmap('Set1')(color_map[label]), markersize=10) 
                   for label in unique_labels]
        plt.legend(handles=handles, title="Classes")
        
        plt.title(f"{self.dataset_name.capitalize()} Dataset Citation Network")
        plt.axis('off')
        plt.show()

In [3]:
# Load PubMedDiabetes dataset
dataset_name = 'pubmed'
citation_visualizer = CitationNetworkVisualizer(dataset_name)
feature_matrix, data, labels = citation_visualizer.load_data()
unique_labels = set(labels)

In [4]:
# citation_visualizer.visualize()

In [5]:
unique_labels

{1, 2, 3}

In [6]:
def generate_subdataset(adjacency_matrix, labels, feature_matrix, target_fraction, random_state=42, tolerance=0.02):
    import numpy as np
    import scipy.sparse as sp

    np.random.seed(random_state)
    total_nodes = len(labels)
    target_size = int(total_nodes * target_fraction)
    min_size = target_size - int(total_nodes * tolerance)
    max_size = target_size + int(total_nodes * tolerance)

    # Ensure the adjacency matrix is in CSR format
    adjacency_matrix = sp.csr_matrix(adjacency_matrix)
    adjacency_matrix.setdiag(0)
    adjacency_matrix.eliminate_zeros()

    node_indices = np.arange(total_nodes)
    sampled_node_ids = set()
    neighbors_included = set()

    while True:
        # Randomly sample nodes to try to reach target_size
        sampled_node_ids = set(np.random.choice(node_indices, size=target_size, replace=False))

        # Add neighbors
        for node_id in sampled_node_ids:
            neighbors = adjacency_matrix[node_id].indices
            neighbors_included.update(neighbors)

        # Combine sampled nodes and their neighbors
        all_node_ids = sampled_node_ids.union(neighbors_included)
        
        # Ensure the combined size is within the tolerance
        if min_size <= len(all_node_ids) <= max_size:
            break
        elif len(all_node_ids) < min_size:
            target_size += int(0.05 * total_nodes)  # Increase target size
        else:
            target_size -= int(0.05 * total_nodes)  # Decrease target size

        neighbors_included.clear()  # Reset for the next iteration

    all_node_ids = np.array(list(all_node_ids))

    # Create a mask for selected nodes
    all_nodes_mask = np.zeros(total_nodes, dtype=bool)
    all_nodes_mask[all_node_ids] = True

    # Extract sub-adjacency matrix, labels, and features
    sub_adj_matrix = adjacency_matrix[all_nodes_mask][:, all_nodes_mask]
    sub_labels = labels[all_nodes_mask]
    sub_features = feature_matrix[all_nodes_mask]

    return sub_adj_matrix, sub_labels, sub_features

In [7]:
def preprocess_data(adjacency_matrix, labels, feature_matrix):
    # Ensure adjacency matrix is sparse
    adjacency_matrix = sp.csr_matrix(adjacency_matrix)
    
    # Remove diagonal elements (self-loops)
    adjacency_matrix.setdiag(0)
    adjacency_matrix.eliminate_zeros()  # Remove explicit zeros
    
    # Calculate degree matrix (as a vector)
    degree_vector = np.array(adjacency_matrix.sum(axis=1)).flatten()
    
    # Identify isolated nodes (degree == 0)
    isolated_nodes = np.where(degree_vector == 0)[0]
    non_isolated_mask = np.isin(np.arange(adjacency_matrix.shape[0]), isolated_nodes, invert=True)

    # Filter adjacency matrix and labels for non-isolated nodes
    adjacency_matrix_non_isolated = adjacency_matrix[non_isolated_mask][:, non_isolated_mask]
    labels_non_isolated = labels[non_isolated_mask]
    feature_matrix_non_isolated = feature_matrix[non_isolated_mask, :]

    # Encode labels
    label_encoder = LabelEncoder()
    encoded_labels_non_isolated = label_encoder.fit_transform(labels_non_isolated)

    return adjacency_matrix_non_isolated, encoded_labels_non_isolated, feature_matrix_non_isolated

In [8]:
def save_to_csv(adjacency_matrix, labels, feature_matrix, nodes_file="nodes.csv", edges_file="edges.csv"):
    """Save the node and edge data into separate CSV files."""
    
    # Save node information (ID, features, labels) to CSV
    node_data = pd.DataFrame({
        'id': np.arange(feature_matrix.shape[0]),
        'features': [list(f) for f in feature_matrix],  # convert features to list
        'label': labels
    })
    node_data.to_csv(nodes_file, index=False)

    # Save adjacency matrix (edge list) to CSz
    edges = np.argwhere(adjacency_matrix > 0)  # get index of non-zero entries (edges)
    edge_list = pd.DataFrame(edges, columns=['source_id', 'target_id'])
    edge_list.to_csv(edges_file, index=False)

    print(f"Nodes saved to {nodes_file} and edges saved to {edges_file}.")

In [9]:
import os

fractions = [1/3, 2/3, 1.0]
subdatasets = []
tolerance = 0.02  # Allow 2% deviation

# Folder names for subdatasets
folder_names = ["pubmed_33", "pubmed_66", "pubmed_full"]

for idx, fraction in enumerate(fractions):
    print(f"\nGenerating subdataset with target fraction {fraction}")
    
    # Generate subdataset
    sub_adj_matrix, sub_labels, sub_features = generate_subdataset(
        data, labels, feature_matrix, target_fraction=fraction, tolerance=tolerance
    )
    
    # Preprocess the subdataset
    sub_adj_matrix, sub_labels, sub_features = preprocess_data(
        sub_adj_matrix, sub_labels, sub_features
    )
    
    # Calculate actual fraction
    actual_fraction = len(sub_labels) / len(labels)
    print(f"Actual fraction after preprocessing: {actual_fraction:.2f}")
    
    # Append to subdatasets
    subdatasets.append((sub_adj_matrix, sub_labels, sub_features))
    
    # Save the subdataset into its respective folder
    folder_name = folder_names[idx]
    os.makedirs(folder_name, exist_ok=True)  # Ensure the folder exists
    
    nodes_file = f"{folder_name}/nodes.csv"
    edges_file = f"{folder_name}/edges.csv"
    
    save_to_csv(sub_adj_matrix, sub_labels, sub_features, nodes_file=nodes_file, edges_file=edges_file)
    print(f"Subdataset saved to folder: {folder_name}")



Generating subdataset with target fraction 0.3333333333333333


  self._set_arrayXarray(i, j, x)


Actual fraction after preprocessing: 0.32
Nodes saved to pubmed_33/nodes.csv and edges saved to pubmed_33/edges.csv.
Subdataset saved to folder: pubmed_33

Generating subdataset with target fraction 0.6666666666666666


  self._set_arrayXarray(i, j, x)


Actual fraction after preprocessing: 0.65
Nodes saved to pubmed_66/nodes.csv and edges saved to pubmed_66/edges.csv.
Subdataset saved to folder: pubmed_66

Generating subdataset with target fraction 1.0


  self._set_arrayXarray(i, j, x)


Actual fraction after preprocessing: 1.00
Nodes saved to pubmed_full/nodes.csv and edges saved to pubmed_full/edges.csv.
Subdataset saved to folder: pubmed_full


In [10]:
# fractions = [1/3, 2/3, 1.0]
# subdatasets = []
# tolerance = 0.02  # Allow 2% deviation

# for fraction in fractions:
#     print(f"\nGenerating subdataset with target fraction {fraction}")
#     sub_adj_matrix, sub_labels, sub_features = generate_subdataset(
#         data, labels, feature_matrix, target_fraction=fraction, tolerance=tolerance
#     )
    
#     # Preprocess the subdataset
#     sub_adj_matrix, sub_labels, sub_features = preprocess_data(
#         sub_adj_matrix, sub_labels, sub_features
#     )
    
#     # Calculate actual fraction
#     actual_fraction = len(sub_labels) / len(labels)
#     print(f"Actual fraction after preprocessing: {actual_fraction:.2f}")
    
#     subdatasets.append((sub_adj_matrix, sub_labels, sub_features))

In [11]:
# # Save subdatasets to CSV files
# for idx, (sub_adj_matrix, sub_labels, sub_features) in enumerate(subdatasets):
#     nodes_file = f"../nodes_pubmed_{idx}.csv"
#     edges_file = f"../edges_pubmed_{idx}.csv"
#     save_to_csv(sub_adj_matrix, sub_labels, sub_features, nodes_file=nodes_file, edges_file=edges_file)