In [1]:
import os
import numpy as np
import pandas as pd
import networkx as nx
from stellargraph import StellarGraph
from stellargraph.datasets import Cora, CiteSeer, PubMedDiabetes
from sklearn.preprocessing import LabelEncoder

import scipy.sparse as sp
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [2]:
class CitationNetworkVisualizer:
    def __init__(self, dataset_name):
        self.dataset_name = dataset_name
        self.graph = None
        self.node_subjects = None
        self.node_features = None

    def load_data(self):
        """Load the specified citation dataset and prepare data for spectral clustering."""
        if self.dataset_name == 'cora':
            dataset = Cora()
        elif self.dataset_name == 'citeseer':
            dataset = CiteSeer()
        elif self.dataset_name == 'pubmed':
            dataset = PubMedDiabetes()
        else:
            raise ValueError("Unsupported dataset name. Choose 'cora', 'citeseer', or 'pubmed'.")
        
        display(HTML(dataset.description))
        self.graph, self.node_subjects = dataset.load()

        # Convert the graph to a dense PyTorch tensor
        adjacency_matrix = self.graph.to_adjacency_matrix(weighted=False)
        # Prepare labels
        labels = self.node_subjects.values

        feature_matrix = self.graph.node_features()

        return feature_matrix, adjacency_matrix, labels

    def visualize(self):
        """Visualize the citation dataset network."""
        # Convert to NetworkX graph for visualization
        G_nx = self.graph.to_networkx()

        # Create a network layout
        pos = nx.spring_layout(G_nx)

        # Create a color map based on the labels
        unique_labels = np.unique(self.node_subjects)
        color_map = {label: i for i, label in enumerate(unique_labels)}
        colors = [color_map[label] for label in self.node_subjects]

        # Draw nodes and edges
        plt.figure(figsize=(12, 12))
        nx.draw_networkx_nodes(G_nx, pos, node_size=50, node_color=colors, cmap=plt.get_cmap('Set1'), alpha=0.6)
        nx.draw_networkx_edges(G_nx, pos, alpha=0.3)
        
        # Create a legend
        handles = [plt.Line2D([0], [0], marker='o', color='w', label=label, 
                               markerfacecolor=plt.get_cmap('Set1')(color_map[label]), markersize=10) 
                   for label in unique_labels]
        plt.legend(handles=handles, title="Classes")
        
        plt.title(f"{self.dataset_name.capitalize()} Dataset Citation Network")
        plt.axis('off')
        plt.show()

In [3]:
# Load PubMedDiabetes dataset
dataset_name = 'citeseer'
citation_visualizer = CitationNetworkVisualizer(dataset_name)
feature_matrix, data, labels = citation_visualizer.load_data()
le = LabelEncoder()
encoded_labels = le.fit_transform(labels)
unique_labels = set(encoded_labels)

In [4]:
# citation_visualizer.visualize()

In [5]:
unique_labels

{0, 1, 2, 3, 4, 5}

In [6]:
def preprocess_data(adjacency_matrix, labels, feature_matrix):
    # Ensure adjacency matrix is sparse
    adjacency_matrix = sp.csr_matrix(adjacency_matrix)
    
    # Remove diagonal elements (self-loops)
    adjacency_matrix.setdiag(0)
    adjacency_matrix.eliminate_zeros()  # Remove explicit zeros
    
    # Calculate degree matrix (as a vector)
    degree_vector = np.array(adjacency_matrix.sum(axis=1)).flatten()
    
    # Identify isolated nodes (degree == 0)
    isolated_nodes = np.where(degree_vector == 0)[0]
    non_isolated_mask = np.isin(np.arange(adjacency_matrix.shape[0]), isolated_nodes, invert=True)

    # Filter adjacency matrix and labels for non-isolated nodes
    adjacency_matrix_non_isolated = adjacency_matrix[non_isolated_mask][:, non_isolated_mask]
    labels_non_isolated = labels[non_isolated_mask]
    feature_matrix_non_isolated = feature_matrix[non_isolated_mask, :]

    # Encode labels
    label_encoder = LabelEncoder()
    encoded_labels_non_isolated = label_encoder.fit_transform(labels_non_isolated)

    return adjacency_matrix_non_isolated, encoded_labels_non_isolated, feature_matrix_non_isolated

In [7]:
def save_to_csv(adjacency_matrix, labels, feature_matrix, nodes_file="nodes.csv", edges_file="edges.csv"):
    """Save the node and edge data into separate CSV files."""
    
    # Save node information (ID, features, labels) to CSV
    node_data = pd.DataFrame({
        'id': np.arange(feature_matrix.shape[0]),
        'features': [list(f) for f in feature_matrix],  # convert features to list
        'label': labels
    })
    node_data.to_csv(nodes_file, index=False)

    # Save adjacency matrix (edge list) to CSz
    edges = np.argwhere(adjacency_matrix > 0)  # get index of non-zero entries (edges)
    edge_list = pd.DataFrame(edges, columns=['source_id', 'target_id'])
    edge_list.to_csv(edges_file, index=False)

    print(f"Nodes saved to {nodes_file} and edges saved to {edges_file}.")

In [8]:
adjacency_matrix_non_isolated, labels_non_isolated, feature_matrix_non_isolated = preprocess_data(data, labels, feature_matrix)

save_to_csv(adjacency_matrix_non_isolated, labels_non_isolated, feature_matrix_non_isolated)

  self._set_arrayXarray(i, j, x)


Nodes saved to nodes.csv and edges saved to edges.csv.
