In this notebook, we will extract topological features from the Citeseer graph dataset and use them to train classification models, including Random Forest, Logistic Regression, and SVM.

In [2]:
from torch_geometric.datasets import Planetoid # type: ignore

dataset = Planetoid(root='data/CiteSeer', name='CiteSeer')
data = dataset[0]

print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Number of features: {data.num_node_features}')
print(f'Number of classes: {dataset.num_classes}')


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.test.index


Number of nodes: 3327
Number of edges: 9104
Number of features: 3703
Number of classes: 6


Processing...
Done!
  self.data, self.slices = torch.load(self.processed_paths[0])


In [3]:
data

Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327])

In [None]:
import torch
from torch_geometric.data import Data
from torch_geometric.utils import to_networkx
import networkx as nx
import matplotlib.pyplot as plt

G = to_networkx(data, node_attrs=['x'], to_undirected=True)
G



<networkx.classes.graph.Graph at 0x1345e5977c0>

In [None]:
import pandas as pd
df = pd.DataFrame(index= range(3327))
df['degree'] = [nx.degree_centrality(G) [i] for i in range(3327)] 


In [139]:
#df['eigenvector'] = [nx.eigenvector_centrality(G)[i] for i in range(3327)] 
list=[]
for i in range(3327):
    list.append(nx.eigenvector_centrality(G)[i])
    
df["eigenvector"]=list



In [141]:
dictt=nx.closeness_centrality(G)
list=[]
for i in range(3327):
    list.append(dictt[i])
    
df["closeness"]=list

In [144]:
dictt=nx.betweenness_centrality(G)

list=[]
for i in range(3327):
    list.append(dictt[i])
    
df["betweenness"]=list

In [145]:
df

Unnamed: 0,degree,eigenvector,closeness,betweenness
0,0.000301,5.356771e-20,0.000301,0.000000e+00
1,0.001503,9.526895e-04,0.086969,1.411372e-04
2,0.000301,5.356771e-20,0.000301,0.000000e+00
3,0.000601,1.377180e-15,0.000835,1.808490e-07
4,0.000301,5.356771e-20,0.000301,0.000000e+00
...,...,...,...,...
3322,0.000301,5.356771e-20,0.000301,0.000000e+00
3323,0.000301,5.356771e-20,0.000301,0.000000e+00
3324,0.000902,4.106946e-03,0.079784,4.372916e-05
3325,0.000301,5.356771e-20,0.000301,0.000000e+00


This dataframe contains graph centrality measures for 3327 nodes, commonly used in network analysis:

degree : Normalized degree centrality, indicating the fraction of nodes a given node is directly connected to.

eigenvector : Eigenvector centrality, reflecting the influence of a node based on the influence of its neighbors.

closeness : Closeness centrality, showing how quickly information can spread from a node to others in the network.

betweenness : Betweenness centrality, measuring the number of shortest paths passing through a node, indicating its role as a bridge.

Each row corresponds to a node, with values for these metrics.

In [146]:
data.y

tensor([3, 1, 5,  ..., 3, 1, 5])

In [149]:

array = data.y.numpy()
array 


array([3, 1, 5, ..., 3, 1, 5], dtype=int64)

In [150]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df,array,test_size=0.2, random_state=42)

In [152]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc = sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [154]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train,y_train )
y_pred = rf.predict(X_test)

In [155]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.44      0.08      0.14        49
           1       0.26      0.45      0.33        97
           2       0.65      0.63      0.64       147
           3       0.40      0.41      0.41       143
           4       0.57      0.66      0.61       115
           5       0.46      0.27      0.34       115

    accuracy                           0.46       666
   macro avg       0.47      0.42      0.41       666
weighted avg       0.48      0.46      0.45       666



In [156]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# SVM Classifier
svm_model = SVC(kernel='linear', C=1, random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))






SVM Accuracy: 0.3213213213213213
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        49
           1       0.00      0.00      0.00        97
           2       0.86      0.41      0.55       147
           3       0.26      0.45      0.33       143
           4       0.26      0.77      0.39       115
           5       0.00      0.00      0.00       115

    accuracy                           0.32       666
   macro avg       0.23      0.27      0.21       666
weighted avg       0.29      0.32      0.26       666



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [157]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000, solver='liblinear', random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.33183183183183185
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        49
           1       0.00      0.00      0.00        97
           2       0.84      0.41      0.55       147
           3       0.28      0.52      0.36       143
           4       0.26      0.74      0.39       115
           5       0.00      0.00      0.00       115

    accuracy                           0.33       666
   macro avg       0.23      0.28      0.22       666
weighted avg       0.29      0.33      0.27       666



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
