In [147]:
import importlib
import seaborn as sns
import numpy as np

import sys
sys.path.append('../')

import training
importlib.reload(training)
import graph_vis
importlib.reload(graph_vis)
import graph_creation
importlib.reload(graph_creation)
import utils
importlib.reload(utils)
import train_n2v
importlib.reload(train_n2v)

import networkx as nx
import pandas as pd
import torch

from sklearn.model_selection import train_test_split

from torch_geometric.utils import to_networkx
import matplotlib.pyplot as plt
import random


Using device: cpu
Random seed set to: 42
p_values: [1, 2]
q_values: [1, 2]


In [148]:
dataset_cora,data_cora = training.load_dataset('Cora', "../training_data/datasets")
dataset_citeseer,data_citeseer = training.load_dataset('Citeseer', "../training_data/datasets")
dataset_pubmed,data_pubmed = training.load_dataset('Pubmed', "../training_data/datasets")

list_of_datasets = [dataset_cora, dataset_citeseer, dataset_pubmed]
list_of_data = [data_cora, data_citeseer, data_pubmed]


# graph_vis.print_graph_info_cluster(data_cora)
# graph_vis.pyg_graph_data_visualizer(data_cora)

In [149]:
def pyg_data_to_dataframe(data):
    """
    Converts a PyG data object to a pandas DataFrame.
    Each row is a node, columns are features (and label if present).
    """
    # Node features
    x = data.x.cpu().numpy() if isinstance(data.x, torch.Tensor) else data.x
    df = pd.DataFrame(x, columns=[f'feat_{i}' for i in range(x.shape[1])])
    
    # Node labels (if present)
    if hasattr(data, 'y') and data.y is not None:
        df['label'] = data.y.cpu().numpy()
    
    # Node indices as index
    df.index.name = 'node_id'
    return df

df = pyg_data_to_dataframe(data_cora)
# print(df.head())

In [150]:
def add_new_features(
    data_cora, 
    df, 
    clustering_coeff: bool = True, 
    node_centrality: bool = True,
    node_centrality_eigen: bool = True,
    node_centrality_betweenness: bool = False, 
    node_centrality_closeness: bool = False, 
    node_degree: bool = True,
    neighbor_label_avg: bool = True
):
    """
    Adds new features to the DataFrame:
    - clustering_coef_m: clustering coefficient of each node (optional)
    - degree_centrality_m: degree centrality of each node (optional)
    - degree_m: degree of each node (optional)
    - neighbor_labels_avg: average of neighbor labels (optional)
    """

    # node_centrality_betweenness = True

    G = to_networkx(data_cora, to_undirected=True)
    
    if clustering_coeff:
        clustering_dict = nx.clustering(G)
        df['clustering_coef_m'] = pd.Series(clustering_dict)

    if node_centrality:
        centrality_dict = nx.degree_centrality(G)
        df['degree_centrality_m'] = pd.Series(centrality_dict)
    
    if node_centrality_eigen:
        eigen_centrality_dict = nx.eigenvector_centrality(G, max_iter=1000)
        df['eigen_centrality_m'] = pd.Series(eigen_centrality_dict)

    if node_centrality_betweenness:
        betweenness_dict = nx.betweenness_centrality(G, normalized=True)
        df['betweenness_centrality_m'] = pd.Series(betweenness_dict)

    if node_centrality_closeness:
        closeness_dict = nx.closeness_centrality(G)
        df['closeness_centrality_m'] = pd.Series(closeness_dict)    

    if node_degree:
        degree_dict = dict(G.degree())
        df['degree_m'] = pd.Series(degree_dict)
    
    if neighbor_label_avg:
        # Get labels as a Series for fast lookup
        label_series = df['label']
        neighbor_label_avg_dict = {}
        for node in G.nodes():
            neighbors = list(G.neighbors(node))
            if neighbors:
                neighbor_labels = label_series.loc[neighbors].values
                neighbor_label_avg_dict[node] = neighbor_labels.mean()
            else:
                neighbor_label_avg_dict[node] = float('nan')  # or 0, or the node's own label
        df['neighbor_labels_avg_m'] = pd.Series(neighbor_label_avg_dict)
    
    return df

# Example usage:
df = add_new_features(data_cora, df, clustering_coeff=True, node_centrality=True, node_degree=True, neighbor_label_avg=True)
print(df.head())

         feat_0  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  \
node_id                                                                   
0           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     1.0     0.0     0.0     0.0     0.0   

         feat_8  feat_9  ...  feat_1429  feat_1430  feat_1431  feat_1432  \
node_id                  ...                                               
0           0.0     0.0  ...        0.0        0.0        0.0        0.0   
1           0.0     0.0  ...        0.0        0.0        0.0        0.0   
2           0.0     0.0  ...        0.0        0.0        0.0        0.0   
3           0.0     0.0  ...        0.0        0.0        0.0        0.0   
4           0.0   

In [151]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Features and labels
X = df.drop(columns=['label'], inplace=False)
X = X[['neighbor_labels_avg_m']]
# X = X[[col for col in X.columns if not col.endswith('_m') or col != 'neighbor_labels_avg_m']]
# X = X[[col for col in X.columns if not (col.endswith('_m') and col != 'neighbor_labels_avg_m')]]
# X.drop(columns=['neighbor_labels_avg_m'], inplace=True)
# X.drop(columns=['degree_centrality_m'], inplace=True)
# X.drop(columns=['eigen_centrality_m'], inplace=True)
# X.drop(columns=['betweenness_centrality_m'], inplace=True)
y = df['label']
print(len(X.columns.tolist()),X.columns.tolist()[-7:])

print("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(),
    "MLP": MLPClassifier(max_iter=500)
}

results = {}

for name, model in models.items():
    try:
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"{name} Test accuracy: {acc:.4f}")
        print(classification_report(y_test, y_pred))
        results[name] = acc
    except Exception as e:
        print(f"{name} failed: {e}")

print("\nSummary of test accuracies:")
for name, acc in results.items():
    print(f"{name}: {acc:.4f}")

1 ['neighbor_labels_avg_m']
Splitting data into train and test sets...

Training Logistic Regression...
Logistic Regression Test accuracy: 0.6052
              precision    recall  f1-score   support

           0       0.54      0.63      0.58        70
           1       0.00      0.00      0.00        43
           2       0.22      0.10      0.13        84
           3       0.54      0.90      0.67       164
           4       0.85      0.75      0.80        85
           5       0.81      0.72      0.76        60
           6       0.92      0.61      0.73        36

    accuracy                           0.61       542
   macro avg       0.55      0.53      0.53       542
weighted avg       0.55      0.61      0.56       542


Training SVM...
SVM Test accuracy: 0.7472
              precision    recall  f1-score   support

           0       0.82      0.47      0.60        70
           1       0.57      0.77      0.65        43
           2       0.66      0.92      0.77        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Test accuracy: 0.7546
              precision    recall  f1-score   support

           0       0.63      0.56      0.59        70
           1       0.74      0.65      0.69        43
           2       0.72      0.88      0.79        84
           3       0.77      0.84      0.80       164
           4       0.86      0.73      0.79        85
           5       0.78      0.77      0.77        60
           6       0.79      0.64      0.71        36

    accuracy                           0.75       542
   macro avg       0.75      0.72      0.73       542
weighted avg       0.76      0.75      0.75       542


Training Gradient Boosting...
Gradient Boosting Test accuracy: 0.7638
              precision    recall  f1-score   support

           0       0.78      0.51      0.62        70
           1       0.61      0.77      0.68        43
           2       0.73      0.88      0.80        84
           3       0.77      0.85      0.81       164
           4       0.85  



In [152]:
importances = model.feature_importances_
importances_df = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(importances_df)

AttributeError: 'MLPClassifier' object has no attribute 'feature_importances_'