In [17]:
import importlib

import training
importlib.reload(training)
import graph_vis
importlib.reload(graph_vis)
import graph_creation
importlib.reload(graph_creation)
import utils
importlib.reload(utils)
import train_n2v
importlib.reload(train_n2v)

import networkx as nx
import pandas as pd
import torch


Using device: cpu
Random seed set to: 42
p_values: [1, 2]
q_values: [1, 2]


In [18]:
dataset_cora,data_cora = training.load_dataset('Cora')
dataset_citeseer,data_citeseer = training.load_dataset('Citeseer')
dataset_pubmed,data_pubmed = training.load_dataset('Pubmed')

list_of_datasets = [dataset_cora, dataset_citeseer, dataset_pubmed]
list_of_data = [data_cora, data_citeseer, data_pubmed]


graph_vis.print_graph_info_cluster(data_cora)

[INFO] Starting graph analytics
[INFO] Graph is a PyTorch Geometric Data object, converting to NetworkX graph.

----------Basic graph information-----------
Type:  <class 'networkx.classes.digraph.DiGraph'>
DiGraph with 2708 nodes and 10556 edges
Number of nodes:  2708
Number of edges:  10556
Average node degree:  7.796159527326441
Has isolated nodes:  0
Has self loops:  0
Is directed:  True
Error calculating Average Shortest Path (Largest Component): local variable 'largest_cc' referenced before assignment
Error calculating Number of Connected Components: not implemented for directed type
----------Graph extra statistics-----------
Number of connected components: N/A (directed graph)
Number of nodes in largest component: N/A (directed graph)
Average Clustering Coefficient: 0.24067329850193728
Transitivity/Global clustering coeff: 0.09349725626661058



In [19]:
def pyg_data_to_dataframe(data):
    """
    Converts a PyG data object to a pandas DataFrame.
    Each row is a node, columns are features (and label if present).
    """
    # Node features
    x = data.x.cpu().numpy() if isinstance(data.x, torch.Tensor) else data.x
    df = pd.DataFrame(x, columns=[f'feat_{i}' for i in range(x.shape[1])])
    
    # Node labels (if present)
    if hasattr(data, 'y') and data.y is not None:
        df['label'] = data.y.cpu().numpy()
    
    # Node indices as index
    df.index.name = 'node_id'
    return df

df = pyg_data_to_dataframe(data_cora)
# print(df.head())

In [20]:
df.loc[0]

feat_0       0.0
feat_1       0.0
feat_2       0.0
feat_3       0.0
feat_4       0.0
            ... 
feat_1429    0.0
feat_1430    0.0
feat_1431    0.0
feat_1432    0.0
label        3.0
Name: 0, Length: 1434, dtype: float64

In [21]:
import networkx as nx
import pandas as pd
from torch_geometric.utils import to_networkx

def add_new_features(
    data_cora, 
    df, 
    clustering_coeff: bool = True, 
    node_centrality: bool = True
):
    """
    Adds new features to the DataFrame:
    - clustering_coef: clustering coefficient of each node (optional)
    - node_centrality: degree centrality of each node (optional)
    """
    G = to_networkx(data_cora, to_undirected=True)
    
    if clustering_coeff:
        clustering_dict = nx.clustering(G)
        df['clustering_coef'] = pd.Series(clustering_dict)
    
    if node_centrality:
        centrality_dict = nx.degree_centrality(G)
        df['degree_centrality'] = pd.Series(centrality_dict)
    
    return df

# Example usage:
df = add_new_features(data_cora, df, clustering_coeff=True, node_centrality=True)
print(df.head())

         feat_0  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  \
node_id                                                                   
0           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
1           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     1.0     0.0     0.0     0.0     0.0   

         feat_8  feat_9  ...  feat_1426  feat_1427  feat_1428  feat_1429  \
node_id                  ...                                               
0           0.0     0.0  ...        0.0        0.0        0.0        0.0   
1           0.0     0.0  ...        0.0        0.0        0.0        0.0   
2           0.0     0.0  ...        0.0        0.0        0.0        0.0   
3           0.0     0.0  ...        0.0        0.0        0.0        0.0   
4           0.0   

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Features and labels
X = df.drop(columns=['label'])
y = df['label']

print("Splitting data into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Starting hyperparameter tuning with GridSearchCV...")
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=3,
    n_jobs=-1
)
grid.fit(X_train, y_train)

print("Best parameters found:", grid.best_params_)

print("Training best Random Forest model on training data...")
best_rf = grid.best_estimator_
best_rf.fit(X_train, y_train)

print("Predicting on test set...")
y_pred = best_rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {acc:.4f}")
print("Classification report:")
print(classification_report(y_test, y_pred))

Splitting data into train and test sets...
Starting hyperparameter tuning with GridSearchCV...
Best parameters found: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Training best Random Forest model on training data...
Predicting on test set...
Test accuracy: 0.7749
Classification report:
              precision    recall  f1-score   support

           0       0.62      0.57      0.60        70
           1       0.91      0.70      0.79        43
           2       0.88      0.87      0.87        84
           3       0.72      0.88      0.79       164
           4       0.82      0.80      0.81        85
           5       0.80      0.73      0.77        60
           6       0.91      0.56      0.69        36

    accuracy                           0.77       542
   macro avg       0.81      0.73      0.76       542
weighted avg       0.78      0.77      0.77       542



In [23]:
importances = best_rf.feature_importances_
importances_df = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print(importances_df)

        feature    importance
495    feat_495  4.645216e-02
581    feat_581  2.541807e-02
774    feat_774  2.411607e-02
19      feat_19  2.198482e-02
1254  feat_1254  1.728652e-02
...         ...           ...
950    feat_950  2.957123e-06
977    feat_977  1.009714e-07
742    feat_742  0.000000e+00
444    feat_444  0.000000e+00
1323  feat_1323  0.000000e+00

[1435 rows x 2 columns]
