In [1]:
import torch
from torch_geometric.datasets import ExplainerDataset
from torch_geometric.datasets.graph_generator import BAGraph
from torch_geometric.datasets.motif_generator import HouseMotif
from torch_geometric.datasets.motif_generator import CycleMotif

def add_node_features(dataset):
    new_data_list = []
    for data in dataset:
        # Thêm feature: 10 chiều, tất cả giá trị là 1
        data.x = torch.ones((data.num_nodes, 10))
        new_data_list.append(data)
    return new_data_list

def extract_graph_features(data):
    num_nodes = data.num_nodes
    num_edges = data.num_edges
    avg_degree = 2 * num_edges / num_nodes
    # Tính trung bình của các đặc trưng nút nếu có
    if data.x is not None:
        node_feature_mean = data.x.mean(dim=0).numpy()
    else:
        node_feature_mean = np.zeros(10)  # Giả sử vector đặc trưng có 10 chiều
    return [num_nodes, num_edges, avg_degree] + node_feature_mean.tolist()

# Tạo dataset ban đầu
dataset1 = ExplainerDataset(
    graph_generator=BAGraph(num_nodes=25, num_edges=1),
    motif_generator=HouseMotif(),
    num_motifs=1,
    num_graphs=500,
)

dataset2 = ExplainerDataset(
    graph_generator=BAGraph(num_nodes=25, num_edges=1),
    motif_generator=CycleMotif(5),
    num_motifs=1,
    num_graphs=500,
)

new_dataset1 = []
for data in dataset1:
    data = data.clone()  # Clone để có thể sửa đổi
    data.graph_label = torch.tensor([0])
    new_dataset1.append(data)

new_dataset2 = []
for data in dataset2:
    data = data.clone()
    data.graph_label = torch.tensor([1])
    new_dataset2.append(data)

# Thêm node features (với clone bên trong hàm add_node_features nếu cần)
new_dataset1 = add_node_features(new_dataset1)
new_dataset2 = add_node_features(new_dataset2)

# Kết hợp dataset
dataset = new_dataset1 + new_dataset2

In [2]:
import random
random.shuffle(dataset)

In [3]:
features = []
labels = []
for data in dataset:
    features.append(extract_graph_features(data))
    labels.append(data.graph_label.item())

In [4]:
import pandas as pd

columns = ['num_nodes', 'num_edges', 'avg_degree'] + [f'feature_{i}' for i in range(10)]
df = pd.DataFrame(features, columns=columns)
df['y'] = labels
df

Unnamed: 0,num_nodes,num_edges,avg_degree,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,y
0,30,60,4.000000,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
1,30,50,3.333333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
2,30,62,4.133333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
3,30,58,3.866667,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
4,30,62,4.133333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,30,60,4.000000,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
996,30,60,4.000000,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
997,30,56,3.733333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
998,30,56,3.733333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1


In [5]:
from util import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import lore
from prepare_dataset import *
from neighbor_generator import *



def prepare_dataset(df):
    columns = df.columns.tolist()
    class_name = 'y'

    possible_outcomes = list(df[class_name].unique())

    type_features, features_type = recognize_features_type(df, class_name)
    print(type_features)
    discrete = ['y']
    discrete, continuous = set_discrete_continuous(columns, type_features, class_name, discrete=discrete,
                                                   continuous=None)
    print(discrete, continuous)

    columns_tmp = list(columns)
    columns_tmp.remove(class_name)
    idx_features = {i: col for i, col in enumerate(columns_tmp)}

    # Dataset Preparation for Scikit Alorithms
    df_le, label_encoder = label_encode(df, discrete)
    X = df_le.loc[:, df_le.columns != class_name].values
    y = df_le[class_name].values


    dataset = {
        'df': df, 
        'columns': columns, 
        'class_name': class_name,  
        'possible_outcomes': possible_outcomes, 
        'type_features': type_features, 
        'features_type': features_type,
        'discrete': discrete,
        'continuous': continuous, 
        'label_encoder': label_encoder,   
        'idx_features': idx_features,  
        'X': X,
        'y': y 
    }

    return dataset

In [6]:
dataset = prepare_dataset(df)
dataset

{'integer': ['num_nodes', 'num_edges', 'y'], 'double': ['avg_degree', 'feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9'], 'string': []}
['y'] ['num_nodes', 'num_edges', 'avg_degree', 'feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9']


{'df':      num_nodes  num_edges  avg_degree  feature_0  feature_1  feature_2  \
 0           30         60    4.000000        1.0        1.0        1.0   
 1           30         50    3.333333        1.0        1.0        1.0   
 2           30         62    4.133333        1.0        1.0        1.0   
 3           30         58    3.866667        1.0        1.0        1.0   
 4           30         62    4.133333        1.0        1.0        1.0   
 ..         ...        ...         ...        ...        ...        ...   
 995         30         60    4.000000        1.0        1.0        1.0   
 996         30         60    4.000000        1.0        1.0        1.0   
 997         30         56    3.733333        1.0        1.0        1.0   
 998         30         56    3.733333        1.0        1.0        1.0   
 999         30         56    3.733333        1.0        1.0        1.0   
 
      feature_3  feature_4  feature_5  feature_6  feature_7  feature_8  \
 0          1.0   

In [7]:
X, y = dataset['X'], dataset['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# blackbox
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

path_data = 'datasets/'
idx_record2explain = 2
X2E = X_test

# tạo neighbors = cách thêm nhiễu nhỏ random
explanation, infos = lore.explain(idx_record2explain, X2E, dataset, model,
                                    ng_function=genetic_neighborhood,
                                    discrete_use_probabilities=True,
                                    continuous_function_estimation=False,
                                    returns_infos=True,
                                    path=path_data, sep=';', log=False)

x = [30.         56.          3.73333333  1.          1.          1.
  1.          1.          1.          1.          1.          1.
  1.        ]
Các cột phân loại (discrete): ['y']
Giá trị ban đầu của cột y: [0 1]
feature_values size 13
Đang mã hóa cột: y
Giá trị sau khi mã hóa: [0 1]




Đang mã hóa cột: y
Giá trị sau khi mã hóa: [0 1]


OSError: [Errno 8] Exec format error: 'yadt/dTcmd'

In [16]:
# Lấy đồ thị đầu tiên
data = dataset[813]
print(data)
# In thông tin chi tiết
print("Number of nodes:", data.num_nodes)
print("Number of edges:", data.num_edges)
print("Node features shape:", data.x.shape if data.x is not None else None)
print("Edge indices:\n", data.edge_index)
print("Graph label (if available):", data.graph_label)


Explanation(edge_index=[2, 52], y=[30], edge_mask=[52], node_mask=[30], graph_label=[1], x=[30, 10])
Number of nodes: 30
Number of edges: 52
Node features shape: torch.Size([30, 10])
Edge indices:
 tensor([[ 0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  2,  3,  4,  5,  5,  5,  5,  5,
          6,  7,  8,  9,  9, 11, 12, 12, 13, 14, 15, 16, 17, 18, 19, 20, 20, 22,
         23, 24, 25, 25, 25, 26, 26, 26, 27, 27, 28, 28, 29, 29,  5, 27],
        [11, 20,  8,  9, 14, 19,  3,  4,  7, 15, 18,  2,  2,  6, 12, 13, 17, 24,
          5,  2,  1,  1, 22,  0,  5, 16,  5,  1,  2, 12,  5,  2,  1,  0, 23,  9,
         20,  5, 26, 28, 29, 29, 27, 25, 26, 28, 27, 25, 25, 26, 27,  5]])
Graph label (if available): tensor([0])


In [5]:
graph_to_explain = dataset[0]  # Chọn đồ thị đầu tiên

# Chuyển đổi dữ liệu đồ thị thành đầu vào cho mô hình giải thích
X2E = graph_to_explain.x  # Các đặc trưng của nodes
y2E = graph_to_explain.y

print(X2E)
y2E

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],


tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 2, 2, 3])

In [None]:
import torch
from torch_geometric.utils import to_dense_adj, degree
import pandas as pd

def extract_graph_features(data):
    num_nodes = data.num_nodes
    num_edges = data.num_edges
    avg_degree = degree(data.edge_index[0]).mean().item()

    adj_matrix = to_dense_adj(data.edge_index).squeeze(0).numpy()
    return {
        'num_nodes': num_nodes,
        'num_edges': num_edges,
        'avg_degree': avg_degree
    }

def convert_graphs_to_dataframe(dataset):
    records = []
    for data in dataset:
        features = extract_graph_features(data)
        label = data.y.item() if hasattr(data, 'y') else -1  
        features['label'] = label
        records.append(features)
    return pd.DataFrame(records)

df = convert_graphs_to_dataframe(dataset)
print(df.head())


RuntimeError: a Tensor with 30 elements cannot be converted to Scalar