In [1]:
import numpy as np
import torch
from torch.autograd import Variable
from math_support import graph_random_walk, convert_sequence_to_graph, compute_index_subsample, graph_random_walk_fixed_start
from data_loader import read_data, perform_ttv_split
from tqdm.auto import tqdm
import os
import re

In [2]:
import numpy as np
import torch
from torch.autograd import Variable

def extract_features_for_classifier(nnet, xembed_numpy, sembed_numpy, ylabel_numpy, eglist, 
                                    idx_train, idx_valid, idx_test, 
                                    nbatch=64):
    """
    Извлекает признаки для каждой вершины после message passing и разделяет их на train/val/test
    Запускает random walk из каждой вершины, гарантируя включение всех вершин
    
    Параметры:
    - nnet: Обученная модель Edge_Sheaf_NNet
    - xembed_numpy: Исходные признаки вершин (numpy array)
    - ylabel_numpy: Метки классов вершин (numpy array)
    - eglist: Список рёбер графа
    - idx_train: Индексы вершин обучающей выборки
    - idx_valid: Индексы вершин валидационной выборки
    - idx_test: Индексы вершин тестовой выборки
    - nbatch: Размер батча для random walk
    
    Возвращает:
    - features_dict: Словарь с разделенными признаками и метками
    """
    # Переводим модель в режим оценки
    nnet.eval()
    
    # Конвертируем в тензор, если нужно
    if isinstance(xembed_numpy, np.ndarray):
        xembed = torch.from_numpy(xembed_numpy).double()
    else:
        xembed = xembed_numpy.clone()

    if isinstance(sembed_numpy, np.ndarray):
        sembed = torch.from_numpy(sembed_numpy).double()
    else:
        sembed = sembed_numpy.clone()
    
    # Количество вершин
    num_vertices = sembed.shape[0]
    
    # Тензоры для агрегации признаков
    final_features = torch.zeros_like(xembed)
    feature_counts = torch.zeros(num_vertices)
    
    # Запускаем random walk из каждой вершины
    for start_vertex in tqdm(list(range(num_vertices))):
        # print(f'Processing random walk starting from vertex {start_vertex+1}/{num_vertices}')
        
        # Генерируем random walk, начиная с указанной вершины
        random_walk_data = graph_random_walk_fixed_start(eglist, nbatch, start_vertex)
        
        # Конвертируем в представление графа
        wgraph_numpy, idx_node = convert_sequence_to_graph(random_walk_data)
        wgraph = torch.from_numpy(wgraph_numpy).double()
        
        # Получаем индексы для train/valid/test из текущего подграфа
        idx_subsample_train = compute_index_subsample(idx_node, idx_train)
        idx_subsample_valid = compute_index_subsample(idx_node, idx_valid)
        idx_subsample_test = compute_index_subsample(idx_node, idx_test)
        
        # Выполняем message passing для этого подграфа (для всех вершин)
        with torch.no_grad():
            subgraph_features = perform_message_passing(nnet, xembed, sembed, wgraph, idx_node)
        
        # Обновляем признаки для всех типов вершин в текущем подграфе
        for subset_name, idx_subset in [
            ('train', idx_subsample_train), 
            ('valid', idx_subsample_valid), 
            ('test', idx_subsample_test)
        ]:
            # Если есть вершины текущего типа в подграфе
            if len(idx_subset) > 0:
                # Для каждой вершины в текущем подмножестве
                for local_idx in idx_subset:
                    # Получаем глобальный индекс вершины
                    global_idx = idx_node[local_idx]
                    
                    # Обновляем признаки и счетчик
                    final_features[global_idx] += subgraph_features[local_idx]
                    feature_counts[global_idx] += 1
    
    # Усредняем признаки для каждой вершины
    for i in range(num_vertices):
        if feature_counts[i] > 0:
            final_features[i] /= feature_counts[i]
        else:
            # Оставляем исходные признаки, если вершина не была включена ни в один random walk
            final_features[i] = sembed[i]
            print(f"Warning: Vertex {i} not included in any random walk. Using original features.")
    
    # Конвертируем обратно в numpy для использования с классификаторами
    final_features_np = final_features.detach().cpu().numpy()
    
    # Разделяем данные на train/val/test
    features_dict = {
        'features': final_features_np,  # Полный набор признаков для всех вершин
        'labels': ylabel_numpy,         # Полный набор меток для всех вершин
        
        # Разделенные наборы
        'train_features': final_features_np[idx_train],
        'train_labels': ylabel_numpy[idx_train],
        
        'val_features': final_features_np[idx_valid],
        'val_labels': ylabel_numpy[idx_valid],
        
        'test_features': final_features_np[idx_test],
        'test_labels': ylabel_numpy[idx_test],
        
        # Индексы для справки
        'idx_train': idx_train,
        'idx_valid': idx_valid,
        'idx_test': idx_test
    }
    
    print(f"Features extracted successfully")
    print(f"Train set: {len(idx_train)} samples")
    print(f"Validation set: {len(idx_valid)} samples")
    print(f"Test set: {len(idx_test)} samples")
    
    return features_dict

def perform_message_passing(nnet, xembed, sembed, wgraph, idx_node):
    """
    Выполняет message passing для преобразования признаков
    
    Параметры:
    - nnet: Обученная модель Edge_Sheaf_NNet
    - xembed: Признаки всех вершин
    - wgraph: Матрица смежности для подграфа
    - idx_node: Индексы вершин в подграфе
    
    Возвращает:
    - Преобразованные признаки после message passing
    """
    # Инициализируем признаки вершин для message passing
    xmaped = sembed[idx_node].clone()
    
    # Количество вершин в подграфе
    num_subgraph_vertices = len(idx_node)
    
    # Сохраняем рёбра для эффективной обработки
    edge_indices = []
    for i in range(num_subgraph_vertices):
        for j in range(num_subgraph_vertices):
            if wgraph[i, j] > 0:
                edge_indices.append((i, j))
    
    # Итерации message passing
    for conv_idx in range(nnet.nconv):
        if len(edge_indices) > 0:
            # Подготавливаем признаки источников и целей
            source_indices = [i for i, j in edge_indices]
            target_indices = [j for i, j in edge_indices]
            
            source_features = sembed[source_indices]
            target_features = sembed[target_indices]
            
            # Вычисляем все матрицы рёбер одним батчем
            # print('features shapes', source_features.shape, target_features.shape)
            edge_matrices_batch = nnet.get_edge_matrix(source_features, target_features)
            # print("edge_matrices_batch", edge_matrices_batch.shape)
            
            # Инициализируем для message passing
            new_xmaped = torch.zeros_like(xembed)
            node_counts = torch.zeros(num_subgraph_vertices).to(xmaped.device)
            
            # Применяем матрицы рёбер к признакам источников
            messages = torch.bmm(
                edge_matrices_batch,
                xembed[source_indices].unsqueeze(2)
            ).squeeze(2)
            
            # Накапливаем сообщения
            for idx, (i, j) in enumerate(edge_indices):
                weight = wgraph[i, j]
                new_xmaped[j:j+1, :] += weight * messages[idx:idx+1]
                node_counts[j] += weight
            
            # Нормализуем по общему весу
            for j in range(num_subgraph_vertices):
                if node_counts[j] > 0:
                    new_xmaped[j] /= node_counts[j]
            
            # Обновляем признаки вершин
            xmaped = new_xmaped
        else:
            # Если ребер нет, признаки не изменяются
            pass
    
    return xmaped

In [3]:
xembed, eglist, ylabel, ylprob, xsvd = read_data(embedding_dimension=1,
                                                 dataset_name='PubMed', eps=1.0e-6)
print('ylabel.shape = ' + str(ylabel.shape))
nsample = xembed.shape[0]
idx_train, idx_ttest, idx_valid = perform_ttv_split(nsample, ftrain=0.6, fttest=0.2, fvalid=0.2)

compute_user_item_embedding
ylabel.shape = (19717,)


In [None]:
for filename in os.listdir("/home/ubuntu/simulations_my_svd/nnet_folder"):
    if '_PubMed_' not in filename:
        continue

    model_path = f"/home/ubuntu/simulations_my_svd/nnet_folder/{filename}"

    model = torch.load(model_path, map_location=torch.device('cpu'), weights_only=False)

    dataset_name = filename.split('_')[2]
    embedding_dimension = int(re.findall(r"dime_(\d+)", filename)[0])

    if embedding_dimension <= 1:
        continue

    print("embedding_dimension", embedding_dimension)
    xembed, eglist, ylabel, ylprob, xsvd = read_data(embedding_dimension=embedding_dimension,
                                                             dataset_name=dataset_name, eps=1.0e-6)

    extracted_features_result = extract_features_for_classifier(model, xembed, xsvd, ylabel, eglist, 
                                        idx_train, idx_valid, idx_ttest, 
                                        nbatch=64)

    folder_name = filename.split('.')[0]
    os.makedirs(f"/home/ubuntu/simulations_my_svd/classificator_features/{folder_name}", exist_ok=True)

    for key, arr in extracted_features_result.items():
        np.save(f"/home/ubuntu/simulations_my_svd/classificator_features/{folder_name}/{key}", arr)

embedding_dimension 64
compute_user_item_embedding


  0%|          | 0/19717 [00:00<?, ?it/s]

Features extracted successfully
Train set: 11831 samples
Validation set: 3943 samples
Test set: 3943 samples
embedding_dimension 128
compute_user_item_embedding


  0%|          | 0/19717 [00:00<?, ?it/s]

Features extracted successfully
Train set: 11831 samples
Validation set: 3943 samples
Test set: 3943 samples
embedding_dimension 16
compute_user_item_embedding


  0%|          | 0/19717 [00:00<?, ?it/s]

Features extracted successfully
Train set: 11831 samples
Validation set: 3943 samples
Test set: 3943 samples
embedding_dimension 128
compute_user_item_embedding


  0%|          | 0/19717 [00:00<?, ?it/s]

Features extracted successfully
Train set: 11831 samples
Validation set: 3943 samples
Test set: 3943 samples
embedding_dimension 16
compute_user_item_embedding


  0%|          | 0/19717 [00:00<?, ?it/s]

Features extracted successfully
Train set: 11831 samples
Validation set: 3943 samples
Test set: 3943 samples
embedding_dimension 128
compute_user_item_embedding


  0%|          | 0/19717 [00:00<?, ?it/s]

Features extracted successfully
Train set: 11831 samples
Validation set: 3943 samples
Test set: 3943 samples
embedding_dimension 64
compute_user_item_embedding


  0%|          | 0/19717 [00:00<?, ?it/s]

Features extracted successfully
Train set: 11831 samples
Validation set: 3943 samples
Test set: 3943 samples
embedding_dimension 128
compute_user_item_embedding


  0%|          | 0/19717 [00:00<?, ?it/s]

Features extracted successfully
Train set: 11831 samples
Validation set: 3943 samples
Test set: 3943 samples
embedding_dimension 64
compute_user_item_embedding


  0%|          | 0/19717 [00:00<?, ?it/s]

Features extracted successfully
Train set: 11831 samples
Validation set: 3943 samples
Test set: 3943 samples
embedding_dimension 64
compute_user_item_embedding


  0%|          | 0/19717 [00:00<?, ?it/s]

Features extracted successfully
Train set: 11831 samples
Validation set: 3943 samples
Test set: 3943 samples
embedding_dimension 16
compute_user_item_embedding


  0%|          | 0/19717 [00:00<?, ?it/s]

Features extracted successfully
Train set: 11831 samples
Validation set: 3943 samples
Test set: 3943 samples
embedding_dimension 16
compute_user_item_embedding


  0%|          | 0/19717 [00:00<?, ?it/s]

In [5]:
!pip install torch-sparse torch-scatter

Collecting torch-sparse
  Using cached torch_sparse-0.6.18.tar.gz (209 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting torch-scatter
  Using cached torch_scatter-2.1.2.tar.gz (108 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: torch-sparse, torch-scatter
[33m  DEPRECATION: Building 'torch-sparse' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'torch-sparse'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for torch-sparse (setup.py) ... [?25ldone
[?25h  Created wheel for torch-sparse: filename=torch_sparse-0.6.18-cp311-cp311-linux_x86_64.whl size=1065968 sha256=5d4a91b2b9d42d43b