In [1]:
from pocketbase import PocketBase
import json

POCKETBASE_URL = "http://localhost:8090"

In [2]:
pb = PocketBase(POCKETBASE_URL)

similar_movies_raw = pb.collection("similar_movies").get_full_list()
similar_movies = {r.movie: set(r.recommendations) for r in similar_movies_raw}

In [3]:
# Строим граф
graph = similar_movies

# Симметризация: добавляем обратные связи
for node, neighbors in list(graph.items()):
    for neighbor in neighbors:
        if neighbor not in graph:
            # print(f"node: {node} | neighbor: {neighbor}")
            graph[neighbor] = set()
        graph[neighbor].add(node)

# Выбрасываем плохо заполненные ноды
NODE_EDGES_THS = 0
for node, neighbors in list(graph.items()):
    if len(neighbors) < NODE_EDGES_THS:
        del graph[node]

# Считаем статистику
num_nodes = len(graph)
num_edges = sum(len(neighbors) for neighbors in graph.values()) // 2  # каждое ребро посчитано дважды
max_edges = num_nodes * (num_nodes - 1) // 2
density = num_edges / max_edges if max_edges > 0 else 0

print(f"Нод: {num_nodes}")
print(f"Рёбер: {num_edges}")
print(f"Макс. рёбер: {max_edges}")
print(f"Плотность: {density:.6f} ({density*100:.4f}%)")

Нод: 14276
Рёбер: 35326
Макс. рёбер: 101894950
Плотность: 0.000347 (0.0347%)


In [4]:
import numpy as np
import random
from collections import defaultdict

random.seed(42)
np.random.seed(42)

def split_edges(graph, test_fraction=0.2):
    """
    Убираем test_fraction рёбер из графа.
    Гарантируем: у каждой ноды остаётся минимум 1 ребро.
    Возвращает (train_graph, test_edges).
    """
    # Собираем все рёбра (каждое один раз)
    all_edges = set()
    for node, neighbors in graph.items():
        for neighbor in neighbors:
            edge = tuple(sorted((node, neighbor)))
            all_edges.add(edge)
    
    all_edges = list(all_edges)
    random.shuffle(all_edges)
    
    # Степени нод
    degree = defaultdict(int)
    for u, v in all_edges:
        degree[u] += 1
        degree[v] += 1
    
    # Текущие степени (будем уменьшать при удалении ребра)
    current_degree = dict(degree)
    
    test_edges = []
    train_edges = []
    target_test = int(len(all_edges) * test_fraction)
    
    for u, v in all_edges:
        # Можем убрать ребро, только если у обеих нод останется >= 1
        if len(test_edges) < target_test and current_degree[u] > 1 and current_degree[v] > 1:
            test_edges.append((u, v))
            current_degree[u] -= 1
            current_degree[v] -= 1
        else:
            train_edges.append((u, v))
    
    # Строим train граф
    train_graph = defaultdict(set)
    for u, v in train_edges:
        train_graph[u].add(v)
        train_graph[v].add(u)
    
    # Все ноды должны присутствовать (даже если изолированных нет по построению)
    for node in graph:
        if node not in train_graph:
            train_graph[node] = set()
    
    print(f"Всего рёбер: {len(all_edges)}")
    print(f"Train рёбер: {len(train_edges)}")
    print(f"Test рёбер: {len(test_edges)}")
    print(f"Доля test: {len(test_edges)/len(all_edges):.2%}")
    
    return dict(train_graph), test_edges

train_graph, test_edges = split_edges(graph, test_fraction=0.2)

Всего рёбер: 35326
Train рёбер: 28261
Test рёбер: 7065
Доля test: 20.00%


In [5]:
import tempfile
import os
import logging
from pecanpy.pecanpy import SparseOTF
from gensim.models import Word2Vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def graph_to_edgelist(g, path):
    """Записываем граф в формат edgelist для pecanpy."""
    with open(path, "w") as f:
        written = set()
        for node, neighbors in g.items():
            for neighbor in neighbors:
                edge = tuple(sorted((node, neighbor)))
                if edge not in written:
                    f.write(f"{edge[0]}\t{edge[1]}\n")
                    written.add(edge)

def train_node2vec(g, dimensions, window, walk_length, num_walks, p=1.0, q=1.0, workers=4):
    """Обучаем node2vec через pecanpy (SparseOTF) + gensim Word2Vec."""
    with tempfile.NamedTemporaryFile(mode="w", suffix=".edgelist", delete=False) as f:
        edgelist_path = f.name
    
    try:
        graph_to_edgelist(g, edgelist_path)
        
        # Проверяем файл
        with open(edgelist_path) as f:
            lines = f.readlines()
            print(f"Edgelist: {len(lines)} lines, first 3: {lines[:3]}")
        
        pecanpy_graph = SparseOTF(p=p, q=q, workers=workers)
        pecanpy_graph.read_edg(edgelist_path, weighted=False, directed=False)
        
        walks = pecanpy_graph.simulate_walks(num_walks=num_walks, walk_length=walk_length)
        
        # Word2Vec на прогулках
        model = Word2Vec(
            walks,
            vector_size=dimensions,
            window=window,
            min_count=0,
            sg=1,  # skip-gram
            workers=workers,
            epochs=5,
        )
        return model
    finally:
        os.unlink(edgelist_path)

print("Training node2vec on train graph...")
model = train_node2vec(train_graph, dimensions=128, window=5, walk_length=20, num_walks=10, p=1, q=1)
print(f"Vocabulary size: {len(model.wv)}")

Training node2vec on train graph...
Edgelist: 28261 lines, first 3: ['75y6sp48613id9n\toykxnpe1jf3c2di\n', '75y6sp48613id9n\tkrxr7vgpypqzx0v\n', '0nv2e39y202ngl2\tj3g6c1romcu2r71\n']


2026-02-26 13:14:24,701 : INFO : collecting all words and their counts
2026-02-26 13:14:24,702 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2026-02-26 13:14:24,722 : INFO : PROGRESS: at sentence #10000, processed 210000 words, keeping 13910 word types
2026-02-26 13:14:24,743 : INFO : PROGRESS: at sentence #20000, processed 420000 words, keeping 14236 word types
2026-02-26 13:14:24,768 : INFO : PROGRESS: at sentence #30000, processed 630000 words, keeping 14272 word types
2026-02-26 13:14:24,793 : INFO : PROGRESS: at sentence #40000, processed 840000 words, keeping 14276 word types
2026-02-26 13:14:24,816 : INFO : PROGRESS: at sentence #50000, processed 1050000 words, keeping 14276 word types
2026-02-26 13:14:24,842 : INFO : PROGRESS: at sentence #60000, processed 1260000 words, keeping 14276 word types
2026-02-26 13:14:24,867 : INFO : PROGRESS: at sentence #70000, processed 1470000 words, keeping 14276 word types
2026-02-26 13:14:24,897 : INFO : PROGRESS: 

Vocabulary size: 14276


In [6]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

def evaluate(model, test_edges, graph, ks=[5, 10, 20]):
    """
    Для каждой ноды, у которой были убраны рёбра:
    - Ранжируем все остальные ноды по cosine similarity эмбеддингов
    - Считаем Recall@K и MRR
    - Группируем по бакетам степени ноды (в исходном графе)
    """
    # Собираем test-соседей для каждой ноды
    test_neighbors = defaultdict(set)
    for u, v in test_edges:
        test_neighbors[u].add(v)
        test_neighbors[v].add(u)
    
    # Степени в исходном графе
    original_degree = {node: len(neighbors) for node, neighbors in graph.items()}
    
    # Бакеты
    buckets = {
        "1-3": (1, 3),
        "3-6": (3, 6),
        "6-10": (6, 10),
        "10-15": (10, 15),
        "15+": (15, float("inf")),
    }
    
    # Все ноды с эмбеддингами
    all_nodes = [n for n in graph if n in model.wv]
    node_to_idx = {n: i for i, n in enumerate(all_nodes)}
    
    # Матрица эмбеддингов
    embeddings = np.array([model.wv[n] for n in all_nodes])
    
    # Результаты по бакетам
    results = {bucket: {f"recall@{k}": [] for k in ks} | {"mrr": []} for bucket in buckets}
    results["all"] = {f"recall@{k}": [] for k in ks} | {"mrr": []}
    
    for node, true_neighbors in tqdm(test_neighbors.items(), desc="Evaluating"):
        if node not in node_to_idx:
            continue
        
        node_idx = node_to_idx[node]
        node_emb = embeddings[node_idx].reshape(1, -1)
        
        # Cosine similarity со всеми
        sims = cosine_similarity(node_emb, embeddings)[0]
        
        # Исключаем саму ноду
        sims[node_idx] = -1
        
        # Ранжируем
        ranked_indices = np.argsort(-sims)
        ranked_nodes = [all_nodes[i] for i in ranked_indices]
        
        # Метрики
        true_set = true_neighbors & set(all_nodes)
        if not true_set:
            continue
        
        # Recall@K
        for k in ks:
            top_k = set(ranked_nodes[:k])
            recall = len(top_k & true_set) / len(true_set)
            results["all"][f"recall@{k}"].append(recall)
        
        # MRR — ранг первого правильного
        for rank, n in enumerate(ranked_nodes, 1):
            if n in true_set:
                results["all"]["mrr"].append(1.0 / rank)
                break
        
        # По бакетам
        deg = original_degree.get(node, 0)
        for bucket_name, (lo, hi) in buckets.items():
            if lo <= deg < hi or (hi == float("inf") and deg >= lo):
                for k in ks:
                    top_k = set(ranked_nodes[:k])
                    recall = len(top_k & true_set) / len(true_set)
                    results[bucket_name][f"recall@{k}"].append(recall)
                for rank, n in enumerate(ranked_nodes, 1):
                    if n in true_set:
                        results[bucket_name]["mrr"].append(1.0 / rank)
                        break
                break
    
    # Агрегация
    print(f"\n{'Bucket':<10} {'Count':>6} ", end="")
    for k in ks:
        print(f"{'R@'+str(k):>8} ", end="")
    print(f"{'MRR':>8}")
    print("-" * (10 + 7 + 9 * len(ks) + 9))
    
    for bucket_name in ["all"] + list(buckets.keys()):
        data = results[bucket_name]
        count = len(data["mrr"])
        if count == 0:
            continue
        print(f"{bucket_name:<10} {count:>6} ", end="")
        for k in ks:
            val = np.mean(data[f"recall@{k}"]) if data[f"recall@{k}"] else 0
            print(f"{val:>8.4f} ", end="")
        mrr = np.mean(data["mrr"]) if data["mrr"] else 0
        print(f"{mrr:>8.4f}")

evaluate(model, test_edges, graph)

Evaluating:   0%|          | 0/6910 [00:00<?, ?it/s]


Bucket      Count      R@5     R@10     R@20      MRR
-----------------------------------------------------
all          6910   0.1511   0.2685   0.3988   0.1244
1-3           825   0.3091   0.4279   0.5139   0.1815
3-6          2171   0.2353   0.3802   0.5100   0.1563
6-10         1924   0.1074   0.2423   0.3982   0.1086
10-15        1257   0.0463   0.1361   0.2822   0.0868
15+           733   0.0186   0.0535   0.1411   0.0715
