In [1]:
from pocketbase import PocketBase
import json

POCKETBASE_URL = "http://localhost:8090"

In [2]:
pb = PocketBase(POCKETBASE_URL)

similar_movies_raw = pb.collection("similar_movies").get_full_list()
similar_movies = {r.movie: set(r.recommendations) for r in similar_movies_raw}

In [3]:
# Строим граф
graph = similar_movies

# Симметризация: добавляем обратные связи
for node, neighbors in list(graph.items()):
    for neighbor in neighbors:
        if neighbor not in graph:
            graph[neighbor] = set()
        graph[neighbor].add(node)

# Выбрасываем плохо заполненные ноды
NODE_EDGES_THS = 0
for node, neighbors in list(graph.items()):
    if len(neighbors) < NODE_EDGES_THS:
        del graph[node]

# Считаем статистику
num_nodes = len(graph)
num_edges = sum(len(neighbors) for neighbors in graph.values()) // 2
max_edges = num_nodes * (num_nodes - 1) // 2
density = num_edges / max_edges if max_edges > 0 else 0

print(f"Нод: {num_nodes}")
print(f"Рёбер: {num_edges}")
print(f"Макс. рёбер: {max_edges}")
print(f"Плотность: {density:.6f} ({density*100:.4f}%)")

Нод: 14276
Рёбер: 35326
Макс. рёбер: 101894950
Плотность: 0.000347 (0.0347%)


In [4]:
import numpy as np
import random
from collections import defaultdict

random.seed(42)
np.random.seed(42)

def split_edges(graph, test_fraction=0.2):
    """
    Убираем test_fraction рёбер из графа.
    Гарантируем: у каждой ноды остаётся минимум 1 ребро.
    """
    all_edges = set()
    for node, neighbors in graph.items():
        for neighbor in neighbors:
            edge = tuple(sorted((node, neighbor)))
            all_edges.add(edge)

    all_edges = list(all_edges)
    random.shuffle(all_edges)

    degree = defaultdict(int)
    for u, v in all_edges:
        degree[u] += 1
        degree[v] += 1

    current_degree = dict(degree)

    test_edges = []
    train_edges = []
    target_test = int(len(all_edges) * test_fraction)

    for u, v in all_edges:
        if len(test_edges) < target_test and current_degree[u] > 1 and current_degree[v] > 1:
            test_edges.append((u, v))
            current_degree[u] -= 1
            current_degree[v] -= 1
        else:
            train_edges.append((u, v))

    train_graph = defaultdict(set)
    for u, v in train_edges:
        train_graph[u].add(v)
        train_graph[v].add(u)

    for node in graph:
        if node not in train_graph:
            train_graph[node] = set()

    print(f"Всего рёбер: {len(all_edges)}")
    print(f"Train рёбер: {len(train_edges)}")
    print(f"Test рёбер: {len(test_edges)}")
    print(f"Доля test: {len(test_edges)/len(all_edges):.2%}")

    return dict(train_graph), test_edges

train_graph, test_edges = split_edges(graph, test_fraction=0.2)

Всего рёбер: 35326
Train рёбер: 28261
Test рёбер: 7065
Доля test: 20.00%


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")


class SkipGramDataset(Dataset):
    """Датасет пар (target, context) из прогулок."""

    def __init__(self, walks, node_to_idx, window):
        self.pairs = []
        for walk in walks:
            indices = [node_to_idx[n] for n in walk if n in node_to_idx]
            for i, target in enumerate(indices):
                start = max(0, i - window)
                end = min(len(indices), i + window + 1)
                for j in range(start, end):
                    if j != i:
                        self.pairs.append((target, indices[j]))

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        return self.pairs[idx]


class SkipGramModel(nn.Module):
    """Skip-gram с negative sampling на GPU."""

    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # Инициализация как в Word2Vec
        nn.init.uniform_(self.target_embeddings.weight, -0.5 / embedding_dim, 0.5 / embedding_dim)
        nn.init.zeros_(self.context_embeddings.weight)

    def forward(self, target_idx, context_idx, neg_idx):
        # target: (batch,) -> (batch, dim)
        target_emb = self.target_embeddings(target_idx)
        # context: (batch,) -> (batch, dim)
        context_emb = self.context_embeddings(context_idx)
        # negative: (batch, num_neg) -> (batch, num_neg, dim)
        neg_emb = self.context_embeddings(neg_idx)

        # Positive score: dot product
        pos_score = (target_emb * context_emb).sum(dim=1)
        pos_loss = -torch.nn.functional.logsigmoid(pos_score)

        # Negative score
        neg_score = torch.bmm(neg_emb, target_emb.unsqueeze(2)).squeeze(2)
        neg_loss = -torch.nn.functional.logsigmoid(-neg_score).sum(dim=1)

        return (pos_loss + neg_loss).mean()

print("SkipGram model ready")

Device: cuda
SkipGram model ready


In [6]:
import tempfile
import os
from pecanpy.pecanpy import SparseOTF


def graph_to_edgelist(g, path):
    with open(path, "w") as f:
        written = set()
        for node, neighbors in g.items():
            for neighbor in neighbors:
                edge = tuple(sorted((node, neighbor)))
                if edge not in written:
                    f.write(f"{edge[0]}\t{edge[1]}\n")
                    written.add(edge)


def train_node2vec_gpu(
    g,
    dimensions=128,
    window=5,
    walk_length=20,
    num_walks=10,
    p=1.0,
    q=1.0,
    num_neg=5,
    batch_size=4096,
    epochs=5,
    lr=0.005,
    workers=4,
):
    # 1. Генерируем прогулки через pecanpy
    with tempfile.NamedTemporaryFile(mode="w", suffix=".edgelist", delete=False) as f:
        edgelist_path = f.name

    try:
        graph_to_edgelist(g, edgelist_path)
        pecanpy_graph = SparseOTF(p=p, q=q, workers=workers)
        pecanpy_graph.read_edg(edgelist_path, weighted=False, directed=False)
        print("Generating walks...")
        walks = pecanpy_graph.simulate_walks(num_walks=num_walks, walk_length=walk_length)
        print(f"Generated {len(walks)} walks")
    finally:
        os.unlink(edgelist_path)

    # 2. Маппинг нод
    all_nodes = sorted(g.keys())
    node_to_idx = {n: i for i, n in enumerate(all_nodes)}
    vocab_size = len(all_nodes)

    # 3. Датасет
    print("Building dataset...")
    dataset = SkipGramDataset(walks, node_to_idx, window)
    print(f"Pairs: {len(dataset):,}")
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)

    # 4. Модель
    model = SkipGramModel(vocab_size, dimensions).to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr)

    # 5. Обучение
    for epoch in range(epochs):
        total_loss = 0
        num_batches = 0
        pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        for batch in pbar:
            target, context = batch
            target = target.to(device)
            context = context.to(device)

            # Negative sampling
            neg = torch.randint(0, vocab_size, (target.size(0), num_neg), device=device)

            loss = model(target, context, neg)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            num_batches += 1
            pbar.set_postfix(loss=f"{total_loss/num_batches:.4f}")

        print(f"Epoch {epoch+1}: avg loss = {total_loss/num_batches:.4f}")

    # 6. Извлекаем эмбеддинги
    embeddings = model.target_embeddings.weight.detach().cpu().numpy()

    return all_nodes, node_to_idx, embeddings


all_nodes, node_to_idx, embeddings = train_node2vec_gpu(
    train_graph,
    dimensions=128,
    window=5,
    walk_length=20,
    num_walks=10,
    p=1,
    q=1,
    epochs=5,
)
print(f"Embeddings shape: {embeddings.shape}")

Generating walks...
Generated 142760 walks
Building dataset...
Pairs: 25,696,800


Epoch 1/5:   0%|          | 0/6274 [00:00<?, ?it/s]

Epoch 1: avg loss = 0.4514


Epoch 2/5:   0%|          | 0/6274 [00:00<?, ?it/s]

Epoch 2: avg loss = 0.3494


Epoch 3/5:   0%|          | 0/6274 [00:00<?, ?it/s]

Epoch 3: avg loss = 0.3442


Epoch 4/5:   0%|          | 0/6274 [00:00<?, ?it/s]

Epoch 4: avg loss = 0.3413


Epoch 5/5:   0%|          | 0/6274 [00:00<?, ?it/s]

Epoch 5: avg loss = 0.3396
Embeddings shape: (14276, 128)


In [7]:
import torch.nn.functional as F


def evaluate_gpu(all_nodes, node_to_idx, embeddings, test_edges, graph, ks=[5, 10, 20]):
    """
    Evaluate на GPU: cosine similarity батчами через torch.
    """
    # test-соседи
    test_neighbors = defaultdict(set)
    for u, v in test_edges:
        test_neighbors[u].add(v)
        test_neighbors[v].add(u)

    original_degree = {node: len(neighbors) for node, neighbors in graph.items()}

    buckets = {
        "1-3": (1, 3),
        "3-6": (3, 6),
        "6-10": (6, 10),
        "10-15": (10, 15),
        "15+": (15, float("inf")),
    }

    # Эмбеддинги на GPU, нормализуем для cosine similarity
    emb_tensor = torch.tensor(embeddings, device=device, dtype=torch.float32)
    emb_norm = F.normalize(emb_tensor, dim=1)

    results = {bucket: {f"recall@{k}": [] for k in ks} | {"mrr": []} for bucket in buckets}
    results["all"] = {f"recall@{k}": [] for k in ks} | {"mrr": []}

    max_k = max(ks)

    # Батчевый evaluate
    eval_nodes = [n for n in test_neighbors if n in node_to_idx]
    eval_indices = torch.tensor([node_to_idx[n] for n in eval_nodes], device=device)

    batch_size = 512
    for start in tqdm(range(0, len(eval_nodes), batch_size), desc="Evaluating"):
        end = min(start + batch_size, len(eval_nodes))
        batch_nodes = eval_nodes[start:end]
        batch_idx = eval_indices[start:end]

        # (batch, dim) @ (dim, vocab) -> (batch, vocab)
        batch_emb = emb_norm[batch_idx]
        sims = batch_emb @ emb_norm.T

        # Исключаем саму ноду
        sims[torch.arange(len(batch_idx), device=device), batch_idx] = -2.0

        # Top-K на GPU
        topk_vals, topk_indices = torch.topk(sims, max_k, dim=1)
        topk_indices = topk_indices.cpu().numpy()

        # Полный ранжинг для MRR — находим ранг первого правильного
        ranked_all = torch.argsort(sims, dim=1, descending=True).cpu().numpy()

        for i, node in enumerate(batch_nodes):
            true_set = test_neighbors[node] & set(all_nodes)
            if not true_set:
                continue
            true_idx_set = {node_to_idx[n] for n in true_set}

            # Recall@K
            for k in ks:
                top_k_set = set(topk_indices[i, :k])
                recall = len(top_k_set & true_idx_set) / len(true_idx_set)
                results["all"][f"recall@{k}"].append(recall)

            # MRR
            for rank_pos, idx in enumerate(ranked_all[i], 1):
                if idx in true_idx_set:
                    results["all"]["mrr"].append(1.0 / rank_pos)
                    break

            # Бакеты
            deg = original_degree.get(node, 0)
            for bucket_name, (lo, hi) in buckets.items():
                if lo <= deg < hi or (hi == float("inf") and deg >= lo):
                    for k in ks:
                        top_k_set = set(topk_indices[i, :k])
                        recall = len(top_k_set & true_idx_set) / len(true_idx_set)
                        results[bucket_name][f"recall@{k}"].append(recall)
                    for rank_pos, idx in enumerate(ranked_all[i], 1):
                        if idx in true_idx_set:
                            results[bucket_name]["mrr"].append(1.0 / rank_pos)
                            break
                    break

    # Агрегация
    print(f"\n{'Bucket':<10} {'Count':>6} ", end="")
    for k in ks:
        print(f"{'R@'+str(k):>8} ", end="")
    print(f"{'MRR':>8}")
    print("-" * (10 + 7 + 9 * len(ks) + 9))

    for bucket_name in ["all"] + list(buckets.keys()):
        data = results[bucket_name]
        count = len(data["mrr"])
        if count == 0:
            continue
        print(f"{bucket_name:<10} {count:>6} ", end="")
        for k in ks:
            val = np.mean(data[f"recall@{k}"]) if data[f"recall@{k}"] else 0
            print(f"{val:>8.4f} ", end="")
        mrr = np.mean(data["mrr"]) if data["mrr"] else 0
        print(f"{mrr:>8.4f}")

    return results


results = evaluate_gpu(all_nodes, node_to_idx, embeddings, test_edges, graph)

Evaluating:   0%|          | 0/14 [00:00<?, ?it/s]


Bucket      Count      R@5     R@10     R@20      MRR
-----------------------------------------------------
all          7031   0.1444   0.2912   0.4504   0.1102
1-3           903   0.3023   0.4219   0.5028   0.1493
3-6          2174   0.2444   0.4210   0.5409   0.1348
6-10         1964   0.0807   0.2679   0.4686   0.0942
10-15        1257   0.0339   0.1463   0.3680   0.0846
15+           733   0.0133   0.0561   0.2096   0.0763


In [None]:
import itertools
import pandas as pd

# Сетка гиперпараметров
param_grid = {
    "dimensions": [64, 128],
    "walk_length": [20, 30],
    "window": [5, 10],
    "p": [0.5, 1, 2],
    "q": [0.5, 1, 2],
    "num_walks": [10],
}

keys = list(param_grid.keys())
combos = list(itertools.product(*param_grid.values()))
print(f"Всего комбинаций: {len(combos)}")

search_results = []

for i, combo in enumerate(combos):
    params = dict(zip(keys, combo))
    print(f"\n=== [{i+1}/{len(combos)}] {params} ===")

    try:
        nodes, n2i, embs = train_node2vec_gpu(
            train_graph,
            dimensions=params["dimensions"],
            window=params["window"],
            walk_length=params["walk_length"],
            num_walks=params["num_walks"],
            p=params["p"],
            q=params["q"],
            epochs=5,
        )

        res = evaluate_gpu(nodes, n2i, embs, test_edges, graph)

        row = dict(params)
        for k in [5, 10, 20]:
            row[f"recall@{k}"] = np.mean(res["all"][f"recall@{k}"])
        row["mrr"] = np.mean(res["all"]["mrr"])
        search_results.append(row)

    except Exception as e:
        print(f"ERROR: {e}")
        continue

df_results = pd.DataFrame(search_results).sort_values("mrr", ascending=False)
print("\n=== TOP RESULTS ===")
df_results.head(10)