I guess autograd is magic. Our goal is to try to optimize this algorithm that does Node2Vec but has modifications to use dot similarity or cosine or Euclidean distance. This algorithm is nowhere close to matching embcom times. Currently, the algorithm takes 16 mins to embed a 1000 node lfr network, node2vec from gensim takes 5 seconds :(((

How to speed this up?

In [3]:
import time
import random
import numpy as np
import networkx as nx

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import lfr # from https://github.com/skojaku/community-detection-via-neural-embedding/tree/master
import sys
import cProfile, pstats

In [4]:
##############################################
# Block 1: Setup and Random Walk Generation  #
##############################################


def generate_random_walks(G, num_walks=10, walk_length=10):
    """
    Generate random walks over the graph G.
    This needs optimization but we've got bigger fish to fry. 
    Sampling pairs from random walks take 5 seconds out of the 18 minutes.
    """
    walks = []
    nodes = list(G.nodes())
    for _ in range(num_walks):
        random.shuffle(nodes)
        for node in nodes:
            walk = [node]
            for i in range(walk_length - 1):
                cur = walk[-1]
                neighbors = list(G.neighbors(cur))
                if len(neighbors) == 0:
                    break
                walk.append(random.choice(neighbors))
            walks.append(walk)
    return walks

##############################################
# Block 2: Generating SkipGram Pairs         #
##############################################

def generate_skipgram_pairs(walks, window_size=2):
    """
    For each walk, generate (center, context) pairs using a sliding window.
    Consider profiling this block using line_profiler if walks or window_size are large.
    """
    pairs = []
    for walk in walks:
        L = len(walk)
        for i, center in enumerate(walk):
            start = max(0, i - window_size)
            end = min(L, i + window_size + 1)
            for j in range(start, end):
                if i == j:
                    continue
                pairs.append((center, walk[j]))
    return pairs

##############################################
# Block 3: Dataset and DataLoader            #
##############################################

class SkipGramDataset(Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        center, context = self.pairs[idx]
        return center, context

##############################################
# Block 4: SkipGram Model and Loss Function    #
##############################################

class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, sim_type="dot"):
        """
        Parameters:
          vocab_size: int – number of nodes.
          embedding_dim: int – embedding dimensionality.
          sim_type: str – similarity measure; options: "dot", "euclidean", "cosine".
        """
        super(SkipGramModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.sim_type = sim_type.lower()
        self.center_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)
        initrange = 0.5 / embedding_dim
        self.center_embeddings.weight.data.uniform_(-initrange, initrange)
        self.context_embeddings.weight.data.zero_()
        
        # Scaling factor for Euclidean distances, rn at just 1
        self.beta = nn.Parameter(torch.tensor(1.0))  

    def forward(self, center_idxs, context_idxs, negative_idxs):
        """
        center_idxs: (B,) tensor for center nodes.
        context_idxs: (B,) tensor for positive context nodes.
        negative_idxs: (B, K) tensor for negative samples.
        Returns:
          pos_scores: (B,) tensor of scores for positive pairs.
          neg_scores: (B, K) tensor of scores for negative pairs.
        """
        center_vecs = self.center_embeddings(center_idxs)         # (B, D)
        pos_context_vecs = self.context_embeddings(context_idxs)    # (B, D)
        neg_context_vecs = self.context_embeddings(negative_idxs)   # (B, K, D)
        
        # Compute positive pair scores based on similarity type.
        if self.sim_type == "dot":
            pos_scores = torch.sum(center_vecs * pos_context_vecs, dim=1)
        elif self.sim_type == "euclidean":
            pos_scores = -torch.norm(center_vecs - pos_context_vecs, dim=1) * self.beta
        elif self.sim_type == "cosine":
            pos_scores = F.cosine_similarity(center_vecs, pos_context_vecs, dim=1)
        else:
            raise ValueError("Unknown similarity type. Choose from 'dot', 'euclidean', or 'cosine'.")

        # Compute negative pair scores.
        if self.sim_type == "dot":
            neg_scores = torch.bmm(neg_context_vecs, center_vecs.unsqueeze(2)).squeeze(2)  # (B, K)
        elif self.sim_type == "euclidean":
            neg_scores = -torch.norm(center_vecs.unsqueeze(1) - neg_context_vecs, dim=2)  # (B, K)
        elif self.sim_type == "cosine":
            neg_scores = F.cosine_similarity(center_vecs.unsqueeze(1).expand_as(neg_context_vecs), neg_context_vecs, dim=2)
        return pos_scores, neg_scores

def skipgram_loss(pos_scores, neg_scores):
    """
    Compute loss:
      - For a positive pair: L_pos = -log(sigmoid(score))
      - For negative samples: L_neg = -sum(log(sigmoid(-score)))
    Optimization tip: You can use F.logsigmoid for numerical stability.
    """
    loss_pos = -F.logsigmoid(pos_scores)
    loss_neg = -torch.sum(F.logsigmoid(-neg_scores), dim=1)
    loss = loss_pos + loss_neg
    return loss.mean()

##############################################
# Block 5: Training Loop                     #
##############################################

def train_model(G, num_walks = 10, walk_length = 10, 
                window_size = 2, embedding_dim = 128, 
                negative_samples = 5, num_epochs = 5, sim_type = "dot"):
    
    '''
    Options for sim_type: "dot", "euclidean", "cosine"
    loss.backward automatically calculates gradients for any form of similarity
    function that we give it! Which makes it easier to modify node2vec to use 
    Euclidean similarity or anything else
    '''
    
    # Use GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

   
   
    
    # Generate random walks
    start_time = time.time()
    walks = generate_random_walks(G, num_walks, walk_length)
    print("Number of walks:", len(walks))
    print("Example walk:", walks[0])



    pairs = generate_skipgram_pairs(walks, window_size)
    print("Number of positive pairs:", len(pairs))
    print("Example pairs:", pairs[:5])
    print("Skipgram walks and pairs generated in {:.4f} seconds.".format(time.time() - start_time))

    # Create dataset and dataloader
    dataset = SkipGramDataset(pairs)
    # For larger datasets, consider setting num_workers > 0 for parallel data loading.
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True, num_workers=0)

    

    # Hyperparameters
    vocab_size = G.number_of_nodes()   # Assuming node IDs are 0-indexed.
    

    # Initialize model and optimizer
    model = SkipGramModel(vocab_size, embedding_dim, sim_type=sim_type).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    print("\nStarting training...")
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        epoch_start = time.time()
        for center_idxs, context_idxs in dataloader:
            center_idxs = center_idxs.to(device)
            context_idxs = context_idxs.to(device)
            batch_size = center_idxs.size(0)
            # Use device variable to ensure compatibility even if the device is CPU.
            negative_idxs = torch.randint(0, vocab_size, (batch_size, negative_samples), device=device)
            optimizer.zero_grad()
            pos_scores, neg_scores = model(center_idxs, context_idxs, negative_idxs)
            loss = skipgram_loss(pos_scores, neg_scores)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * batch_size
        avg_loss = epoch_loss / len(dataset)
        print(f"Epoch {epoch+1}/{num_epochs} completed in {time.time() - epoch_start:.2f}s, Loss: {avg_loss:.4f}")

    # Save or return the trained model and embeddings as needed.
    return model, device

##############################################
# Block 6: Evaluating and Extracting         #
#         Embeddings                         #
##############################################

def evaluate_model(model, device):
    model.eval()
    with torch.no_grad():
        embeddings = model.center_embeddings.weight.cpu().numpy()
    return embeddings

In [13]:
##############################################
# Main Function: Run the Training and Eval   #
##############################################

import lfr
def create_network(params={
    "N": 1000,     # number of nodes
    "k": 6,        # average degree
    "maxk": 25,    # maximum degree
    "minc": 25,    # minimum community size
    "maxc": 250,   # maximum community size
    "tau": 2,      # degree exponent
    "tau2": 1.5,   # community size exponent
    "mu": 0.01,     # mixing rate
}):
    ng = lfr.NetworkGenerator()
    data = ng.generate(**params)
    net = data["net"]                  # scipy.csr_sparse matrix
    community_table = data["community_table"]  # pandas DataFrame
    seed = data["seed"]                # Seed value
    return net, community_table, seed


# Create a sample graph (Karate Club)

G = nx.karate_club_graph() # Try this for faster analysis

# A, community_labels, _ = create_network()
# G = nx.from_scipy_sparse_array(A)

#-----------------------
profiler = cProfile.Profile()
profiler.enable()

#-----------------------

model, device = train_model(G, num_walks = 10, walk_length = 40, 
                            window_size = 6, embedding_dim = 64, 
                            negative_samples = 5, num_epochs = 3, sim_type = "euclidean" )

embeddings = evaluate_model(model, device)

#-----------------------
profiler.disable()

Using device: cuda
Number of walks: 340
Example walk: [22, 32, 2, 28, 2, 7, 2, 7, 2, 3, 7, 3, 12, 0, 13, 2, 8, 33, 13, 33, 18, 32, 2, 28, 33, 28, 2, 3, 2, 13, 1, 13, 3, 13, 3, 12, 3, 7, 3, 13]
Number of positive pairs: 148920
Example pairs: [(22, 32), (22, 2), (22, 28), (22, 2), (22, 7)]
Skipgram walks and pairs generated in 0.1649 seconds.

Starting training...
Epoch 1/3 completed in 6.94s, Loss: 0.7099
Epoch 2/3 completed in 11.01s, Loss: 0.0061
Epoch 3/3 completed in 9.38s, Loss: 0.0012


In [8]:
stats = pstats.Stats(profiler).sort_stats('tottime')
stats.print_stats()

         5478538 function calls (5317895 primitive calls) in 30.317 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     6981    9.512    0.001    9.512    0.001 {method 'run_backward' of 'torch._C._EngineBase' objects}
     6981    1.472    0.000    4.271    0.001 /tmp/ipykernel_1795993/3812967147.py:88(forward)
    20943    1.106    0.000    1.106    0.000 {built-in method torch.embedding}
    20947    0.948    0.000    0.948    0.000 {built-in method torch.tensor}
    13962    0.793    0.000    0.793    0.000 {built-in method torch._C._linalg.linalg_vector_norm}
     6981    0.762    0.000    1.824    0.000 /tmp/ipykernel_1795993/3812967147.py:120(skipgram_loss)
    13965    0.674    0.000    0.674    0.000 {method 'to' of 'torch._C.TensorBase' objects}
    20946    0.665    0.000    0.665    0.000 {built-in method torch._ops.profiler._record_function_enter_new}
        1    0.576    0.576   30.308   30.308 /tmp/ipykerne

<pstats.Stats at 0x7ca05a8cbc70>

In [22]:
import embcom


def create_embedding(G, emb_params = {
                                            "window_length": 10,
                                            "walk_length": 40,
                                            "num_walks": 6,
                                            "dim" : 64,
                                        }):
  
    model = embcom.embeddings.Node2Vec(window_length = emb_params['window_length'], walk_length=emb_params['walk_length'], num_walks=emb_params['num_walks'])
            
    net = nx.adjacency_matrix(G).astype(float)
    model.fit(net)
    emb = model.transform(dim=emb_params['dim'])

    return emb

G = nx.karate_club_graph()

start_t = time.time()
emb_p = create_embedding(G)
print("Time in seconds:", time.time() - start_t)

Time in seconds: 0.02255702018737793


In [21]:
from sklearn.metrics.pairwise import cosine_similarity


def calculate_normalized_embedding_loss(V_a, V_b):
    
    
    def center_embeddings(V):
        # Subtract the mean of each column from the corresponding entries
        return V - np.mean(V, axis=0)

    
    # Step 1: Center both embedding matrices
    V_a_centered = center_embeddings(V_a)
    V_b_centered = center_embeddings(V_b)
    
    # Step 2: Calculate cosine similarity matrices for centered embeddings
    C_a = cosine_similarity(V_a_centered)
    C_b = cosine_similarity(V_b_centered)
    
    # Step 3: Calculate the absolute differences between cosine similarities
    N = V_a.shape[0]
    loss = 0
    
    # Only sum over the upper triangular part of the matrix (i < j)
    for i in range(N):
        for j in range(i+1, N):
            loss += np.abs(C_a[i, j] - C_b[i, j])
    
    # Step 4: Normalize the loss
    normalized_loss = (2 / (N * (N - 1))) * loss
    
    return normalized_loss



loss = calculate_normalized_embedding_loss(emb_p, embeddings)
loss

0.3451614016445388

This is the normalized embedding loss between the embeddings generated by the code and by node2vec that uses gensim.