In this notebook, we will use the node2vecs package which was modified in "Final - Modding Sadamori's Torch N2V.ipynb" in the previously timestamped folder. We modified the loss functions to use cosine and Euclidean distances apart from just dot similarity. The autograd function was used to verify our manual jacobian calculations. 
We now generate embeddings through this modified node2vec to suit our needs later. We want to use the embeddings to run clustering with different distance metrics to see if there's something interesting.


I want to use another set of networks because of some weird aberrations that were happening in low mixing limits in the previous run.

In [None]:
import os
import numpy as np
import scipy.sparse as sp
import pandas as pd
import pickle
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm
from node2vecs import TorchNode2Vec

def process_task(task):
    """
    Process a single task: load the network and community table, train embeddings using three similarity metrics,
    and save embeddings, network, and community table to the output directory.
    
    task: tuple (k, run, mu, task_idx)
    """
    k, run, mu, task_idx = task
    # Alternate GPU devices: tasks with even task_idx use cuda:0, odd use cuda:1
    device = f"cuda:{task_idx % 2}"
    
    # Define input and output directories for this k value
    input_base = f"/nobackup/gogandhi/alt_means_sans_k/data/experiment_mu_change_10000_{k}_3.0_minc50"
    output_base = f"/nobackup/gogandhi/alt_means_sans_k/data/experiment_n2v_metric_cosine_change_10000_{k}_3.0_minc50"
    
    input_run_dir = os.path.join(input_base, f"Run_{run}")
    output_run_dir = os.path.join(output_base, f"Run_{run}")
    os.makedirs(output_run_dir, exist_ok=True)
    
    mu_str = f"{mu}"
    net_filename = f"net_LFR_n_10000_tau1_3.0_tau2_1.0_mu_{mu_str}_k_{k}_mincomm_50.npz"
    comm_filename = f"community_table_LFR_n_10000_tau1_3.0_tau2_1.0_mu_{mu_str}_k_{k}_mincomm_50.npz"
    
    net_path = os.path.join(input_run_dir, net_filename)
    comm_path = os.path.join(input_run_dir, comm_filename)
    
    if not os.path.exists(net_path):
        print(f"[Task {task_idx}] Network file not found: {net_path}")
        return
    if not os.path.exists(comm_path):
        print(f"[Task {task_idx}] Community file not found: {comm_path}")
        return
    
    # Load the network as a scipy sparse matrix
    try:
        A = sp.load_npz(net_path)
    except Exception as e:
        print(f"[Task {task_idx}] Error loading network file {net_path}: {e}")
        return
    
    # Load the community table; assume it's saved in an npz file
    try:
        community_table = pd.read_csv(comm_path)
        
    except Exception as e:
        print(f"[Task {task_idx}] Error loading community file {comm_path}: {e}")
        return
    
    # Define the similarity metrics to test
    similarity_measures = ["cosine"]
    embeddings_dict = {}
    
    for sim in similarity_measures:
        print(f"[Task {task_idx}] Training model using {sim} similarity for Run {run}, mu {mu_str}, k {k} on {device}...")
        try:
            model = TorchNode2Vec(
                vector_size=64,
                similarity_metric=sim,
                device=device,
                num_workers=1
            )
            model.fit(A)
            emb = model.transform()
            embeddings_dict[sim] = emb
        except Exception as e:
            print(f"[Task {task_idx}] Error training {sim} model for Run {run}, mu {mu_str}, k {k} on {device}: {e}")
            continue
    
    # Save the embeddings dictionary as a pickle file
    out_filename = f"embeddings_LFR_n_10000_tau1_3.0_tau2_1.0_mu_{mu_str}_k_{k}_mincomm_50.pkl"
    out_path = os.path.join(output_run_dir, out_filename)
    try:
        with open(out_path, "wb") as f:
            pickle.dump(embeddings_dict, f)
        print(f"[Task {task_idx}] Saved embeddings to {out_path}")
    except Exception as e:
        print(f"[Task {task_idx}] Error saving embeddings to {out_path}: {e}")
    
    # # Save the network to the output folder
    # network_out_path = os.path.join(output_run_dir, net_filename)
    # try:
    #     sp.save_npz(network_out_path, A)
    #     print(f"[Task {task_idx}] Saved network to {network_out_path}")
    # except Exception as e:
    #     print(f"[Task {task_idx}] Error saving network to {network_out_path}: {e}")
    
    # # Save the community table as a CSV file to the output folder
    # community_out_path = os.path.join(output_run_dir, f"community_table_LFR_n_10000_tau1_3.0_tau2_1.0_mu_{mu_str}_k_{k}_mincomm_50.csv")
    # try:
    #     if not isinstance(community_table, pd.DataFrame):
    #         community_table = pd.DataFrame(community_table)
    #     community_table.to_csv(community_out_path, index=False)
    #     print(f"[Task {task_idx}] Saved community table to {community_out_path}")
    # except Exception as e:
    #     print(f"[Task {task_idx}] Error saving community table to {community_out_path}: {e}")

# Process k values sequentially while parallelizing runs and mu-values
for k in [50]:
    tasks = []
    mu_values = [round(x, 2) for x in np.arange(0.05, 1.01, 0.05)]
    #mu_values = [0.05]
    task_idx = 0
    for run in range(1, 11):
        for mu in mu_values:
            tasks.append((k, run, mu, task_idx))
            task_idx += 1
    
    # Use a ProcessPoolExecutor to run tasks in parallel; adjust max_workers as needed
    with ProcessPoolExecutor(max_workers=20) as executor:
        futures = [executor.submit(process_task, task) for task in tasks]
        for _ in tqdm(as_completed(futures), total=len(futures), desc=f"Processing k={k}"):
            pass

Processing k=50:   0%|                                                                                | 0/200 [00:00<?, ?it/s]

[Task 2] Training model using cosine similarity for Run 1, mu 0.15, k 50 on cuda:0...
[Task 11] Training model using cosine similarity for Run 1, mu 0.6, k 50 on cuda:1...[Task 6] Training model using cosine similarity for Run 1, mu 0.35, k 50 on cuda:0...

[Task 12] Training model using cosine similarity for Run 1, mu 0.65, k 50 on cuda:0...[Task 1] Training model using cosine similarity for Run 1, mu 0.1, k 50 on cuda:1...

[Task 8] Training model using cosine similarity for Run 1, mu 0.45, k 50 on cuda:0...
[Task 9] Training model using cosine similarity for Run 1, mu 0.5, k 50 on cuda:1...
[Task 4] Training model using cosine similarity for Run 1, mu 0.25, k 50 on cuda:0...
[Task 0] Training model using cosine similarity for Run 1, mu 0.05, k 50 on cuda:0...[Task 13] Training model using cosine similarity for Run 1, mu 0.7, k 50 on cuda:1...

[Task 10] Training model using cosine similarity for Run 1, mu 0.55, k 50 on cuda:0...
[Task 5] Training model using cosine similarity for Ru

 39%|████████████████████████████▌                                            | 12210/31250 [11:45<10:38, 29.81it/s, loss=1.3]