In [1]:
import networkx as nx
import matplotlib.pyplot as plt
from math import ceil
import numpy as np
from tqdm.auto import tqdm
from os import makedirs

makedirs(f"data/processed", exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os

os.environ["LIBRARY_PATH"] = "/usr/lib/llvm-20/lib"
os.environ["LD_LIBRARY_PATH"] = "/usr/lib/llvm-20/lib"

!make

In [3]:
NUM_THREADS = 24
COST_FN = "uniform"
DATASET_NAME = "lastfm_asia"

## 1. Data preparation

### Read the data

In [4]:
G = nx.Graph()
with open(f"data/raw/{DATASET_NAME}_edges.csv", "r") as f:
    f.readline()  # Skip header
    for line in f:
        a, b = map(int, line.strip().split(","))
        G.add_edge(a, b)
print(f"Number of nodes: {G.number_of_nodes()}")
G = nx.Graph(nx.subgraph(G, max(list(nx.connected_components(G)), key=len))) # Keep only the largest component
print(f"Number of nodes in largest component: {G.number_of_nodes()}")
print("Number of edges:", G.number_of_edges())

Number of nodes: 7624
Number of nodes in largest component: 7624
Number of edges: 27806


### Save the largest connected component

In [5]:
with open(f"data/processed/{DATASET_NAME}_edges.csv", "w") as f:
    for e in G.edges():
        f.write(f"{e[0]},{e[1]}\n")

In [9]:
for k in range(5):
    output = !time ./diff.out ./data/processed/{DATASET_NAME}_edges.csv {NUM_THREADS} {COST_FN} {k + 1}
    S = list(map(int, output[2:-6]))
    overall_k, desired_k = output[-5].split("/")
    time = output[-3].split()[1]
    print(time)

[7237]
[7237, 524]
[7237, 524, 3530]
[7237, 524, 3530, 3450]
[7237, 524, 3530, 3450, 4785]


### Save the largest connected component