In [1]:
import networkx as nx
import matplotlib.pyplot as plt
from math import ceil
import numpy as np
from tqdm.auto import tqdm
from os import makedirs
import pickle as pkl

makedirs(f"data/processed", exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os

os.environ["LIBRARY_PATH"] = "/usr/lib/llvm-20/lib"
os.environ["LD_LIBRARY_PATH"] = "/usr/lib/llvm-20/lib"

!make

make: Nothing to be done for 'all'.


In [3]:
NUM_THREADS = 24
NUM_STEPS = 50
COST_FN = "uniform"
DATASET_NAME = "lastfm_asia"

In [4]:
makedirs(f"results/{DATASET_NAME}", exist_ok=True)

## 1. Data preparation

### Read the data

In [5]:
G = nx.Graph()
with open(f"data/raw/{DATASET_NAME}_edges.csv", "r") as f:
    f.readline()  # Skip header
    for line in f:
        a, b = map(int, line.strip().split(","))
        G.add_edge(a, b)
print(f"Number of nodes: {G.number_of_nodes()}")
G = nx.Graph(nx.subgraph(G, max(list(nx.connected_components(G)), key=len))) # Keep only the largest component
print(f"Number of nodes in largest component: {G.number_of_nodes()}")
print("Number of edges:", G.number_of_edges())

Number of nodes: 7624
Number of nodes in largest component: 7624
Number of edges: 27806


### Save the largest connected component

In [6]:
with open(f"data/processed/{DATASET_NAME}_edges.csv", "w") as f:
    for e in G.edges():
        f.write(f"{e[0]},{e[1]}\n")

## 2. Definizione della funzione di costo

In [7]:
from collections import defaultdict

degrees = defaultdict(int, nx.degree(G)) # Degree of each node

if COST_FN == "degree":
    max_k = np.sum(np.array(list(degrees.values())) / 2).item()
elif COST_FN == "uniform":
    max_k = G.number_of_nodes()
else:
    raise ValueError(f"Unknown cost function: {COST_FN}")

In [8]:
for k in tqdm(range(1, int(max_k), ceil((max_k - 1) / NUM_STEPS))):
    output = !time ./diff.out ./data/processed/{DATASET_NAME}_edges.csv {NUM_THREADS} {COST_FN} {k}
    S = set(map(int, output[2:-6]))
    overall_k, desired_k = output[-5].split("/")
    time = output[-3].split()[1]
    with open(f"results/{DATASET_NAME}/{COST_FN}_greedy_{k}.pkl", "wb") as f:
        pkl.dump((S, overall_k, desired_k, time), f)

100%|██████████| 50/50 [8:31:56<00:00, 614.34s/it]   


### Save the largest connected component