In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import embedders

DEVICE = torch.device("cuda", 1) # Use the 2nd GPU

In [3]:
# Get some embeddings to work with

# Get graph
D, _, adj = embedders.dataloaders.load("karate_club")
D, adj = D.to(DEVICE), adj.to(DEVICE)
pm = embedders.manifolds.ProductManifold(
    signature=[(-1, 2), (0, 2), (1, 2)], device=DEVICE
)

# Get embeddings
X, _ = embedders.coordinate_learning.train_coords(
    pm=pm, dists=D, device=DEVICE, burn_in_iterations=400, training_iterations=1600
)

# Get stereographic version
pm_stereo, X_stereo = pm.stereographic(X)

Top CC has 34 nodes; original graph has 34 nodes.


  0%|          | 0/2000 [00:00<?, ?it/s]

In [4]:
# Get an adjacency matrix that's not leaky
dists = pm.pdist2(X)
max_dist = dists[dists.isfinite()].max()
dists /= max_dist
A = torch.exp(-dists)
A_hat = embedders.predictors.kappa_gcn.get_A_hat(A).float().to(DEVICE)

In [5]:
from sklearn.model_selection import train_test_split

# Train a KappaGCN on everything
kgcn = embedders.predictors.kappa_gcn.KappaGCN(
    pm=pm_stereo, output_dim=1, hidden_dims=[pm.dim, pm.dim], task="link_prediction"
).to(DEVICE)

# Split on indices, since this is a weird graph thing
y = adj.float().flatten()
train_idx, test_idx = train_test_split(list(range(len(y))), test_size=0.2)

# Fit on train indices using all X, but only some of the y - this masks out certain edges from the loss
kgcn.fit(X, y=y[train_idx], A=A_hat, lr=1e-2, lp_indices=train_idx, use_tqdm=True, epochs=100)

# Predict on test indices
y_pred = kgcn.predict(X, A_hat)[test_idx]

(y_pred == y[test_idx]).float().mean()


  0%|          | 0/100 [00:00<?, ?it/s]

tensor(0.8405, device='cuda:1')

In [6]:
# Copied from notebook 22

def make_link_prediction_dataset(X_embed, pm, adj, add_dists=True):
    # Stack embeddings
    emb = []
    for i in range(len(X_embed)):
        for j in range(len(X_embed)):
            joint_embed = torch.cat([X_embed[i], X_embed[j]])
            emb.append(joint_embed)

    X = torch.stack(emb)

    # Add distances
    if add_dists:
        dists = pm.pdist(X_embed)
        X = torch.cat([X, dists.flatten().unsqueeze(1)], dim=1)

    # y = torch.tensor(adj.flatten())
    if not torch.is_tensor(adj):
        adj = torch.tensor(adj)
    y = adj.flatten()

    # Make a new signature
    new_sig = pm.signature + pm.signature
    if add_dists:
        new_sig.append((0, 1))
    new_pm = embedders.manifolds.ProductManifold(signature=new_sig)

    return X, y, new_pm

In [None]:
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
import time

DATASETS = ["dolphins", "football", "karate_club", "lesmis", "polbooks"]#, "adjnoun"]
SIGNATURE = [(-1, 2), (0, 2), (1, 2)]
N_TRIALS = 10
TOTAL_ITERATIONS = 1_000
USE_DISTS = True
USE_TQDM = True
MODELS = [
    "sklearn_dt",
    "sklearn_rf",
    "product_dt",
    "product_rf",
    "tangent_dt",
    "tangent_rf",
    "knn",
    "ps_perceptron",
    "ambient_mlp",
    # "ambient_gnn",
    # "kappa_gcn",
    # "product_mlr",
]
LR = 1e-4
EPOCHS = 4_000

results = []

# for dataset in ["karate_club"]:
my_tqdm = tqdm(total=N_TRIALS * len(DATASETS))
for dataset in DATASETS:
    dists, _, adj = embedders.dataloaders.load(dataset)
    dists, adj = dists.to(DEVICE), adj.to(DEVICE)
    dists = dists / dists[dists.isfinite()].max()

    while len(results) < N_TRIALS:
        pm = embedders.manifolds.ProductManifold(signature=SIGNATURE, device=DEVICE)
        X, _ = embedders.coordinate_learning.train_coords(
            pm=pm,
            dists=dists,
            burn_in_iterations=int(0.1 * TOTAL_ITERATIONS),
            training_iterations=int(0.9 * TOTAL_ITERATIONS),
            scale_factor_learning_rate=0.02,
            device=DEVICE,
        )
        assert not torch.isnan(X).any()

        # Get data for classification variants
        XX, yy, pm_new = make_link_prediction_dataset(X, pm, adj, add_dists=USE_DISTS)
        X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
            XX, yy, list(range(len(yy))), test_size=0.2
        )
        res = embedders.benchmarks.benchmark(
            XX, yy, pm_new, task="classification", score=["accuracy", "f1-micro"], device=DEVICE, models=MODELS
        )

        # Other manifolds we'll need
        pm_stereo, X_stereo = pm.stereographic(X)
        pm_stereo_euc = embedders.manifolds.ProductManifold(
            signature=[(0, X.shape[1])], stereographic=True, device=DEVICE
        )

        # Get an adjacency matrix that's not leaky
        dists = pm.pdist2(X)
        max_dist = dists[dists.isfinite()].max()
        dists /= max_dist
        A = torch.exp(-dists)
        A_hat = embedders.predictors.kappa_gcn.get_A_hat(A).float().to(DEVICE)

        # Ambient GNN
        agnn = embedders.predictors.kappa_gcn.KappaGCN(
            pm=pm_stereo_euc, output_dim=1, hidden_dims=[pm_stereo_euc.dim], task="link_prediction"
        ).to(DEVICE)
        t1 = time.time()
        agnn.fit(X=X, y=y_train, A=A_hat, lr=LR, epochs=EPOCHS, lp_indices=idx_train, use_tqdm=USE_TQDM)
        t2 = time.time()
        y_pred = agnn.predict(X, A_hat)[idx_test]
        res["ambient_gnn_accuracy"] = (y_pred == y_test).float().mean().item()
        res["ambient_gnn_f1_micro"] = f1_score(y_test.cpu(), y_pred.cpu(), average="micro")
        res["ambient_gnn_time"] = t2 - t1

        # Kappa GCN
        kgcn = embedders.predictors.kappa_gcn.KappaGCN(
            pm=pm_stereo, output_dim=1, hidden_dims=[pm_stereo.dim], task="link_prediction"
        ).to(DEVICE)
        t1 = time.time()
        kgcn.fit(X=X_stereo, y=y_train, A=A_hat, lr=LR, epochs=EPOCHS, lp_indices=idx_train, use_tqdm=USE_TQDM)
        t2 = time.time()
        y_pred = kgcn.predict(X_stereo, A_hat)[idx_test]
        res["kappa_gcn_accuracy"] = (y_pred == y_test).float().mean().item()
        res["kappa_gcn_f1_micro"] = f1_score(y_test.cpu(), y_pred.cpu(), average="micro")
        res["kappa_gcn_time"] = t2 - t1

        # Product MLR
        mlr = embedders.predictors.kappa_gcn.KappaGCN(
            pm=pm_stereo, output_dim=1, hidden_dims=[], task="link_prediction"
        ).to(DEVICE)
        t1 = time.time()
        kgcn.fit(X=X_stereo, y=y_train, A=A_hat, lr=LR, epochs=EPOCHS, lp_indices=idx_train, use_tqdm=USE_TQDM)
        t2 = time.time()
        y_pred = kgcn.predict(X_stereo, A_hat)[idx_test]
        res["product_mlr_accuracy"] = (y_pred == y_test).float().mean().item()
        res["product_mlr_f1_micro"] = f1_score(y_test.cpu(), y_pred.cpu(), average="micro")
        res["product_mlr_time"] = t2 - t1

        # Other details
        res["d_avg"] = embedders.metrics.d_avg(pm.pdist(X), dists).item()
        results.append(res)
        my_tqdm.update(1)
        # except Exception as e:
        #     print(e)
        #     pass

  0%|          | 0/50 [00:00<?, ?it/s]

Top CC has 62 nodes; original graph has 62 nodes.


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

  0%|          | 0/4000 [00:00<?, ?it/s]

In [None]:
results_df = pd.DataFrame(results)
results_df.to_csv("../data/results_icml/link_prediction.tsv", sep="\t", index=False)