In [None]:

!pip install dill
!pip install dotmap
!pip install  dgl -f https://data.dgl.ai/wheels/cu117/repo.html

Collecting dill
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Downloading dill-0.4.0-py3-none-any.whl (119 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/119.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.7/119.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.4.0
Collecting dotmap
  Downloading dotmap-1.3.30-py3-none-any.whl.metadata (3.2 kB)
Downloading dotmap-1.3.30-py3-none-any.whl (11 kB)
Installing collected packages: dotmap
Successfully installed dotmap-1.3.30
Looking in links: https://data.dgl.ai/wheels/cu117/repo.html
Collecting dgl
  Downloading https://data.dgl.ai/wheels/cu117/dgl-2.1.0%2Bcu117-cp311-cp311-manylinux1_x86_64.whl (257.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.4/257.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting torchdata>=0.5.0 (from dgl)
  Downloading 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip uninstall -y dgl


Found existing installation: dgl 2.1.0+cu117
Uninstalling dgl-2.1.0+cu117:
  Successfully uninstalled dgl-2.1.0+cu117


In [None]:
!pip install dgl==1.1.2 -f https://data.dgl.ai/wheels/repo.html


Looking in links: https://data.dgl.ai/wheels/repo.html
Collecting dgl==1.1.2
  Downloading dgl-1.1.2-cp311-cp311-manylinux1_x86_64.whl.metadata (530 bytes)
Downloading dgl-1.1.2-cp311-cp311-manylinux1_x86_64.whl (6.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dgl
Successfully installed dgl-1.1.2


In [None]:
# === Basic libraries ===
import numpy as np
import pandas as pd
import random
from collections import defaultdict

# === PyTorch ===
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# === DGL for graph operations ===
import dgl
from dgl.nn import GraphConv

# === ML Evaluation ===
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score

# === Misc ===
import os
import warnings
warnings.filterwarnings("ignore")

# === Set random seeds for reproducibility ===
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# === Device setup ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [None]:
import pandas as pd
import torch

# Load edge data
label_df = pd.read_csv("/content/drive/MyDrive/bitcoin_alpha/label_train.csv")

# Get total number of nodes (assuming 0-based IDs)
num_nodes = int(max(label_df['src'].max(), label_df['dst'].max()) + 1)

# Generate random features for each node (64-dim)
features = torch.randn((num_nodes, 64), dtype=torch.float32)

# Store features and node count in dicts for DGL
node_features = {'user': features}
num_nodes_dict = {'user': num_nodes}

print("✅ User features loaded. Shape:", features.shape)


✅ User features loaded. Shape: torch.Size([3783, 64])


In [None]:
import dgl

# Separate edges by sign
pos_edges = label_df[label_df['label'] == 1]
neg_edges = label_df[label_df['label'] == 0]

# Create edge index arrays
src_pos = torch.tensor(pos_edges['src'].values, dtype=torch.int64)
dst_pos = torch.tensor(pos_edges['dst'].values, dtype=torch.int64)
src_neg = torch.tensor(neg_edges['src'].values, dtype=torch.int64)
dst_neg = torch.tensor(neg_edges['dst'].values, dtype=torch.int64)

# Build heterograph with signed edge types
graph_data = {
    ('user', 'positive', 'user'): (src_pos, dst_pos),
    ('user', 'negative', 'user'): (src_neg, dst_neg)
}

graph = dgl.heterograph(graph_data, num_nodes_dict=num_nodes_dict)

# Assign features to graph
graph.nodes['user'].data['feature'] = node_features['user']

print("✅ Graph loaded with:")
print("  Nodes:", graph.num_nodes('user'))
print("  Positive edges:", graph.num_edges(('user', 'positive', 'user')))
print("  Negative edges:", graph.num_edges(('user', 'negative', 'user')))


✅ Graph loaded with:
  Nodes: 3783
  Positive edges: 22650
  Negative edges: 1536


In [None]:
from torch.utils.data import Dataset

class LabelPairs(Dataset):
    def __init__(self, df):
        self.u = torch.tensor(df['src'].values, dtype=torch.long)
        self.v = torch.tensor(df['dst'].values, dtype=torch.long)
        self.labels = torch.tensor(df['label'].values, dtype=torch.float32)

    def __getitem__(self, index):
        return (self.u[index], self.v[index]), self.labels[index]

    def __len__(self):
        return len(self.labels)

# Initialize dataset
label_dataset = LabelPairs(label_df)
print("✅ Label dataset ready. Total pairs:", len(label_dataset))


✅ Label dataset ready. Total pairs: 24186


In [None]:
import networkx as nx

def get_unstable_edges_from_balance_theory(dgl_graph):
    """
    Returns a set of edges (u, v) that are part of at least one unbalanced triad,
    based on balance theory: a triad is unbalanced if the product of its edge signs is negative.
    """

    # === Step 1: Convert to undirected signed graph ===
    G = nx.Graph()

    for canonical_etype in dgl_graph.canonical_etypes:
        src, dst = dgl_graph.edges(etype=canonical_etype)
        sign = +1 if 'positive' in canonical_etype else -1

        # Treat graph as undirected: add both (u,v) and (v,u) to normalize
        for u, v in zip(src.tolist(), dst.tolist()):
            if u == v:
                continue
            a, b = min(u, v), max(u, v)  # consistent undirected edge
            G.add_edge(a, b, sign=sign)

    # === Step 2: Triangle enumeration ===
    unstable_edges = set()

    for triangle in nx.enumerate_all_cliques(G):
        if len(triangle) != 3:
            continue
        u, v, w = triangle

        # Get signs of each edge
        s_uv = G[u][v]['sign']
        s_vw = G[v][w]['sign']
        s_wu = G[w][u]['sign']

        sign_product = s_uv * s_vw * s_wu

        # If unbalanced (i.e., negative product), mark all edges
        if sign_product < 0:
            unstable_edges.add(tuple(sorted((u, v))))
            unstable_edges.add(tuple(sorted((v, w))))
            unstable_edges.add(tuple(sorted((w, u))))

    print(f"✅ Found {len(unstable_edges)} unstable edges based on balance theory.")
    return unstable_edges


In [None]:
unstable_edges = get_unstable_edges_from_balance_theory(graph)



✅ Found 3230 unstable edges based on balance theory.


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
from dgl.nn import GraphConv

# === GCN encoder with 2 layers ===
class GCNEncoder(nn.Module):
    def __init__(self, in_feats, hidden_feats, out_feats):
        super().__init__()
        self.conv1 = GraphConv(in_feats, hidden_feats, activation=F.relu)
        self.conv2 = GraphConv(hidden_feats, out_feats)

    def forward(self, g, nids):
        h = g.ndata['feature']
        h = self.conv1(g, h)
        h = self.conv2(g, h)
        return h[nids]

# === SGCL Model: encoder + MLP + loss functions ===
class SGCLModel(nn.Module):
    def __init__(self, in_dim=64, hidden_dim=128, out_dim=64):
        super().__init__()
        self.encoder = GCNEncoder(in_dim, hidden_dim, out_dim)
        self.mlp = nn.Sequential(
            nn.Linear(out_dim * 2, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def get_embeddings(self, g, nids):
      if 'feature' not in g.ndata:
         raise KeyError("Input graph is missing node features before homogenization.")

    # Convert to homogeneous and keep features
      g = dgl.to_homogeneous(g, ndata=['feature'])

      # ✅ Add self-loops to fix zero in-degree problem
      g = dgl.add_self_loop(g)

      return self.encoder(g, nids)


    # Convert to homogeneous graph and preserve features
      g = dgl.to_homogeneous(g, ndata=['feature'])

      return self.encoder(g, nids)


    def compute_contrastive_loss(self, z1, z2):
        z1 = F.normalize(z1, dim=1)
        z2 = F.normalize(z2, dim=1)
        return 2 - 2 * (z1 * z2).sum(dim=1).mean()

    def predict(self, z, u, v):
        h_u = z[u]
        h_v = z[v]
        h = torch.cat([h_u, h_v], dim=1)
        return self.mlp(h).squeeze()

    def compute_label_loss(self, scores, labels):
        return F.binary_cross_entropy_with_logits(scores, labels)

# === Augmentation: feature masking ===
def generate_attr_graph(g, mask_ratio=0.3):
    g_attr = g.clone()
    if 'feature' not in g_attr.ndata:
        raise KeyError("Original graph is missing 'feature' in node data.")

    feature = g_attr.ndata['feature']
    noise = torch.randn_like(feature)
    mask = (torch.rand_like(feature) > mask_ratio).float()
    noisy_feature = feature * mask + noise * (1 - mask)

    g_attr.ndata['feature'] = noisy_feature
    return g_attr

# === Augmentation: structure perturbation ===
def generate_stru_graph(g, drop_ratio=0.2):
    new_data = {}

    for canonical_etype in g.canonical_etypes:
        src, dst = g.edges(etype=canonical_etype)
        num_edges = len(src)
        num_drop = int(num_edges * drop_ratio)

        # Drop a random portion
        perm = torch.randperm(num_edges)
        keep = perm[num_drop:]
        src_keep = src[keep]
        dst_keep = dst[keep]

        # Add new random edges
        new_src = torch.randint(0, g.num_nodes('user'), (num_drop,))
        new_dst = torch.randint(0, g.num_nodes('user'), (num_drop,))

        final_src = torch.cat([src_keep, new_src])
        final_dst = torch.cat([dst_keep, new_dst])
        new_data[canonical_etype] = (final_src, final_dst)

    g_stru = dgl.heterograph(new_data, num_nodes_dict={'user': g.num_nodes('user')})

    # Copy features from original graph
    if 'feature' in g.ndata:
        g_stru.ndata['feature'] = g.ndata['feature']
    else:
        raise KeyError("Original graph is missing 'feature' in node data.")

    return g_stru
#sign perturbation
def generate_sign_flip_graph(g, flip_ratio=0.2):
    import copy
    import random

    g_flipped = copy.deepcopy(g)
    new_data = {}

    for canonical_etype in g.canonical_etypes:
        src, dst = g.edges(etype=canonical_etype)
        num_edges = len(src)
        num_flip = int(flip_ratio * num_edges)

        # Randomly select edges to flip
        perm = torch.randperm(num_edges)
        flip_idx = perm[:num_flip]
        keep_idx = perm[num_flip:]

        src_keep = src[keep_idx]
        dst_keep = dst[keep_idx]

        # Determine new flipped type
        if 'positive' in canonical_etype:
            flipped_type = ('user', 'negative', 'user')
        elif 'negative' in canonical_etype:
            flipped_type = ('user', 'positive', 'user')
        else:
            flipped_type = canonical_etype  # fallback (no flip)

        # Keep edges in their original type
        if canonical_etype not in new_data:
            new_data[canonical_etype] = (src_keep, dst_keep)
        else:
            new_data[canonical_etype] = (
                torch.cat([new_data[canonical_etype][0], src_keep]),
                torch.cat([new_data[canonical_etype][1], dst_keep])
            )

        # Add flipped edges under new type
        if flipped_type not in new_data:
            new_data[flipped_type] = (src[flip_idx], dst[flip_idx])
        else:
            new_data[flipped_type] = (
                torch.cat([new_data[flipped_type][0], src[flip_idx]]),
                torch.cat([new_data[flipped_type][1], dst[flip_idx]])
            )

    # Reconstruct heterograph
    g_flipped = dgl.heterograph(new_data, num_nodes_dict={'user': g.num_nodes('user')})
    g_flipped.ndata['feature'] = g.ndata['feature']

    return g_flipped

#using the balance theory

def generate_balance_theory_augmented_graph(g, unstable_edges, flip_prob=0.5):
    """
    Flips the sign of edges (u, v) that are part of unbalanced triads with probability flip_prob.
    """
    import copy
    import random

    new_data = {}

    # Loop through each edge type (positive, negative)
    for canonical_etype in g.canonical_etypes:
        src, dst = g.edges(etype=canonical_etype)
        sign = 'positive' if 'positive' in canonical_etype else 'negative'
        new_type = ('user', 'positive', 'user') if sign == 'negative' else ('user', 'negative', 'user')

        keep_src, keep_dst = [], []
        flip_src, flip_dst = [], []

        for u, v in zip(src.tolist(), dst.tolist()):
            a, b = min(u, v), max(u, v)  # undirected comparison

            if (a, b) in unstable_edges and random.random() < flip_prob:
                # Flip this edge to the opposite sign
                flip_src.append(u)
                flip_dst.append(v)
            else:
                # Keep edge in original sign
                keep_src.append(u)
                keep_dst.append(v)

        # Add unflipped edges to original type
        if canonical_etype not in new_data:
            new_data[canonical_etype] = (torch.tensor(keep_src), torch.tensor(keep_dst))
        else:
            new_data[canonical_etype] = (
                torch.cat([new_data[canonical_etype][0], torch.tensor(keep_src)]),
                torch.cat([new_data[canonical_etype][1], torch.tensor(keep_dst)])
            )

        # Add flipped edges to new (opposite sign) type
        if flip_src:
            if new_type not in new_data:
                new_data[new_type] = (torch.tensor(flip_src), torch.tensor(flip_dst))
            else:
                new_data[new_type] = (
                    torch.cat([new_data[new_type][0], torch.tensor(flip_src)]),
                    torch.cat([new_data[new_type][1], torch.tensor(flip_dst)])
                )

    # Rebuild the graph
    g_new = dgl.heterograph(new_data, num_nodes_dict={'user': g.num_nodes('user')})
    g_new.ndata['feature'] = g.ndata['feature']

    # print(f"✅ Graph augmented with balance-theory sign flipping.")
    return g_new



# === Wrapper to generate both augmentations ===
def augment_graph(g):
    # a = generate_attr_graph(g, mask_ratio=0.3)
    # a = generate_sign_flip_graph(graph, flip_ratio=0.3)
    a = generate_stru_graph(g, drop_ratio=0.3)

    b = generate_balance_theory_augmented_graph(graph, unstable_edges, flip_prob=0.3)

    return a,b


In [None]:
# === Hyperparameters ===
in_dim = 64         # feature dim
hidden_dim = 128    # hidden layer in GCN
out_dim = 64        # final embedding dim
batch_size = 256
epochs = 20        # you can increase later
alpha = 0.5         # weight for contrastive loss
lr = 1e-3

# === Initialize model, optimizer ===
model = SGCLModel(in_dim, hidden_dim, out_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# === DataLoader for label pairs ===
from torch.utils.data import DataLoader
train_loader = DataLoader(label_dataset, batch_size=batch_size, shuffle=True)

print("✅ Model and DataLoader ready.")

✅ Model and DataLoader ready.


In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for (pair, label) in train_loader:
        u, v = pair
        u = u.to(device)
        v = v.to(device)
        label = label.to(device)

        # === Augment graph
        g_attr, g_stru = augment_graph(graph)

        # === Move to device
        g_attr = g_attr.to(device)
        g_stru = g_stru.to(device)

        # === Node IDs involved
        nids = torch.unique(torch.cat([u, v]))

        # === Get embeddings from both views
        z1 = model.get_embeddings(g_attr, nids)
        z2 = model.get_embeddings(g_stru, nids)

        # === Contrastive loss between views
        loss_cl = model.compute_contrastive_loss(z1, z2)

        # === Link prediction logits
        z_all = model.get_embeddings(graph.to(device), nids)
              # === Map global u/v to local indices in z
        nid_map = {nid.item(): i for i, nid in enumerate(nids)}
        u_local = torch.tensor([nid_map[int(x)] for x in u.tolist()], device=device)
        v_local = torch.tensor([nid_map[int(x)] for x in v.tolist()], device=device)

# === Predict using local indices
        scores = model.predict(z_all, u_local, v_local)

        loss_pred = model.compute_label_loss(scores, label)

        # === Joint loss
        loss = loss_pred + alpha * loss_cl

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss:.4f}")


Epoch 1/20 | Loss: 31.1148
Epoch 2/20 | Loss: 24.3636
Epoch 3/20 | Loss: 23.2418
Epoch 4/20 | Loss: 21.9858
Epoch 5/20 | Loss: 19.8112
Epoch 6/20 | Loss: 18.7219
Epoch 7/20 | Loss: 17.4291
Epoch 8/20 | Loss: 16.3470
Epoch 9/20 | Loss: 15.6584
Epoch 10/20 | Loss: 15.0984
Epoch 11/20 | Loss: 14.6209
Epoch 12/20 | Loss: 14.0638
Epoch 13/20 | Loss: 13.6804
Epoch 14/20 | Loss: 13.3383
Epoch 15/20 | Loss: 13.0944
Epoch 16/20 | Loss: 12.7786
Epoch 17/20 | Loss: 12.4967
Epoch 18/20 | Loss: 12.2543
Epoch 19/20 | Loss: 12.2871
Epoch 20/20 | Loss: 12.0811


In [None]:
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_score, recall_score
)

def evaluate(model, graph, dataset, device):
    model.eval()

    with torch.no_grad():
        all_nids = torch.arange(graph.num_nodes('user')).to(device)
        z = model.get_embeddings(graph.to(device), all_nids)

    preds = []
    trues = []

    for (pair, label) in dataset:
        u, v = pair
        u = torch.tensor([u], device=device)
        v = torch.tensor([v], device=device)
        label = torch.tensor([label], device=device)

        score = model.predict(z, u, v)
        pred = torch.sigmoid(score).item()

        preds.append(pred)
        trues.append(label.item())

    preds_bin = [1 if p >= 0.5 else 0 for p in preds]

    auc = roc_auc_score(trues, preds)
    binary_f1 = f1_score(trues, preds_bin)
    micro_f1 = f1_score(trues, preds_bin, average='micro')
    macro_f1 = f1_score(trues, preds_bin, average='macro')
    precision = precision_score(trues, preds_bin)
    recall = recall_score(trues, preds_bin)

    print(f"\n🎯 Evaluation Results:")
    print(f"Micro-F1:   {micro_f1:.4f}")
    print(f"Binary-F1:  {binary_f1:.4f}")
    print(f"Macro-F1:   {macro_f1:.4f}")
    print(f"AUC:        {auc:.4f}")
    print(f"Precision:  {precision:.4f}")
    print(f"Recall:     {recall:.4f}")


In [None]:
evaluate(model, graph, label_dataset, device)



🎯 Evaluation Results:
Micro-F1:   0.9528
Binary-F1:  0.9751
Macro-F1:   0.7653
AUC:        0.9593
Precision:  0.9645
Recall:     0.9860
