In [45]:
import dataloader
import argparse
import importlib
importlib.reload(dataloader)

<module 'dataloader' from '/u/nyw6dh/HCDM/Experiment/LearnabilityLock/dataloader.py'>

In [46]:
# Arguments
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str, default='cora')
parser.add_argument('--gpu_id', type=int, default=0)
parser.add_argument('--config', type=str, default='config.yaml')
args = parser.parse_args("")

In [47]:
# Load configuration
import yaml
from yaml import SafeLoader
config = yaml.load(open('config.yml'), Loader=SafeLoader)[args.dataset]

In [48]:
# Load the data
dataset = dataloader.load_DGL('cora')[0]

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


In [49]:
# 3. Attack ===================

# Set up surrogate model
# surrogate = models.GCN(
#   in_size=dataset.ndata['feat'].shape[1],
#   out_size=int(dataset.ndata['label'].max()) + 1,
#   hid_size=config['hid_size'],
#   lr=config['lr'],
#   dropout=config['dropout'],
#   weight_decay=config['weight_decay']
# )

# surrogate(dataset, dataset.ndata['feat'])
# surrogate.fit(dataset, dataset.ndata['feat'], dataset.ndata['label'], 100)


In [50]:
import dgl
import torch
import models
import importlib
importlib.reload(models)
from tqdm import tqdm

class LearnabilityLock():
  def __init__(self, graph: dgl.DGLGraph):
    self.adj = graph.adj().to_dense()
    self.num_nodes = graph.num_nodes()
    self.feat = graph.ndata['feat']
    self.labels = graph.ndata['label']
    
  def set_protected(self, protected: torch.Tensor) -> None:
    assert self.num_nodes == protected.shape[0]
    assert protected.dtype == torch.bool
    self.g0 = protected
    self.gX = ~protected

  def designate_protected_random(self, node_ct: int) -> None:
    """
    Randomly select ct nodes to be the protected set
    """
    sample = torch.rand(self.num_nodes).topk(node_ct)
    protected = torch.zeros(self.num_nodes, dtype=torch.bool)
    protected[sample.indices] = True
    self.set_protected(protected)

  def get_sample_idx(self, edge_ct: int) -> torch.tensor:
    cutoff = 1 - (edge_ct / self.num_nodes ** 2) 
    sample = (torch.rand(self.adj.shape) > cutoff).int().to_sparse().indices()

    return sample

  def create_locked_graph(self, attack_epochs: int, surrogate: torch.nn.Module) -> dgl.data.DGLDataset:
    assert 'protected' in self.dataset.ndata

    t = tqdm(range(attack_epochs), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}')
    t.set_description("Perturbing")

    for epoch in t:
      None


attack = LearnabilityLock(dataset)
attack.designate_protected_random(node_ct=100)
attack.get_sample_idx(1000)


# attack.create_locked_graph(attack_epochs=10, surrogate=surrogate)

tensor([[   4,    5,    7,  ..., 2702, 2707, 2707],
        [2448, 2025, 1065,  ..., 2418,  405, 2022]])

In [57]:
import models
import importlib
importlib.reload(models)

# Set up surrogate model
surrogate = models.DenseGCN(
  in_size=dataset.ndata['feat'].shape[1],
  out_size=int(dataset.ndata['label'].max()) + 1,
  hid_size=config['hid_size'],
  lr=config['lr'],
  dropout=config['dropout'],
  weight_decay=config['weight_decay']
)

import torch.nn.functional as F
from tqdm import tqdm
import numpy as np

perturbations = torch.zeros_like(attack.adj).float()
count = torch.zeros_like(attack.adj).float()
num_perturbations = 2000

t = tqdm(range(10), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}')
t.set_description("Perturbing")

for epoch in t:
    # Re-initialize adj_grad
    adj_grad = torch.zeros_like(attack.adj).float()

    # Get modified adj
    modified_adj = attack.adj + perturbations - ((attack.adj * perturbations) * 2)
    # Utils.get_modified_adj(graph.adj, perturbations).float().to(device)

    for sample_epoch in range(5):
        # Get sample indices
        # sampled = torch.bernoulli(sampling_matrix)
        idx = attack.get_sample_idx(edge_ct=10000)

        # Map sample to adj
        sample = modified_adj[idx[0], idx[1]].clone().detach().requires_grad_(True)
        modified_adj[idx[0], idx[1]] = sample

        # Get grad
        predictions = surrogate(modified_adj, attack.feat)
        loss = F.cross_entropy(predictions[attack.g0], attack.labels[attack.g0]) \
            - F.cross_entropy(predictions[attack.gX], attack.labels[attack.gX])

        grad = torch.autograd.grad(loss, sample)[0]

        # Implement averaging
        adj_grad[idx[0], idx[1]] += grad
        count[idx[0], idx[1]] += 1

    # Update the sampling matrix
    # samplingMatrix.updateByGrad(adj_grad, count)
    # samplingMatrix.getRatio()

    # Average the gradient
    adj_grad = torch.div(adj_grad, count)
    adj_grad[adj_grad != adj_grad] = 0

    # Update perturbations
    lr = (num_perturbations) / (epoch + 1)
    # pre_projection = int(perturbations.sum() / 2)
    perturbations = perturbations + (lr * adj_grad)
    # perturbations = Utils.projection(perturbations, num_perturbations)

    for i in range(10):
        perturbations = perturbations * (num_perturbations / perturbations.abs().sum())
        perturbations.clamp_(-1, 1)
        if np.abs(1 - (perturbations.abs().sum() / num_perturbations)) < 0.1:
            break

    # Train the model
    modified_adj = attack.adj + perturbations - ((attack.adj * perturbations) * 2)
    model_loss = surrogate.fit(modified_adj, attack.feat, attack.labels, epochs=1)

    t.set_postfix({"adj_l": loss.item(),
                    "adj_g": int(adj_grad.sum()),
                    "model_loss": model_loss})

GCN Training: 100%|██████████| 1/1 [00:00<00:00, 79.39it/s, loss=1.95]
GCN Training: 100%|██████████| 1/1 [00:00<00:00, 84.78it/s, loss=1.92]58, adj_g=0, model_loss=1.95]
GCN Training: 100%|██████████| 1/1 [00:00<00:00, 86.04it/s, loss=1.9]34, adj_g=0, model_loss=1.92] 
GCN Training: 100%|██████████| 1/1 [00:00<00:00, 89.68it/s, loss=1.89], adj_g=0, model_loss=1.9]  
GCN Training: 100%|██████████| 1/1 [00:00<00:00, 75.37it/s, loss=1.87], adj_g=-1, model_loss=1.89]
GCN Training: 100%|██████████| 1/1 [00:00<00:00, 76.04it/s, loss=1.85], adj_g=-2, model_loss=1.87]
GCN Training: 100%|██████████| 1/1 [00:00<00:00, 78.46it/s, loss=1.83], adj_g=-2, model_loss=1.85]
GCN Training: 100%|██████████| 1/1 [00:00<00:00, 84.42it/s, loss=1.8]8, adj_g=-3, model_loss=1.83]
GCN Training: 100%|██████████| 1/1 [00:00<00:00, 86.12it/s, loss=1.78], adj_g=-4, model_loss=1.8] 
GCN Training: 100%|██████████| 1/1 [00:00<00:00, 92.16it/s, loss=1.76], adj_g=-4, model_loss=1.78]
Perturbing: 100%|██████████| 10/10 [

In [52]:
def s(t: torch.Tensor):
  print(f'shape:{t.shape}\tsum:{t.sum()}\tmin:{t.min()}\tmax: {t.max()}\tmean:{t.mean()}')

In [53]:
s(perturbations.abs())

shape:torch.Size([2708, 2708])	sum:2000.0	min:0.0	max: 0.7944938540458679	mean:0.00027272984152659774


In [58]:
with torch.no_grad():
    max_loss = -1000

    for k in range(0,3):
        sample = torch.bernoulli(perturbations.abs())
        modified_adj = attack.adj + sample - ((attack.adj * sample) * 2)
        # modified_adj = Utils.make_symmetric(modified_adj) # Removing this creates "impossible" adj, but works well

        predictions = surrogate(modified_adj, attack.feat) 

        loss = F.cross_entropy(predictions[attack.g0], attack.labels[attack.g0]) \
            - F.cross_entropy(predictions[attack.gX], attack.labels[attack.gX])

        if loss > max_loss:
            max_loss = loss
            best = sample
            best_mod = modified_adj
    
    print(f"Best sample loss: {loss:.2f}\t Edges: {best.abs().sum():.0f}")

Best sample loss: -0.06	 Edges: 2033


In [59]:
# Evaluate
import sklearn.metrics as metrics

lock_model = models.DenseGCN(
  in_size=dataset.ndata['feat'].shape[1],
  out_size=int(dataset.ndata['label'].max()) + 1,
  hid_size=config['hid_size'],
  lr=config['lr'],
  dropout=config['dropout'],
  weight_decay=config['weight_decay']
)

lock_model.fit(
    g=best_mod, 
    feat=attack.feat, 
    labels=attack.labels, 
    epochs=200, 
    verbose=True)

pred = lock_model(best_mod, attack.feat)
f1_g0 = metrics.f1_score(attack.labels[attack.g0], pred.argmax(dim=1)[attack.g0], average='micro')
f1_gX = metrics.f1_score(attack.labels[attack.gX], pred.argmax(dim=1)[attack.gX], average='micro')
f1_g0, f1_gX

# match = pred.argmax(dim=1) == dataset.ndata['label']
# match.sum() / match.shape[0]

GCN Training: 100%|██████████| 200/200 [00:01<00:00, 124.73it/s, loss=0.65]


(0.85, 0.8320552147239264)