# Graph Contrastic Learning Lab Project

## Setting up the packages - not trivial

In [1]:
# pip uninstall -y dgl dglgo PyGCL pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv

In [2]:
import torch
# print(torch.__version__)

  from .autonotebook import tqdm as notebook_tqdm


Same version must be used for the pip wheel installation of the torch related packages

In [3]:
# !pip install torch_geometric

# # Optional dependencies:
# !pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-1.13.1+cu117.html
# !pip install  dgl==1.1.2  dglgo
# !pip install PyGCL

import GCL.losses as L

L

<module 'GCL.losses' from '/home/felix/.local/lib/python3.10/site-packages/GCL/losses/__init__.py'>

## JSON to GCL

In [18]:
import json

benchmark = json.load(open("benchmark.json"))

In [12]:
strategy = benchmark["strategies"][0]
strategy

{'name': 'Default',
 'strategy': 'DualBranch',
 'mode': 'L2L',
 'augmentations': ['EdgeRemoving', 'FeatureMasking', 'NodeDropping'],
 'negative': None,
 'architecture': ['DefaultGNN'],
 'epochs': 1000,
 'objective': 'InfoNCE'}

In [13]:
from torch import nn
from tqdm import tqdm
from torch.optim import Adam
from GCL.eval import get_split, SVMEvaluator
from GCL.models import DualBranchContrast
from torch_geometric.nn import GINConv, global_add_pool
from torch_geometric.data import DataLoader
from torch_geometric.datasets import TUDataset


def make_gin_conv(input_dim, out_dim):
    return GINConv(nn.Sequential(nn.Linear(input_dim, out_dim), nn.ReLU(), nn.Linear(out_dim, out_dim)))


class GConv(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(GConv, self).__init__()
        self.layers = nn.ModuleList()
        self.batch_norms = nn.ModuleList()

        for i in range(num_layers):
            if i == 0:
                self.layers.append(make_gin_conv(input_dim, hidden_dim))
            else:
                self.layers.append(make_gin_conv(hidden_dim, hidden_dim))
            self.batch_norms.append(nn.BatchNorm1d(hidden_dim))

        project_dim = hidden_dim * num_layers
        self.project = torch.nn.Sequential(
            nn.Linear(project_dim, project_dim),
            nn.ReLU(inplace=True),
            nn.Linear(project_dim, project_dim))

    def forward(self, x, edge_index, batch):
        z = x
        zs = []
        for conv, bn in zip(self.layers, self.batch_norms):
            z = conv(z, edge_index)
            z = F.relu(z)
            z = bn(z)
            zs.append(z)
        gs = [global_add_pool(z, batch) for z in zs]
        z, g = [torch.cat(x, dim=1) for x in [zs, gs]]
        return z, g


class Encoder(torch.nn.Module):
    def __init__(self, encoder, augmentor):
        super(Encoder, self).__init__()
        self.encoder = encoder
        self.augmentor = augmentor

    def forward(self, x, edge_index, batch):
        aug1, aug2 = self.augmentor
        x1, edge_index1, edge_weight1 = aug1(x, edge_index)
        x2, edge_index2, edge_weight2 = aug2(x, edge_index)
        z, g = self.encoder(x, edge_index, batch)
        z1, g1 = self.encoder(x1, edge_index1, batch)
        z2, g2 = self.encoder(x2, edge_index2, batch)
        return z, g, z1, z2, g1, g2


In [16]:
import GCL.losses as L
import GCL.augmentors as A
import GCL.models as M

class GCLPipeline:
    def __init__(self, mode, strategy, objective, augmentations, negative, architecture, epochs):
        self.mode = mode
        self.strategy = strategy
        self.objective = objective
        self.augmentations = augmentations
        self.negative = negative
        self.architecture = architecture
        self.epochs = epochs

    def gcl_objective_from_str(objective_name):
        match objective_name:
            case "InfoNCE":
                return L.InfoNCE(tau=0.2)
            case "JSD":
                return L.JSD()
            case "Triplet":
                return L.TripletMargin()
            case "BootstrapLatent":
                return L.BootstrapLatent()
            case "BarlowTwins":
                return L.BarlowTwins()
            case "VICReg":
                return L.VICReg()
            case _:
                raise NameError(f"Unknown objective name: {objective_name}")
    
    def gcl_strategy_from_str(strategy_name, objective, mode):
        match strategy_name:
            case "SingleBranch":
                return M.SingleBranchContrast(objective, mode)
            case "DualBranch":
                return M.DualBranchContrast(objective, mode)
            case "BootstrapBranch":
                return M.BootstrapContrast(objective, mode)
            case "WithinEmbed":
                return M.WithinEmbedContrast(objective)
            case _:
                raise NameError(f"Unknown strategy name: {strategy_name}")
        
    def gcl_augmentations_from_str(augmentation_name):
        match augmentation_name:
            case "EdgeAdding":
                return A.EdgeAdding(pe=0.2) 
            case "EdgeRemoving":
                return A.EdgeRemoving(pe=0.1)
            case "FeatureMasking":
                return A.FeatureMasking(pf=0.2)
            case "FeatureDropout":
                return A.FeatureDropout(pf=0.2)
            case "EdgeAttrMasking":
                return A.EdgeAttrMasking(pf=0.1)
            case "PPRDiffusion":
                return A.PPRDiffusion()
            case "MDK":
                return A.MarkovDiffusion() 
            case "NodeDropping":
                return A.NodeDropping(pn=0.2)
            case "NodeShuffling":
                return A.NodeShuffling()
            case "RWSampling":
                return A.RWSampling()
            case "EgoNet":
                return A.Identity()
            case _:
                raise NameError(f"Unknown augmentation name: {augmentation_name}")
        
        
    @classmethod
    def from_strategy(cls, strategy):
        # Retrieve arguments ; Provide default values / complete missing TODO
        
        strat = strategy["strategy"] # ok
        mode = strategy["mode"] # TODO : training loop ?
        objective = strategy["objective"] # ok
        augmentations = strategy["augmentations"] # ok 
        negative = strategy["negative"] # ok (no parsing to be done)
        architecture = strategy["architecture"] # TODO, import from other places ?
        epochs = strategy["epochs"] # ok (no parsing to be done)
        
        # Check validity of combination
        # Architecture and objective
         # Single branch and G2L only
        assert(not (strat == "SingleBranch" and mode != "G2L"))
         # DualBranch / Bootstrap and L2L, G2G, G2L only
        assert(not (strat in ["DualBranch", "Bootstrap"] and mode not in ["L2L", "G2G", "G2L"]))
        # Within embedding and  L2L / G2G only
        assert(not (strat == "WithinEmbedding" and mode not in ["L2L", "G2G"]))
        
        # Convert objective
        objective = GCLPipeline.gcl_objective_from_str(objective_name=objective)
        # Convert strategy
        strategy = GCLPipeline.gcl_strategy_from_str(strategy_name=strat, objective=objective, mode=mode) # This is a class to be initialized with args
        # Convert augmentations ; TODO : how to tune their parameters ? Declare presets in another py file ?
        # Use of a dictionary params:{} in the json for each augmentation
        augmentations = A.RandomChoice([GCLPipeline.gcl_augmentations_from_str(aug) for aug in augmentations], 1)
        
        return cls(mode=mode,
                   strategy=strat,
                   objective=objective,
                   augmentations=augmentations,
                   negative=negative,
                   architecture=architecture,
                   epochs=epochs)
    
    def train(self, dataset):
#         device = torch.device('cuda')
        device = torch.device('cpu')
        match self.strategy:
            case "SingleBranch":
                return GCL.models.SingleBranchContrast(objective, mode)
            case "DualBranch":
                
                dataloader = DataLoader(dataset, batch_size=128)
                input_dim = max(dataset.num_features, 1)

                aug1 = A.Identity()
                aug2 = self.augmentations
                gconv = GConv(input_dim=input_dim, hidden_dim=32, num_layers=2).to(device)
                # Model should be loaded from above as well
                encoder_model = Encoder(encoder=gconv, augmentor=(aug1, aug2)).to(device)
                contrast_model = M.DualBranchContrast(loss=L.InfoNCE(tau=0.2), mode='G2G').to(device)

                optimizer = Adam(encoder_model.parameters(), lr=0.01)

                with tqdm(total=self.epochs, desc='(T)') as pbar:
                    for epoch in range(1, self.epochs + 1):
                        loss = self.__train(encoder_model, contrast_model, dataloader, optimizer)
                        pbar.set_postfix({'loss': loss})
                        pbar.update()

                test_result = self.test(encoder_model, dataloader)
                print(f'(E): Best test F1Mi={test_result["micro_f1"]:.4f}, F1Ma={test_result["macro_f1"]:.4f}')
                return test_result
            case "BootstrapBranch":
                return M.BootstrapContrast(objective, mode)
            case "WithinEmbed":
                return M.WithinEmbedContrast(objective)
            case _:
                raise NameError(f"Unknown strategy name: {strategy}")
        
        raise NotImplementedError()
    
    def __train(self, encoder_model, contrast_model, dataloader, optimizer):
        encoder_model.train()
        epoch_loss = 0
        for data in dataloader:
#             data = data.to('cuda')
            optimizer.zero_grad()

            if data.x is None:
                print("data x is None")
                num_nodes = data.batch.size(0)
                data.x = torch.ones((num_nodes, 1), dtype=torch.float32, device=data.batch.device)

            _, _, _, _, g1, g2 = encoder_model(data.x, data.edge_index, data.batch)
            g1, g2 = [encoder_model.encoder.project(g) for g in [g1, g2]]
            loss = contrast_model(g1=g1, g2=g2, batch=data.batch)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
        return epoch_loss
    
    def test(self, encoder_model, dataloader):
    
        encoder_model.eval()
        x = []
        y = []
        for data in dataloader:
#             data = data.to('cuda')
            if data.x is None:
                num_nodes = data.batch.size(0)
                data.x = torch.ones((num_nodes, 1), dtype=torch.float32, device=data.batch.device)
            _, g, _, _, _, _ = encoder_model(data.x, data.edge_index, data.batch)
            x.append(g)
            y.append(data.y)
        x = torch.cat(x, dim=0)
        y = torch.cat(y, dim=0)

        split = get_split(num_samples=x.size()[0], train_ratio=0.8, test_ratio=0.1)
        result = SVMEvaluator(linear=True)(x, y, split)
        return result

    def evaluate(self):
        raise NotImplementedError()
        

In [24]:
import os.path as osp
import GCL.losses as L
import GCL.augmentors as A
import torch.nn.functional as F
import pandas as pd
# Stick to the example for demo
path = osp.join(osp.expanduser('~'), 'datasets')
dataset = TUDataset(path, name='PTC_MR')
results = {}
for strategy in benchmark["strategies"]:
    results[strategy["name"]] = []
    for run_i in range(5):
        pipeline = GCLPipeline.from_strategy(strategy)
        pipeline_results = pipeline.train(dataset)
        results[strategy["name"]].append(pipeline_results)

df_results = pd.DataFrame(results)
df_results

(T): 100%|███████████████████████| 1000/1000 [00:57<00:00, 17.42it/s, loss=3.17]


(E): Best test F1Mi=0.4857, F1Ma=0.4293


(T): 100%|███████████████████████| 1000/1000 [00:59<00:00, 16.95it/s, loss=3.23]


(E): Best test F1Mi=0.4571, F1Ma=0.4571


(T): 100%|███████████████████████| 1000/1000 [00:58<00:00, 17.00it/s, loss=3.09]


(E): Best test F1Mi=0.6000, F1Ma=0.5717


(T): 100%|███████████████████████| 1000/1000 [00:58<00:00, 17.09it/s, loss=3.07]


(E): Best test F1Mi=0.6571, F1Ma=0.6196


(T): 100%|███████████████████████| 1000/1000 [00:55<00:00, 18.10it/s, loss=2.81]


(E): Best test F1Mi=0.5714, F1Ma=0.5714


(T): 100%|████████████████████████| 1000/1000 [00:59<00:00, 16.83it/s, loss=3.1]


(E): Best test F1Mi=0.5143, F1Ma=0.5079


(T): 100%|███████████████████████| 1000/1000 [01:00<00:00, 16.65it/s, loss=4.53]


(E): Best test F1Mi=0.5429, F1Ma=0.5105


(T): 100%|███████████████████████| 1000/1000 [00:56<00:00, 17.74it/s, loss=3.62]


(E): Best test F1Mi=0.6286, F1Ma=0.5578


(T): 100%|███████████████████████| 1000/1000 [00:57<00:00, 17.47it/s, loss=2.72]


(E): Best test F1Mi=0.4857, F1Ma=0.4750


(T): 100%|███████████████████████| 1000/1000 [00:59<00:00, 16.90it/s, loss=2.88]


(E): Best test F1Mi=0.5143, F1Ma=0.4218


Unnamed: 0,Default,Default - JSD
0,"{'micro_f1': 0.4857142857142857, 'macro_f1': 0...","{'micro_f1': 0.5142857142857142, 'macro_f1': 0..."
1,"{'micro_f1': 0.45714285714285713, 'macro_f1': ...","{'micro_f1': 0.5428571428571428, 'macro_f1': 0..."
2,"{'micro_f1': 0.6, 'macro_f1': 0.5716783216783217}","{'micro_f1': 0.6285714285714286, 'macro_f1': 0..."
3,"{'micro_f1': 0.6571428571428571, 'macro_f1': 0...","{'micro_f1': 0.4857142857142857, 'macro_f1': 0..."
4,"{'micro_f1': 0.5714285714285714, 'macro_f1': 0...","{'micro_f1': 0.5142857142857142, 'macro_f1': 0..."


In [20]:
# from torch_geometric.datasets import PPI

# dataset_ppi = PPI("/home/felix/mscai/.data")