In [13]:
import os
import pickle
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import tqdm

os.getcwd()

'/home/kylemoy/GithubGraphML'

In [147]:
from GithubGraphML.analyze import load_networks
pa_network = load_networks('./data', ['Pascal'])[0]
pa_network.list_properties()
pa_network.save('pascal.graphml')
pa_network

Loading cached developers...
Loading cached Pascal network...
created_at                  (vertex)  (type: double)
deleted                     (vertex)  (type: bool)
developer_id                (vertex)  (type: int32_t)
fake                        (vertex)  (type: bool)
is_org                      (vertex)  (type: bool)
login                       (vertex)  (type: string)
begin_contribution_date     (edge)    (type: double)
contribution_days           (edge)    (type: int32_t)
end_contribution_date       (edge)    (type: double)
number_commits              (edge)    (type: int32_t)
programming_language_id     (edge)    (type: int32_t)
repository_id               (edge)    (type: int32_t)


<Graph object, undirected, with 4241 vertices and 12008 edges, 6 internal vertex properties, 6 internal edge properties, at 0x7fd6f1c423f0>

In [135]:
from GithubGraphML.parsing.loading import load_bipartite
bipartite_network = load_bipartite('./data', ['Pascal'], cache='test_bipartite.pkl', combined='Pascal.pkl')
bipartite_network.clear_filters()
def thing(x):
    try:
        return int(x)
    except:
        return -1
    
bipartite_network.vp['number_commits'] = bipartite_network.vp['number_commits'].t(thing, value_type='int')
bipartite_network.vp['number_commiters'] = bipartite_network.vp['number_commiters'].t(thing, value_type='int')
bipartite_network.list_properties()
bipartite_network.save('pascal_bipartite.graphml')
bipartite_network


Loading cached bipartite...
create_date                 (vertex)  (type: string)
created_at                  (vertex)  (type: double)
deleted                     (vertex)  (type: bool)
description                 (vertex)  (type: string)
developer_id                (vertex)  (type: int32_t)
duration_days               (vertex)  (type: string)
end_date                    (vertex)  (type: string)
fake                        (vertex)  (type: bool)
is_org                      (vertex)  (type: bool)
is_repository               (vertex)  (type: bool)
login                       (vertex)  (type: string)
name                        (vertex)  (type: string)
number_commiters            (vertex)  (type: int32_t)
number_commits              (vertex)  (type: int32_t)
programming_language_id     (vertex)  (type: string)
repository_id               (vertex)  (type: string)
url                         (vertex)  (type: string)
begin_contribution_date     (edge)    (type: double)
contribution_days      

<Graph object, undirected, with 5738 vertices and 5238 edges, 17 internal vertex properties, 6 internal edge properties, at 0x7fd6dda3af60>

In [148]:
from torch_geometric.utils import to_undirected
def trasfrom_graph_tool(graph, eprops, reduce="mean"):
    edges = graph.get_edges(eprops)
    edge_index = torch.tensor(edges[:, :2], dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edges[:, 2:], dtype=torch.float)
    return to_undirected(edge_index, edge_attr=edge_attr, reduce=reduce, )

indices, attrs = trasfrom_graph_tool(pa_network,[pa_network.ep['number_commits']], "sum")
indices, attrs

(tensor([[   0,    0,    0,  ..., 4238, 4239, 4240],
         [   1,    2,    3,  ..., 4237, 4240, 4239]]),
 tensor([[805.],
         [799.],
         [891.],
         ...,
         [ 30.],
         [  5.],
         [  5.]]))

In [149]:
from torch_geometric.data import HeteroData
from graph_tool.all import *
from collections import defaultdict
import numpy as np

def transform_data(graph, vclass=[], vprops=[], eclass=[], eprops=[]):
    def filter_nodes(graph, vcls, vprops):
        return graph.get_vertices(vprops)[vcls.a.astype(bool)]
    def filter_edges(graph, ecls, vprops):
        return graph.get_edges(eprops)[ecls.a.astype(bool)]
    
    nodes = [filter_nodes(graph, vcls, vprops) for vcls in vclass] if vclass else [graph.get_vertices(vprops).reshape(-1, len(vprops) + 1)]
    edges = [filter_edges(graph, ecls, eprops) for ecls in eclass] if eclass else [graph.get_edges(eprops)]
    data = HeteroData()
    nmap = {}
    for idx, node_array in enumerate(nodes):
        nmap.update({int(oidx): (int(nidx), f'v{idx}') for nidx, oidx in enumerate(node_array[:, 0])})
        data[f'v{idx}'].x = torch.tensor(node_array[:, 1:], dtype=torch.float)
        data[f'v{idx}'].num_nodes = len(node_array)
        
    for idx, edge_array in enumerate(edges):
        # print(edge_array)
        src_nodes = edge_array[:, 0]
        dst_nodes = edge_array[:, 1]
        emap = defaultdict(list)
        for jdx in range(len(edge_array)):
            try:
                src, dst = edge_array[jdx, :2]
                src, src_cls = nmap[int(src)] 
                dst, dst_cls = nmap[int(dst)] 
                edge_array[jdx][0] = src
                edge_array[jdx][1] = dst
                emap[(src_cls, f'e{idx}', dst_cls)].append((src, dst))
            except:
                pass

        for rel, indices in emap.items():
            data[*rel].edge_index = torch.tensor(np.array(indices)[:, :2].T, dtype=torch.long)
            data[*rel].edge_attrs = torch.tensor(np.array(indices)[:, 2:].T, dtype=torch.long)
            data[*rel].edge_label = torch.ones(len(indices), dtype=torch.long)
            data[*rel].num_edges = len(indices)

    return data

data = transform_data(bipartite_network, vprops=[bipartite_network.vp['number_commits']], vclass=[bipartite_network.vp['is_repository'], bipartite_network.vp['is_repository'].t(np.logical_not)])
data

HeteroData(
  v0={
    x=[1497, 1],
    num_nodes=1497,
  },
  v1={
    x=[4241, 1],
    num_nodes=4241,
  },
  (v1, e0, v0)={
    edge_index=[2, 5238],
    edge_attrs=[0, 5238],
    edge_label=[5238],
    num_edges=5238,
  }
)

In [161]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import HeteroConv, GATConv
from torch_geometric.data import HeteroData
from torch_geometric.utils import negative_sampling
from collections import defaultdict
import numpy as np
from graph_tool.all import *

# ---- 1. Transform graph-tool bipartite graph into HeteroData ---- #
def transform_data(graph, vclass=[], vprops=[], eclass=[], eprops=[]):
    def filter_nodes(graph, vcls, vprops):
        return graph.get_vertices(vprops)[vcls.a.astype(bool)]
    def filter_edges(graph, ecls, vprops):
        return graph.get_edges(eprops)[ecls.a.astype(bool)]
    
    nodes = [filter_nodes(graph, vcls, vprops) for vcls in vclass] if vclass else [graph.get_vertices(vprops).reshape(-1, len(vprops) + 1)]
    edges = [filter_edges(graph, ecls, eprops) for ecls in eclass] if eclass else [graph.get_edges(eprops)]
    data = HeteroData()
    nmap = {}
    for idx, node_array in enumerate(nodes):
        nmap.update({int(oidx): (int(nidx), f'v{idx}') for nidx, oidx in enumerate(node_array[:, 0])})
        data[f'v{idx}'].x = torch.tensor(node_array[:, 1:], dtype=torch.float)
        data[f'v{idx}'].num_nodes = len(node_array)
        
    for idx, edge_array in enumerate(edges):
        # print(edge_array)
        src_nodes = edge_array[:, 0]
        dst_nodes = edge_array[:, 1]
        emap = defaultdict(list)
        for jdx in range(len(edge_array)):
            try:
                src, dst = edge_array[jdx, :2]
                src, src_cls = nmap[int(src)] 
                dst, dst_cls = nmap[int(dst)] 
                edge_array[jdx][0] = src
                edge_array[jdx][1] = dst
                emap[(src_cls, f'e{idx}', dst_cls)].append((src, dst))
            except:
                pass

        for rel, indices in emap.items():
            data[*rel].edge_index = torch.tensor(np.array(indices)[:, :2].T, dtype=torch.long)
            data[*rel].edge_attrs = torch.tensor(np.array(indices)[:, 2:].T, dtype=torch.long)
            data[*rel].edge_label = torch.ones(len(indices), dtype=torch.long)
            data[*rel].num_edges = len(indices)

    return data
    
# ---- 2. GAT-based Heterogeneous Link Prediction Model ---- #
class HeteroGAT(torch.nn.Module):
    def __init__(self, metadata, hidden_channels=64, heads=2):
        super().__init__()
        self.conv1 = HeteroConv({
            ('v1', 'e0', 'v0'): GATConv((-1, -1), hidden_channels, heads=heads, add_self_loops=False),
            ('v0', 'rev_e0', 'v1'): GATConv((-1, -1), hidden_channels, heads=heads, add_self_loops=False),
        }, aggr='sum')

        self.lin_pred = torch.nn.Linear(heads * hidden_channels * 2, 1)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv1(x_dict, edge_index_dict)
        return x_dict

    def decode(self, z_dict, edge_index, etype):
        src, dst = edge_index
        z_src = z_dict[etype[0]][src]
        z_dst = z_dict[etype[2]][dst]
        z = torch.cat([z_src, z_dst], dim=1)
        return self.lin_pred(z).view(-1)

# ---- 3. Training and Evaluation Utilities ---- #
def get_positive_negative_edges(data, etype):
    pos_edge_index = data[etype].edge_index
    neg_edge_index = negative_sampling(
        edge_index=pos_edge_index,
        num_nodes=(data[etype[0]].num_nodes, data[etype[2]].num_nodes),
        num_neg_samples=pos_edge_index.size(1)
    )
    return pos_edge_index, neg_edge_index

def train(model, optimizer, data, etype):
    model.train()
    optimizer.zero_grad()

    z_dict = model(data.x_dict, data.edge_index_dict)
    pos_edge_index, neg_edge_index = get_positive_negative_edges(data, etype)

    pos_pred = model.decode(z_dict, pos_edge_index, etype)
    neg_pred = model.decode(z_dict, neg_edge_index, etype)

    pred = torch.cat([pos_pred, neg_pred], dim=0)
    label = torch.cat([torch.ones(pos_pred.size(0)), torch.zeros(neg_pred.size(0))], dim=0).to(pred.device)

    loss = F.binary_cross_entropy_with_logits(pred, label)
    loss.backward()
    optimizer.step()

    return loss.item()

@torch.no_grad()
def test(model, data, etype):
    model.eval()
    z_dict = model(data.x_dict, data.edge_index_dict)
    pos_edge_index, neg_edge_index = get_positive_negative_edges(data, etype)

    pos_pred = model.decode(z_dict, pos_edge_index, etype).sigmoid()
    neg_pred = model.decode(z_dict, neg_edge_index, etype).sigmoid()

    preds = torch.cat([pos_pred, neg_pred], dim=0)
    labels = torch.cat([torch.ones(pos_pred.size(0)), torch.zeros(neg_pred.size(0))], dim=0)

    acc = ((preds > 0.5) == labels).float().mean().item()
    return acc

# ---- 4. Main Runner ---- #
# ADD YOUR GRAPH LOADING LOGIC HERE
# The following assumes:
# - bipartite_network.vp['is_repository'] and its negation distinguish types
# - bipartite_network.vp['number_commits'] is a feature

# Create the hetero data object
data = transform_data(
    bipartite_network,
    vprops=[bipartite_network.vp['number_commits']],
    vclass=[
        bipartite_network.vp['is_repository'],
        bipartite_network.vp['is_repository'].t(np.logical_not)
    ],
    eprops=[bipartite_network.ep['number_commits']]
)
rev_etype = ('v0', 'rev_e0', 'v1')
data[rev_etype].edge_index = data[('v1', 'e0', 'v0')].edge_index.flip(0)
print(data)


# Prepare model and training setup
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HeteroGAT(data.metadata())
data = data
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

etype = list(data.edge_index_dict.keys())[0]  # use first edge type for link prediction

for epoch in range(1, 100):
    loss = train(model, optimizer, data, etype)
    acc = test(model, data, etype)
    print(f"Epoch {epoch:02d} | Loss: {loss:.4f} | Acc: {acc:.4f}")


HeteroData(
  v0={
    x=[1497, 2],
    num_nodes=1497,
  },
  v1={
    x=[4241, 2],
    num_nodes=4241,
  },
  (v1, e0, v0)={
    edge_index=[2, 5238],
    edge_attrs=[0, 5238],
    edge_label=[5238],
    num_edges=5238,
  },
  (v0, rev_e0, v1)={ edge_index=[2, 5238] }
)
Epoch 01 | Loss: 15193633.0000 | Acc: 0.5000
Epoch 02 | Loss: 50839272.0000 | Acc: 0.5000
Epoch 03 | Loss: 47431760.0000 | Acc: 0.5000
Epoch 04 | Loss: 21239948.0000 | Acc: 0.5000
Epoch 05 | Loss: 15533540.0000 | Acc: 0.5000
Epoch 06 | Loss: 27758888.0000 | Acc: 0.5000
Epoch 07 | Loss: 24470144.0000 | Acc: 0.5000
Epoch 08 | Loss: 10167825.0000 | Acc: 0.5000
Epoch 09 | Loss: 12100879.0000 | Acc: 0.5000
Epoch 10 | Loss: 20868750.0000 | Acc: 0.5000
Epoch 11 | Loss: 19011074.0000 | Acc: 0.5000
Epoch 12 | Loss: 8743549.0000 | Acc: 0.5000
Epoch 13 | Loss: 8305343.0000 | Acc: 0.5000
Epoch 14 | Loss: 15117750.0000 | Acc: 0.5000
Epoch 15 | Loss: 13358387.0000 | Acc: 0.5000
Epoch 16 | Loss: 4384615.5000 | Acc: 0.5000
Epoch 17 |