In [1]:
from model import *
from utils import *
from torch_geometric.loader import DataLoader
from torch_geometric.loader import ClusterData, ClusterLoader, NeighborSampler
import torch.nn.functional as F

import pandas as pd
import numpy as np
from tqdm import tqdm
import random
import pickle
import os
from sklearn.metrics import f1_score

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from functools import partial
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans

seed= 10
torch.manual_seed(seed)

%load_ext autoreload
%autoreload 2

In [2]:

def masked_edge_index(edge_index, edge_mask):
    if isinstance(edge_index, Tensor):
        return edge_index[:, edge_mask]
    else:
        return print('Error')

def one_hot_encoding(l):
    label_types = torch.unique(l).tolist()
    new_labels = []
    for i in range(0, len(l)):
        tmp = []
        for j in range(0, len(label_types)):
            tmp.append(0.)
        tmp[l[i].item()] = 1.
        new_labels.append(tmp)
    return torch.tensor(new_labels)     

def load_files(node_file_path, links_file_path, label_file_path, embedding_file_path, dataset):
    colors = pd.read_csv(node_file_path, sep='\t', header = None)
    colors = colors.dropna(axis=1,how='all')
    labels = pd.read_csv(label_file_path, sep='\t', header = None)
    links = pd.read_csv(links_file_path, sep='\t', header = None)
    labels.rename(columns = {0: 'node', 1: 'label'}, inplace = True)
    source_nodes_with_labels = labels['node'].values.tolist()
    labels = torch.tensor(labels['label'].values)
    colors.rename(columns = {0: 'node', 1: 'color'}, inplace = True)
    links.rename(columns = {0: 'node_1', 1: 'relation_type', 2: 'node_2'}, inplace = True)
    if dataset == 'complex' or dataset == 'simple':
        embedding = pd.read_csv(embedding_file_path, sep='\t', header = None)
        embedding_number = len(embedding.columns)-2
        if embedding_number == 3:
            embedding.rename(columns = {0: 'index', 1: 'second embedding', 2: 'first embedding', 3: 'labels'}, inplace = True)
        elif embedding_number == 4:
            embedding.rename(columns = {0: 'index', 1: 'third embedding', 2: 'second embedding', 3: 'first embedding', 4: 'labels'}, inplace = True)
        elif embedding_number == 5:
            embedding.rename(columns = {0: 'index', 1: 'fourth embedding', 2: 'third embedding', 3: 'second embedding', 4: 'first_embdding', 5: 'labels'}, inplace = True)
        elif embedding_number == 2:
            embedding.rename(columns = {0: 'index', 1: 'first embedding', 2: 'labels'}, inplace = True)
        return labels, colors, links, embedding
    else:
        labels_multi  = one_hot_encoding(labels)
        # for i in range(0, len(labels)):
        #     if labels[i].item() == 0:
        #         labels[i] = 1
        #     else:
        #         labels[i] = 0
        return labels, colors, links, source_nodes_with_labels, labels_multi




def splitting_node_and_labels(lab, feat, src, dataset):
    if dataset == 'complex' or dataset == 'simple':
        node_idx = torch.tensor(feat['node'].values)
    else:
        node_idx = torch.tensor(src)
        
    #node_idx = node_idx.cpu().detach().numpy()
    #np.random.seed(10)
    #np.random.shuffle(node_idx)   
    
    train_split = int(len(node_idx)*0.8)
    test_split = len(node_idx) - train_split
    train_idx = node_idx[:train_split]
    test_idx = node_idx[-test_split:]

    train_y = lab[:train_split]
    test_y = lab[-test_split:]
    return node_idx, train_idx, train_y, test_idx, test_y


def get_node_features(colors):
    node_features = pd.get_dummies(colors)
    
    node_features.drop(["node"], axis=1, inplace=True)
    
    x = node_features.to_numpy().astype(np.float32)
    x = np.flip(x, 1).copy()
    x = torch.from_numpy(x) 
    return x

def get_edge_index_and_type_no_reverse(links):
    edge_index = links.drop(['relation_type'], axis=1)
    edge_index = torch.tensor([list(edge_index['node_1'].values), list(edge_index['node_2'].values)])
    
    edge_type = links['relation_type']
    edge_type = torch.tensor(edge_type)
    return edge_index, edge_type


In [7]:

def mpgnn_train(model, optimizer, data):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index, data.edge_type)
    #weight_loss = torch.Tensor([1.,10.])
    loss = F.nll_loss(out[data.train_idx].squeeze(-1), data.train_y)# ,weight = weight_loss)
    loss.backward()
    optimizer.step()
    return float(loss)

@torch.no_grad()
def mpgnn_test(model, data):
    model.eval()
    pred = model(data.x, data.edge_index, data.edge_type)#.argmax(dim=-1)
    loss_test = F.nll_loss(pred[data.test_idx].squeeze(-1), data.test_y)
    
    train_predictions = torch.argmax(pred[data.train_idx], 1).tolist()
    test_predictions = torch.argmax(pred[data.test_idx], 1).tolist()
    train_y = data.train_y.tolist()
    test_y = data.test_y.tolist()
    # train_acc = (train_predictions == train_y).float().mean()
    # test_acc = (test_predictions == test_y).float().mean()
    f1_train = f1_score(train_predictions, train_y, average='micro')
    f1_test_macro = f1_score(test_predictions, test_y, average = 'macro')
    f1_test_micro = f1_score(test_predictions, test_y, average = 'micro')
    return f1_train, f1_test_micro, f1_test_macro,loss_test,train_predictions,test_predictions


def mpgnn_parallel_multiple(data_mpgnn, input_dim, hidden_dim, num_rel, output_dim, ll_output_dim, metapaths):
    #metapaths = [[0, 4, 2], [4, 3, 0], [1, 0]] # multi sintetico0
    metapaths = [[2,0],[3,1]] #IMDB
    #metapaths = [[2,1,0]] # complex = true
    mpgnn_model = MPNetm(input_dim, hidden_dim, num_rel, output_dim, ll_output_dim, len(metapaths), metapaths)
    print(mpgnn_model)
    
    mpgnn_optimizer = torch.optim.Adam(mpgnn_model.parameters(), lr=0.1, weight_decay=0.0005) #lr  0.01
    best_macro, best_micro = 0., 0.
    for epoch in range(1, 300):
        loss = mpgnn_train(mpgnn_model, mpgnn_optimizer, data_mpgnn)
        if epoch % 20 == 0:
            train_acc, f1_test_micro, f1_test_macro,loss_test,ptr,pte = mpgnn_test(mpgnn_model, data_mpgnn)
            print(epoch, "train loss %0.3f" % loss, "test loss %0.3f" % loss_test,
                  'train micro: %0.3f'% train_acc, 'test micro: %0.3f'% f1_test_micro)
            if f1_test_macro > best_macro:
                best_macro = f1_test_micro
            if f1_test_micro > best_micro:
                best_micro = f1_test_micro
    return best_micro,ptr,pte



def main(node_file_path, link_file_path, label_file_path, embedding_file_path, metapath_length, pickle_filename, input_dim, hidden_dim, num_rel, output_dim, ll_output_dim, dataset):
    # Obtain true 0|1 labels for each node, feature matrix (1-hot encoding) and links among nodes
    if dataset == 'complex' or dataset == 'simple':
        sources = []
        true_labels, features, edges, embedding = load_files(node_file_path, link_file_path, label_file_path, embedding_file_path, dataset)
    else: 
        true_labels, features, edges, sources, labels_multi = load_files(node_file_path, link_file_path, label_file_path, embedding_file_path, dataset)
    # Get features' matrix
    x = get_node_features(features)
    # Get edge_index and types
    edge_index, edge_type = get_edge_index_and_type_no_reverse(edges)

    # Split data into train and test
    node_idx, train_idx, train_y, test_idx, test_y = splitting_node_and_labels(true_labels, features, sources, dataset)
    #node_idx, train_idx, train_y, test_idx, test_y = splitting_node_and_labels(labels_multi, features, sources, dataset)

    # Dataset for MPGNN
    data_mpgnn = Data()
    data_mpgnn.x = x
    data_mpgnn.edge_index = edge_index
    data_mpgnn.edge_type = edge_type
    data_mpgnn.train_idx = train_idx
    data_mpgnn.test_idx = test_idx
    data_mpgnn.train_y = train_y
    data_mpgnn.test_y = test_y
    data_mpgnn.num_nodes = len(node_idx)
    # Variables
    if sources:
        source_nodes_mask = sources
    else:
        source_nodes_mask = []
    metapath = []

    # Dataset for score function
    data = Data()
    data.x = x
    data.edge_index = edge_index
    data.edge_type = edge_type
    data.labels = true_labels
    data.labels = data.labels.unsqueeze(-1)
    data.num_nodes = x.size(0)
    data.bags = torch.empty(1)
    data.bag_labels = torch.empty(1)

    # All possible relations
    relations = torch.unique(data.edge_type).tolist()
    mp = []
    mpgnn_f1_micro,ptr,pte = mpgnn_parallel_multiple(data_mpgnn, input_dim, hidden_dim, num_rel, output_dim, ll_output_dim, mp)
    return mpgnn_f1_micro,ptr,pte



In [8]:
COMPLEX = True
COMPLEX = "synthetic_multi"
COMPLEX = "IMDB"

metapath_length= 3
tot_rel=5

if COMPLEX == True:
    input_dim = 6
    ll_output_dim = 2
    dataset = "complex"
    folder= "data/" + dataset + "/length_m_" + str(metapath_length) + "__tot_rel_" + str(tot_rel) + "/"
elif COMPLEX == False:
    input_dim = 6
    ll_output_dim = 2
    dataset = "simple"
    folder= "data/" + dataset + "/length_m_" + str(metapath_length) + "__tot_rel_" + str(tot_rel) + "/"
elif COMPLEX == 'IMDB':
    tot_rel=4
    input_dim = 3066
    ll_output_dim = 3
    dataset = 'IMDB' ## 5
    folder= "data/" + dataset + "/"
elif COMPLEX == 'DBLP':
    input_dim = 4231
    tot_rel=6
    ll_output_dim = 4
    dataset = 'DBLP' ## 7
    folder= "data/" + dataset + "/"
elif COMPLEX == 'synthetic_multi':
    input_dim=6
    tot_rel=5
    ll_output_dim=2
    dataset = 'tot_rel_5'
    folder="data/synthetic_multi/" + dataset + "/"

node_file= folder + "node.dat"
link_file= folder + "link.dat"
label_file= folder + "label.dat"
embedding_file = folder + "embedding.dat"
# Define the filename for saving the variables
pickle_filename = folder + "iteration_variables.pkl"
# mpgnn variables
hidden_dim = 32
num_rel = tot_rel
output_dim = 64

meta,ptr,pte = main(node_file, link_file, label_file, embedding_file, metapath_length, pickle_filename, input_dim, hidden_dim, num_rel, output_dim, ll_output_dim, dataset)

MPNetm(
  (layers_list): ModuleList(
    (0): ModuleList(
      (0): CustomRGCNConv(3066, 32, num_relations=4)
      (1): CustomRGCNConv(32, 32, num_relations=4)
    )
    (1): ModuleList(
      (0): CustomRGCNConv(3066, 32, num_relations=4)
      (1): CustomRGCNConv(32, 32, num_relations=4)
    )
  )
  (fc1): Linear(in_features=64, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=3, bias=True)
  (log_softmax): LogSoftmax(dim=1)
)
20 train loss 1.099 test loss 1.099 train micro: 0.301 test micro: 0.124
40 train loss 1.099 test loss 1.099 train micro: 0.301 test micro: 0.124
60 train loss 1.099 test loss 1.099 train micro: 0.301 test micro: 0.124
80 train loss 1.099 test loss 1.099 train micro: 0.301 test micro: 0.124
100 train loss 1.099 test loss 1.099 train micro: 0.301 test micro: 0.124
120 train loss 1.099 test loss 1.099 train micro: 0.301 test micro: 0.124
140 train loss 1.099 test loss 1.099 train micro: 0.301 test micro: 0.124
160 train loss 1.099 test l

KeyboardInterrupt: 

In [None]:
280 train loss 0.224 test loss 0.230 train micro: 0.907 test micro: 0.915

In [None]:
len(data_mpgnn.train_y),len(data_mpgnn.test_y),

In [None]:
sum(data_mpgnn.train_y)

In [None]:
sum(data_mpgnn.test_y)

In [None]:
node_file_path, link_file_path, label_file_path, embedding_file_path, metapath_length, pickle_filename, input_dim, hidden_dim, num_rel, output_dim, ll_output_dim, dataset = node_file, link_file, label_file, embedding_file, metapath_length, pickle_filename, input_dim, hidden_dim, num_rel, output_dim, ll_output_dim, dataset

In [None]:
# Obtain true 0|1 labels for each node, feature matrix (1-hot encoding) and links among nodes
if dataset == 'complex' or dataset == 'simple':
    sources = []
    true_labels, features, edges, embedding = load_files(node_file_path, link_file_path, label_file_path, embedding_file_path, dataset)
else: 
    true_labels, features, edges, sources, labels_multi = load_files(node_file_path, link_file_path, label_file_path, embedding_file_path, dataset)
# Get features' matrix
x = get_node_features(features)
# Get edge_index and types
edge_index, edge_type = get_edge_index_and_type_no_reverse(edges)

# Split data into train and test
node_idx, train_idx, train_y, test_idx, test_y = splitting_node_and_labels(true_labels, features, sources, dataset)
#node_idx, train_idx, train_y, test_idx, test_y = splitting_node_and_labels(labels_multi, features, sources, dataset)

# Dataset for MPGNN
data_mpgnn = Data()
data_mpgnn.x = x
data_mpgnn.edge_index = edge_index
data_mpgnn.edge_type = edge_type
data_mpgnn.train_idx = train_idx
data_mpgnn.test_idx = test_idx
data_mpgnn.train_y = train_y
data_mpgnn.test_y = test_y
data_mpgnn.num_nodes = len(node_idx)
# Variables
if sources:
    source_nodes_mask = sources
else:
    source_nodes_mask = []
metapath = []

In [None]:
mp = []

#metapaths = [[2, 4, 0], [0, 3, 4], [0, 1]]
metapaths = [[2,0],[3,1]] #IMDB


data_mpgnn, input_dim, hidden_dim, num_rel, output_dim, ll_output_dim, mp = data_mpgnn, input_dim, hidden_dim, num_rel, output_dim, ll_output_dim, metapaths

In [None]:

mpgnn_model = MPNetm(input_dim, hidden_dim, num_rel, output_dim, ll_output_dim, len(metapaths), metapaths)
print(mpgnn_model)

In [None]:
def mpgnn_parallel_multiple(data_mpgnn, input_dim, hidden_dim, num_rel, output_dim, ll_output_dim, metapaths):
    metapaths = [[2, 4, 0], [0, 3, 4], [0, 1]] # multi sintetico
    #metapaths = [[2,0],[3,1]] #IMDB
    #metapaths = [[2,1,0]] # complex = true
    mpgnn_model = MPNetm(input_dim, hidden_dim, num_rel, output_dim, ll_output_dim, len(metapaths), metapaths)
    print(mpgnn_model)
    
    best_macro, best_micro = 0., 0.
    for epoch in range(1, 1000):
        loss = mpgnn_train(mpgnn_model, mpgnn_optimizer, data_mpgnn)
        if epoch % 20 == 0:
            train_acc, f1_test_micro, f1_test_macro,loss_test = mpgnn_test(mpgnn_model, data_mpgnn)
            print(epoch, "train loss %0.3f" % loss, "test loss %0.3f" % loss_test,'train micro: ', train_acc, 'test micro: ', f1_test_micro)
            if f1_test_macro > best_macro:
                best_macro = f1_test_micro
            if f1_test_micro > best_micro:
                best_micro = f1_test_micro
    return best_micro

In [None]:
mpgnn_optimizer = torch.optim.Adam(mpgnn_model.parameters(), lr=0.1, weight_decay=0.0005) #lr  0.01

mpgnn_model.train()
mpgnn_optimizer.zero_grad()
weight_loss = torch.tensor([1., 100.])
out = mpgnn_model(data_mpgnn.x, data_mpgnn.edge_index, data_mpgnn.edge_type)
loss = F.nll_loss(out[data_mpgnn.train_idx].squeeze(-1), data_mpgnn.train_y)
loss.backward()
mpgnn_optimizer.step()

In [None]:
metapaths = [[2,0],[3,1]] #IMDB

In [None]:
h = mpgnn_model.layers_list[0][0](2,data_mpgnn.x,data_mpgnn.edge_index,data_mpgnn.edge_type)

In [None]:
mpgnn_model.layers_list[0][1](0,h,data_mpgnn.edge_index,data_mpgnn.edge_type)

In [None]:
embeddings = []
for i in range(0, len(self.metapaths)):
    for layer_index in range(0, len(self.metapaths[i])):
        if layer_index == 0:
            h = F.relu(self.layers_list[i][layer_index](self.metapaths[i][layer_index], x, edge_index, edge_type))
            #h = self.dropout1(h)
        else:
            h = F.relu(self.layers_list[i][layer_index](self.metapaths[i][layer_index], h, edge_index, edge_type))
            #h = self.dropout2(h)
    embeddings.append(h)
print(embeddings)
concatenated_embedding = torch.cat(embeddings, dim=1)
h = F.relu(self.fc1(concatenated_embedding))

h = F.relu(self.fc2(h))
h = self.log_softmax(h)
return h

In [None]:

best_macro, best_micro = 0., 0.
for epoch in range(1, 1000):
    loss = mpgnn_train(mpgnn_model, mpgnn_optimizer, data_mpgnn)
    if epoch % 10 == 0:
        train_acc, f1_test_micro, f1_test_macro,loss_test = mpgnn_test(mpgnn_model, data_mpgnn)
        print(epoch, "train loss %0.3f" % loss, "test loss %0.3f" % loss_test,'train micro: ', train_acc, 'test micro: ', f1_test_micro)
        if f1_test_macro > best_macro:
            best_macro = f1_test_micro
        if f1_test_micro > best_micro:
            best_micro = f1_test_micro
return best_micro