In [11]:
import networkx as nx
from node2vec import Node2Vec
from igraph import Graph
import igraph as ig
import numpy as np
import json
import pandas as pd
from gensim.models import Word2Vec
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, log_loss, matthews_corrcoef, precision_recall_curve, average_precision_score
from sklearn.metrics import precision_recall_fscore_support as prf_support
from sklearn.metrics import roc_curve, roc_auc_score, auc
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from torch_geometric.nn import MetaPath2Vec
from typing import Dict, Tuple
import torch
import tqdm

graphss = r'C:\Users\George\Desktop\ISEF-2023\Network construction\Het_graph_final.graphml'
HetG = ig.Graph.Load(graphss, format='graphml')
graph = HetG


def create_edge_index_dict(graph: ig.Graph):
    edge_index_dict = {}
    unique_to_original = {}
    original_to_unique = {}

    # Variables to keep track of unique IDs for each node type
    current_ids = {}

    for vertex in graph.vs:
        node_type = vertex["type"]
        original_id = vertex.index

        # Assign unique IDs for each node type
        if node_type not in current_ids:
            current_ids[node_type] = 0
        unique_id = current_ids[node_type]
        current_ids[node_type] += 1

        # Update mappings
        unique_to_original[(node_type, unique_id)] = original_id
        original_to_unique[original_id] = (node_type, unique_id)

    for edge in graph.es:
        src_node_type = graph.vs[edge.source]["type"]
        rel_type = edge["type"]
        dst_node_type = graph.vs[edge.target]["type"]

        key = (src_node_type, rel_type, dst_node_type)

        if key not in edge_index_dict:
            edge_index_dict[key] = []

        # Use unique IDs in edge indices
        unique_src_id = original_to_unique[edge.source][1]
        unique_dst_id = original_to_unique[edge.target][1]
        edge_index_dict[key].append((unique_src_id, unique_dst_id))

    # Convert the lists of edge indices to torch.Tensor
    for key in edge_index_dict:
        edge_index_dict[key] = torch.tensor(edge_index_dict[key], dtype=torch.long).t()

    return edge_index_dict, unique_to_original, original_to_unique

In [12]:
edge_index_dict, unique_to_original, original_to_unique = create_edge_index_dict(HetG)
print(edge_index_dict)

{('Protein', 'Protein-Protein-Physical', 'Protein'): tensor([[2582, 2582, 2582,  ...,  723,  723,  723],
        [7168, 5221, 1923,  ...,  591, 3126, 2752]]), ('Protein', 'Protein-Protein-STP', 'Protein'): tensor([[2555, 2555, 2555,  ..., 3493, 3493, 3493],
        [4190, 6645, 5829,  ..., 6923, 5475, 2182]]), ('Compound', 'Metabolite-Reaction', 'Reaction'): tensor([[  4, 171, 334,  ..., 446, 101, 358],
        [  0,   0,   1,  ..., 444, 444, 444]]), ('Protein', 'Enzyme-Reaction', 'Reaction'): tensor([[ 389, 3826, 6099,  ..., 2706,  773,  773],
        [   0,    1,    1,  ...,  443,  443,  444]]), ('Drug', 'Drug-Drug', 'Drug'): tensor([[3645, 3645, 3645,  ...,  583,   64, 3667],
        [ 972, 1256, 2674,  ..., 4442,  824, 2533]]), ('Drug', 'Drug-Target', 'Protein'): tensor([[3128, 3128, 3128,  ..., 1427, 3388, 1871],
        [5981, 2925, 6006,  ..., 3904, 1662, 7042]])}


In [13]:
device = "cpu"

metapath = [
    ('Protein', 'Protein-Protein-Physical', 'Protein'),
    ('Protein', 'Protein-Protein-STP', 'Protein')
]

model = MetaPath2Vec(edge_index_dict,
                     embedding_dim=128,
                     metapath=metapath,
                     walk_length=5,
                     context_size=3,
                     walks_per_node=3,
                     num_negative_samples=1,
                     sparse=True
                    ).to(device)

loader = model.loader(batch_size=1, shuffle=True, num_workers=6)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

for pos_rw, neg_rw in loader:
    print(pos_rw.shape)

IndexError: Caught IndexError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "C:\Users\George\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "C:\Users\George\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "C:\Users\George\AppData\Local\Programs\Python\Python310\lib\site-packages\torch_geometric\nn\models\metapath2vec.py", line 193, in _sample
    return self._pos_sample(batch), self._neg_sample(batch)
  File "C:\Users\George\AppData\Local\Programs\Python\Python310\lib\site-packages\torch_geometric\nn\models\metapath2vec.py", line 151, in _pos_sample
    batch = sample(
  File "C:\Users\George\AppData\Local\Programs\Python\Python310\lib\site-packages\torch_geometric\nn\models\metapath2vec.py", line 245, in sample
    rand *= rowcount[subset].to(rand.dtype).view(-1, 1)
IndexError: index 7392 is out of bounds for dimension 0 with size 7392


In [None]:


def train(epoch, log_steps=100, eval_steps=2000):
    model.train()

    total_loss = 0
    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (i + 1) % log_steps == 0:
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Loss: {total_loss / log_steps:.4f}'))
            total_loss = 0

        if (i + 1) % eval_steps == 0:
            acc = test()
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Acc: {acc:.4f}'))


@torch.no_grad()
def test(train_ratio=0.1):
    model.eval()

    z = model('author', batch=data['author'].y_index.to(device))
    y = data['author'].y

    perm = torch.randperm(z.size(0))
    train_perm = perm[:int(z.size(0) * train_ratio)]
    test_perm = perm[int(z.size(0) * train_ratio):]

    return model.test(z[train_perm], y[train_perm], z[test_perm], y[test_perm],
                      max_iter=150)



In [None]:

for epoch in range(1, 6):
    train(epoch)
    acc = test()
    print(f'Epoch: {epoch}, Accuracy: {acc:.4f}')

IndexError: Caught IndexError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "C:\Users\George\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "C:\Users\George\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\_utils\fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "C:\Users\George\AppData\Local\Programs\Python\Python310\lib\site-packages\torch_geometric\nn\models\metapath2vec.py", line 193, in _sample
    return self._pos_sample(batch), self._neg_sample(batch)
  File "C:\Users\George\AppData\Local\Programs\Python\Python310\lib\site-packages\torch_geometric\nn\models\metapath2vec.py", line 151, in _pos_sample
    batch = sample(
  File "C:\Users\George\AppData\Local\Programs\Python\Python310\lib\site-packages\torch_geometric\nn\models\metapath2vec.py", line 245, in sample
    rand *= rowcount[subset].to(rand.dtype).view(-1, 1)
IndexError: index 12719 is out of bounds for dimension 0 with size 12719
