In [None]:
from ogb.graphproppred import GraphPropPredDataset
import random
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
from multiprocessing import Pool
import os

In [None]:
N = 100
MAX_WALK_LENGTH = 100
WORKERS = len(os.sched_getaffinity(0))

In [None]:
dataset_name = 'ogbg-molhiv'
dataset = GraphPropPredDataset(name=dataset_name)
split_idx = dataset.get_idx_split()

graph_dicts = [graph_dict for graph_dict, _ in tqdm(dataset)]
labels = [label for _, label in tqdm(dataset)]

In [None]:
def get_neighborhood(graph_dict, node):
    neighborhood = {
        'nodes': [],
        'edges': []
    }

    from_nodes = graph_dict['edge_index'][0]
    to_nodes = graph_dict['edge_index'][1]
    
    for index, from_node in enumerate(from_nodes):
        if from_nodes[index] == node:
            neighborhood['nodes'].append(to_nodes[index])
            neighborhood['edges'].append(index)

    return neighborhood

def get_node_attributes(graph_dict, node):
    return [f'v{i}_{v}'for i, v in enumerate(graph_dict['node_feat'][node])]

def get_edge_attributes(graph_dict, edge):
    return [f'e{i}_{v}'for i, v in enumerate(graph_dict['edge_feat'][edge])]

In [None]:
def random_walk_evenly_distributed(graph_dict, how_often_visited):
    walk = {
        'vertices': [],
        'edges': []
    }

    # initialize with invalid value to make sure that the first vertex is not removed
    previous_vertex = -1

    # choose random start vertex
    least_visited_vertices = []
    for i, x in enumerate(how_often_visited):
        if x == min(how_often_visited):
            least_visited_vertices.append(i)
    vertex = random.choice(least_visited_vertices)
    how_often_visited[vertex] += 1

    # append the first vertex to the walk
    walk['vertices'].append(vertex)

    for _ in range(MAX_WALK_LENGTH):
        neighborhood = get_neighborhood(graph_dict, vertex)
        
        # remove previous vertex from neighborhood
        if previous_vertex in neighborhood['nodes']:
            index_to_remove = neighborhood['nodes'].index(previous_vertex)
            neighborhood['nodes'].pop(index_to_remove)
            neighborhood['edges'].pop(index_to_remove)
        previous_vertex = vertex

        # if there are no more neighbors, stop
        if len(neighborhood['nodes']) == 0:
            break

        # choose the next vertex to visit
        edge = random.choice(neighborhood['edges'])
        vertex = graph_dict['edge_index'][1][edge]
        how_often_visited[vertex] += 1

        # append the next vertex and edge to the walk
        walk['edges'].append(edge)
        walk['vertices'].append(vertex)

    return walk, how_often_visited

In [None]:
def walk_to_list_of_attributes(graph_dict, walk):
    list = []

    for i in range(len(walk['edges'])):
        list += get_node_attributes(graph_dict, walk['vertices'][i])
        list += get_edge_attributes(graph_dict, walk['edges'][i])
    
    list += get_node_attributes(graph_dict, walk['vertices'][-1])

    return list

In [None]:
def walk_to_words(graph_dict, walk):
    list_of_attributes = walk_to_list_of_attributes(graph_dict, walk)
    words = [attributes for attributes in list_of_attributes]
    return words

In [41]:
words = []

def generate_words(graph_dict):
    how_often_visited = [0] * graph_dict['num_nodes']

    this_graphs_words = []
    for _ in range(N):
        walk, how_often_visited = random_walk_evenly_distributed(graph_dict, how_often_visited)
        this_graphs_words.append(walk_to_words(graph_dict, walk))
    return this_graphs_words

with Pool() as pool:
    for this_graphs_words in tqdm(pool.imap(generate_words, graph_dicts, chunksize=10), total=len(graph_dicts)):
        words.append(this_graphs_words)

 40%|███▉      | 16249/41127 [08:32<13:04, 31.72it/s] 


In [None]:
with open('words.cor', 'w') as test_file:
    for sub_words in words:
        for sub_sub_words in sub_words:
            test_file.write(' '.join(sub_sub_words) + '\n')

In [None]:
from gensim.test.utils import common_texts, common_dictionary, common_corpus


In [None]:
model = Doc2Vec(corpus_file='words.cor', workers=WORKERS, window=15)

In [None]:
X = np.array([sum([model.dv[i+(j*N)]/N for i in range(N)]) for j in range(len(dataset))])
X.shape

In [None]:
import numpy as np

X_train = np.array([X[i] for i in split_idx['train']])
y_train = np.array([labels[i][0] for i in split_idx['train']])

X_test = np.array([X[i] for i in split_idx['test']])
y_test = np.array([labels[i][0] for i in split_idx['test']])

In [None]:
y_test.shape

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)

X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from ogb.graphproppred import Evaluator

In [None]:
clf = RandomForestClassifier(n_jobs=WORKERS)
clf.fit(X_resampled, y_resampled.ravel())

In [None]:
y_test_predicted = clf.predict(X_test)

In [None]:
y_test.shape, y_test_predicted.shape

In [None]:
pred_pos_idx = [i for i, v in enumerate(y_test_predicted) if v == 1]
pred_neg_idx = [i for i, v in enumerate(y_test_predicted) if v == 0]

len(pred_pos_idx), len(pred_neg_idx)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_test_predicted)

In [None]:
evaluator = Evaluator(name=dataset_name)
input_dict = {"y_true": y_test.reshape(-1, 1), "y_pred": y_test_predicted.reshape(-1, 1)}
result_dict = evaluator.eval(input_dict)
result_dict