In [1]:
from ogb.graphproppred import GraphPropPredDataset
import random
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np

In [2]:
dataset_name = 'ogbg-molhiv'
dataset = GraphPropPredDataset(name=dataset_name)
split_idx = dataset.get_idx_split()

In [3]:
def get_neighborhood(graph_dict, node):
    neighborhood = {
        'nodes': [],
        'edges': []
    }

    from_nodes = graph_dict['edge_index'][0]
    to_nodes = graph_dict['edge_index'][1]
    
    for index, from_node in enumerate(from_nodes):
        if from_nodes[index] == node:
            neighborhood['nodes'].append(to_nodes[index])
            neighborhood['edges'].append(index)

    return neighborhood

def get_node_attributes(graph_dict, node):
    return graph_dict['node_feat'][node]

def get_edge_attributes(graph_dict, edge):
    return graph_dict['edge_feat'][edge]

In [4]:
def random_walk_evenly_distributed(graph_dict, how_often_visited):
    walk = {
        'vertices': [],
        'edges': []
    }

    # initialize with invalid value to make sure that the first vertex is not removed
    previous_vertex = -1

    # choose random start vertex
    least_visited_vertices = []
    for i, x in enumerate(how_often_visited):
        if x == min(how_often_visited):
            least_visited_vertices.append(i)
    vertex = random.choice(least_visited_vertices)
    how_often_visited[vertex] += 1

    # append the first vertex to the walk
    walk['vertices'].append(vertex)

    for _ in range(3):
        neighborhood = get_neighborhood(graph_dict, vertex)
        
        # remove previous vertex from neighborhood
        if previous_vertex in neighborhood['nodes']:
            index_to_remove = neighborhood['nodes'].index(previous_vertex)
            neighborhood['nodes'].pop(index_to_remove)
            neighborhood['edges'].pop(index_to_remove)
        previous_vertex = vertex

        # if there are no more neighbors, stop
        if len(neighborhood['nodes']) == 0:
            break

        # choose the next vertex to visit
        edge = random.choice(neighborhood['edges'])
        vertex = graph_dict['edge_index'][1][edge]
        how_often_visited[vertex] += 1

        # append the next vertex and edge to the walk
        walk['edges'].append(edge)
        walk['vertices'].append(vertex)

    return walk, how_often_visited

In [5]:
def walk_to_list_of_attributes(graph_dict, walk):
    list = []

    for i in range(len(walk['edges'])):
        list.append(get_node_attributes(graph_dict, walk['vertices'][i]))
        list.append(get_edge_attributes(graph_dict, walk['edges'][i]))
    
    list.append(get_node_attributes(graph_dict, walk['vertices'][-1]))

    return list

In [6]:
def walk_to_words(graph_dict, walk):
    list_of_attributes = walk_to_list_of_attributes(graph_dict, walk)
    words = [','.join([str(v) for v in attributes]) for attributes in list_of_attributes]
    return words

In [7]:
f_words = open("words.cor", "w")
f_graph_id = open("graph_id.cor", "w")

Tagged_documents = []
labels = []

n = 100

for i, (graph_dict, label) in enumerate(tqdm(dataset)):
    how_often_visited = [0] * graph_dict['num_nodes']

    for x in range(n):
        walk, how_often_visited = random_walk_evenly_distributed(graph_dict, how_often_visited)
        words = walk_to_words(graph_dict, walk)
        f_words.write(f"{' '.join((words))}\n")
        f_graph_id.write(f'{str(i)}\n')
        
    labels.append(label)

f_words.close()
f_graph_id.close()

100%|██████████| 41127/41127 [06:56<00:00, 98.77it/s] 


In [8]:
model = Doc2Vec(corpus_file='words.cor', workers=4, window=10)

In [9]:
X = np.array([sum([model.dv[i+(j*n)]/n for i in range(n)]) for j in range(len(dataset))])
X.shape

(41127, 100)

In [10]:
import numpy as np

X_train = np.array([X[i] for i in split_idx['train']])
y_train = np.array([labels[i] for i in split_idx['train']])

X_test = np.array([X[i] for i in split_idx['test']])
y_test = np.array([labels[i] for i in split_idx['test']])

In [11]:
X_train.shape, y_train.shape

((32901, 100), (32901, 1))

In [26]:
pos_idx = [i for i, v in enumerate(y_train) if v == 1]
neg_idx = [i for i, v in enumerate(y_train) if v == 0]

len(pos_idx), len(neg_idx)

(1232, 31669)

In [13]:
len(neg_idx) / len(pos_idx)

25.705357142857142

In [24]:
X_train_oversampled = []
y_train_oversampled = []

for i in tqdm(range(len(X_train))):
    current_X = X_train[i]
    current_y = y_train[i]

    if current_y == 1:
        for _ in range(25):
            X_train_oversampled.append(current_X)
            y_train_oversampled.append(current_y)

    X_train_oversampled.append(current_X)
    y_train_oversampled.append(current_y)

X_train_oversampled = np.array(X_train_oversampled)
y_train_oversampled = np.array(y_train_oversampled)

100%|██████████| 32901/32901 [00:00<00:00, 356309.26it/s]


In [48]:
X_train_noise = []
y_train_noise = []
for X, y in tqdm(zip(X_train_oversampled, y_train_oversampled)):
    for _ in range(25):
        X_train_noise.append(X + ((X/100) * random.uniform(-5, 5)))
        y_train_noise.append(y)

X_train_noise = np.array(X_train_noise)
y_train_noise = np.array(y_train_noise)

63701it [00:08, 7444.47it/s]


In [49]:
pos_idx = [i for i, v in enumerate(y_train_noise) if v == 1]
neg_idx = [i for i, v in enumerate(y_train_noise) if v == 0]

len(pos_idx), len(neg_idx)

(800800, 791725)

In [50]:
from sklearn.ensemble import RandomForestClassifier
from ogb.graphproppred import Evaluator

In [51]:
clf = RandomForestClassifier()
clf.fit(X_train_noise, y_train_noise.ravel())

In [None]:
y_test_predicted = clf.predict(X_test)

In [None]:
y_test.shape, y_test_predicted.reshape(-1, 1).shape

((4113, 1), (4113, 1))

In [None]:
evaluator = Evaluator(name = dataset_name)
input_dict = {"y_true": y_test, "y_pred": y_test_predicted.reshape(-1, 1)}
result_dict = evaluator.eval(input_dict)
result_dict

{'rocauc': 0.4984935977906101}