-
Notifications
You must be signed in to change notification settings - Fork 239
/
prepare_graph_data.py
330 lines (261 loc) · 10.7 KB
/
prepare_graph_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
"""Prepare and process graph-structured data.
Lead author: Hadi Zaatiti.
"""
import logging
import random
import geomstats.backend as gs
from geomstats.geometry.poincare_ball import PoincareBall
class Graph:
"""Class for generating a graph object from a dataset.
Prepare Graph object from a dataset file.
Parameters
----------
graph_matrix_path : string
Path to graph adjacency matrix.
labels_path : string
Path to labels of the nodes of the graph.
Attributes
----------
edges : dict
Dictionary with node number as key
and edge connected node numbers as values.
n_nodes : int
Number of nodes in the graph.
labels : dict
Dictionary with node number as key and the true label number as values.
"""
def __init__(self, graph_matrix_path, labels_path):
self.edges = {}
with open(graph_matrix_path, "r") as edges_file:
for i, line in enumerate(edges_file):
lsp = line.split()
self.edges[i] = [k for k, value in enumerate(lsp) if int(value) == 1]
self.n_nodes = len(self.edges)
if labels_path is not None:
self.labels = {}
with open(labels_path, "r") as labels_file:
for i, line in enumerate(labels_file):
self.labels[i] = []
self.labels[i].append(int(line))
def random_walk(self, walk_length=5, n_walks_per_node=1):
"""Compute a set of random walks on a graph.
For each node of the graph, generates a a number of
random walks of a specified length.
Two consecutive nodes in the random walk, are necessarily
related with an edge. The walks capture the structure of the graph.
Parameters
----------
walk_length : int
Length of a random walk in terms of number of edges.
n_walks_per_node : int
Number of generated walks starting from each node of the graph.
Returns
-------
self : array-like,
Shape=[n_walks_per_node*self.n_edges), walk_length]
array containing random walks.
"""
paths = [
[0] * (walk_length + 1) for i in range(self.n_nodes * n_walks_per_node)
]
for index in range(len(self.edges)):
for i in range(n_walks_per_node):
paths[index * n_walks_per_node + i] = self._walk(index, walk_length)
return gs.array(paths)
def _walk(self, index, walk_length):
"""Generate a single random walk."""
count_index = index
path = [index]
for _ in range(walk_length):
count_index = self.edges[count_index][
random.randint(0, len(self.edges[count_index]) - 1)
]
path.append(count_index)
return gs.array(path, dtype=gs.int32)
class HyperbolicEmbedding:
"""Class for learning embeddings of graphs on hyperbolic space.
Parameters
----------
dim : object
Dimensions of the used hyperbolic space.
max_epochs : int
Maximum number of iterations for embedding.
lr : int
Learning rate for embedding.
n_context : int
Number of nodes to consider from a neighborhood
of nodes around a particular node.
n_negative : int
Number of nodes to consider when searching for
a set of nodes that are far from a particular node.
"""
def __init__(self, dim=2, max_epochs=100, lr=0.05, n_context=1, n_negative=2):
self.manifold = PoincareBall(dim)
self.max_epochs = max_epochs
self.lr = lr
self.n_context = n_context
self.n_negative = n_negative
@staticmethod
def log_sigmoid(vector):
"""Logsigmoid function.
Apply log sigmoid function.
Parameters
----------
vector : array-like, shape=[n_samples, dim]
Returns
-------
result : array-like, shape=[n_samples, dim]
"""
return gs.log((1 / (1 + gs.exp(-vector))))
@staticmethod
def grad_log_sigmoid(vector):
"""Gradient of log sigmoid function.
Parameters
----------
vector : array-like, shape=[n_samples, dim]
Returns
-------
gradient : array-like, shape=[n_samples, dim]
"""
return 1 / (1 + gs.exp(vector))
def grad_squared_distance(self, point_a, point_b):
"""Gradient of squared hyperbolic distance.
Gradient of the squared distance based on the
Ball representation according to point_a.
Parameters
----------
point_a : array-like, shape=[n_samples, dim]
First point in hyperbolic space.
point_b : array-like, shape=[n_samples, dim]
Second point in hyperbolic space.
Returns
-------
dist : array-like, shape=[n_samples, 1]
Geodesic squared distance between the two points.
"""
hyperbolic_metric = self.manifold.metric
log_map = hyperbolic_metric.log(point_b, point_a)
return -2 * log_map
def loss(self, example_embedding, context_embedding, negative_embedding):
"""Compute loss and grad.
Compute loss and grad given embedding of the current example,
embedding of the context and negative sampling embedding.
Parameters
----------
example_embedding : array-like, shape=[dim]
Current data sample embedding.
context_embedding : array-like, shape=[dim]
Current context embedding.
negative_embedding: array-like, shape=[dim]
Current negative sample embedding.
Returns
-------
total_loss : int
The current value of the loss function.
example_grad : array-like, shape=[dim]
The gradient of the loss function at the embedding
of the current data sample.
"""
n_edges, dim = negative_embedding.shape[0], example_embedding.shape[-1]
example_embedding = gs.expand_dims(example_embedding, 0)
context_embedding = gs.expand_dims(context_embedding, 0)
positive_distance = self.manifold.metric.squared_dist(
example_embedding, context_embedding
)
positive_loss = self.log_sigmoid(-positive_distance)
reshaped_example_embedding = gs.repeat(example_embedding, n_edges, axis=0)
negative_distance = self.manifold.metric.squared_dist(
reshaped_example_embedding, negative_embedding
)
negative_loss = self.log_sigmoid(negative_distance)
total_loss = -(positive_loss + gs.sum(negative_loss))
positive_log_sigmoid_grad = -self.grad_log_sigmoid(-positive_distance)
positive_distance_grad = self.grad_squared_distance(
example_embedding, context_embedding
)
positive_grad = (
gs.repeat(positive_log_sigmoid_grad, dim, axis=-1) * positive_distance_grad
)
negative_distance_grad = self.grad_squared_distance(
reshaped_example_embedding, negative_embedding
)
negative_distance = gs.to_ndarray(negative_distance, to_ndim=2, axis=-1)
negative_log_sigmoid_grad = self.grad_log_sigmoid(negative_distance)
negative_grad = negative_log_sigmoid_grad * negative_distance_grad
example_grad = -(positive_grad + gs.sum(negative_grad, axis=0))
return total_loss, example_grad
def embed(self, graph):
"""Compute embedding.
Optimize a loss function to obtain a representable embedding.
Parameters
----------
graph : object
An instance of the Graph class.
Returns
-------
embeddings : array-like, shape=[n_samples, dim]
Return the embedding of the data. Each data sample
is represented as a point belonging to the manifold.
"""
nb_vertices_by_edges = [len(e_2) for _, e_2 in graph.edges.items()]
logging.info("Number of edges: %s", len(graph.edges))
logging.info(
"Mean vertices by edges: %s",
(sum(nb_vertices_by_edges, 0) / len(graph.edges)),
)
negative_table_parameter = 5
negative_sampling_table = []
for i, nb_v in enumerate(nb_vertices_by_edges):
negative_sampling_table += (
[i] * int((nb_v ** (3.0 / 4.0))) * negative_table_parameter
)
negative_sampling_table = gs.array(negative_sampling_table)
random_walks = graph.random_walk()
embeddings = gs.random.normal(size=(graph.n_nodes, self.manifold.dim))
embeddings = embeddings * 0.2
for epoch in range(self.max_epochs):
total_loss = []
for path in random_walks:
for example_index, one_path in enumerate(path):
context_index = path[
max(0, example_index - self.n_context) : min(
example_index + self.n_context, len(path)
)
]
negative_index = gs.random.randint(
negative_sampling_table.shape[0],
size=(len(context_index), self.n_negative),
)
negative_index = gs.expand_dims(gs.flatten(negative_index), axis=-1)
negative_index = gs.get_slice(
negative_sampling_table, negative_index
)
example_embedding = embeddings[gs.cast(one_path, dtype=gs.int64)]
for one_context_i, one_negative_i in zip(
context_index, negative_index
):
context_embedding = embeddings[one_context_i]
negative_embedding = gs.get_slice(
embeddings,
gs.squeeze(gs.cast(one_negative_i, dtype=gs.int64)),
)
l, g_ex = self.loss(
example_embedding, context_embedding, negative_embedding
)
total_loss.append(l)
example_to_update = embeddings[one_path]
valeur = self.manifold.metric.exp(
-self.lr * g_ex, example_to_update
)
embeddings = gs.assignment(
embeddings,
valeur,
gs.to_ndarray(one_path, to_ndim=1),
axis=1,
)
logging.info(
"iteration %d loss_value %f",
epoch,
sum(total_loss, 0) / len(total_loss),
)
return embeddings