Permalink
Browse files

[#1114] fixed bug in random vertex neighborhood sampling (#1115)

fixes #1114
  • Loading branch information...
galpha authored and ChrizZz110 committed Dec 6, 2018
1 parent 8de9b6d commit 961a45118afc58d15ce76158d1aa566d5df8aab0
@@ -16,21 +16,18 @@
package org.gradoop.flink.model.impl.operators.sampling;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.gradoop.common.model.impl.id.GradoopId;
import org.gradoop.common.model.impl.pojo.Edge;
import org.gradoop.common.model.impl.pojo.Vertex;
import org.gradoop.flink.model.impl.epgm.LogicalGraph;
import org.gradoop.flink.model.impl.functions.epgm.Id;
import org.gradoop.flink.model.impl.functions.epgm.SourceId;
import org.gradoop.flink.model.impl.functions.tuple.Value0Of3;
import org.gradoop.flink.model.impl.operators.sampling.functions.EdgeWithSourceTarget;
import org.gradoop.flink.model.impl.operators.sampling.functions.Neighborhood;
import org.gradoop.flink.model.impl.operators.sampling.functions.VertexRandomMarkedMap;
import org.gradoop.flink.model.impl.operators.sampling.functions.VertexWithId;
import org.gradoop.flink.model.impl.operators.sampling.functions.EdgeSourceVertexJoin;
import org.gradoop.flink.model.impl.operators.sampling.functions.EdgeTargetVertexJoin;
import org.gradoop.flink.model.impl.operators.sampling.functions.EdgesWithSampledVerticesFilter;
import org.gradoop.flink.model.impl.operators.sampling.functions.FilterVerticesWithDegreeOtherThanGiven;
import org.gradoop.flink.model.impl.operators.sampling.functions.Neighborhood;
import org.gradoop.flink.model.impl.operators.sampling.functions.VertexRandomMarkedMap;
/**
* Computes a vertex sampling of the graph. Retains randomly chosen vertices of a given relative
@@ -108,24 +105,22 @@ public RandomVertexNeighborhoodSampling(float sampleSize,
*/
@Override
public LogicalGraph sample(LogicalGraph graph) {
DataSet<Tuple2<Vertex, GradoopId>> sampledVerticesWithId = graph.getVertices()
.map(new VertexRandomMarkedMap<>(sampleSize, randomSeed, PROPERTY_KEY_SAMPLED))
.map(new VertexWithId());
DataSet<Tuple3<Edge, GradoopId, GradoopId>> edgeSourceIdTargetId = graph.getEdges()
.map(new EdgeWithSourceTarget());
DataSet<Edge> newEdges = edgeSourceIdTargetId
.join(sampledVerticesWithId)
.where(1).equalTo(1)
.with(new EdgeSourceVertexJoin())
.join(sampledVerticesWithId)
.where(2).equalTo(1)
.with(new EdgeTargetVertexJoin())
.filter(new EdgesWithSampledVerticesFilter(PROPERTY_KEY_SAMPLED, neighborType))
DataSet<Vertex> sampledVertices = graph.getVertices()
.map(new VertexRandomMarkedMap(sampleSize, randomSeed, PROPERTY_KEY_SAMPLED));
DataSet<Edge> newEdges = graph.getEdges()
.join(sampledVertices)
.where(new SourceId<>()).equalTo(new Id<>())
.with(new EdgeSourceVertexJoin(PROPERTY_KEY_SAMPLED))
.join(sampledVertices)
.where(1).equalTo(new Id<>())
.with(new EdgeTargetVertexJoin(PROPERTY_KEY_SAMPLED))
.filter(new EdgesWithSampledVerticesFilter(neighborType))
.map(new Value0Of3<>());
graph = graph.getConfig().getLogicalGraphFactory().fromDataSets(graph.getVertices(), newEdges);
graph = graph.getFactory().fromDataSets(graph.getVertices(), newEdges);
graph = new FilterVerticesWithDegreeOtherThanGiven(0L).execute(graph);
return graph;
@@ -16,39 +16,48 @@
package org.gradoop.flink.model.impl.operators.sampling.functions;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.functions.FunctionAnnotation;
import org.apache.flink.api.java.tuple.Tuple3;
import org.gradoop.common.model.impl.id.GradoopId;
import org.gradoop.common.model.impl.pojo.Edge;
import org.gradoop.common.model.impl.pojo.Vertex;
/**
* Joins to get the edge source
* Joins to get the edge source:
* (edge),(vertex) -> (edge,edge.targetId,(bool)vertex[propertyKey])
*/
public class EdgeSourceVertexJoin implements JoinFunction<Tuple3<Edge, GradoopId, GradoopId>,
Tuple2<Vertex, GradoopId>, Tuple3<Edge, Vertex, GradoopId>> {
@FunctionAnnotation.ForwardedFieldsFirst({"*->f0", "id->f1"})
@FunctionAnnotation.ReadFieldsSecond("properties")
public class EdgeSourceVertexJoin
implements JoinFunction<Edge, Vertex, Tuple3<Edge, GradoopId, Boolean>> {
/**
* Reduce object instantiations
* Reduce object instantiations
*/
private Tuple3<Edge, Vertex, GradoopId> reuse;
private Tuple3<Edge, GradoopId, Boolean> reuse;
/**
* Constructor
* Property key of marked value
*/
public EdgeSourceVertexJoin() {
reuse = new Tuple3<>();
private String propertyKey;
/**
* Creates an instance of this join function
*
* @param propertyKey vertex property key
*/
public EdgeSourceVertexJoin(String propertyKey) {
this.reuse = new Tuple3<>();
this.propertyKey = propertyKey;
}
/**
* {@inheritDoc}
*/
@Override
public Tuple3<Edge, Vertex, GradoopId> join(
Tuple3<Edge, GradoopId, GradoopId> edgeWithItsVerticesIds,
Tuple2<Vertex, GradoopId> vertexWithItsId) {
reuse.f0 = edgeWithItsVerticesIds.f0;
reuse.f1 = vertexWithItsId.f0;
reuse.f2 = edgeWithItsVerticesIds.f2;
public Tuple3<Edge, GradoopId, Boolean> join(Edge edge, Vertex vertex) throws Exception {
reuse.f0 = edge;
reuse.f1 = edge.getTargetId();
reuse.f2 = vertex.getPropertyValue(propertyKey).getBoolean();
return reuse;
}
}
@@ -16,38 +16,50 @@
package org.gradoop.flink.model.impl.operators.sampling.functions;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.functions.FunctionAnnotation;
import org.apache.flink.api.java.tuple.Tuple3;
import org.gradoop.common.model.impl.id.GradoopId;
import org.gradoop.common.model.impl.pojo.Edge;
import org.gradoop.common.model.impl.pojo.Vertex;
/**
* Joins to get the edge target
* Joins to get the edge target:
* (edge,edge.targetId,bool-source),(target) -> (edge,bool-source,(bool)target[propertyKey])
*/
public class EdgeTargetVertexJoin implements JoinFunction<Tuple3<Edge, Vertex, GradoopId>,
Tuple2<Vertex, GradoopId>, Tuple3<Edge, Vertex, Vertex>> {
@FunctionAnnotation.ForwardedFieldsFirst({"f0->f0", "f2->f1"})
@FunctionAnnotation.ReadFieldsSecond("properties")
public class EdgeTargetVertexJoin implements
JoinFunction<Tuple3<Edge, GradoopId, Boolean>, Vertex, Tuple3<Edge, Boolean, Boolean>> {
/**
* Reduce object instantiations
*/
private Tuple3<Edge, Vertex, Vertex> reuse;
private Tuple3<Edge, Boolean, Boolean> reuse;
/**
* Property key of vertex value
*/
private final String propertyKey;
/**
* Constructor
* Creates an instance of this join function
*
* @param propertyKey property key of marked vertex value
*/
public EdgeTargetVertexJoin() {
reuse = new Tuple3<>();
public EdgeTargetVertexJoin(String propertyKey) {
this.reuse = new Tuple3<>();
this.propertyKey = propertyKey;
}
/**
* {@inheritDoc}
*/
@Override
public Tuple3<Edge, Vertex, Vertex> join(Tuple3<Edge, Vertex, GradoopId> edgeWithItsVerticesIds,
Tuple2<Vertex, GradoopId> vertexWithItsId) {
reuse.f0 = edgeWithItsVerticesIds.f0;
reuse.f1 = edgeWithItsVerticesIds.f1;
reuse.f2 = vertexWithItsId.f0;
public Tuple3<Edge, Boolean, Boolean> join(Tuple3<Edge, GradoopId, Boolean> interim,
Vertex vertex) {
reuse.f0 = interim.f0;
reuse.f1 = interim.f2;
reuse.f2 = vertex.getPropertyValue(propertyKey).getBoolean();
return reuse;
}
}
@@ -17,58 +17,51 @@
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.gradoop.common.exceptions.UnsupportedTypeException;
import org.gradoop.common.model.impl.pojo.Edge;
import org.gradoop.common.model.impl.pojo.Vertex;
/**
* Filters the edges with sampled vertices. If any vertices of the edge does not have any related
* property for sampling, we consider that vertex as not sampled.
*/
public class EdgesWithSampledVerticesFilter
implements FilterFunction<Tuple3<Edge, Vertex, Vertex>> {
/**
* Property name which shows if a vertex is sampled
*/
private String propertyNameForSampled;
implements FilterFunction<Tuple3<Edge, Boolean, Boolean>> {
/**
* type of neighborhood
* Type of neighborhood
*/
private Neighborhood neighborType;
/**
* Constructor
*
* @param propertyNameForSampled property name which shows if a vertex is sampled
* @param neighborType type of neighborhood
*/
public EdgesWithSampledVerticesFilter(String propertyNameForSampled, Neighborhood neighborType) {
this.propertyNameForSampled = propertyNameForSampled;
public EdgesWithSampledVerticesFilter(Neighborhood neighborType) {
this.neighborType = neighborType;
}
/**
* {@inheritDoc}
*/
@Override
public boolean filter(Tuple3<Edge, Vertex, Vertex> t3) {
boolean isSourceVertexMarked = false;
boolean isTargetVertexMarked = false;
if (t3.f1.hasProperty(propertyNameForSampled)) {
isSourceVertexMarked = Boolean.getBoolean(
t3.f1.getPropertyValue(propertyNameForSampled).toString());
}
if (t3.f2.hasProperty(propertyNameForSampled)) {
isTargetVertexMarked = Boolean.getBoolean(
t3.f2.getPropertyValue(propertyNameForSampled).toString());
}
boolean ret = false;
if (neighborType.equals(Neighborhood.BOTH)) {
ret = isSourceVertexMarked || isTargetVertexMarked;
} else if (neighborType.equals(Neighborhood.IN)) {
ret = isTargetVertexMarked;
} else if (neighborType.equals(Neighborhood.OUT)) {
ret = isSourceVertexMarked;
public boolean filter(Tuple3<Edge, Boolean, Boolean> tuple) {
boolean isSourceVertexMarked = tuple.f1;
boolean isTargetVertexMarked = tuple.f2;
boolean filter;
switch (neighborType) {
case BOTH: filter = isSourceVertexMarked || isTargetVertexMarked;
break;
case IN: filter = isTargetVertexMarked;
break;
case OUT: filter = isSourceVertexMarked;
break;
default: throw new UnsupportedTypeException("NeighborType needs to be BOTH, IN or OUT");
}
return ret;
return filter;
}
}
@@ -47,6 +47,7 @@ public FilterVerticesWithDegreeOtherThanGiven(long degree) {
*/
@Override
public LogicalGraph execute(LogicalGraph graph) {
DistinctVertexDegrees distinctVertexDegrees = new DistinctVertexDegrees(
SamplingAlgorithm.DEGREE_PROPERTY_KEY,
SamplingAlgorithm.IN_DEGREE_PROPERTY_KEY,
@@ -23,10 +23,8 @@
/**
* Creates a random value for each vertex and marks those that are below a
* given threshold.
*
* @param <V> EPGM vertex type
*/
public class VertexRandomMarkedMap<V extends Vertex> implements MapFunction<V, V> {
public class VertexRandomMarkedMap implements MapFunction<Vertex, Vertex> {
/**
* Threshold to decide if a vertex needs to be filtered.
*/
@@ -57,7 +55,7 @@ public VertexRandomMarkedMap(float sampleSize, long randomSeed, String mark) {
* {@inheritDoc}
*/
@Override
public V map(V vertex) throws Exception {
public Vertex map(Vertex vertex) throws Exception {
if (randomGenerator.nextFloat() <= sampleSize) {
vertex.setProperty(mark, true);
} else {
@@ -15,15 +15,12 @@
*/
package org.gradoop.flink.model.impl.operators.sampling;
import org.gradoop.common.model.impl.pojo.Edge;
import org.gradoop.flink.model.impl.epgm.LogicalGraph;
import org.gradoop.flink.model.impl.operators.sampling.functions.Neighborhood;
import org.junit.runners.Parameterized;
import java.util.Arrays;
import static org.junit.Assert.assertFalse;
public class RandomVertexNeighborhoodSamplingTest extends ParameterizedTestForGraphSampling {
/**
@@ -35,7 +32,7 @@
* @param neighborType The vertex neighborhood type, e.g. Neighborhood.BOTH
*/
public RandomVertexNeighborhoodSamplingTest(String testName, String seed, String sampleSize,
String neighborType) {
String neighborType) {
super(testName, Long.parseLong(seed), Float.parseFloat(sampleSize),
Neighborhood.valueOf(neighborType));
}
@@ -53,15 +50,9 @@ public SamplingAlgorithm getSamplingOperator() {
*/
@Override
public void validateSpecific(LogicalGraph input, LogicalGraph output) {
dbEdges.removeAll(newEdges);
for (Edge edge : dbEdges) {
assertFalse("edge from original graph was not sampled but source and target were",
newVertexIDs.contains(edge.getSourceId()) &&
newVertexIDs.contains(edge.getTargetId()));
}
}
/**
* Parameters called when running the test
*

0 comments on commit 961a451

Please sign in to comment.