Skip to content

Commit

Permalink
Consider document routing when deleting and overwriting data in SparkSQL
Browse files Browse the repository at this point in the history
fixes #1030
  • Loading branch information
jbaiera committed Feb 16, 2018
1 parent 81a0893 commit 2e4c4b0
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 6 deletions.
25 changes: 21 additions & 4 deletions mr/src/main/java/org/elasticsearch/hadoop/rest/RestRepository.java
Original file line number Diff line number Diff line change
Expand Up @@ -384,18 +384,35 @@ public void delete() {
sb.append("&search_type=scan");
}
String scanQuery = sb.toString();
ScrollReader scrollReader = new ScrollReader(new ScrollReaderConfig(new JdkValueReader()));
ScrollReaderConfig readerConf = new ScrollReaderConfig(true, new JdkValueReader());
ScrollReader scrollReader = new ScrollReader(readerConf);

// start iterating
ScrollQuery sq = scanAll(scanQuery, null, scrollReader);
try {
BytesArray entry = new BytesArray(0);

// delete each retrieved batch
String format = "{\"delete\":{\"_id\":\"%s\"}}\n";
// delete each retrieved batch, keep routing in mind:
String baseFormat = "{\"delete\":{\"_id\":\"%s\"}}\n";
String routedFormat;
if (client.internalVersion.onOrAfter(EsMajorVersion.V_7_X)) {
routedFormat = "{\"delete\":{\"_id\":\"%s\", \"routing\":\"%s\"}}\n";
} else {
routedFormat = "{\"delete\":{\"_id\":\"%s\", \"_routing\":\"%s\"}}\n";
}
while (sq.hasNext()) {
entry.reset();
entry.add(StringUtils.toUTF(String.format(format, sq.next()[0])));
Object[] kv = sq.next();
@SuppressWarnings("unchecked")
Map<String, Object> value = (Map<String, Object>) kv[1];
@SuppressWarnings("unchecked")
Map<String, Object> metadata = (Map<String, Object>) value.get("_metadata");
String routing = (String) metadata.get("_routing");
if (StringUtils.hasText(routing)) {
entry.add(StringUtils.toUTF(String.format(routedFormat, kv[0], routing)));
} else {
entry.add(StringUtils.toUTF(String.format(baseFormat, kv[0])));
}
writeProcessedToIndex(entry);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,11 @@ public ScrollReaderConfig(ValueReader reader, Mapping resolvedMapping, boolean r
}

public ScrollReaderConfig(ValueReader reader) {
this(reader, null, false, "_metadata", false, false, Collections.<String> emptyList(), Collections.<String> emptyList(), Collections.<String> emptyList());
this(false, reader);
}

public ScrollReaderConfig(boolean readMetadata, ValueReader reader) {
this(reader, null, readMetadata, "_metadata", false, false, Collections.<String> emptyList(), Collections.<String> emptyList(), Collections.<String> emptyList());
}

public ScrollReaderConfig(ValueReader reader, Mapping resolvedMapping, Settings cfg) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1204,6 +1204,71 @@ class AbstractScalaEsScalaSparkSQL(prefix: String, readMetadata: jl.Boolean, pus
assertEquals(1, df.count)
}

@Test
def testEsDataFrame52OverwriteExistingDataSourceWithJoinField() {
// Join added in 6.0.
EsAssume.versionOnOrAfter(EsMajorVersion.V_6_X, "Join added in 6.0.")

// using long-form joiner values
val schema = StructType(Seq(
StructField("id", StringType, nullable = false),
StructField("company", StringType, nullable = true),
StructField("name", StringType, nullable = true),
StructField("joiner", StructType(Seq(
StructField("name", StringType, nullable = false),
StructField("parent", StringType, nullable = true)
)))
))

val parents = Seq(
Row("1", "Elastic", null, Row("company", null)),
Row("2", "Fringe Cafe", null, Row("company", null)),
Row("3", "WATIcorp", null, Row("company", null))
)

val firstChildren = Seq(
Row("10", null, "kimchy", Row("employee", "1")),
Row("20", null, "April Ryan", Row("employee", "2")),
Row("21", null, "Charlie", Row("employee", "2")),
Row("30", null, "Alvin Peats", Row("employee", "3"))
)

val index = wrapIndex("sparksql-test-scala-overwrite-join")
val typename = "join"
val target = s"$index/$typename"
RestUtils.delete(index)
RestUtils.touch(index)
RestUtils.putMapping(index, typename, "data/join/mapping.json")

sqc.createDataFrame(sc.makeRDD(parents ++ firstChildren), schema)
.write
.format("es")
.options(Map(ES_MAPPING_ID -> "id", ES_MAPPING_JOIN -> "joiner"))
.save(target)

assertThat(RestUtils.get(target + "/10?routing=1"), containsString("kimchy"))
assertThat(RestUtils.get(target + "/10?routing=1"), containsString(""""_routing":"1""""))

// Overwrite the data using a new dataset:
val newChildren = Seq(
Row("110", null, "costinl", Row("employee", "1")),
Row("111", null, "jbaiera", Row("employee", "1")),
Row("121", null, "Charlie", Row("employee", "2")),
Row("130", null, "Damien", Row("employee", "3"))
)

sqc.createDataFrame(sc.makeRDD(parents ++ newChildren), schema)
.write
.format("es")
.options(cfg ++ Map(ES_MAPPING_ID -> "id", ES_MAPPING_JOIN -> "joiner"))
.mode(SaveMode.Overwrite)
.save(target)

assertFalse(RestUtils.exists(target + "/10?routing=1"))
assertThat(RestUtils.get(target + "/110?routing=1"), containsString("costinl"))
assertThat(RestUtils.get(target + "/110?routing=1"), containsString(""""_routing":"1""""))
}

@Test
def testEsDataFrame53OverwriteExistingDataSourceFromAnotherDataSource() {
// to keep the select static
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1222,6 +1222,71 @@ class AbstractScalaEsScalaSparkSQL(prefix: String, readMetadata: jl.Boolean, pus
assertEquals(1, df.count)
}

@Test
def testEsDataFrame52OverwriteExistingDataSourceWithJoinField() {
// Join added in 6.0.
EsAssume.versionOnOrAfter(EsMajorVersion.V_6_X, "Join added in 6.0.")

// using long-form joiner values
val schema = StructType(Seq(
StructField("id", StringType, nullable = false),
StructField("company", StringType, nullable = true),
StructField("name", StringType, nullable = true),
StructField("joiner", StructType(Seq(
StructField("name", StringType, nullable = false),
StructField("parent", StringType, nullable = true)
)))
))

val parents = Seq(
Row("1", "Elastic", null, Row("company", null)),
Row("2", "Fringe Cafe", null, Row("company", null)),
Row("3", "WATIcorp", null, Row("company", null))
)

val firstChildren = Seq(
Row("10", null, "kimchy", Row("employee", "1")),
Row("20", null, "April Ryan", Row("employee", "2")),
Row("21", null, "Charlie", Row("employee", "2")),
Row("30", null, "Alvin Peats", Row("employee", "3"))
)

val index = wrapIndex("sparksql-test-scala-overwrite-join")
val typename = "join"
val target = s"$index/$typename"
RestUtils.delete(index)
RestUtils.touch(index)
RestUtils.putMapping(index, typename, "data/join/mapping.json")

sqc.createDataFrame(sc.makeRDD(parents ++ firstChildren), schema)
.write
.format("es")
.options(Map(ES_MAPPING_ID -> "id", ES_MAPPING_JOIN -> "joiner"))
.save(target)

assertThat(RestUtils.get(target + "/10?routing=1"), containsString("kimchy"))
assertThat(RestUtils.get(target + "/10?routing=1"), containsString(""""_routing":"1""""))

// Overwrite the data using a new dataset:
val newChildren = Seq(
Row("110", null, "costinl", Row("employee", "1")),
Row("111", null, "jbaiera", Row("employee", "1")),
Row("121", null, "Charlie", Row("employee", "2")),
Row("130", null, "Damien", Row("employee", "3"))
)

sqc.createDataFrame(sc.makeRDD(parents ++ newChildren), schema)
.write
.format("es")
.options(cfg ++ Map(ES_MAPPING_ID -> "id", ES_MAPPING_JOIN -> "joiner"))
.mode(SaveMode.Overwrite)
.save(target)

assertFalse(RestUtils.exists(target + "/10?routing=1"))
assertThat(RestUtils.get(target + "/110?routing=1"), containsString("costinl"))
assertThat(RestUtils.get(target + "/110?routing=1"), containsString(""""_routing":"1""""))
}

@Test
def testEsDataFrame53OverwriteExistingDataSourceFromAnotherDataSource() {
// to keep the select static
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class DataFrameFieldExtractor extends ScalaMapFieldExtractor {

// Return the value or unpack the value if it's a row-schema tuple
obj match {
case (row: Row, struct: StructType) => row
case (row: Row, _: StructType) => row
case any => any
}
}
Expand Down

0 comments on commit 2e4c4b0

Please sign in to comment.