-
Notifications
You must be signed in to change notification settings - Fork 997
Closed
Labels
Description
The elasticsearch-hadoop library appears to corrupt the JSON schema inference. The same JSON source read using the SQLContext.jsonFile
method succeeds.
Reproduction Steps
- Have an elasticsearch instance and configure it below.
curl -XDELETE localhost:9200/events2-salsnap1-2013-12/
curl -XPUT localhost:9200/events2-salsnap1-2013-12/event/76e3773d-8a19-485a-a75c-225070e2cbc6 -d '{"EventType":"MessageExportRequested","EventTime":1387710245000,"SessionId":"gsk*****","Trigger":"_null_","ScopedCompanyId":148,"ScopedArchiveId":"anArchive","EventId":"dbnbudzu4wge","ActorEntity":{"CompanyId":148,"IpAddress":"127.0.0.1","EntityId":"602","EntityName":"first-1 last-1","EntityType":"CompanyUser"},"AffectedEntity1":{"EntityId":"5678","EntityName":"5678","EntityType":"MessageExport","ExportPurpose":"FinraAudit","CaseName":"R v Sargisson","NumberMessages":534,"CaseId":"Sarg598","PriceCurrency":"CAD","DeliveryOptions":"DirectDownload","NewestMessageDate":1419112760000,"SpecialRequest":"_null_","ExportName":"Some Export","ExportFormat":"EML","SizeMessagesInBytes":1234789,"ExportDescription":"If the NSA can do it then so can I","ExportOption":"IncludeHiddenRecipientData","Price":500.12,"OldestMessageDate":1387576760000}}'
- Use Java to run
SparkSQLElasticsearchTest
- Use Java to run
SparkSQLJsonFileTest
Expected Output
ActorEntity AffectedEntity1 EventId EventTime EventType ScopedArchiveId ScopedCompanyId SessionId Trigger
[148,602,first-1 ... [Sarg598,R v Sarg... dbnbudzu4wge 1387710245000 MessageExportRequ... anArchive 148 gsk***** _null_
Actual Output (from SparkSQLElasticsearchTest)
ActorEntity AffectedEntity1 EventId EventTime EventType ScopedArchiveId ScopedCompanyId SessionId Trigger
MessageExportRequ... 1387710245000 gsk***** _null_ 148 anArchive dbnbudzu4wge Map(CompanyId -> ... Map(EntityId -> 5...
Files
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<parent>
<groupId>com.globalrelay</groupId>
<artifactId>globalrelay-parent</artifactId>
<version>3.2</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<groupId>com.globalrelay.herald</groupId>
<artifactId>spark-es-test</artifactId>
<version>0.0.0.1-SNAPSHOT</version>
<name>Spark elasticsearch Test</name>
<!-- Repository for snapshot version of elasticsearch-hadoop. Required until they release. -->
<repositories>
<repository>
<id>sonatype-oss</id>
<url>http://oss.sonatype.org/content/repositories/snapshots</url>
<snapshots><enabled>true</enabled></snapshots>
</repository>
</repositories>
<properties>
<spark.version>1.3.1</spark.version>
<elasticsearch-hadoop.version>2.1.0.BUILD-SNAPSHOT</elasticsearch-hadoop.version>
<es.version>1.3.2</es.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
<version>${elasticsearch-hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.pig</groupId>
<artifactId>pig</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-streaming</artifactId>
</exclusion>
<exclusion>
<groupId>cascading</groupId>
<artifactId>cascading-local</artifactId>
</exclusion>
<exclusion>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
</exclusion>
<!-- <exclusion>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
</exclusion> -->
<exclusion>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-tools</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hive</groupId>
<artifactId>hive-service</artifactId>
</exclusion>
<exclusion>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
</exclusion>
<exclusion>
<groupId>cascading</groupId>
<artifactId>cascading-hadoop</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
<version>1.8.8</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>${es.version}</version>
</dependency>
</dependencies>
</project>
SparkSQLElasticsearchTest.java
import static org.elasticsearch.index.query.FilterBuilders.typeFilter;
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
import static org.elasticsearch.index.query.QueryBuilders.filteredQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.elasticsearch.index.query.MatchQueryBuilder.Operator;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.spark.sql.api.java.JavaEsSparkSQL;
public class SparkSQLElasticsearchTest {
/**
* Before running this test:
* 1. Have an elasticsearch instance and configure it below.
* 2. curl -XDELETE localhost:9200/events2-salsnap1-2013-12/
* 3. curl -XPUT localhost:9200/events2-salsnap1-2013-12/event/76e3773d-8a19-485a-a75c-225070e2cbc6 -d '{"EventType":"MessageExportRequested","EventTime":1387710245000,"SessionId":"gsk*****","Trigger":"_null_","ScopedCompanyId":148,"ScopedArchiveId":"anArchive","EventId":"dbnbudzu4wge","ActorEntity":{"CompanyId":148,"IpAddress":"127.0.0.1","EntityId":"602","EntityName":"first-1 last-1","EntityType":"CompanyUser"},"AffectedEntity1":{"EntityId":"5678","EntityName":"5678","EntityType":"MessageExport","ExportPurpose":"FinraAudit","CaseName":"R v Sargisson","NumberMessages":534,"CaseId":"Sarg598","PriceCurrency":"CAD","DeliveryOptions":"DirectDownload","NewestMessageDate":1419112760000,"SpecialRequest":"_null_","ExportName":"Some Export","ExportFormat":"EML","SizeMessagesInBytes":1234789,"ExportDescription":"If the NSA can do it then so can I","ExportOption":"IncludeHiddenRecipientData","Price":500.12,"OldestMessageDate":1387576760000}}'
*/
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("ElasticsearchTest");
// Note that the elasticsearch node needs to have its publish_host set to be reachable.
conf.set("spark.es.nodes", "192.168.50.2:9200"); // elasticsearch port
conf.set("spark.es.resource", "customer/external");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(jsc);
QueryBuilder qb = filteredQuery(
boolQuery().
must(matchQuery("EventType", "MessageExportRequested").operator(Operator.OR))/*.
must(rangeQuery("EventTime").from(startDate.getTime()).to(endDate.getTime()))*/,
typeFilter("event"));
String queryString = "{\"query\":" + qb.toString() + "}";
DataFrame baseDF = JavaEsSparkSQL.esDF(sqlContext, "events2-salsnap1-2013-12/event", queryString);
System.out.println(baseDF.schema());
baseDF.show();
// Actual output
// ActorEntity AffectedEntity1 EventId EventTime EventType ScopedArchiveId ScopedCompanyId SessionId Trigger
// MessageExportRequ... 1387710245000 gsk***** _null_ 148 anArchive dbnbudzu4wge Map(CompanyId -> ... Map(EntityId -> 5...
// Expected output (as for jsonfile)
// ActorEntity AffectedEntity1 EventId EventTime EventType ScopedArchiveId ScopedCompanyId SessionId Trigger
// [148,602,first-1 ... [Sarg598,R v Sarg... dbnbudzu4wge 1387710245000 MessageExportRequ... anArchive 148 gsk***** _null_
}
}
SparkSQLJsonFileTest
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
public class SparkSQLJsonFileTest {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("JsonFileTest");
// Note that the elasticsearch node needs to have its publish_host set to be reachable.
conf.set("spark.es.nodes", "192.168.50.2:9200"); // elasticsearch port
conf.set("spark.es.resource", "customer/external");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sqlContext = new SQLContext(jsc);
DataFrame baseDF = sqlContext.jsonFile("src/main/resources/message-export-events.json");
System.out.println(baseDF.schema());
baseDF.show();
// Expected output:
// ActorEntity AffectedEntity1 EventId EventTime EventType ScopedArchiveId ScopedCompanyId SessionId Trigger
// [148,602,first-1 ... [Sarg598,R v Sarg... dbnbudzu4wge 1387710245000 MessageExportRequ... anArchive 148 gsk***** _null_
}
}
src/main/resources/message-export-events.json
{"EventType":"MessageExportRequested","EventTime":1387710245000,"SessionId":"gsk*****","Trigger":"_null_","ScopedCompanyId":148,"ScopedArchiveId":"anArchive","EventId":"dbnbudzu4wge","ActorEntity":{"CompanyId":148,"IpAddress":"127.0.0.1","EntityId":"602","EntityName":"first-1 last-1","EntityType":"CompanyUser"},"AffectedEntity1":{"EntityId":"5678","EntityName":"5678","EntityType":"MessageExport","ExportPurpose":"FinraAudit","CaseName":"R v Sargisson","NumberMessages":534,"CaseId":"Sarg598","PriceCurrency":"CAD","DeliveryOptions":"DirectDownload","NewestMessageDate":1419112760000,"SpecialRequest":"_null_","ExportName":"Some Export","ExportFormat":"EML","SizeMessagesInBytes":1234789,"ExportDescription":"If the NSA can do it then so can I","ExportOption":"IncludeHiddenRecipientData","Price":500.12,"OldestMessageDate":1387576760000}}