Skip to content

JSON schema inference corrupt with Elasticsearch Spark #441

@ejsarge-gr

Description

@ejsarge-gr

The elasticsearch-hadoop library appears to corrupt the JSON schema inference. The same JSON source read using the SQLContext.jsonFile method succeeds.

Reproduction Steps

  1. Have an elasticsearch instance and configure it below.
  2. curl -XDELETE localhost:9200/events2-salsnap1-2013-12/
  3. curl -XPUT localhost:9200/events2-salsnap1-2013-12/event/76e3773d-8a19-485a-a75c-225070e2cbc6 -d '{"EventType":"MessageExportRequested","EventTime":1387710245000,"SessionId":"gsk*****","Trigger":"_null_","ScopedCompanyId":148,"ScopedArchiveId":"anArchive","EventId":"dbnbudzu4wge","ActorEntity":{"CompanyId":148,"IpAddress":"127.0.0.1","EntityId":"602","EntityName":"first-1 last-1","EntityType":"CompanyUser"},"AffectedEntity1":{"EntityId":"5678","EntityName":"5678","EntityType":"MessageExport","ExportPurpose":"FinraAudit","CaseName":"R v Sargisson","NumberMessages":534,"CaseId":"Sarg598","PriceCurrency":"CAD","DeliveryOptions":"DirectDownload","NewestMessageDate":1419112760000,"SpecialRequest":"_null_","ExportName":"Some Export","ExportFormat":"EML","SizeMessagesInBytes":1234789,"ExportDescription":"If the NSA can do it then so can I","ExportOption":"IncludeHiddenRecipientData","Price":500.12,"OldestMessageDate":1387576760000}}'
  4. Use Java to run SparkSQLElasticsearchTest
  5. Use Java to run SparkSQLJsonFileTest

Expected Output

ActorEntity          AffectedEntity1      EventId      EventTime     EventType            ScopedArchiveId ScopedCompanyId SessionId Trigger
[148,602,first-1 ... [Sarg598,R v Sarg... dbnbudzu4wge 1387710245000 MessageExportRequ... anArchive       148             gsk*****  _null_

Actual Output (from SparkSQLElasticsearchTest)

ActorEntity          AffectedEntity1 EventId  EventTime EventType ScopedArchiveId ScopedCompanyId SessionId            Trigger
MessageExportRequ... 1387710245000   gsk***** _null_    148       anArchive       dbnbudzu4wge    Map(CompanyId -> ... Map(EntityId -> 5...

Files

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
    <parent>
        <groupId>com.globalrelay</groupId>
        <artifactId>globalrelay-parent</artifactId>
        <version>3.2</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.globalrelay.herald</groupId>
    <artifactId>spark-es-test</artifactId>
    <version>0.0.0.1-SNAPSHOT</version>
    <name>Spark elasticsearch Test</name>

    <!-- Repository for snapshot version of elasticsearch-hadoop. Required until they release. -->
    <repositories>
      <repository>
        <id>sonatype-oss</id>
        <url>http://oss.sonatype.org/content/repositories/snapshots</url>
        <snapshots><enabled>true</enabled></snapshots>
      </repository>
    </repositories>

    <properties>
      <spark.version>1.3.1</spark.version>
      <elasticsearch-hadoop.version>2.1.0.BUILD-SNAPSHOT</elasticsearch-hadoop.version>
      <es.version>1.3.2</es.version>
    </properties>

    <dependencies>
      <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.10</artifactId>
        <version>${spark.version}</version>
      </dependency>
      <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_2.10</artifactId>
        <version>${spark.version}</version>
      </dependency>
      <dependency>
        <groupId>org.elasticsearch</groupId>
        <artifactId>elasticsearch-hadoop</artifactId>
        <version>${elasticsearch-hadoop.version}</version>
        <exclusions>
          <exclusion>
            <groupId>org.apache.pig</groupId>
            <artifactId>pig</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.10</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-streaming</artifactId>
          </exclusion>
          <exclusion>
            <groupId>cascading</groupId>
            <artifactId>cascading-local</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.10</artifactId>
          </exclusion>
          <!-- <exclusion>
            <groupId>org.codehaus.jackson</groupId>
            <artifactId>jackson-mapper-asl</artifactId>
          </exclusion> -->
          <exclusion>
            <groupId>org.apache.storm</groupId>
            <artifactId>storm-core</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.10</artifactId>
          </exclusion>
          <exclusion>
            <groupId>javax.servlet</groupId>
            <artifactId>javax.servlet-api</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-tools</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.10</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-service</artifactId>
          </exclusion>
          <exclusion>
            <groupId>joda-time</groupId>
            <artifactId>joda-time</artifactId>
          </exclusion>
          <exclusion>
            <groupId>cascading</groupId>
            <artifactId>cascading-hadoop</artifactId>
          </exclusion>
        </exclusions>
      </dependency>
      <dependency>
        <groupId>org.codehaus.jackson</groupId>
        <artifactId>jackson-mapper-asl</artifactId>
        <version>1.8.8</version>
      </dependency>
      <dependency>
        <groupId>org.elasticsearch</groupId>
        <artifactId>elasticsearch</artifactId>
        <version>${es.version}</version>
      </dependency>
    </dependencies>

</project>

SparkSQLElasticsearchTest.java

import static org.elasticsearch.index.query.FilterBuilders.typeFilter;
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
import static org.elasticsearch.index.query.QueryBuilders.filteredQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.elasticsearch.index.query.MatchQueryBuilder.Operator;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.spark.sql.api.java.JavaEsSparkSQL;

public class SparkSQLElasticsearchTest {

    /**
     * Before running this test:
     * 1. Have an elasticsearch instance and configure it below.
     * 2. curl -XDELETE localhost:9200/events2-salsnap1-2013-12/
     * 3. curl -XPUT localhost:9200/events2-salsnap1-2013-12/event/76e3773d-8a19-485a-a75c-225070e2cbc6 -d '{"EventType":"MessageExportRequested","EventTime":1387710245000,"SessionId":"gsk*****","Trigger":"_null_","ScopedCompanyId":148,"ScopedArchiveId":"anArchive","EventId":"dbnbudzu4wge","ActorEntity":{"CompanyId":148,"IpAddress":"127.0.0.1","EntityId":"602","EntityName":"first-1 last-1","EntityType":"CompanyUser"},"AffectedEntity1":{"EntityId":"5678","EntityName":"5678","EntityType":"MessageExport","ExportPurpose":"FinraAudit","CaseName":"R v Sargisson","NumberMessages":534,"CaseId":"Sarg598","PriceCurrency":"CAD","DeliveryOptions":"DirectDownload","NewestMessageDate":1419112760000,"SpecialRequest":"_null_","ExportName":"Some Export","ExportFormat":"EML","SizeMessagesInBytes":1234789,"ExportDescription":"If the NSA can do it then so can I","ExportOption":"IncludeHiddenRecipientData","Price":500.12,"OldestMessageDate":1387576760000}}'
     */
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("ElasticsearchTest");
        // Note that the elasticsearch node needs to have its publish_host set to be reachable.
        conf.set("spark.es.nodes", "192.168.50.2:9200"); // elasticsearch port
        conf.set("spark.es.resource", "customer/external");
        JavaSparkContext jsc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(jsc);

        QueryBuilder qb = filteredQuery(
                boolQuery().
                must(matchQuery("EventType", "MessageExportRequested").operator(Operator.OR))/*.
                            must(rangeQuery("EventTime").from(startDate.getTime()).to(endDate.getTime()))*/,
                            typeFilter("event"));

        String queryString = "{\"query\":" + qb.toString() + "}";

        DataFrame baseDF = JavaEsSparkSQL.esDF(sqlContext, "events2-salsnap1-2013-12/event", queryString);

        System.out.println(baseDF.schema());
        baseDF.show();

        // Actual output
        // ActorEntity          AffectedEntity1 EventId  EventTime EventType ScopedArchiveId ScopedCompanyId SessionId            Trigger
        // MessageExportRequ... 1387710245000   gsk***** _null_    148       anArchive       dbnbudzu4wge    Map(CompanyId -> ... Map(EntityId -> 5...

        // Expected output (as for jsonfile)
        // ActorEntity          AffectedEntity1      EventId      EventTime     EventType            ScopedArchiveId ScopedCompanyId SessionId Trigger
        // [148,602,first-1 ... [Sarg598,R v Sarg... dbnbudzu4wge 1387710245000 MessageExportRequ... anArchive       148             gsk*****  _null_


    }
}

SparkSQLJsonFileTest

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;

public class SparkSQLJsonFileTest {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("JsonFileTest");
        // Note that the elasticsearch node needs to have its publish_host set to be reachable.
        conf.set("spark.es.nodes", "192.168.50.2:9200"); // elasticsearch port
        conf.set("spark.es.resource", "customer/external");
        JavaSparkContext jsc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(jsc);

        DataFrame baseDF = sqlContext.jsonFile("src/main/resources/message-export-events.json");

        System.out.println(baseDF.schema());
        baseDF.show();
        // Expected output:
        //        ActorEntity          AffectedEntity1      EventId      EventTime     EventType            ScopedArchiveId ScopedCompanyId SessionId Trigger
        //        [148,602,first-1 ... [Sarg598,R v Sarg... dbnbudzu4wge 1387710245000 MessageExportRequ... anArchive       148             gsk*****  _null_
    }
}

src/main/resources/message-export-events.json

{"EventType":"MessageExportRequested","EventTime":1387710245000,"SessionId":"gsk*****","Trigger":"_null_","ScopedCompanyId":148,"ScopedArchiveId":"anArchive","EventId":"dbnbudzu4wge","ActorEntity":{"CompanyId":148,"IpAddress":"127.0.0.1","EntityId":"602","EntityName":"first-1 last-1","EntityType":"CompanyUser"},"AffectedEntity1":{"EntityId":"5678","EntityName":"5678","EntityType":"MessageExport","ExportPurpose":"FinraAudit","CaseName":"R v Sargisson","NumberMessages":534,"CaseId":"Sarg598","PriceCurrency":"CAD","DeliveryOptions":"DirectDownload","NewestMessageDate":1419112760000,"SpecialRequest":"_null_","ExportName":"Some Export","ExportFormat":"EML","SizeMessagesInBytes":1234789,"ExportDescription":"If the NSA can do it then so can I","ExportOption":"IncludeHiddenRecipientData","Price":500.12,"OldestMessageDate":1387576760000}}

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions