JSON schema inference corrupt with Elasticsearch Spark

The elasticsearch-hadoop library appears to corrupt the JSON schema inference. The same JSON source read using the `SQLContext.jsonFile` method succeeds.
# Reproduction Steps
1. Have an elasticsearch instance and configure it below.
2. `curl -XDELETE localhost:9200/events2-salsnap1-2013-12/`
3. `curl -XPUT localhost:9200/events2-salsnap1-2013-12/event/76e3773d-8a19-485a-a75c-225070e2cbc6 -d '{"EventType":"MessageExportRequested","EventTime":1387710245000,"SessionId":"gsk*****","Trigger":"_null_","ScopedCompanyId":148,"ScopedArchiveId":"anArchive","EventId":"dbnbudzu4wge","ActorEntity":{"CompanyId":148,"IpAddress":"127.0.0.1","EntityId":"602","EntityName":"first-1 last-1","EntityType":"CompanyUser"},"AffectedEntity1":{"EntityId":"5678","EntityName":"5678","EntityType":"MessageExport","ExportPurpose":"FinraAudit","CaseName":"R v Sargisson","NumberMessages":534,"CaseId":"Sarg598","PriceCurrency":"CAD","DeliveryOptions":"DirectDownload","NewestMessageDate":1419112760000,"SpecialRequest":"_null_","ExportName":"Some Export","ExportFormat":"EML","SizeMessagesInBytes":1234789,"ExportDescription":"If the NSA can do it then so can I","ExportOption":"IncludeHiddenRecipientData","Price":500.12,"OldestMessageDate":1387576760000}}'`
4. Use Java to run `SparkSQLElasticsearchTest`
5. Use Java to run `SparkSQLJsonFileTest`
# Expected Output

```
ActorEntity          AffectedEntity1      EventId      EventTime     EventType            ScopedArchiveId ScopedCompanyId SessionId Trigger
[148,602,first-1 ... [Sarg598,R v Sarg... dbnbudzu4wge 1387710245000 MessageExportRequ... anArchive       148             gsk*****  _null_
```
# Actual Output (from SparkSQLElasticsearchTest)

```
ActorEntity          AffectedEntity1 EventId  EventTime EventType ScopedArchiveId ScopedCompanyId SessionId            Trigger
MessageExportRequ... 1387710245000   gsk***** _null_    148       anArchive       dbnbudzu4wge    Map(CompanyId -> ... Map(EntityId -> 5...
```
# Files
## pom.xml

```
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
    <parent>
        <groupId>com.globalrelay</groupId>
        <artifactId>globalrelay-parent</artifactId>
        <version>3.2</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.globalrelay.herald</groupId>
    <artifactId>spark-es-test</artifactId>
    <version>0.0.0.1-SNAPSHOT</version>
    <name>Spark elasticsearch Test</name>

    
    <repositories>
      <repository>
        <id>sonatype-oss</id>
        <url>http://oss.sonatype.org/content/repositories/snapshots</url>
        <snapshots><enabled>true</enabled></snapshots>
      </repository>
    </repositories>

    <properties>
      <spark.version>1.3.1</spark.version>
      <elasticsearch-hadoop.version>2.1.0.BUILD-SNAPSHOT</elasticsearch-hadoop.version>
      <es.version>1.3.2</es.version>
    </properties>

    <dependencies>
      <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.10</artifactId>
        <version>${spark.version}</version>
      </dependency>
      <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_2.10</artifactId>
        <version>${spark.version}</version>
      </dependency>
      <dependency>
        <groupId>org.elasticsearch</groupId>
        <artifactId>elasticsearch-hadoop</artifactId>
        <version>${elasticsearch-hadoop.version}</version>
        <exclusions>
          <exclusion>
            <groupId>org.apache.pig</groupId>
            <artifactId>pig</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.10</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-streaming</artifactId>
          </exclusion>
          <exclusion>
            <groupId>cascading</groupId>
            <artifactId>cascading-local</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.10</artifactId>
          </exclusion>
          
          <exclusion>
            <groupId>org.apache.storm</groupId>
            <artifactId>storm-core</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.10</artifactId>
          </exclusion>
          <exclusion>
            <groupId>javax.servlet</groupId>
            <artifactId>javax.servlet-api</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-tools</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.10</artifactId>
          </exclusion>
          <exclusion>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-service</artifactId>
          </exclusion>
          <exclusion>
            <groupId>joda-time</groupId>
            <artifactId>joda-time</artifactId>
          </exclusion>
          <exclusion>
            <groupId>cascading</groupId>
            <artifactId>cascading-hadoop</artifactId>
          </exclusion>
        </exclusions>
      </dependency>
      <dependency>
        <groupId>org.codehaus.jackson</groupId>
        <artifactId>jackson-mapper-asl</artifactId>
        <version>1.8.8</version>
      </dependency>
      <dependency>
        <groupId>org.elasticsearch</groupId>
        <artifactId>elasticsearch</artifactId>
        <version>${es.version}</version>
      </dependency>
    </dependencies>

</project>
```
## SparkSQLElasticsearchTest.java

```
import static org.elasticsearch.index.query.FilterBuilders.typeFilter;
import static org.elasticsearch.index.query.QueryBuilders.boolQuery;
import static org.elasticsearch.index.query.QueryBuilders.filteredQuery;
import static org.elasticsearch.index.query.QueryBuilders.matchQuery;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.elasticsearch.index.query.MatchQueryBuilder.Operator;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.spark.sql.api.java.JavaEsSparkSQL;

public class SparkSQLElasticsearchTest {

    /**
     * Before running this test:
     * 1. Have an elasticsearch instance and configure it below.
     * 2. curl -XDELETE localhost:9200/events2-salsnap1-2013-12/
     * 3. curl -XPUT localhost:9200/events2-salsnap1-2013-12/event/76e3773d-8a19-485a-a75c-225070e2cbc6 -d '{"EventType":"MessageExportRequested","EventTime":1387710245000,"SessionId":"gsk*****","Trigger":"_null_","ScopedCompanyId":148,"ScopedArchiveId":"anArchive","EventId":"dbnbudzu4wge","ActorEntity":{"CompanyId":148,"IpAddress":"127.0.0.1","EntityId":"602","EntityName":"first-1 last-1","EntityType":"CompanyUser"},"AffectedEntity1":{"EntityId":"5678","EntityName":"5678","EntityType":"MessageExport","ExportPurpose":"FinraAudit","CaseName":"R v Sargisson","NumberMessages":534,"CaseId":"Sarg598","PriceCurrency":"CAD","DeliveryOptions":"DirectDownload","NewestMessageDate":1419112760000,"SpecialRequest":"_null_","ExportName":"Some Export","ExportFormat":"EML","SizeMessagesInBytes":1234789,"ExportDescription":"If the NSA can do it then so can I","ExportOption":"IncludeHiddenRecipientData","Price":500.12,"OldestMessageDate":1387576760000}}'
     */
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("ElasticsearchTest");
        // Note that the elasticsearch node needs to have its publish_host set to be reachable.
        conf.set("spark.es.nodes", "192.168.50.2:9200"); // elasticsearch port
        conf.set("spark.es.resource", "customer/external");
        JavaSparkContext jsc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(jsc);

        QueryBuilder qb = filteredQuery(
                boolQuery().
                must(matchQuery("EventType", "MessageExportRequested").operator(Operator.OR))/*.
                            must(rangeQuery("EventTime").from(startDate.getTime()).to(endDate.getTime()))*/,
                            typeFilter("event"));

        String queryString = "{\"query\":" + qb.toString() + "}";

        DataFrame baseDF = JavaEsSparkSQL.esDF(sqlContext, "events2-salsnap1-2013-12/event", queryString);

        System.out.println(baseDF.schema());
        baseDF.show();

        // Actual output
        // ActorEntity          AffectedEntity1 EventId  EventTime EventType ScopedArchiveId ScopedCompanyId SessionId            Trigger
        // MessageExportRequ... 1387710245000   gsk***** _null_    148       anArchive       dbnbudzu4wge    Map(CompanyId -> ... Map(EntityId -> 5...

        // Expected output (as for jsonfile)
        // ActorEntity          AffectedEntity1      EventId      EventTime     EventType            ScopedArchiveId ScopedCompanyId SessionId Trigger
        // [148,602,first-1 ... [Sarg598,R v Sarg... dbnbudzu4wge 1387710245000 MessageExportRequ... anArchive       148             gsk*****  _null_


    }
}
```
## SparkSQLJsonFileTest

```
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;

public class SparkSQLJsonFileTest {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("JsonFileTest");
        // Note that the elasticsearch node needs to have its publish_host set to be reachable.
        conf.set("spark.es.nodes", "192.168.50.2:9200"); // elasticsearch port
        conf.set("spark.es.resource", "customer/external");
        JavaSparkContext jsc = new JavaSparkContext(conf);
        SQLContext sqlContext = new SQLContext(jsc);

        DataFrame baseDF = sqlContext.jsonFile("src/main/resources/message-export-events.json");

        System.out.println(baseDF.schema());
        baseDF.show();
        // Expected output:
        //        ActorEntity          AffectedEntity1      EventId      EventTime     EventType            ScopedArchiveId ScopedCompanyId SessionId Trigger
        //        [148,602,first-1 ... [Sarg598,R v Sarg... dbnbudzu4wge 1387710245000 MessageExportRequ... anArchive       148             gsk*****  _null_
    }
}
```
## src/main/resources/message-export-events.json

```
{"EventType":"MessageExportRequested","EventTime":1387710245000,"SessionId":"gsk*****","Trigger":"_null_","ScopedCompanyId":148,"ScopedArchiveId":"anArchive","EventId":"dbnbudzu4wge","ActorEntity":{"CompanyId":148,"IpAddress":"127.0.0.1","EntityId":"602","EntityName":"first-1 last-1","EntityType":"CompanyUser"},"AffectedEntity1":{"EntityId":"5678","EntityName":"5678","EntityType":"MessageExport","ExportPurpose":"FinraAudit","CaseName":"R v Sargisson","NumberMessages":534,"CaseId":"Sarg598","PriceCurrency":"CAD","DeliveryOptions":"DirectDownload","NewestMessageDate":1419112760000,"SpecialRequest":"_null_","ExportName":"Some Export","ExportFormat":"EML","SizeMessagesInBytes":1234789,"ExportDescription":"If the NSA can do it then so can I","ExportOption":"IncludeHiddenRecipientData","Price":500.12,"OldestMessageDate":1387576760000}}
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

JSON schema inference corrupt with Elasticsearch Spark #441

Reproduction Steps

Expected Output

Actual Output (from SparkSQLElasticsearchTest)

Files

pom.xml

SparkSQLElasticsearchTest.java

SparkSQLJsonFileTest

src/main/resources/message-export-events.json

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

JSON schema inference corrupt with Elasticsearch Spark #441

Description

Reproduction Steps

Expected Output

Actual Output (from SparkSQLElasticsearchTest)

Files

pom.xml

SparkSQLElasticsearchTest.java

SparkSQLJsonFileTest

src/main/resources/message-export-events.json

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions