In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, LongType

import os

In [3]:
spark = (
    SparkSession
    .builder
    .appName('Schema handling')
    .enableHiveSupport()
    .getOrCreate()
)

There are two json files with incompatible Schema, the score column is Long in the first file and Double in the second:

#### Content of file-1.json:
{"user_id": 100, "score": 50} <br>
{"user_id": 200, "score": 51}

#### Content of file-2.json:
{"user_id": 300, "score": 45.0} <br>
{"user_id": 400, "score": 46.0}

In [4]:
spark.version

'2.4.5'

In [11]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-1]) 

data_input_path = os.path.join(project_path, 'data/incompatible-json-schema')

In [21]:
schema = StructType([
    StructField('score', LongType()),
    StructField('user_id', LongType())
])

In [22]:
# Read with default mode value (mode='PERMISSIVE')

df = spark.read.schema(schema).json(data_input_path)

In [23]:
df.printSchema()

root
 |-- score: long (nullable = true)
 |-- user_id: long (nullable = true)



In [24]:
df.show()

+-----+-------+
|score|user_id|
+-----+-------+
| null|   null|
| null|   null|
|   50|    100|
|   51|    200|
+-----+-------+



In [25]:
df.count()

4

In [26]:
df.filter(col('score').isNotNull()).count()

2

In [27]:
df.filter(col('user_id').isNotNull()).count()

4

In [28]:
df.collect()

[Row(score=None, user_id=None),
 Row(score=None, user_id=None),
 Row(score=50, user_id=100),
 Row(score=51, user_id=200)]

In [29]:
df.select('user_id').collect()

[Row(user_id=300), Row(user_id=400), Row(user_id=100), Row(user_id=200)]

In [30]:
df.select('score').collect()

[Row(score=None), Row(score=None), Row(score=50), Row(score=51)]

In [31]:
# Read with mode value DROPMALFORMED

df = (
    spark.read
    .schema(schema)
    .option('mode', 'DROPMALFORMED')
    .json(data_input_path)
)

In [32]:
df.printSchema()

root
 |-- score: long (nullable = true)
 |-- user_id: long (nullable = true)



In [33]:
df.show()

+-----+-------+
|score|user_id|
+-----+-------+
|   50|    100|
|   51|    200|
+-----+-------+



In [34]:
df.count()

4

In [37]:
df.filter(col('score').isNotNull()).count()

2

In [38]:
df.filter(col('user_id').isNotNull()).count()

4

In [35]:
df.collect()

[Row(score=50, user_id=100), Row(score=51, user_id=200)]

In [36]:
df.select('user_id').collect()

[Row(user_id=300), Row(user_id=400), Row(user_id=100), Row(user_id=200)]

In [39]:
df.select('score').collect()

[Row(score=50), Row(score=51)]

In [40]:
df.select('user_id').show()

+-------+
|user_id|
+-------+
|    300|
|    400|
|    100|
|    200|
+-------+



The DROPMALFORMED mode leads to confusing situations, since the total count returns 4 however the collected list has only two records. Similarly the show funtions displays only 2 records, however when we select only user_id it displays for records.

In [41]:
# Read with mode value DROPMALFORMED

df = (
    spark.read
    .schema(schema)
    .option('mode', 'FAILFAST')
    .json(data_input_path)
)

In [42]:
df.count()

4

In [43]:
df.printSchema()

root
 |-- score: long (nullable = true)
 |-- user_id: long (nullable = true)



In [44]:
df.show()

Py4JJavaError: An error occurred while calling o161.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 30.0 failed 1 times, most recent failure: Lost task 0.0 in stage 30.0 (TID 45, localhost, executor driver): org.apache.spark.SparkException: Malformed records are detected in record parsing. Parse Mode: FAILFAST.
	at org.apache.spark.sql.execution.datasources.FailureSafeParser.parse(FailureSafeParser.scala:70)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$readFile$2.apply(JsonDataSource.scala:143)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$readFile$2.apply(JsonDataSource.scala:143)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:181)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.RuntimeException: Failed to parse a value for data type bigint (current token: VALUE_NUMBER_FLOAT).
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$failedConversion$1.applyOrElse(JacksonParser.scala:327)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$failedConversion$1.applyOrElse(JacksonParser.scala:318)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:36)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5$$anonfun$apply$9.applyOrElse(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5$$anonfun$apply$9.applyOrElse(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$sql$catalyst$json$JacksonParser$$parseJsonToken(JacksonParser.scala:308)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5.apply(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5.apply(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$sql$catalyst$json$JacksonParser$$convertObject(JacksonParser.scala:343)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1$$anonfun$apply$2.applyOrElse(JacksonParser.scala:76)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1$$anonfun$apply$2.applyOrElse(JacksonParser.scala:75)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$sql$catalyst$json$JacksonParser$$parseJsonToken(JacksonParser.scala:308)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1.apply(JacksonParser.scala:75)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1.apply(JacksonParser.scala:75)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$parse$2.apply(JacksonParser.scala:399)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$parse$2.apply(JacksonParser.scala:394)
	at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2543)
	at org.apache.spark.sql.catalyst.json.JacksonParser.parse(JacksonParser.scala:394)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$6.apply(JsonDataSource.scala:139)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$6.apply(JsonDataSource.scala:139)
	at org.apache.spark.sql.execution.datasources.FailureSafeParser.parse(FailureSafeParser.scala:62)
	... 29 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Malformed records are detected in record parsing. Parse Mode: FAILFAST.
	at org.apache.spark.sql.execution.datasources.FailureSafeParser.parse(FailureSafeParser.scala:70)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$readFile$2.apply(JsonDataSource.scala:143)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$readFile$2.apply(JsonDataSource.scala:143)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:181)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.lang.RuntimeException: Failed to parse a value for data type bigint (current token: VALUE_NUMBER_FLOAT).
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$failedConversion$1.applyOrElse(JacksonParser.scala:327)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$failedConversion$1.applyOrElse(JacksonParser.scala:318)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:36)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5$$anonfun$apply$9.applyOrElse(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5$$anonfun$apply$9.applyOrElse(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$sql$catalyst$json$JacksonParser$$parseJsonToken(JacksonParser.scala:308)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5.apply(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5.apply(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$sql$catalyst$json$JacksonParser$$convertObject(JacksonParser.scala:343)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1$$anonfun$apply$2.applyOrElse(JacksonParser.scala:76)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1$$anonfun$apply$2.applyOrElse(JacksonParser.scala:75)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$sql$catalyst$json$JacksonParser$$parseJsonToken(JacksonParser.scala:308)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1.apply(JacksonParser.scala:75)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1.apply(JacksonParser.scala:75)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$parse$2.apply(JacksonParser.scala:399)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$parse$2.apply(JacksonParser.scala:394)
	at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2543)
	at org.apache.spark.sql.catalyst.json.JacksonParser.parse(JacksonParser.scala:394)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$6.apply(JsonDataSource.scala:139)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$6.apply(JsonDataSource.scala:139)
	at org.apache.spark.sql.execution.datasources.FailureSafeParser.parse(FailureSafeParser.scala:62)
	... 29 more


In [45]:
df.select('user_id').show()

+-------+
|user_id|
+-------+
|    300|
|    400|
|    100|
|    200|
+-------+



In [47]:
df.collect()

Py4JJavaError: An error occurred while calling o161.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 34.0 failed 1 times, most recent failure: Lost task 0.0 in stage 34.0 (TID 49, localhost, executor driver): org.apache.spark.SparkException: Malformed records are detected in record parsing. Parse Mode: FAILFAST.
	at org.apache.spark.sql.execution.datasources.FailureSafeParser.parse(FailureSafeParser.scala:70)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$readFile$2.apply(JsonDataSource.scala:143)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$readFile$2.apply(JsonDataSource.scala:143)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:181)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.RuntimeException: Failed to parse a value for data type bigint (current token: VALUE_NUMBER_FLOAT).
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$failedConversion$1.applyOrElse(JacksonParser.scala:327)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$failedConversion$1.applyOrElse(JacksonParser.scala:318)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:36)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5$$anonfun$apply$9.applyOrElse(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5$$anonfun$apply$9.applyOrElse(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$sql$catalyst$json$JacksonParser$$parseJsonToken(JacksonParser.scala:308)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5.apply(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5.apply(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$sql$catalyst$json$JacksonParser$$convertObject(JacksonParser.scala:343)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1$$anonfun$apply$2.applyOrElse(JacksonParser.scala:76)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1$$anonfun$apply$2.applyOrElse(JacksonParser.scala:75)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$sql$catalyst$json$JacksonParser$$parseJsonToken(JacksonParser.scala:308)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1.apply(JacksonParser.scala:75)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1.apply(JacksonParser.scala:75)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$parse$2.apply(JacksonParser.scala:399)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$parse$2.apply(JacksonParser.scala:394)
	at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2543)
	at org.apache.spark.sql.catalyst.json.JacksonParser.parse(JacksonParser.scala:394)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$6.apply(JsonDataSource.scala:139)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$6.apply(JsonDataSource.scala:139)
	at org.apache.spark.sql.execution.datasources.FailureSafeParser.parse(FailureSafeParser.scala:62)
	... 26 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:299)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3263)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3260)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3260)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Malformed records are detected in record parsing. Parse Mode: FAILFAST.
	at org.apache.spark.sql.execution.datasources.FailureSafeParser.parse(FailureSafeParser.scala:70)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$readFile$2.apply(JsonDataSource.scala:143)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$readFile$2.apply(JsonDataSource.scala:143)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:441)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:181)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: java.lang.RuntimeException: Failed to parse a value for data type bigint (current token: VALUE_NUMBER_FLOAT).
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$failedConversion$1.applyOrElse(JacksonParser.scala:327)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$failedConversion$1.applyOrElse(JacksonParser.scala:318)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:36)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5$$anonfun$apply$9.applyOrElse(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5$$anonfun$apply$9.applyOrElse(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$sql$catalyst$json$JacksonParser$$parseJsonToken(JacksonParser.scala:308)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5.apply(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeConverter$5.apply(JacksonParser.scala:165)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$sql$catalyst$json$JacksonParser$$convertObject(JacksonParser.scala:343)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1$$anonfun$apply$2.applyOrElse(JacksonParser.scala:76)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1$$anonfun$apply$2.applyOrElse(JacksonParser.scala:75)
	at org.apache.spark.sql.catalyst.json.JacksonParser.org$apache$spark$sql$catalyst$json$JacksonParser$$parseJsonToken(JacksonParser.scala:308)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1.apply(JacksonParser.scala:75)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$makeStructRootConverter$1.apply(JacksonParser.scala:75)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$parse$2.apply(JacksonParser.scala:399)
	at org.apache.spark.sql.catalyst.json.JacksonParser$$anonfun$parse$2.apply(JacksonParser.scala:394)
	at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2543)
	at org.apache.spark.sql.catalyst.json.JacksonParser.parse(JacksonParser.scala:394)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$6.apply(JsonDataSource.scala:139)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$$anonfun$6.apply(JsonDataSource.scala:139)
	at org.apache.spark.sql.execution.datasources.FailureSafeParser.parse(FailureSafeParser.scala:62)
	... 26 more
