# Notebook for Defensive Pipeline Design Chapter 
You will need to install the Spark depdendencies as detailed in the README in the repo root

In [8]:
import json
from jsonschema import validate
from pyspark.sql import types as T, Row, functions as f, SparkSession
from pyspark.sql.types import StructType, LongType, StringType, ArrayType, StructField
spark = (SparkSession
         .builder
         .appName('oreilly-book')
         .getOrCreate())

## Schema validation

### Taking a look at some sample bird survey data to see different ways of working with schemas for data validation

In [2]:
df = (spark
        .read
        .option("inferSchema", True)
        .json('initial_source_data.json'))

# converting to pandas for more readable output
df.toPandas()

                                                                                

Unnamed: 0,count,description,img_files,location,user
0,5,Several lesser goldfinches in the yard today,[],"[45.2341, 121.2351]",pc@cats.xyz
1,1,"Breezy morning, overcast. Saw a black-crowned ...",[s3://bird-2345/34541.jpeg],"[27.9659, 82.8001]",sylvia@srlp.org
2,3,Walked over to the heron rookery this afternoo...,"[s3://bird-1243/09731.jpeg, s3://bird-1243/481...","[45.4348, 123.9460]",birdlover124@email.com


In [3]:
df.schema

StructType(List(StructField(count,LongType,true),StructField(description,StringType,true),StructField(img_files,ArrayType(StringType,true),true),StructField(location,ArrayType(StringType,true),true),StructField(user,StringType,true)))

In [4]:
source_schema = df.schema
df.printSchema()

root
 |-- count: long (nullable = true)
 |-- description: string (nullable = true)
 |-- img_files: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- location: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- user: string (nullable = true)



In [5]:
(df
 .withColumn("latitude", f.element_at(df.location, 1))
 .withColumn("longitude", f.element_at(df.location, 2))
).toPandas()

Unnamed: 0,count,description,img_files,location,user,latitude,longitude
0,5,Several lesser goldfinches in the yard today,[],"[45.2341, 121.2351]",pc@cats.xyz,45.2341,121.2351
1,1,"Breezy morning, overcast. Saw a black-crowned ...",[s3://bird-2345/34541.jpeg],"[27.9659, 82.8001]",sylvia@srlp.org,27.9659,82.8001
2,3,Walked over to the heron rookery this afternoo...,"[s3://bird-1243/09731.jpeg, s3://bird-1243/481...","[45.4348, 123.9460]",birdlover124@email.com,45.4348,123.946


#### A new column, app_version, is added to the survey data. 

In [6]:
df = (spark
        .read
        .option("inferSchema", True)
        .json('source_data_plus_column.json'))

# converting to pandas for more readable output
df.toPandas()

Unnamed: 0,app_version,count,description,img_files,location,user
0,v1.02.5,1,great blue heron,[s3://bucket-name/rock/today/bring.bmp],"[(26.91756, -82.07842)]",lucy@cats.xyz


What happens if you try to apply the schema from the original data?

In [7]:
df_new = (spark
            .read
            .schema(source_schema)
            .json('source_data_plus_column.json'))
df_new.toPandas()

Unnamed: 0,count,description,img_files,location,user
0,1,great blue heron,[s3://bucket-name/rock/today/bring.bmp],"[(26.91756, -82.07842)]",lucy@cats.xyz


Notice that `app_version` is not in the dataframe. There were no errors on read to alert you that data is missing. This was part of the nearly "multi million dollar mistake" I mention in the text

### Working with JSON schemas

#### Working with a generated schema - notice the `location` definition

In [30]:
initial_json_schema = {
  "$schema": "http://json-schema.org/draft-04/schema#",
    "type": "array",
    "items": [{
        "type": "object",
        "properties": {
        "user": {"type": "string"},
        "location": {
            "type": "array",
            "items": [
                {"type": "string"},
                {"type": "string"}
            ]
        },
        "img_files": {
            "type": "array",
            "items": [{"type": "string"}]
        },
        "description": {
            "type": "string"
        },
        "count": {
            "type": "integer"
        }
        },
        "required": [
        "user",
        "location"
        ]
    }]
}

In [31]:
ok_location = [{"user":"pc@cats.xyz", "location":["26.91756","82.07842"],"img_files":[],"description":"Several lesser goldfinches in the yard today","count":5}]
short_location = [{"user":"pc@cats.xyz", "location":["26.91756"],"img_files":[],"description":"Several lesser goldfinches in the yard today","count":5}]


In [32]:
validate(ok_location, initial_json_schema)

You want the `short_location` example to fail validation, but it does not. Despite the generated schema appearing to have the right definition for `location` you need to make some modifications to use the schema for validation

In [34]:
validate(short_location, initial_json_schema)

In [35]:
updated_schema = initial_json_schema = {
  "$schema": "http://json-schema.org/draft-04/schema#",
    "type": "array",
    "items": [{
        "type": "object",
        "properties": {
        "user": {"type": "string"},
        "location": {
            "type": "array",
            "minItems":2,
            "items": [
                {"type": "string"}
            ]
        },
        "img_files": {
            "type": "array",
            "items": [{"type": "string"}]
        },
        "description": {
            "type": "string"
        },
        "count": {
            "type": "integer"
        }
        },
        "required": [
        "user",
        "location"
        ]
    }]
}

In [36]:
validate(ok_location, updated_schema)

The new definition for `location` gives the desired validation, failing for the following 3 cases:
* Not enough elements
* Location is not an array
* Elements are not strings

In [38]:
validate(short_location, updated_schema)

ValidationError: ['26.91756'] is too short

Failed validating 'minItems' in schema['items'][0]['properties']['location']:
    {'items': [{'type': 'string'}], 'minItems': 2, 'type': 'array'}

On instance[0]['location']:
    ['26.91756']

In [41]:
validate([{"user":"someone", "location":"26.91756,82.07842"}], updated_schema)

ValidationError: '26.91756,82.07842' is not of type 'array'

Failed validating 'type' in schema['items'][0]['properties']['location']:
    {'items': [{'type': 'string'}], 'minItems': 2, 'type': 'array'}

On instance[0]['location']:
    '26.91756,82.07842'

In [42]:
validate([{"user":"pc@cats.xyz", "location":[26.91756,82.07842], updated_schema)

SyntaxError: closing parenthesis ')' does not match opening parenthesis '{' (3767170636.py, line 1)

### Comparing inferred vs explicit schemas


#### Data Type change
In the earlier schema, the `location` field was an Array of String. If you're depending on this structure in your pipeline, such as code that grabs the lat and long from locations in the `location` list, you would want to know if this format changed.

In [100]:
df = (spark
            .read
            .option("inferSchema", True)
            .json('string_location.json'))
df.show()

+-----+-------------+--------------------+------------------+
|count|        email|           img_files|          location|
+-----+-------------+--------------------+------------------+
|    1|lucy@cats.xyz|[s3://bucket-name...|26.91756,-82.07842|
+-----+-------------+--------------------+------------------+



Attempting to process this data sample will throw an exception

In [101]:
(df
 .withColumn("latitude", f.element_at(df.location, 1))
 .withColumn("longitude", f.element_at(df.location, 2))
).toPandas()

AnalysisException: cannot resolve 'element_at(location, 1)' due to data type mismatch: The first argument to function element_at should have been array or map type, but its string type.;
'Project [count#1560L, email#1561, img_files#1562, location#1563, element_at(location#1563, 1, false) AS latitude#1589]
+- Relation [count#1560L,email#1561,img_files#1562,location#1563] json


To catch errors like this compare explicit schemas with inferred schemas

In [104]:
inferred_schema = df.schema
inferred_schema == source_schema

False

In [105]:
mismatched_fields = [field for field in inferred_schema.fields if field not in source_schema.fields]
mismatched_fields

[StructField(location,StringType,true)]

22/10/28 13:49:56 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 2270549 ms exceeds timeout 120000 ms
22/10/28 13:49:56 WARN SparkContext: Killing executors is not supported by current scheduler.


In [88]:
for mfield in mismatched_fields:
    source_field = [field for field in source_schema.fields if field.name == mfield.name]
    print(f"Field mismatch for {mfield.name}. Source schema: {source_field}, Inferred schema: {mfield}")

Field mismatch for location. Source schema: [StructField(location,ArrayType(StringType,true),true)], Inferred schema: StructField(location,StringType,true)


In [60]:
schema_w_corrupt = StructType(
    [StructField("count", LongType(),True), 
    StructField("description", StringType(),True),
    StructField("user", StringType(),True),
    StructField("img_files", ArrayType(StringType(),True),True),
    StructField("location", ArrayType(StringType(),True),True),
    StructField("_corrupt_record", StringType(), True)]
)

In [61]:
df_new = (spark
            .read
            .schema(schema_w_corrupt)
            .json('no_description.json', mode="PERMISSIVE", columnNameOfCorruptRecord="_corrupt_record"))


In [62]:
df_new.show(10, False)

+-----+-----------+----+---------------------------------------+---------------------+---------------+
|count|description|user|img_files                              |location             |_corrupt_record|
+-----+-----------+----+---------------------------------------+---------------------+---------------+
|1    |null       |null|[s3://bucket-name/rock/today/bring.bmp]|[26.91756, -82.07842]|null           |
+-----+-----------+----+---------------------------------------+---------------------+---------------+



Try changing the Type of the "location" field to simulate a change in source data schema. Now what happens?

In [77]:
df = (spark
        .read
        .option("inferSchema", True)
        .json('no_description.json'))


df.show(10, False)

+-----+-------------+---------------------------------------+---------------------+
|count|email        |img_files                              |location             |
+-----+-------------+---------------------------------------+---------------------+
|1    |lucy@cats.xyz|[s3://bucket-name/rock/today/bring.bmp]|[26.91756, -82.07842]|
+-----+-------------+---------------------------------------+---------------------+



In [78]:
df.schema

StructType(List(StructField(count,LongType,true),StructField(email,StringType,true),StructField(img_files,ArrayType(StringType,true),true),StructField(location,ArrayType(StringType,true),true)))

### Identifying and acting on malformed data

In [112]:
sc = spark.sparkContext

In [106]:
bad_data = [
    "{'user': 'pc@cats.xyz', 'location': [26.91756, -82.07842]}",
    "{'user': 'lucy@cats.xyz', 'location': [45.2341, 121.2351]}",
    "{'user': 'scout@cats.xyz', 'location': [45.2341,}"
]

Processing with basic python - check for and handle exceptions arising from bad data

In [110]:
json.loads(bad_data)

TypeError: the JSON object must be str, bytes or bytearray, not list

Different approaches to handling corrupt data in pyspark

In [116]:
corrupt_df = spark.read.json(sc.parallelize(bad_data), mode="PERMISSIVE", columnNameOfCorruptRecord="_corrupt_record")
corrupt_df.toPandas()

Unnamed: 0,_corrupt_record,location,user
0,,"[26.91756, -82.07842]",pc@cats.xyz
1,,"[45.2341, 121.2351]",lucy@cats.xyz
2,"{'user': 'scout@cats.xyz', 'location': [45.2341,}",,


In [114]:
corrupt_df = spark.read.json(sc.parallelize(bad_data), mode="DROPMALFORMED", columnNameOfCorruptRecord="_corrupt_record")
corrupt_df.show()

+--------------------+-------------+
|            location|         user|
+--------------------+-------------+
|[26.91756, -82.07...|  pc@cats.xyz|
| [45.2341, 121.2351]|lucy@cats.xyz|
+--------------------+-------------+



In [115]:
corrupt_df = spark.read.json(sc.parallelize(bad_data), mode="FAILFAST", columnNameOfCorruptRecord="_corrupt_record")
corrupt_df.show()

22/11/02 17:28:41 ERROR Executor: Exception in task 3.0 in stage 77.0 (TID 95)
org.apache.spark.SparkException: Malformed records are detected in schema inference. Parse Mode: FAILFAST.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.malformedRecordsDetectedInSchemaInferenceError(QueryExecutionErrors.scala:1144)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.$anonfun$infer$2(JsonInferSchema.scala:77)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator.isEmpty(Iterator.scala:387)
	at scala.collection.Iterator.isEmpty$(Iterator.scala:387)
	at scala.collection.AbstractIterator.isEmpty(Iterator.scala:1431)
	at scala.collection.TraversableOnce.reduceLeftOption(TraversableOnce.scala:249)
	at scala.collection.TraversableOnce.reduceLeftOption$(TraversableOnce.scala:248)
	at scala.collection.AbstractIterator.reduceLeftOption(Iterator.scala:1431)
	at scala.collection.Trav

Py4JJavaError: An error occurred while calling o653.json.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 77.0 failed 1 times, most recent failure: Lost task 3.0 in stage 77.0 (TID 95) (192.168.0.199 executor driver): org.apache.spark.SparkException: Malformed records are detected in schema inference. Parse Mode: FAILFAST.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.malformedRecordsDetectedInSchemaInferenceError(QueryExecutionErrors.scala:1144)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.$anonfun$infer$2(JsonInferSchema.scala:77)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator.isEmpty(Iterator.scala:387)
	at scala.collection.Iterator.isEmpty$(Iterator.scala:387)
	at scala.collection.AbstractIterator.isEmpty(Iterator.scala:1431)
	at scala.collection.TraversableOnce.reduceLeftOption(TraversableOnce.scala:249)
	at scala.collection.TraversableOnce.reduceLeftOption$(TraversableOnce.scala:248)
	at scala.collection.AbstractIterator.reduceLeftOption(Iterator.scala:1431)
	at scala.collection.TraversableOnce.reduceOption(TraversableOnce.scala:256)
	at scala.collection.TraversableOnce.reduceOption$(TraversableOnce.scala:256)
	at scala.collection.AbstractIterator.reduceOption(Iterator.scala:1431)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.$anonfun$infer$1(JsonInferSchema.scala:80)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)
Caused by: com.fasterxml.jackson.core.JsonParseException: Unexpected character ('}' (code 125)): expected a value
 at [Source: (byte[])"{'user': 'scout@cats.xyz', 'location': [45.2341,}"; line: 1, column: 50]
	at com.fasterxml.jackson.core.JsonParser._constructError(JsonParser.java:2337)
	at com.fasterxml.jackson.core.base.ParserMinimalBase._reportError(ParserMinimalBase.java:710)
	at com.fasterxml.jackson.core.base.ParserMinimalBase._reportUnexpectedChar(ParserMinimalBase.java:635)
	at com.fasterxml.jackson.core.json.UTF8StreamJsonParser._handleUnexpectedValue(UTF8StreamJsonParser.java:2658)
	at com.fasterxml.jackson.core.json.UTF8StreamJsonParser._nextTokenNotInObject(UTF8StreamJsonParser.java:870)
	at com.fasterxml.jackson.core.json.UTF8StreamJsonParser.nextToken(UTF8StreamJsonParser.java:762)
	at org.apache.spark.sql.catalyst.json.JacksonUtils$.nextUntil(JacksonUtils.scala:30)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.inferField(JsonInferSchema.scala:155)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.inferField(JsonInferSchema.scala:111)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.inferField(JsonInferSchema.scala:142)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.$anonfun$infer$4(JsonInferSchema.scala:68)
	at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2713)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.$anonfun$infer$2(JsonInferSchema.scala:66)
	... 25 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1160)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1160)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1160)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2642)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2309)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.infer(JsonInferSchema.scala:93)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$.$anonfun$inferFromDataset$5(JsonDataSource.scala:110)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource$.inferFromDataset(JsonDataSource.scala:110)
	at org.apache.spark.sql.DataFrameReader.$anonfun$json$1(DataFrameReader.scala:453)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:453)
	at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:433)
	at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:419)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Malformed records are detected in schema inference. Parse Mode: FAILFAST.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.malformedRecordsDetectedInSchemaInferenceError(QueryExecutionErrors.scala:1144)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.$anonfun$infer$2(JsonInferSchema.scala:77)
	at scala.collection.Iterator$$anon$11.nextCur(Iterator.scala:486)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:492)
	at scala.collection.Iterator.isEmpty(Iterator.scala:387)
	at scala.collection.Iterator.isEmpty$(Iterator.scala:387)
	at scala.collection.AbstractIterator.isEmpty(Iterator.scala:1431)
	at scala.collection.TraversableOnce.reduceLeftOption(TraversableOnce.scala:249)
	at scala.collection.TraversableOnce.reduceLeftOption$(TraversableOnce.scala:248)
	at scala.collection.AbstractIterator.reduceLeftOption(Iterator.scala:1431)
	at scala.collection.TraversableOnce.reduceOption(TraversableOnce.scala:256)
	at scala.collection.TraversableOnce.reduceOption$(TraversableOnce.scala:256)
	at scala.collection.AbstractIterator.reduceOption(Iterator.scala:1431)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.$anonfun$infer$1(JsonInferSchema.scala:80)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more
Caused by: com.fasterxml.jackson.core.JsonParseException: Unexpected character ('}' (code 125)): expected a value
 at [Source: UNKNOWN; line: 1, column: 50]
	at com.fasterxml.jackson.core.JsonParser._constructError(JsonParser.java:2337)
	at com.fasterxml.jackson.core.base.ParserMinimalBase._reportError(ParserMinimalBase.java:710)
	at com.fasterxml.jackson.core.base.ParserMinimalBase._reportUnexpectedChar(ParserMinimalBase.java:635)
	at com.fasterxml.jackson.core.json.UTF8StreamJsonParser._handleUnexpectedValue(UTF8StreamJsonParser.java:2658)
	at com.fasterxml.jackson.core.json.UTF8StreamJsonParser._nextTokenNotInObject(UTF8StreamJsonParser.java:870)
	at com.fasterxml.jackson.core.json.UTF8StreamJsonParser.nextToken(UTF8StreamJsonParser.java:762)
	at org.apache.spark.sql.catalyst.json.JacksonUtils$.nextUntil(JacksonUtils.scala:30)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.inferField(JsonInferSchema.scala:155)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.inferField(JsonInferSchema.scala:111)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.inferField(JsonInferSchema.scala:142)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.$anonfun$infer$4(JsonInferSchema.scala:68)
	at org.apache.spark.util.Utils$.tryWithResource(Utils.scala:2713)
	at org.apache.spark.sql.catalyst.json.JsonInferSchema.$anonfun$infer$2(JsonInferSchema.scala:66)
	... 25 more


unner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)
Caused by: com.fasterxml.jackson.core.JsonParseException: Unexpected character ('}' (code 125)): expected a value
 at [Source: (byte[])"{'user': 'scout@cats.xyz', 'location': [45.2341,}"; line: 1, column: 50]
	at com.fasterxml.jackson.core.JsonParser._constructError(JsonParser.java:2337)
	at com.fasterxml.jackson.core.base.ParserMinimalBase._reportError(ParserMinimalBase.java:710)
	at com.fasterxml.jackson.core.base.ParserMinimalBase._reportUnexpectedChar(ParserMinimalBase.java:635)
	at com.fasterxml.jackson.core.json.UTF8StreamJsonParser._handleUnexpectedValue(UTF8StreamJsonParser.java:2658)
	at com.fasterxml.jackson.core.json.UTF8StreamJsonParser._nextTokenNotInObject(UTF8StreamJsonParser.java:870)
	at com.fasterxml.jackson.core.json.UTF8StreamJson