In [1]:
from pyspark.sql import SparkSession

# Spark session & context
spark = (SparkSession
         .builder
         .master('local')
         .appName('read-parquet-attempts')
         .getOrCreate())
sc = spark.sparkContext

# Generate test data

In [3]:
import datetime

# Set up parameters
data_path = "/home/jovyan/work/spark-data/raw/test_data_parquet"
num_rows = 10

%run ./modules/01_create_simple_schema.ipynb 
gen_data_simple_schema(data_path, datetime.date(2020,1,1), num_rows)

%run ./modules/02_add_nested_structure_to_schema.ipynb
gen_data_add_nested_struct(data_path, datetime.date(2020,2,1), num_rows)

%run ./modules/03_add_column_to_schema.ipynb
gen_data_add_columns(data_path, datetime.date(2020,3,1), num_rows)

%run ./modules/04_change_datatype_add_struct.ipynb
gen_data_change_datatype_add_struct(data_path, datetime.date(2020,4,1), num_rows)

%run ./modules/05_change_column_name.ipynb
gen_data_change_column_name(data_path, datetime.date(2020,5,1), num_rows)

%run ./modules/06_remove_column.ipynb
gen_data_remove_column(data_path, datetime.date(2020,6,1), num_rows)


Partition created: /home/jovyan/work/spark-data/raw/test_data_parquet/date=2020-01-01
# Rows: 10
Schema:
root
 |-- identifier: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- date: date (nullable = true)



Partition created: /home/jovyan/work/spark-data/raw/test_data_parquet/date=2020-02-01
# Rows: 10
Schema:
root
 |-- identifier: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- address: struct (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- postal_code: string (nullable = true)
 |-- date: date (nullable = true)



Partition created: /home/jovyan/work/spark

# Attempt 1 - Try to read all partitions without schema merge option

In [4]:
df_at1 = spark.read.parquet(data_path)

In [5]:
df_at1.printSchema()

root
 |-- identifier: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- date: date (nullable = true)



In [6]:
from pyspark.sql.functions import col

df_at1.groupBy(col("date")).count().sort(col("date")).show()

+----------+-----+
|      date|count|
+----------+-----+
|2020-01-01|   10|
|2020-02-01|   10|
|2020-03-01|   10|
|2020-04-01|   10|
|2020-05-01|   10|
|2020-06-01|   10|
+----------+-----+



In [7]:
df_at1.show()

+----------+----------+---------+--------------------+---+----------+
|identifier|first_name|last_name|          occupation|age|      date|
+----------+----------+---------+--------------------+---+----------+
|  19-14/52|      Shea|  Barrera|       Horse Breeder| 21|2020-04-01|
|  88-34/89|       Nam|  Vazquez|              Tailor| 39|2020-04-01|
|  52-90/42|    Rickey|  Cabrera|           Economist| 55|2020-04-01|
|  04-61/33|        Ai|    Sears|       Floor Manager| 52|2020-04-01|
|  19-29/75|      Echo|    Mayer|  Marine Electrician| 37|2020-04-01|
|  68-17/13|  Terrance|    Wolfe|Fork Lift Truck D...| 39|2020-04-01|
|  62-31/47|      Elmo|     Wise|        Resin Caster| 54|2020-04-01|
|  90-93/88| Anjanette|   Weaver|    Building Control| 36|2020-04-01|
|  60-59/36|     Garry|    Keith|     Assembly Worker| 26|2020-04-01|
|  37-55/88| Lashandra|    Poole|          Lumberjack| 64|2020-04-01|
|  60-92/25|  Shirleen|     Knox|    Hospital Orderly| 37|2020-05-01|
|  44-80/27|       W

**Conclusion:** It could read only some columns based on the schema of the first partition

# Attempt 2 - Try to read all partitions with schema merge option

In [8]:
df_at2 = spark.read.option("mergeSchema", "true").parquet(data_path)

Py4JJavaError: An error occurred while calling o1992.parquet.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 40.0 failed 1 times, most recent failure: Lost task 0.0 in stage 40.0 (TID 341, localhost, executor driver): org.apache.spark.SparkException: Failed merging schema of file file:/home/jovyan/work/spark-data/raw/test_data_parquet/date=2020-04-01/part-00000-7509b13c-f2bf-4104-ac10-43a927e3c41a.c000.snappy.parquet:
root
 |-- identifier: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- address: struct (nullable = true)
 |    |-- address_details: struct (nullable = true)
 |    |    |-- street: struct (nullable = true)
 |    |    |    |-- street_name: string (nullable = true)
 |    |    |    |-- latitude: float (nullable = true)
 |    |    |    |-- longitude: float (nullable = true)
 |    |    |-- number: integer (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- country_code: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- postal_code: integer (nullable = true)
 |-- title: string (nullable = true)

	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9$$anonfun$apply$14.apply(ParquetFileFormat.scala:627)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9$$anonfun$apply$14.apply(ParquetFileFormat.scala:622)
	at scala.collection.immutable.Stream.foreach(Stream.scala:594)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:622)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:603)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:823)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:823)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Failed to merge fields 'address' and 'address'. Failed to merge fields 'postal_code' and 'postal_code'. Failed to merge incompatible data types string and int
	at org.apache.spark.sql.types.StructType$$anonfun$merge$1$$anonfun$apply$3.apply(StructType.scala:502)
	at org.apache.spark.sql.types.StructType$$anonfun$merge$1$$anonfun$apply$3.apply(StructType.scala:495)
	at scala.Option.map(Option.scala:146)
	at org.apache.spark.sql.types.StructType$$anonfun$merge$1.apply(StructType.scala:495)
	at org.apache.spark.sql.types.StructType$$anonfun$merge$1.apply(StructType.scala:492)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at org.apache.spark.sql.types.StructType$.merge(StructType.scala:492)
	at org.apache.spark.sql.types.StructType.merge(StructType.scala:402)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9$$anonfun$apply$14.apply(ParquetFileFormat.scala:625)
	... 17 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:990)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:385)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:989)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.mergeSchemasInParallel(ParquetFileFormat.scala:633)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat.inferSchema(ParquetFileFormat.scala:241)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$6.apply(DataSource.scala:180)
	at org.apache.spark.sql.execution.datasources.DataSource$$anonfun$6.apply(DataSource.scala:180)
	at scala.Option.orElse(Option.scala:289)
	at org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:179)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:373)
	at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223)
	at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211)
	at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:645)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Failed merging schema of file file:/home/jovyan/work/spark-data/raw/test_data_parquet/date=2020-04-01/part-00000-7509b13c-f2bf-4104-ac10-43a927e3c41a.c000.snappy.parquet:
root
 |-- identifier: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- address: struct (nullable = true)
 |    |-- address_details: struct (nullable = true)
 |    |    |-- street: struct (nullable = true)
 |    |    |    |-- street_name: string (nullable = true)
 |    |    |    |-- latitude: float (nullable = true)
 |    |    |    |-- longitude: float (nullable = true)
 |    |    |-- number: integer (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- country_code: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- postal_code: integer (nullable = true)
 |-- title: string (nullable = true)

	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9$$anonfun$apply$14.apply(ParquetFileFormat.scala:627)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9$$anonfun$apply$14.apply(ParquetFileFormat.scala:622)
	at scala.collection.immutable.Stream.foreach(Stream.scala:594)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:622)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9.apply(ParquetFileFormat.scala:603)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:823)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:823)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.spark.SparkException: Failed to merge fields 'address' and 'address'. Failed to merge fields 'postal_code' and 'postal_code'. Failed to merge incompatible data types string and int
	at org.apache.spark.sql.types.StructType$$anonfun$merge$1$$anonfun$apply$3.apply(StructType.scala:502)
	at org.apache.spark.sql.types.StructType$$anonfun$merge$1$$anonfun$apply$3.apply(StructType.scala:495)
	at scala.Option.map(Option.scala:146)
	at org.apache.spark.sql.types.StructType$$anonfun$merge$1.apply(StructType.scala:495)
	at org.apache.spark.sql.types.StructType$$anonfun$merge$1.apply(StructType.scala:492)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
	at org.apache.spark.sql.types.StructType$.merge(StructType.scala:492)
	at org.apache.spark.sql.types.StructType.merge(StructType.scala:402)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$$anonfun$9$$anonfun$apply$14.apply(ParquetFileFormat.scala:625)
	... 17 more


**Conclusion:** Error when merging the partition 2020-04-01 because the field postal_code has a different datatype.

*Caused by: org.apache.spark.SparkException: Failed to merge fields 'address' and 'address'. Failed to merge fields 'postal_code' and 'postal_code'. Failed to merge incompatible data types string and int*

# Attempt 3 - Try to read forcing a schema on read

In [9]:
from pyspark.sql.types import StructType
import json

schema_json = '{"fields":[{"metadata":{},"name":"address","nullable":true,"type":{"fields":[{"metadata":{},"name":"address","nullable":true,"type":"string"},{"metadata":{},"name":"address_details","nullable":true,"type":{"fields":[{"metadata":{},"name":"number","nullable":true,"type":"string"},{"metadata":{},"name":"street","nullable":true,"type":{"fields":[{"metadata":{},"name":"lat","nullable":true,"type":"string"},{"metadata":{},"name":"latitude","nullable":true,"type":"string"},{"metadata":{},"name":"long","nullable":true,"type":"string"},{"metadata":{},"name":"longitude","nullable":true,"type":"string"},{"metadata":{},"name":"street_name","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"city","nullable":true,"type":"string"},{"metadata":{},"name":"country","nullable":true,"type":"string"},{"metadata":{},"name":"country_code","nullable":true,"type":"string"},{"metadata":{},"name":"postal_code","nullable":true,"type":"string"},{"metadata":{},"name":"state","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"age","nullable":true,"type":"string"},{"metadata":{},"name":"date","nullable":true,"type":"string"},{"metadata":{},"name":"first_name","nullable":true,"type":"string"},{"metadata":{},"name":"identifier","nullable":true,"type":"string"},{"metadata":{},"name":"last_name","nullable":true,"type":"string"},{"metadata":{},"name":"occupation","nullable":true,"type":"string"},{"metadata":{},"name":"title","nullable":true,"type":"string"},{"metadata":{},"name":"title_name","nullable":true,"type":"string"}],"type":"struct"}'
schema = StructType.fromJson(json.loads(schema_json))

In [10]:
schema

StructType(List(StructField(address,StructType(List(StructField(address,StringType,true),StructField(address_details,StructType(List(StructField(number,StringType,true),StructField(street,StructType(List(StructField(lat,StringType,true),StructField(latitude,StringType,true),StructField(long,StringType,true),StructField(longitude,StringType,true),StructField(street_name,StringType,true))),true))),true),StructField(city,StringType,true),StructField(country,StringType,true),StructField(country_code,StringType,true),StructField(postal_code,StringType,true),StructField(state,StringType,true))),true),StructField(age,StringType,true),StructField(date,StringType,true),StructField(first_name,StringType,true),StructField(identifier,StringType,true),StructField(last_name,StringType,true),StructField(occupation,StringType,true),StructField(title,StringType,true),StructField(title_name,StringType,true)))

In [11]:
df_at3 = spark.read.schema(schema).option("mergeSchema", "true").parquet(data_path)

In [12]:
df_at3.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- address_details: struct (nullable = true)
 |    |    |-- number: string (nullable = true)
 |    |    |-- street: struct (nullable = true)
 |    |    |    |-- lat: string (nullable = true)
 |    |    |    |-- latitude: string (nullable = true)
 |    |    |    |-- long: string (nullable = true)
 |    |    |    |-- longitude: string (nullable = true)
 |    |    |    |-- street_name: string (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- country_code: string (nullable = true)
 |    |-- postal_code: string (nullable = true)
 |    |-- state: string (nullable = true)
 |-- age: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- identifier: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- title: string (nullable = true)
 |-- title_name: string (nullab

In [13]:
df_at3.show()

Py4JJavaError: An error occurred while calling o2054.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 41.0 failed 1 times, most recent failure: Lost task 0.0 in stage 41.0 (TID 342, localhost, executor driver): org.apache.spark.sql.execution.QueryExecutionException: Encounter error while reading parquet files. One possible cause: Parquet column cannot be converted in the corresponding files. Details: 
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:193)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.parquet.io.ParquetDecodingException: Can not read value at 1 in block 0 in file file:/home/jovyan/work/spark-data/raw/test_data_parquet/date=2020-04-01/part-00000-7509b13c-f2bf-4104-ac10-43a927e3c41a.c000.snappy.parquet
	at org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:251)
	at org.apache.parquet.hadoop.ParquetRecordReader.nextKeyValue(ParquetRecordReader.java:207)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:181)
	... 22 more
Caused by: java.lang.ClassCastException: org.apache.spark.sql.catalyst.expressions.MutableAny cannot be cast to org.apache.spark.sql.catalyst.expressions.MutableInt
	at org.apache.spark.sql.catalyst.expressions.SpecificInternalRow.setInt(SpecificInternalRow.scala:233)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetRowConverter$RowUpdater.setInt(ParquetRowConverter.scala:168)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetPrimitiveConverter.addInt(ParquetRowConverter.scala:86)
	at org.apache.parquet.column.impl.ColumnReaderImpl$2$3.writeValue(ColumnReaderImpl.java:248)
	at org.apache.parquet.column.impl.ColumnReaderImpl.writeCurrentValueToConverter(ColumnReaderImpl.java:367)
	at org.apache.parquet.io.RecordReaderImplementation.read(RecordReaderImplementation.java:406)
	at org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:226)
	... 27 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1891)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1879)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1878)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1878)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:927)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:927)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2112)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2061)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2050)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:738)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:365)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3389)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3370)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:80)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:127)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:75)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3369)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2550)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2764)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:254)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:291)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.sql.execution.QueryExecutionException: Encounter error while reading parquet files. One possible cause: Parquet column cannot be converted in the corresponding files. Details: 
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:193)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:346)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:310)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.parquet.io.ParquetDecodingException: Can not read value at 1 in block 0 in file file:/home/jovyan/work/spark-data/raw/test_data_parquet/date=2020-04-01/part-00000-7509b13c-f2bf-4104-ac10-43a927e3c41a.c000.snappy.parquet
	at org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:251)
	at org.apache.parquet.hadoop.ParquetRecordReader.nextKeyValue(ParquetRecordReader.java:207)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:101)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:181)
	... 22 more
Caused by: java.lang.ClassCastException: org.apache.spark.sql.catalyst.expressions.MutableAny cannot be cast to org.apache.spark.sql.catalyst.expressions.MutableInt
	at org.apache.spark.sql.catalyst.expressions.SpecificInternalRow.setInt(SpecificInternalRow.scala:233)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetRowConverter$RowUpdater.setInt(ParquetRowConverter.scala:168)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetPrimitiveConverter.addInt(ParquetRowConverter.scala:86)
	at org.apache.parquet.column.impl.ColumnReaderImpl$2$3.writeValue(ColumnReaderImpl.java:248)
	at org.apache.parquet.column.impl.ColumnReaderImpl.writeCurrentValueToConverter(ColumnReaderImpl.java:367)
	at org.apache.parquet.io.RecordReaderImplementation.read(RecordReaderImplementation.java:406)
	at org.apache.parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:226)
	... 27 more


**Conclusion:** Apparently it could not merge the change in partition 2020-04-01. TThis partition has significant changes in the address struct and it can be the reason why Spark could not read it properly.

*Caused by: org.apache.parquet.io.ParquetDecodingException: Can not read value at 1 in block 0 in file file:/home/jovyan/work/data/raw/test_data_parquet/date=2020-04-01/part-00000-796d0c3c-69c0-44c5-a4fa-635195e8d6a9.c000.snappy.parquet*

# Attempt 4: Read each partition at a time and union dataframes

In [14]:
import os
from pyspark.sql.functions import lit

idx = 0

for dir in [d for d in os.listdir(data_path) if d.find("=") != -1]:
    
    df_temp = spark.read.parquet(data_path + "/" + dir).withColumn(dir.split("=")[0], lit(dir.split("=")[1]))

    if idx == 0:
        df_at4 = df_temp
    else:
        df_at4 = df_at4.union(df_temp)

    idx = idx + 1

AnalysisException: "Union can only be performed on tables with the same number of columns, but the first table has 7 columns and the second table has 8 columns;;\n'Union\n:- Project [identifier#2893, first_name#2894, last_name#2895, occupation#2896, age#2897, address#2898, 2020-02-01 AS date#2905]\n:  +- Relation[identifier#2893,first_name#2894,last_name#2895,occupation#2896,age#2897,address#2898] parquet\n+- Project [identifier#2913, first_name#2914, last_name#2915, occupation#2916, age#2917, address#2918, title#2919, 2020-03-01 AS date#2927]\n   +- Relation[identifier#2913,first_name#2914,last_name#2915,occupation#2916,age#2917,address#2918,title#2919] parquet\n"

# Final solution - without incremental control
For the complete final solution with incremental control check here: http://127.0.0.1:8888/notebooks/work/notebooks/MergeSchemasParquet.ipynb

In [15]:
# Load functions
%run ./modules/mergeSchemaParquet-functions.ipynb
%run ./modules/helpers.ipynb

# Read all partitions and merge the schemas
df = spark.read.json(merge_schemas_full(data_path))


Processing files:
idx: 0 | path: /home/jovyan/work/spark-data/raw/test_data_parquet/date=2020-02-01
idx: 1 | path: /home/jovyan/work/spark-data/raw/test_data_parquet/date=2020-03-01
idx: 2 | path: /home/jovyan/work/spark-data/raw/test_data_parquet/date=2020-06-01
idx: 3 | path: /home/jovyan/work/spark-data/raw/test_data_parquet/date=2020-04-01
idx: 4 | path: /home/jovyan/work/spark-data/raw/test_data_parquet/date=2020-01-01
idx: 5 | path: /home/jovyan/work/spark-data/raw/test_data_parquet/date=2020-05-01


In [16]:
# Check schema
df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- address_details: struct (nullable = true)
 |    |    |-- number: string (nullable = true)
 |    |    |-- street: struct (nullable = true)
 |    |    |    |-- lat: string (nullable = true)
 |    |    |    |-- latitude: string (nullable = true)
 |    |    |    |-- long: string (nullable = true)
 |    |    |    |-- longitude: string (nullable = true)
 |    |    |    |-- street_name: string (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- country_code: string (nullable = true)
 |    |-- postal_code: string (nullable = true)
 |    |-- state: string (nullable = true)
 |-- age: string (nullable = true)
 |-- date: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- identifier: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- title: string (nullable = t

In [17]:
from pyspark.sql.functions import col

# Count by partition
df.select(
    col("date")
).groupBy("date").count().sort("date").toPandas()

Unnamed: 0,date,count
0,2020-01-01,10
1,2020-02-01,10
2,2020-03-01,10
3,2020-04-01,10
4,2020-05-01,10
5,2020-06-01,10


In [18]:
df.show()

+--------------------+---+----------+----------+----------+---------+--------------------+----------+----------+
|             address|age|      date|first_name|identifier|last_name|          occupation|     title|title_name|
+--------------------+---+----------+----------+----------+---------+--------------------+----------+----------+
|[339 Legion Run,,...| 20|2020-02-01|   Zackary|  07-28/51|    Haney| Security Consultant|      null|      null|
|[496 Lowell Priva...| 28|2020-02-01|      Jame|  52-81/02|     Cobb|        Photographer|      null|      null|
|[363 Chabot Line,...| 66|2020-02-01|   Babette|  16-14/80|  Harrell|       Tanker Driver|      null|      null|
|[595 Bergen Squar...| 33|2020-02-01|       Flo|  46-42/47|    Joyce|              Tanner|      null|      null|
|[473 Coldspring G...| 66|2020-02-01|   Jeanett|  05-76/67|  Fuentes|    Vehicle Assessor|      null|      null|
|[219 Chattanooga ...| 65|2020-02-01|    Antwan|  12-84/88|   Zamora| Playgroup Assistant|      

In [19]:
df_flat = df.selectExpr(flatten(df.schema))

In [20]:
from pyspark.sql.functions import col, when

# Count by partition and column
df_flat.select(
    ["date"] + 
    [when(col(c).isNull(), 1).otherwise(0).alias(c) for c in df_flat.columns if c != "date"]
).groupBy("date").sum().sort("date").toPandas()

Unnamed: 0,date,sum(address_address),sum(address_address_details_number),sum(address_address_details_street_lat),sum(address_address_details_street_latitude),sum(address_address_details_street_long),sum(address_address_details_street_longitude),sum(address_address_details_street_street_name),sum(address_city),sum(address_country),sum(address_country_code),sum(address_postal_code),sum(address_state),sum(age),sum(first_name),sum(identifier),sum(last_name),sum(occupation),sum(title),sum(title_name)
0,2020-01-01,10,10,10,10,10,10,10,10,10,10,10,10,0,0,0,0,0,10,10
1,2020-02-01,0,10,10,10,10,10,10,0,0,10,0,0,0,0,0,0,0,10,10
2,2020-03-01,0,10,10,10,10,10,10,0,0,0,0,0,0,0,0,0,0,0,10
3,2020-04-01,10,0,10,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,10
4,2020-05-01,10,0,0,10,0,10,0,0,0,0,0,0,0,0,0,0,0,10,0
5,2020-06-01,10,0,10,10,10,10,0,0,0,10,0,0,0,0,0,0,0,10,10
