In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql import types as t
import math

In [2]:
spark = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .config("spark.jars.packages", "org.apache.spark:spark-avro_2.12:3.5.0") \
    .getOrCreate()

"""
Reference gresearch:
- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/
- GitHub Spark extension: https://github.com/G-Research/spark-extension
- Parquet methods: https://github.com/G-Research/spark-extension/tree/master/python/gresearch/spark/parquet
"""

'\nReference gresearch:\n- Parquet files analysis: https://www.gresearch.com/blog/article/parquet-files-know-your-scaling-limits/\n- GitHub Spark extension: https://github.com/G-Research/spark-extension\n- Parquet methods: https://github.com/G-Research/spark-extension/tree/master/python/gresearch/spark/parquet\n'

In [3]:
sc = spark.sparkContext

In [4]:
#Turning off AQE as it generates more jobs which might be confusing for this scenario here. 
spark.conf.set("spark.sql.adaptive.enabled", "false")
#to not cache dataframes... this may not create repeatable results
spark.conf.set("spark.databricks.io.cache.enabled", "false")

In [5]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

In [6]:
sdf = sdf_generator(10000000, 8)
sdf.show()

+---+----------+--------------------+--------+-------+------+
| id|      date|           timestamp|idstring|idfirst|idlast|
+---+----------+--------------------+--------+-------+------+
|  0|2024-03-23|2024-03-23 08:27:...|       0|      0|     0|
|  1|2024-03-23|2024-03-23 08:27:...|       1|      1|     1|
|  2|2024-03-23|2024-03-23 08:27:...|       2|      2|     2|
|  3|2024-03-23|2024-03-23 08:27:...|       3|      3|     3|
|  4|2024-03-23|2024-03-23 08:27:...|       4|      4|     4|
|  5|2024-03-23|2024-03-23 08:27:...|       5|      5|     5|
|  6|2024-03-23|2024-03-23 08:27:...|       6|      6|     6|
|  7|2024-03-23|2024-03-23 08:27:...|       7|      7|     7|
|  8|2024-03-23|2024-03-23 08:27:...|       8|      8|     8|
|  9|2024-03-23|2024-03-23 08:27:...|       9|      9|     9|
| 10|2024-03-23|2024-03-23 08:27:...|      10|      1|     0|
| 11|2024-03-23|2024-03-23 08:27:...|      11|      1|     1|
| 12|2024-03-23|2024-03-23 08:27:...|      12|      1|     2|
| 13|202

In [7]:
sdf

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

In [8]:
sdf.schema

StructType([StructField('id', LongType(), False), StructField('date', DateType(), False), StructField('timestamp', TimestampType(), False), StructField('idstring', StringType(), False), StructField('idfirst', StringType(), False), StructField('idlast', StringType(), False)])

In [9]:

#https://vincent.doba.fr/posts/20211004_spark_data_description_language_for_defining_spark_schema/
ddl_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"

spark_schema = t.StructType(
    [
        t.StructField('id', t.LongType(), True), 
        t.StructField('date', t.DateType(), True), 
        t.StructField('timestamp', t.TimestampType(), True), 
        t.StructField('idstring', t.StringType(), True), 
        t.StructField('idfirst', t.StringType(), True), 
        t.StructField('idlast', t.StringType(), True)
    ]
)

In [10]:
t._parse_datatype_string(ddl_schema)

StructType([StructField('id', LongType(), True), StructField('date', DateType(), True), StructField('timestamp', TimestampType(), True), StructField('idstring', StringType(), True), StructField('idfirst', StringType(), True), StructField('idlast', StringType(), True)])

# 1 - JSON

## 1-1 Initial state JSON
- Writing and reading json we default settings

Results:
- Write time: 8.4 s 
- Load time: 11.7 s
- Data size: 1208 MB

How it works: https://github.com/jerryshao/apache-spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala#L323
https://spark.apache.org/docs/latest/sql-data-sources-json.html

In [10]:
sc.setJobDescription("Save Json")
path_json = "D:/Spark/Data/format_json.json"
sdf.write.format("json").mode("overwrite").save(path_json)

In [11]:
#set maxPartitions MB for a fair comparison of 8 partitions also during read 
maxPartitionsMB = 160
maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")

In [12]:
sc.setJobDescription("Load Json without schema")
sdf_json = spark.read.format("json").load(path_json)
sdf_json.write.format("noop").mode("overwrite").save()

In [13]:
sdf_json

DataFrame[date: string, id: bigint, idfirst: string, idlast: string, idstring: string, timestamp: string]

## 1-2 Load with Auto Schema
- Use the schema of the loaded JSON Dataframe for loading the data and move to correct order

Results:
- Load time: 7 s

In [14]:
schema = "id bigint, date string, timestamp string, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Json with auto schema")
sdf_json = spark.read.format("json").schema(schema).load(path_json)
sdf_json.write.format("noop").mode("overwrite").save()

In [15]:
sdf_json

DataFrame[id: bigint, date: string, timestamp: string, idstring: string, idfirst: string, idlast: string]

## 1-3 Cast output dataframe
- When loading JSON the schema is not as expected as initially defined in the saved SDF
- Let's additionally cast our dataframes with and without schema

Results:
- With manual casting without schema: 17.5 s
- With manual casting with schema: 14 s


In [16]:
sc.setJobDescription("Load Json without schema and manual casting")
sdf_json = spark.read.format("json").load(path_json)
sdf_json = sdf_json.withColumn("date", f.col("date").cast("date"))
sdf_json = sdf_json.withColumn("timestamp", f.col("timestamp").cast("timestamp"))
sdf_json = sdf_json.select("id", "date", "timestamp",  "idstring", "idfirst", "idlast")
sdf_json.write.format("noop").mode("overwrite").save()

In [17]:
sdf_json

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

In [18]:
schema = "id bigint, date string, timestamp string, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Json with auto schema and manual casting")
sdf_json = spark.read.format("json").schema(schema).load(path_json)
sdf_json = sdf_json.withColumn("date", f.col("date").cast("date"))
sdf_json = sdf_json.withColumn("timestamp", f.col("timestamp").cast("timestamp"))
sdf_json = sdf_json.select("id", "date", "timestamp",  "idstring", "idfirst", "idlast")
sdf_json.write.format("noop").mode("overwrite").save()

In [19]:
sdf_json

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

## 1-4 Define expected Schema

Results:
- Load data: 13.1 s

In [20]:
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Json with DataFrame schema")
sdf_json = spark.read.format("json").schema(sdf_schema).load(path_json)
sdf_json.write.format("noop").mode("overwrite").save()

In [21]:
sdf_json

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

# 2 - CSV with schema interference

## 2-1 Initial state CSV

- Writing and reading CSV we default settings

Results:
- Write time: 6.2 s
- Load time: 25.2 s
- Data size: 593 MB

How it works: https://github.com/jerryshao/apache-spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala#L470
https://spark.apache.org/docs/latest/sql-data-sources-csv.html

In [22]:
sc.setJobDescription("Save CSV")
path_csv = "D:/Spark/Data/format_csv.csv"
sdf.write.format("csv").mode("overwrite").option("header", "True").save(path_csv)

In [23]:
#set maxPartitions MB for a fair comparison of 8 partitions also during read 
maxPartitionsMB = 80
maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")

In [24]:
sc.setJobDescription("Load CSV without schema")
sdf_csv = spark.read.format("csv").options(inferSchema=True, header=True).load(path_csv)
sdf_csv.write.format("noop").mode("overwrite").save()

In [25]:
sdf_csv

DataFrame[id: int, date: date, timestamp: timestamp, idstring: int, idfirst: int, idlast: int]

## 2-2 Load with Auto Schema
- Use the schema of the loaded JSON Dataframe for loading the data and move to correct order

Results:
- Load time: 11.5 s

In [26]:
schema = "id int, date date, timestamp timestamp, idstring int, idfirst int, idlast int"
sc.setJobDescription("Load CSV with auto schema")
sdf_csv = spark.read.format("csv").options(header=True).schema(schema).load(path_csv)
sdf_csv.write.format("noop").mode("overwrite").save()

In [27]:
sdf_json

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

## 2-3 Cast output dataframe
- When loading JSON the schema is not as expected as initially defined in the saved SDF
- Let's additionally cast our dataframes with and without schema

Results:
- With manual casting without schema: 23.2 s
- With manual casting with schema: 12 s


In [28]:
sc.setJobDescription("Load CSV without schema and manual casting")
sdf_csv = spark.read.format("csv").options(inferSchema=True, header=True).load(path_csv)
sdf_csv = sdf_csv.withColumn("id", f.col("id").cast("long"))
sdf_csv = sdf_csv.withColumn("idstring", f.col("idstring").cast("string"))
sdf_csv = sdf_csv.withColumn("idfirst", f.col("idfirst").cast("string"))
sdf_csv = sdf_csv.withColumn("idlast", f.col("idlast").cast("string"))
sdf_csv = sdf_csv.select("id", "date", "timestamp",  "idstring", "idfirst", "idlast")
sdf_csv.write.format("noop").mode("overwrite").save()

In [29]:
sdf_csv

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

In [30]:
schema = "id int, date date, timestamp timestamp, idstring int, idfirst int, idlast int"
sc.setJobDescription("Load CSV with auto schema and manual casting")
sdf_csv = spark.read.format("csv").options(header=True).schema(schema).load(path_csv)
sdf_csv = sdf_csv.withColumn("id", f.col("id").cast("long"))
sdf_csv = sdf_csv.withColumn("idstring", f.col("idstring").cast("string"))
sdf_csv = sdf_csv.withColumn("idfirst", f.col("idfirst").cast("string"))
sdf_csv = sdf_csv.withColumn("idlast", f.col("idlast").cast("string"))
sdf_csv = sdf_csv.select("id", "date", "timestamp",  "idstring", "idfirst", "idlast")
sdf_csv.write.format("noop").mode("overwrite").save()

In [31]:
sdf_csv

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

## 2-4 Define expected Schema

Results:
- Load data: 11.6 s

In [32]:
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load CSV with DataFrame schema")
sdf_csv = spark.read.format("csv").options(header=True).schema(sdf_schema).load(path_csv)
sdf_csv.write.format("noop").mode("overwrite").save()

In [33]:
sdf_csv

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

# 3 - Parquet

## 3-1 Initial state Parquet
- Write time: 3.6 s
- Load time: 1 s
- Data size: 81.5 MB

How it works: https://spark.apache.org/docs/latest/sql-data-sources-parquet.html

In [11]:
sc.setJobDescription("Save Parquet")
path_parquet = "D:/Spark/Data/format_parquet.parquet"
sdf.write.format("parquet").mode("overwrite").save(path_parquet)

In [15]:
#set maxPartitions MB for a fair comparison of 8 partitions also during read 
maxPartitionsMB = 15
maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")

In [16]:
sc.setJobDescription("Load Parquet without schema")
sdf_parquet = spark.read.format("parquet").load(path_parquet)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [17]:
sdf_parquet

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

## 3-2 Define Schema

Results:
- Load data: 0.6 s

In [18]:
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Parquet with schema")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_parquet)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [19]:
sdf_parquet

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

## 3-3 Catalog

In [21]:
sc.setJobDescription("CrEATE CATALOG TABLE PARQUTE")
spark.sql("CREATE EXTERNAL TABLE test USING parquet LOCATION 'D:/Spark/Data/format_parquet.parquet'")



DataFrame[]

In [22]:
sc.setJobDescription("Load Parquet with Catalog")
sdf_parquet = spark.read.format("parquet").load(path_parquet)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [23]:
spark.sql("ANALYZE TABLE test COMPUTE STATISTICS FOR ALL COLUMNS")

DataFrame[]

In [24]:
spark.sql("DESC EXTENDED test").show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|                  id|              bigint|   NULL|
|                date|                date|   NULL|
|           timestamp|           timestamp|   NULL|
|            idstring|              string|   NULL|
|             idfirst|              string|   NULL|
|              idlast|              string|   NULL|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|             Catalog|       spark_catalog|       |
|            Database|             default|       |
|               Table|                test|       |
|        Created Time|Sat Mar 23 08:29:...|       |
|         Last Access|             UNKNOWN|       |
|          Created By|         Spark 3.5.1|       |
|                Type|            EXTERNAL|       |
|            Provider|             parquet|       |
|          S

In [25]:
spark.sql("DESC EXTENDED test id").show()

+--------------+----------+
|     info_name|info_value|
+--------------+----------+
|      col_name|        id|
|     data_type|    bigint|
|       comment|      NULL|
|           min|         0|
|           max|   9999999|
|     num_nulls|         0|
|distinct_count|   9386681|
|   avg_col_len|         8|
|   max_col_len|         8|
|     histogram|      NULL|
+--------------+----------+



In [26]:
sc.setJobDescription("Count Parquet with Catalog")
sdf_parquet = spark.read.format("parquet").load(path_parquet)
sdf_parquet.count()

10000000

# 4 - Avro

## 4-1 AVRO
- Write time: 1.9 s
- Load time: 1.6 s
- Data size: 69.2 MB

How it works: https://spark.apache.org/docs/latest/sql-data-sources-avro.html

In [40]:
sc.setJobDescription("Save Avro")
path_avro = "D:/Spark/Data/format_avro.avro"
sdf.write.format("avro").mode("overwrite").save(path_avro)

In [41]:
#set maxPartitions MB for a fair comparison of 8 partitions also during read 
maxPartitionsMB = 10
maxPartitionsBytes = math.ceil(maxPartitionsMB*1024*1024)
spark.conf.set("spark.sql.files.maxPartitionBytes", str(maxPartitionsBytes)+"b")

In [42]:
sc.setJobDescription("Load Avro")
sdf_avro = spark.read.format("avro").load(path_avro)
sdf_avro.write.format("noop").mode("overwrite").save()

## 4-2 Define Schema

Results:
- Load data: 1.6 s

In [43]:
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sc.setJobDescription("Load Avro with Schema")
sdf_avro = spark.read.format("avro").schema(sdf_schema).load(path_avro)
sdf_avro.write.format("noop").mode("overwrite").save()

In [44]:
sdf_avro

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

# 5 - Schema hints

In [45]:
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
spark.read.format("parquet").schema(sdf_schema).load(path_parquet).show()


+---+----------+--------------------+--------+-------+------+
| id|      date|           timestamp|idstring|idfirst|idlast|
+---+----------+--------------------+--------+-------+------+
|  0|2024-03-11|2024-03-11 20:38:...|       0|      0|     0|
|  1|2024-03-11|2024-03-11 20:38:...|       1|      1|     1|
|  2|2024-03-11|2024-03-11 20:38:...|       2|      2|     2|
|  3|2024-03-11|2024-03-11 20:38:...|       3|      3|     3|
|  4|2024-03-11|2024-03-11 20:38:...|       4|      4|     4|
|  5|2024-03-11|2024-03-11 20:38:...|       5|      5|     5|
|  6|2024-03-11|2024-03-11 20:38:...|       6|      6|     6|
|  7|2024-03-11|2024-03-11 20:38:...|       7|      7|     7|
|  8|2024-03-11|2024-03-11 20:38:...|       8|      8|     8|
|  9|2024-03-11|2024-03-11 20:38:...|       9|      9|     9|
| 10|2024-03-11|2024-03-11 20:38:...|      10|      1|     0|
| 11|2024-03-11|2024-03-11 20:38:...|      11|      1|     1|
| 12|2024-03-11|2024-03-11 20:38:...|      12|      1|     2|
| 13|202

## Use schema to ignore columns. 

Take care that you always define all needed columns otherwise you loose data

In [46]:
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string"
spark.read.format("parquet").schema(sdf_schema).load(path_parquet).show()

+---+----------+--------------------+--------+
| id|      date|           timestamp|idstring|
+---+----------+--------------------+--------+
|  0|2024-03-11|2024-03-11 20:38:...|       0|
|  1|2024-03-11|2024-03-11 20:38:...|       1|
|  2|2024-03-11|2024-03-11 20:38:...|       2|
|  3|2024-03-11|2024-03-11 20:38:...|       3|
|  4|2024-03-11|2024-03-11 20:38:...|       4|
|  5|2024-03-11|2024-03-11 20:38:...|       5|
|  6|2024-03-11|2024-03-11 20:38:...|       6|
|  7|2024-03-11|2024-03-11 20:38:...|       7|
|  8|2024-03-11|2024-03-11 20:38:...|       8|
|  9|2024-03-11|2024-03-11 20:38:...|       9|
| 10|2024-03-11|2024-03-11 20:38:...|      10|
| 11|2024-03-11|2024-03-11 20:38:...|      11|
| 12|2024-03-11|2024-03-11 20:38:...|      12|
| 13|2024-03-11|2024-03-11 20:38:...|      13|
| 14|2024-03-11|2024-03-11 20:38:...|      14|
| 15|2024-03-11|2024-03-11 20:38:...|      15|
| 16|2024-03-11|2024-03-11 20:38:...|      16|
| 17|2024-03-11|2024-03-11 20:38:...|      17|
| 18|2024-03-

## Use schema to add Null columns
Adding a column not existing will add an Null column. Use-full if some files have a different schema and the value is not always there. Then you don't need to handle it yourself.

In [47]:
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string, empty string"
spark.read.format("parquet").schema(sdf_schema).load(path_parquet).show()

+---+----------+--------------------+--------+-------+------+-----+
| id|      date|           timestamp|idstring|idfirst|idlast|empty|
+---+----------+--------------------+--------+-------+------+-----+
|  0|2024-03-11|2024-03-11 20:38:...|       0|      0|     0| NULL|
|  1|2024-03-11|2024-03-11 20:38:...|       1|      1|     1| NULL|
|  2|2024-03-11|2024-03-11 20:38:...|       2|      2|     2| NULL|
|  3|2024-03-11|2024-03-11 20:38:...|       3|      3|     3| NULL|
|  4|2024-03-11|2024-03-11 20:38:...|       4|      4|     4| NULL|
|  5|2024-03-11|2024-03-11 20:38:...|       5|      5|     5| NULL|
|  6|2024-03-11|2024-03-11 20:38:...|       6|      6|     6| NULL|
|  7|2024-03-11|2024-03-11 20:38:...|       7|      7|     7| NULL|
|  8|2024-03-11|2024-03-11 20:38:...|       8|      8|     8| NULL|
|  9|2024-03-11|2024-03-11 20:38:...|       9|      9|     9| NULL|
| 10|2024-03-11|2024-03-11 20:38:...|      10|      1|     0| NULL|
| 11|2024-03-11|2024-03-11 20:38:...|      11|  

## Schema must be as in the format enforced

Avro and Parquet enforce the schema. A wrong schema will give an error

Here: int instead of bigint for id

In [48]:
sdf_schema = "id int, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
spark.read.format("parquet").schema(sdf_schema).load(path_parquet).show()


Py4JJavaError: An error occurred while calling o375.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 29.0 failed 1 times, most recent failure: Lost task 0.0 in stage 29.0 (TID 183) (DESKTOP-PNH8CDK executor driver): org.apache.spark.SparkException: Parquet column cannot be converted in file file:///D:/Spark/Data/format_parquet.parquet/part-00000-764a057c-4b8e-4cc4-b494-3b9b7ee1d350-c000.snappy.parquet. Column: [id], Expected: int, Found: INT64.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.unsupportedSchemaColumnConvertError(QueryExecutionErrors.scala:854)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:287)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:593)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException: column: [id], physicalType: INT64, logicalType: int
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.constructConvertNotSupportedException(ParquetVectorUpdaterFactory.java:1127)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.getUpdater(ParquetVectorUpdaterFactory.java:189)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:175)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:342)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:233)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:283)
	... 23 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2780)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2779)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2779)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1242)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3048)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2971)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:984)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4344)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3326)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4334)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4332)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4332)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3326)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3549)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkException: Parquet column cannot be converted in file file:///D:/Spark/Data/format_parquet.parquet/part-00000-764a057c-4b8e-4cc4-b494-3b9b7ee1d350-c000.snappy.parquet. Column: [id], Expected: int, Found: INT64.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.unsupportedSchemaColumnConvertError(QueryExecutionErrors.scala:854)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:287)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:593)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.columnartorow_nextBatch_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException: column: [id], physicalType: INT64, logicalType: int
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.constructConvertNotSupportedException(ParquetVectorUpdaterFactory.java:1127)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetVectorUpdaterFactory.getUpdater(ParquetVectorUpdaterFactory.java:189)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedColumnReader.readBatch(VectorizedColumnReader.java:175)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextBatch(VectorizedParquetRecordReader.java:342)
	at org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader.nextKeyValue(VectorizedParquetRecordReader.java:233)
	at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:39)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:283)
	... 23 more


In [49]:
sdf_schema = "id int, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
spark.read.format("avro").schema(sdf_schema).load(path_avro).show()

Py4JJavaError: An error occurred while calling o387.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 30.0 failed 1 times, most recent failure: Lost task 0.0 in stage 30.0 (TID 184) (DESKTOP-PNH8CDK executor driver): org.apache.spark.sql.avro.IncompatibleSchemaException: Cannot convert Avro type {"type":"record","name":"topLevelRecord","fields":[{"name":"id","type":"long"},{"name":"date","type":{"type":"int","logicalType":"date"}},{"name":"timestamp","type":{"type":"long","logicalType":"timestamp-micros"}},{"name":"idstring","type":"string"},{"name":"idfirst","type":"string"},{"name":"idlast","type":"string"}]} to SQL type STRUCT<id: INT, date: DATE, timestamp: TIMESTAMP, idstring: STRING, idfirst: STRING, idlast: STRING>.
	at org.apache.spark.sql.avro.AvroDeserializer.liftedTree1$1(AvroDeserializer.scala:101)
	at org.apache.spark.sql.avro.AvroDeserializer.<init>(AvroDeserializer.scala:73)
	at org.apache.spark.sql.avro.AvroFileFormat$$anon$1.<init>(AvroFileFormat.scala:144)
	at org.apache.spark.sql.avro.AvroFileFormat.$anonfun$buildReader$1(AvroFileFormat.scala:137)
	at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:155)
	at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:140)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:217)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:279)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.sql.avro.IncompatibleSchemaException: Cannot convert Avro field 'id' to SQL field 'id' because schema is incompatible (avroType = "long", sqlType = INT)
	at org.apache.spark.sql.avro.AvroDeserializer.newWriter(AvroDeserializer.scala:348)
	at org.apache.spark.sql.avro.AvroDeserializer.$anonfun$getRecordWriter$1(AvroDeserializer.scala:379)
	at scala.collection.immutable.List.map(List.scala:293)
	at org.apache.spark.sql.avro.AvroDeserializer.getRecordWriter(AvroDeserializer.scala:376)
	at org.apache.spark.sql.avro.AvroDeserializer.liftedTree1$1(AvroDeserializer.scala:83)
	... 29 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2780)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2779)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2779)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1242)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3048)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2971)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:984)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4344)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3326)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4334)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4332)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4332)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3326)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3549)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.sql.avro.IncompatibleSchemaException: Cannot convert Avro type {"type":"record","name":"topLevelRecord","fields":[{"name":"id","type":"long"},{"name":"date","type":{"type":"int","logicalType":"date"}},{"name":"timestamp","type":{"type":"long","logicalType":"timestamp-micros"}},{"name":"idstring","type":"string"},{"name":"idfirst","type":"string"},{"name":"idlast","type":"string"}]} to SQL type STRUCT<id: INT, date: DATE, timestamp: TIMESTAMP, idstring: STRING, idfirst: STRING, idlast: STRING>.
	at org.apache.spark.sql.avro.AvroDeserializer.liftedTree1$1(AvroDeserializer.scala:101)
	at org.apache.spark.sql.avro.AvroDeserializer.<init>(AvroDeserializer.scala:73)
	at org.apache.spark.sql.avro.AvroFileFormat$$anon$1.<init>(AvroFileFormat.scala:144)
	at org.apache.spark.sql.avro.AvroFileFormat.$anonfun$buildReader$1(AvroFileFormat.scala:137)
	at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:155)
	at org.apache.spark.sql.execution.datasources.FileFormat$$anon$1.apply(FileFormat.scala:140)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:217)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:279)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:129)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
Caused by: org.apache.spark.sql.avro.IncompatibleSchemaException: Cannot convert Avro field 'id' to SQL field 'id' because schema is incompatible (avroType = "long", sqlType = INT)
	at org.apache.spark.sql.avro.AvroDeserializer.newWriter(AvroDeserializer.scala:348)
	at org.apache.spark.sql.avro.AvroDeserializer.$anonfun$getRecordWriter$1(AvroDeserializer.scala:379)
	at scala.collection.immutable.List.map(List.scala:293)
	at org.apache.spark.sql.avro.AvroDeserializer.getRecordWriter(AvroDeserializer.scala:376)
	at org.apache.spark.sql.avro.AvroDeserializer.liftedTree1$1(AvroDeserializer.scala:83)
	... 29 more


## Use schema for casting partly in JSON and CSV

In JSON and CSV where the schema is not enforced you can cast the columns. In some cases it might fail but can really save you a lot of code at the end.

E.g. here for json defining the string as integer returns a Null value. Carefull with this

In [50]:
sdf_csv = spark.read.format("csv").options(inferSchema=True, header=True).load(path_csv)
sdf_csv

DataFrame[id: int, date: date, timestamp: timestamp, idstring: int, idfirst: int, idlast: int]

In [51]:
sdf_csv.show()

+---+----------+--------------------+--------+-------+------+
| id|      date|           timestamp|idstring|idfirst|idlast|
+---+----------+--------------------+--------+-------+------+
|  0|2024-03-11|2024-03-11 20:36:...|       0|      0|     0|
|  1|2024-03-11|2024-03-11 20:36:...|       1|      1|     1|
|  2|2024-03-11|2024-03-11 20:36:...|       2|      2|     2|
|  3|2024-03-11|2024-03-11 20:36:...|       3|      3|     3|
|  4|2024-03-11|2024-03-11 20:36:...|       4|      4|     4|
|  5|2024-03-11|2024-03-11 20:36:...|       5|      5|     5|
|  6|2024-03-11|2024-03-11 20:36:...|       6|      6|     6|
|  7|2024-03-11|2024-03-11 20:36:...|       7|      7|     7|
|  8|2024-03-11|2024-03-11 20:36:...|       8|      8|     8|
|  9|2024-03-11|2024-03-11 20:36:...|       9|      9|     9|
| 10|2024-03-11|2024-03-11 20:36:...|      10|      1|     0|
| 11|2024-03-11|2024-03-11 20:36:...|      11|      1|     1|
| 12|2024-03-11|2024-03-11 20:36:...|      12|      1|     2|
| 13|202

In [52]:
schema = "id bigint, date date, timestamp timestamp, idstring int, idfirst string, idlast string"
sdf_csv = spark.read.format("csv").options(header=True).schema(schema).load(path_csv)
sdf_csv

DataFrame[id: bigint, date: date, timestamp: timestamp, idstring: int, idfirst: string, idlast: string]

In [53]:
sdf_csv.show()

+---+----------+--------------------+--------+-------+------+
| id|      date|           timestamp|idstring|idfirst|idlast|
+---+----------+--------------------+--------+-------+------+
|  0|2024-03-11|2024-03-11 20:36:...|       0|      0|     0|
|  1|2024-03-11|2024-03-11 20:36:...|       1|      1|     1|
|  2|2024-03-11|2024-03-11 20:36:...|       2|      2|     2|
|  3|2024-03-11|2024-03-11 20:36:...|       3|      3|     3|
|  4|2024-03-11|2024-03-11 20:36:...|       4|      4|     4|
|  5|2024-03-11|2024-03-11 20:36:...|       5|      5|     5|
|  6|2024-03-11|2024-03-11 20:36:...|       6|      6|     6|
|  7|2024-03-11|2024-03-11 20:36:...|       7|      7|     7|
|  8|2024-03-11|2024-03-11 20:36:...|       8|      8|     8|
|  9|2024-03-11|2024-03-11 20:36:...|       9|      9|     9|
| 10|2024-03-11|2024-03-11 20:36:...|      10|      1|     0|
| 11|2024-03-11|2024-03-11 20:36:...|      11|      1|     1|
| 12|2024-03-11|2024-03-11 20:36:...|      12|      1|     2|
| 13|202

In [54]:
sdf_json = spark.read.format("json").load(path_json)
sdf_json

DataFrame[date: string, id: bigint, idfirst: string, idlast: string, idstring: string, timestamp: string]

In [55]:
sdf_json.show()

+----------+---+-------+------+--------+--------------------+
|      date| id|idfirst|idlast|idstring|           timestamp|
+----------+---+-------+------+--------+--------------------+
|2024-03-11|  0|      0|     0|       0|2024-03-11T20:35:...|
|2024-03-11|  1|      1|     1|       1|2024-03-11T20:35:...|
|2024-03-11|  2|      2|     2|       2|2024-03-11T20:35:...|
|2024-03-11|  3|      3|     3|       3|2024-03-11T20:35:...|
|2024-03-11|  4|      4|     4|       4|2024-03-11T20:35:...|
|2024-03-11|  5|      5|     5|       5|2024-03-11T20:35:...|
|2024-03-11|  6|      6|     6|       6|2024-03-11T20:35:...|
|2024-03-11|  7|      7|     7|       7|2024-03-11T20:35:...|
|2024-03-11|  8|      8|     8|       8|2024-03-11T20:35:...|
|2024-03-11|  9|      9|     9|       9|2024-03-11T20:35:...|
|2024-03-11| 10|      1|     0|      10|2024-03-11T20:35:...|
|2024-03-11| 11|      1|     1|      11|2024-03-11T20:35:...|
|2024-03-11| 12|      1|     2|      12|2024-03-11T20:35:...|
|2024-03

In [56]:
schema = "id int, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sdf_json = spark.read.format("json").schema(schema).load(path_json)
sdf_json

DataFrame[id: int, date: date, timestamp: timestamp, idstring: string, idfirst: string, idlast: string]

In [57]:
sdf_json.show()

+---+----------+--------------------+--------+-------+------+
| id|      date|           timestamp|idstring|idfirst|idlast|
+---+----------+--------------------+--------+-------+------+
|  0|2024-03-11|2024-03-11 20:35:...|       0|      0|     0|
|  1|2024-03-11|2024-03-11 20:35:...|       1|      1|     1|
|  2|2024-03-11|2024-03-11 20:35:...|       2|      2|     2|
|  3|2024-03-11|2024-03-11 20:35:...|       3|      3|     3|
|  4|2024-03-11|2024-03-11 20:35:...|       4|      4|     4|
|  5|2024-03-11|2024-03-11 20:35:...|       5|      5|     5|
|  6|2024-03-11|2024-03-11 20:35:...|       6|      6|     6|
|  7|2024-03-11|2024-03-11 20:35:...|       7|      7|     7|
|  8|2024-03-11|2024-03-11 20:35:...|       8|      8|     8|
|  9|2024-03-11|2024-03-11 20:35:...|       9|      9|     9|
| 10|2024-03-11|2024-03-11 20:35:...|      10|      1|     0|
| 11|2024-03-11|2024-03-11 20:35:...|      11|      1|     1|
| 12|2024-03-11|2024-03-11 20:35:...|      12|      1|     2|
| 13|202

In [58]:
schema = "id int, date date, timestamp timestamp, idstring int, idfirst string, idlast string"
sdf_json = spark.read.format("json").schema(schema).load(path_json)
sdf_json

DataFrame[id: int, date: date, timestamp: timestamp, idstring: int, idfirst: string, idlast: string]

In [59]:
sdf_json.show()

+---+----------+--------------------+--------+-------+------+
| id|      date|           timestamp|idstring|idfirst|idlast|
+---+----------+--------------------+--------+-------+------+
|  0|2024-03-11|2024-03-11 20:35:...|    NULL|      0|     0|
|  1|2024-03-11|2024-03-11 20:35:...|    NULL|      1|     1|
|  2|2024-03-11|2024-03-11 20:35:...|    NULL|      2|     2|
|  3|2024-03-11|2024-03-11 20:35:...|    NULL|      3|     3|
|  4|2024-03-11|2024-03-11 20:35:...|    NULL|      4|     4|
|  5|2024-03-11|2024-03-11 20:35:...|    NULL|      5|     5|
|  6|2024-03-11|2024-03-11 20:35:...|    NULL|      6|     6|
|  7|2024-03-11|2024-03-11 20:35:...|    NULL|      7|     7|
|  8|2024-03-11|2024-03-11 20:35:...|    NULL|      8|     8|
|  9|2024-03-11|2024-03-11 20:35:...|    NULL|      9|     9|
| 10|2024-03-11|2024-03-11 20:35:...|    NULL|      1|     0|
| 11|2024-03-11|2024-03-11 20:35:...|    NULL|      1|     1|
| 12|2024-03-11|2024-03-11 20:35:...|    NULL|      1|     2|
| 13|202

In [60]:
schema = "id int, date date, timestamp timestamp, idstring string, idfirst string, idlast string"
sdf_json = spark.read.format("json").schema(schema).load(path_json)
sdf_json = sdf_json.withColumn("idstring", f.col("idstring").cast("int"))
sdf_json

DataFrame[id: int, date: date, timestamp: timestamp, idstring: int, idfirst: string, idlast: string]

In [61]:
sdf_json.show()

+---+----------+--------------------+--------+-------+------+
| id|      date|           timestamp|idstring|idfirst|idlast|
+---+----------+--------------------+--------+-------+------+
|  0|2024-03-11|2024-03-11 20:35:...|       0|      0|     0|
|  1|2024-03-11|2024-03-11 20:35:...|       1|      1|     1|
|  2|2024-03-11|2024-03-11 20:35:...|       2|      2|     2|
|  3|2024-03-11|2024-03-11 20:35:...|       3|      3|     3|
|  4|2024-03-11|2024-03-11 20:35:...|       4|      4|     4|
|  5|2024-03-11|2024-03-11 20:35:...|       5|      5|     5|
|  6|2024-03-11|2024-03-11 20:35:...|       6|      6|     6|
|  7|2024-03-11|2024-03-11 20:35:...|       7|      7|     7|
|  8|2024-03-11|2024-03-11 20:35:...|       8|      8|     8|
|  9|2024-03-11|2024-03-11 20:35:...|       9|      9|     9|
| 10|2024-03-11|2024-03-11 20:35:...|      10|      1|     0|
| 11|2024-03-11|2024-03-11 20:35:...|      11|      1|     1|
| 12|2024-03-11|2024-03-11 20:35:...|      12|      1|     2|
| 13|202