# Convert CSV to Delta

In [1]:
import delta
import pyspark
from delta import configure_spark_with_delta_pip

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-330-delta-210/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-cd133e32-face-4ff2-8dd6-a8f6dfcbb4c4;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.1.0 in central
	found io.delta#delta-storage;2.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 115ms :: artifacts dl 4ms
	:: modules in use:
	io.delta#delta-core_2.12;2.1.0 from central in [default]
	io.delta#delta-storage;2.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| 

23/03/18 13:45:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/18 13:45:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/03/18 13:45:18 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Convert CSV to Delta

In [4]:
df = spark.read.format("csv").option("header", True).load("../../data/students/*.csv")

In [5]:
df.show()

+------------+---------------+-------+
|student_name|graduation_year|  major|
+------------+---------------+-------+
| chrisXXborg|           2025|    bio|
|davidXXcross|           2026|physics|
|sophiaXXraul|           2022|    bio|
|    fredXXli|           2025|physics|
|someXXperson|           2023|   math|
|     liXXyao|           2025|physics|
+------------+---------------+-------+



In [6]:
df.write.format("delta").save("tmp/students_delta")

                                                                                

In [7]:
!tree tmp/students_delta

[01;34mtmp/students_delta[0m
├── [01;34m_delta_log[0m
│   └── [00m00000000000000000000.json[0m
├── [00mpart-00000-f88eff9c-a087-4354-8c04-4eb3da62a10e-c000.snappy.parquet[0m
├── [00mpart-00001-0bc69839-449f-4a64-87b6-91e28bfb7423-c000.snappy.parquet[0m
└── [00mpart-00002-a093e875-4eb5-4372-8b4f-def98ec208a6-c000.snappy.parquet[0m

1 directory, 4 files


In [8]:
spark.read.format("delta").load("tmp/students_delta").show()

+------------+---------------+-------+
|student_name|graduation_year|  major|
+------------+---------------+-------+
| chrisXXborg|           2025|    bio|
|davidXXcross|           2026|physics|
|sophiaXXraul|           2022|    bio|
|    fredXXli|           2025|physics|
|someXXperson|           2023|   math|
|     liXXyao|           2025|physics|
+------------+---------------+-------+



## Clean data before creating Delta table

In [20]:
from pyspark.sql.functions import col, split

In [9]:
df.show()

+------------+---------------+-------+
|student_name|graduation_year|  major|
+------------+---------------+-------+
| chrisXXborg|           2025|    bio|
|davidXXcross|           2026|physics|
|sophiaXXraul|           2022|    bio|
|    fredXXli|           2025|physics|
|someXXperson|           2023|   math|
|     liXXyao|           2025|physics|
+------------+---------------+-------+



In [23]:
clean_df = (
    df.withColumn("student_first_name", split(col("student_name"), "XX").getItem(0))
    .withColumn("student_last_name", split(col("student_name"), "XX").getItem(1))
    .drop("student_name")
)

In [24]:
clean_df.show()

+---------------+-------+------------------+-----------------+
|graduation_year|  major|student_first_name|student_last_name|
+---------------+-------+------------------+-----------------+
|           2025|    bio|             chris|             borg|
|           2026|physics|             david|            cross|
|           2022|    bio|            sophia|             raul|
|           2025|physics|              fred|               li|
|           2023|   math|              some|           person|
|           2025|physics|                li|              yao|
+---------------+-------+------------------+-----------------+



In [25]:
clean_df.write.format("delta").save("tmp/clean_students_delta")

In [26]:
spark.read.format("delta").load("tmp/clean_students_delta").show()

+---------------+-------+------------------+-----------------+
|graduation_year|  major|student_first_name|student_last_name|
+---------------+-------+------------------+-----------------+
|           2025|    bio|             chris|             borg|
|           2026|physics|             david|            cross|
|           2022|    bio|            sophia|             raul|
|           2025|physics|              fred|               li|
|           2023|   math|              some|           person|
|           2025|physics|                li|              yao|
+---------------+-------+------------------+-----------------+



## Schema enforcement with CSV is bad

In [10]:
mismatched_df = spark.range(0, 3)

In [11]:
mismatched_df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
+---+



In [13]:
mismatched_df.repartition(1).write.mode("append").format("csv").option(
    "header", True
).save("../../data/students")

In [14]:
spark.read.format("csv").option("header", True).load("../../data/students/*.csv").show()

22/12/24 16:40:46 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 1, schema size: 3
CSV file: file:///Users/matthew.powers/Documents/code/my_apps/delta-examples/data/students/part-00000-988a286d-a024-4612-8b6e-89cce5f2556e-c000.csv
+------------+---------------+-------+
|student_name|graduation_year|  major|
+------------+---------------+-------+
| chrisXXborg|           2025|    bio|
|davidXXcross|           2026|physics|
|sophiaXXraul|           2022|    bio|
|    fredXXli|           2025|physics|
|someXXperson|           2023|   math|
|     liXXyao|           2025|physics|
|           0|           null|   null|
|           1|           null|   null|
|           2|           null|   null|
+------------+---------------+-------+



## Schema enforcement with Delta Lake is good

In [16]:
mismatched_df.repartition(1).write.mode("append").format("delta").save(
    "tmp/students_delta"
)

AnalysisException: A schema mismatch detected when writing to the Delta table (Table ID: 740d4bb1-d539-4d56-911e-18a616a37940).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- student_name: string (nullable = true)
-- graduation_year: string (nullable = true)
-- major: string (nullable = true)


Data schema:
root
-- id: long (nullable = true)

         