In [1]:
import sys
!{sys.executable} -m pip install pyspark==4.0.2 delta-spark==4.1.0

print("pyspark and delta-spark installed successfully.")

pyspark and delta-spark installed successfully.


In [2]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

# Configure SparkSession for Delta Lake
# Use configure_spark_with_delta_pip to add Delta Lake support and explicitly add Delta configs
spark = configure_spark_with_delta_pip(SparkSession.builder.appName("DeltaLakeQuickstart")) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

print(f"SparkSession with Delta Lake support initialized successfully. Spark version: {spark.version}")

SparkSession with Delta Lake support initialized successfully. Spark version: 4.0.2


In [3]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# 2. Define a schema for the DataFrame
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
])

# 3. Create a list of sample data
data = [
    (1, "Alice", 30),
    (2, "Bob", 24),
    (3, "Charlie", 35),
    (4, "David", 29)
]

# 4. Create the PySpark DataFrame
sample_df = spark.createDataFrame(data, schema)

# 5. Display the first few rows and print the schema
sample_df.show()
sample_df.printSchema()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 30|
|  2|    Bob| 24|
|  3|Charlie| 35|
|  4|  David| 29|
+---+-------+---+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [4]:
delta_table_path = "/tmp/delta/sample_delta_table"

# Write the DataFrame as a Delta Table
sample_df.write.format("delta").mode("overwrite").save(delta_table_path)

print(f"DataFrame successfully saved as Delta Table at: {delta_table_path}")

DataFrame successfully saved as Delta Table at: /tmp/delta/sample_delta_table


In [5]:
read_df = spark.read.format("delta").load(delta_table_path)

print("Data read from Delta Table:")
read_df.show()

Data read from Delta Table:
+---+-------+---+
| id|   name|age|
+---+-------+---+
|  3|Charlie| 35|
|  4|  David| 29|
|  1|  Alice| 30|
|  2|    Bob| 24|
+---+-------+---+

