In [3]:
import pyspark

from delta import *
from pyspark.sql.types import *
from delta.tables import *
from pyspark.sql.functions import *

In [4]:
# Creating spark application with delta lake configuration
builder = pyspark.sql.SparkSession.builder.appName("pyspark-delta-lake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

In [5]:
# Creating delta/spark context
spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

24/04/24 20:01:50 WARN Utils: Your hostname, galaxia-vostro-3520 resolves to a loopback address: 127.0.1.1; using 10.32.9.180 instead (on interface wlp0s20f3)
24/04/24 20:01:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/eduardo.freitas/dev/projects/college/delta-lake-iceberg-demo/venv/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/eduardo.freitas/.ivy2/cache
The jars for the packages stored in: /home/eduardo.freitas/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f3758c27-4dd1-48fd-84f7-4980fd7e26ee;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.1.0 in central
	found io.delta#delta-storage;3.1.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 538ms :: artifacts dl 21ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.1.0 from central in [default]
	io.delta#delta-storage;3.1.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default

In [6]:
# Creating the delta table
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("url", StringType(), True),
    StructField("title", StringType(), True),
    StructField("upc", StringType(), True),
    StructField("product_type", StringType(), True),
    StructField("price_excl_tax", IntegerType(), True),
    StructField("price_incl_tax", IntegerType(), True),
    StructField("tax", IntegerType(), True),
    StructField("price", IntegerType(), True),
    StructField("availability", IntegerType(), True),
    StructField("num_reviews", IntegerType(), True),
    StructField("stars", IntegerType(), True),
    StructField("category", StringType(), True),
    StructField("description", StringType(), True)
])

books_df = spark.read.json('books.json', schema)
books_df.write.mode(saveMode="overwrite").format("delta").save("data/delta/books")

                                                                                

In [7]:
# Metadata for books_df delta table
# Schema
print('Schema for books_df')
books_df.printSchema()

# Number of rows
print('Number of rows: ', books_df.count())

Schema for books_df
root
 |-- id: integer (nullable = true)
 |-- url: string (nullable = true)
 |-- title: string (nullable = true)
 |-- upc: string (nullable = true)
 |-- product_type: string (nullable = true)
 |-- price_excl_tax: integer (nullable = true)
 |-- price_incl_tax: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- availability: integer (nullable = true)
 |-- num_reviews: integer (nullable = true)
 |-- stars: integer (nullable = true)
 |-- category: string (nullable = true)
 |-- description: string (nullable = true)

Number of rows:  998


In [8]:
# Reading the books as a delta table
books = spark.read.format("delta").load("data/delta/books")

books.show()

                                                                                

+----+--------------------+--------------------+----------------+------------+--------------+--------------+----+-----+------------+-----------+-----+------------------+--------------------+
|  id|                 url|               title|             upc|product_type|price_excl_tax|price_incl_tax| tax|price|availability|num_reviews|stars|          category|         description|
+----+--------------------+--------------------+----------------+------------+--------------+--------------+----+-----+------------+-----------+-----+------------------+--------------------+
|NULL|                NULL|                NULL|            NULL|        NULL|          NULL|          NULL|NULL| NULL|        NULL|       NULL| NULL|              NULL|                NULL|
|   2|https://books.tos...|Scott Pilgrim's l...|3b1c02bac2a429e6|       books|          NULL|          NULL|NULL| NULL|          19|          0|    5|    sequential art|('Scott Pilgrim\'...|
|   3|https://books.tos...|Aaron Ledbetter’s.

In [9]:
# Creating a temporary view for querying results out of books delta table
books_df.createOrReplaceTempView("books")

In [10]:
# Querying 5 stars books
five_stars_books_df = spark.sql("SELECT title, stars, category FROM books WHERE stars > 4 ORDER BY stars DESC")

five_stars_books_df.show()

+--------------------+-----+---------------+
|               title|stars|       category|
+--------------------+-----+---------------+
|Scott Pilgrim's l...|    5| sequential art|
|Aaron Ledbetter’s...|    5|    young adult|
|From a renowned h...|    5|        history|
|Punk's raw power ...|    5|          music|
|No matter how bus...|    5|        romance|
|A Michelin two-st...|    5|        romance|
|Anti-apartheid ac...|    5|     nonfiction|
|A page-turning no...|    5|     philosophy|
|Mark Fallon is an...|    5|       thriller|
|In The Four Agree...|    5|   spirituality|
|Paris is burning-...|    5|        fiction|
|There is a cosmic...|    5|     nonfiction|
|On a searing summ...|    5|        fiction|
|The Freeman famil...|    5|        fiction|
|Slay Procrastinat...|    5|        default|
|Change and anger ...|    5|   spirituality|
|Just as Annie and...|    5|        fantasy|
|THE LONG-AWAITED ...|    5| sequential art|
|What if you could...|    5|science fiction|
|Tired of 