In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Jupyter").getOrCreate()

spark

25/10/15 12:59:28 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


## 1. Create namespace

In [2]:
%%sql
SHOW CATALOGS;

catalog
demo
spark_catalog


In [3]:
%%sql
SHOW DATABASES;

namespace
db


In [4]:
%%sql
CREATE NAMESPACE IF NOT EXISTS demo.db;

## 2. Read dataset

In [5]:
## checking the dataset presence
!ls dataset

insurance.csv


In [6]:
df = spark.read.option("header", "true").option("inferSchema", "true").csv("./dataset/insurance.csv")
df.show(5)

+---+------+------+--------+------+---------+-----------+
|age|   sex|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
+---+------+------+--------+------+---------+-----------+
only showing top 5 rows



In [7]:
df.createOrReplaceTempView("insurance_csv")

In [8]:
%%sql
SELECT * FROM insurance_csv
WHERE
  bmi = 39.82;

age,sex,bmi,children,smoker,region,charges
56,female,39.82,0,no,southeast,11090.7178
33,female,39.82,1,no,southeast,4795.6568
18,female,39.82,0,no,southeast,1633.9618


## 3. Partition

In [9]:
%%sql
CREATE
OR REPLACE TABLE demo.db.insurance USING iceberg PARTITIONED BY (region) AS
SELECT * FROM
  insurance_csv;

                                                                                

In [10]:
%%sql
SELECT * FROM demo.db.insurance LIMIT 5;

age,sex,bmi,children,smoker,region,charges
33,male,22.705,0,no,northwest,21984.47061
32,male,28.88,0,no,northwest,3866.8552
37,female,27.74,3,no,northwest,7281.5056
60,female,25.84,0,no,northwest,28923.13692
37,male,28.025,2,no,northwest,6203.90175


## 3.1 Show partitions in the table

### Show the table definition

In [11]:
%%sql
DESCRIBE EXTENDED demo.db.insurance;

col_name,data_type,comment
age,int,
sex,string,
bmi,double,
children,int,
smoker,string,
region,string,
charges,double,
# Partition Information,,
# col_name,data_type,comment
region,string,


### Iceberg metadata tables

In [12]:
%%sql
SELECT * FROM demo.db.insurance.partitions;

partition,spec_id,record_count,file_count,total_data_file_size_in_bytes,position_delete_record_count,position_delete_file_count,equality_delete_record_count,equality_delete_file_count,last_updated_at,last_updated_snapshot_id
Row(region='northwest'),0,325,1,6366,0,0,0,0,2025-10-15 12:59:35.275000,7724012699579993439
Row(region='southwest'),0,325,1,5744,0,0,0,0,2025-10-15 12:59:35.275000,7724012699579993439
Row(region='southeast'),0,364,1,6687,0,0,0,0,2025-10-15 12:59:35.275000,7724012699579993439
Row(region='northeast'),0,324,1,6423,0,0,0,0,2025-10-15 12:59:35.275000,7724012699579993439


### Run a partition‑pruning query

In [13]:
%%sql
EXPLAIN SELECT * FROM demo.db.insurance WHERE region = 'southeast';

plan
"== Physical Plan == *(1) ColumnarToRow +- BatchScan demo.db.insurance[age#207, sex#208, bmi#209, children#210, smoker#211, region#212, charges#213] demo.db.insurance (branch=null) [filters=region IS NOT NULL, region = 'southeast', groupedBy=] RuntimeFilters: []"


## 4. Schema evolution

### 4.1 Rename column

In [14]:
%%sql
ALTER TABLE demo.db.insurance
RENAME COLUMN sex TO gender;

In [15]:
%%sql
SELECT * FROM demo.db.insurance LIMIT 5;

age,gender,bmi,children,smoker,region,charges
33,male,22.705,0,no,northwest,21984.47061
32,male,28.88,0,no,northwest,3866.8552
37,female,27.74,3,no,northwest,7281.5056
60,female,25.84,0,no,northwest,28923.13692
37,male,28.025,2,no,northwest,6203.90175


### 4.2 verify schema history

### Schema history

In [16]:
%%sql
SELECT * FROM demo.db.insurance.history ORDER BY made_current_at DESC;

made_current_at,snapshot_id,parent_id,is_current_ancestor
2025-10-15 12:59:35.275000,7724012699579993439,,True
2025-10-15 12:56:47.478000,9187748239727257317,,False
2025-10-15 12:52:22.885000,3262457274102260786,2.465268696510764e+18,False
2025-10-15 12:45:00.125000,2465268696510764264,,False


### Inspect metadata

In [18]:
%%sql
SELECT * FROM demo.db.insurance.snapshots ORDER BY committed_at DESC;

committed_at,snapshot_id,parent_id,operation,manifest_list,summary
2025-10-15 12:59:35.275000,7724012699579993439,,append,s3://warehouse/db/insurance/metadata/snap-7724012699579993439-1-8e62b31e-5bc5-4eb7-8b85-1cf2bdf69442.avro,"{'engine-version': '3.5.5', 'added-data-files': '4', 'total-equality-deletes': '0', 'app-id': 'local-1760533167736', 'added-records': '1338', 'total-records': '1338', 'spark.app.id': 'local-1760533167736', 'changed-partition-count': '4', 'engine-name': 'spark', 'total-position-deletes': '0', 'added-files-size': '25220', 'total-delete-files': '0', 'iceberg-version': 'Apache Iceberg 1.8.1 (commit 9ce0fcf0af7becf25ad9fc996c3bad2afdcfd33d)', 'total-files-size': '25220', 'total-data-files': '4'}"
2025-10-15 12:56:47.478000,9187748239727257317,,append,s3://warehouse/db/insurance/metadata/snap-9187748239727257317-1-e2489c37-84fe-448a-9f56-b8b3c1de5869.avro,"{'engine-version': '3.5.5', 'added-data-files': '4', 'total-equality-deletes': '0', 'app-id': 'local-1760532996222', 'added-records': '1338', 'total-records': '1338', 'spark.app.id': 'local-1760532996222', 'changed-partition-count': '4', 'engine-name': 'spark', 'total-position-deletes': '0', 'added-files-size': '25220', 'total-delete-files': '0', 'iceberg-version': 'Apache Iceberg 1.8.1 (commit 9ce0fcf0af7becf25ad9fc996c3bad2afdcfd33d)', 'total-files-size': '25220', 'total-data-files': '4'}"
2025-10-15 12:52:22.885000,3262457274102260786,2.465268696510764e+18,append,s3://warehouse/db/insurance/metadata/snap-3262457274102260786-1-be43f190-9278-412b-904e-20f985a341fe.avro,"{'engine-version': '3.5.5', 'added-data-files': '1', 'total-equality-deletes': '0', 'app-id': 'local-1760532269410', 'added-records': '1', 'total-records': '1339', 'spark.app.id': 'local-1760532269410', 'changed-partition-count': '1', 'engine-name': 'spark', 'total-position-deletes': '0', 'added-files-size': '1974', 'total-delete-files': '0', 'iceberg-version': 'Apache Iceberg 1.8.1 (commit 9ce0fcf0af7becf25ad9fc996c3bad2afdcfd33d)', 'total-files-size': '27194', 'total-data-files': '5'}"
2025-10-15 12:45:00.125000,2465268696510764264,,append,s3://warehouse/db/insurance/metadata/snap-2465268696510764264-1-ff33b53c-4adc-4f98-a660-2a37033ce03c.avro,"{'engine-version': '3.5.5', 'added-data-files': '4', 'total-equality-deletes': '0', 'app-id': 'local-1760532269410', 'added-records': '1338', 'total-records': '1338', 'spark.app.id': 'local-1760532269410', 'changed-partition-count': '4', 'engine-name': 'spark', 'total-position-deletes': '0', 'added-files-size': '25220', 'total-delete-files': '0', 'iceberg-version': 'Apache Iceberg 1.8.1 (commit 9ce0fcf0af7becf25ad9fc996c3bad2afdcfd33d)', 'total-files-size': '25220', 'total-data-files': '4'}"


## 5. Time Travel
https://docs.databricks.com/gcp/en/delta/history

In [26]:
%%sql
SELECT * FROM demo.db.insurance WHERE age = 19;

age,gender,bmi,children,smoker,region,charges
19,male,28.0,33,0,northwest,4500.0
19,male,20.425,0,no,northwest,1625.43375
19,male,25.555,0,no,northwest,1632.56445
19,female,31.825,1,no,northwest,2719.27975
19,male,30.59,0,no,northwest,1639.5631
19,female,32.11,0,no,northwest,2130.6759
19,male,29.07,0,yes,northwest,17352.6803
19,male,31.92,0,yes,northwest,33750.2918
19,male,36.955,0,yes,northwest,36219.40545
19,male,20.615,2,no,northwest,2803.69785


## 5.1 Query by snapshot ID

In [22]:
%%sql
SELECT * FROM demo.db.insurance.snapshot_id_7724012699579993439 LIMIT 5;

age,sex,bmi,children,smoker,region,charges
33,male,22.705,0,no,northwest,21984.47061
32,male,28.88,0,no,northwest,3866.8552
37,female,27.74,3,no,northwest,7281.5056
60,female,25.84,0,no,northwest,28923.13692
37,male,28.025,2,no,northwest,6203.90175


## 5.2 Query by timestamp

In [23]:
%%sql
SELECT * FROM demo.db.insurance TIMESTAMP AS OF '2025-10-15 12:59:35.275000' LIMIT 5;

age,sex,bmi,children,smoker,region,charges
33,male,22.705,0,no,northwest,21984.47061
32,male,28.88,0,no,northwest,3866.8552
37,female,27.74,3,no,northwest,7281.5056
60,female,25.84,0,no,northwest,28923.13692
37,male,28.025,2,no,northwest,6203.90175
