### План-минимум
- [ ] Посмотреть таблицу снэпшотов 
- [ ] Сделать изменения и потом откатиться к предыдущему снэпшоту
- [ ] Изменить названия колонок
- [ ] Вставить новые данные
- [ ] Изменить партицию
- [ ] Посмотреть таблицу `files`
- [ ] Посмотреть таблицу `history`
- [ ] Посмотреть таблицу `partitions`
- [ ] Сделать аудит прямым способом
- [ ] Сделать аудит с помощью веток

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import date_format, col

from pathlib import Path

Создадим сессию

In [2]:
spark = SparkSession.builder.appName("Jupyter") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .config("spark.memory.fraction", "0.8") \
    .getOrCreate()

spark

24/11/19 12:59:46 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Создадим базу данных

In [3]:
%%sql

CREATE DATABASE IF NOT EXISTS nyc

In [4]:
%%sql

DROP TABLE IF EXISTS nyc.taxis

Выгрузим все данные по желтым такси

In [5]:
schema = spark.read.parquet("/home/iceberg/data/yellow_tripdata_2021-04.parquet").schema
schema

                                                                                

StructType([StructField('VendorID', LongType(), True), StructField('tpep_pickup_datetime', TimestampNTZType(), True), StructField('tpep_dropoff_datetime', TimestampNTZType(), True), StructField('passenger_count', DoubleType(), True), StructField('trip_distance', DoubleType(), True), StructField('RatecodeID', DoubleType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('payment_type', LongType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True), StructField('airport_fee', DoubleType(), True)])

In [6]:
df = spark.createDataFrame([], schema)
df.writeTo("nyc.taxis").create()

                                                                                

In [7]:
path = Path('/home/iceberg/data')

for file in path.iterdir():
    if file.is_file():
        name = str(file).rsplit('/', 1)[-1]
        if 'yellow' in name:
            df = spark.read.parquet(str(file))
            df.writeTo("nyc.taxis").append()

                                                                                

Проверим, что данные выгрузились за все нужные месяца и года

In [8]:
year_month_fact = []

for file in path.iterdir():
    if file.is_file():
        name = str(file).rsplit('/', 1)[-1]
        if 'yellow' in name:
            year_month_cur = (
                int(name.rsplit('_', 1)[-1].rsplit('.')[0].split('-')[0]), 
                int(name.rsplit('_', 1)[-1].rsplit('.')[0].split('-')[1])
            )
            year_month_fact.append(year_month_cur)

year_month_fact

[(2022, 3),
 (2021, 7),
 (2021, 12),
 (2021, 5),
 (2021, 8),
 (2021, 4),
 (2022, 2),
 (2021, 9),
 (2021, 6),
 (2021, 11),
 (2021, 10),
 (2022, 4),
 (2022, 1)]

In [9]:
df_year_month = spark.sql("""
select year(tpep_pickup_datetime) as year, month(tpep_pickup_datetime) as month
from nyc.taxis
group by year(tpep_pickup_datetime), month(tpep_pickup_datetime)
order by year, month
""")

In [10]:
rows_year_month = df_year_month.select(df_year_month.year, df_year_month.month).collect()

                                                                                

In [11]:
rows_year_month

[Row(year=2002, month=12),
 Row(year=2003, month=1),
 Row(year=2004, month=4),
 Row(year=2008, month=12),
 Row(year=2009, month=1),
 Row(year=2011, month=1),
 Row(year=2011, month=2),
 Row(year=2012, month=2),
 Row(year=2021, month=3),
 Row(year=2021, month=4),
 Row(year=2021, month=5),
 Row(year=2021, month=6),
 Row(year=2021, month=7),
 Row(year=2021, month=8),
 Row(year=2021, month=9),
 Row(year=2021, month=10),
 Row(year=2021, month=11),
 Row(year=2021, month=12),
 Row(year=2022, month=1),
 Row(year=2022, month=2),
 Row(year=2022, month=3),
 Row(year=2022, month=4),
 Row(year=2022, month=5),
 Row(year=2028, month=12),
 Row(year=2029, month=5),
 Row(year=2070, month=8),
 Row(year=2098, month=9)]

In [12]:
year_month_table = [(row.year, row.month) for row in rows_year_month]
year_month_table

[(2002, 12),
 (2003, 1),
 (2004, 4),
 (2008, 12),
 (2009, 1),
 (2011, 1),
 (2011, 2),
 (2012, 2),
 (2021, 3),
 (2021, 4),
 (2021, 5),
 (2021, 6),
 (2021, 7),
 (2021, 8),
 (2021, 9),
 (2021, 10),
 (2021, 11),
 (2021, 12),
 (2022, 1),
 (2022, 2),
 (2022, 3),
 (2022, 4),
 (2022, 5),
 (2028, 12),
 (2029, 5),
 (2070, 8),
 (2098, 9)]

In [13]:
assert set(year_month_fact).issubset(set(year_month_table))

Значит мы верно прочитали все года и месяца

Заметим, что некоторые года и месяца выбиваются из рамок. Удалим эти выбросы

In [14]:
df = spark.sql("""
    select *
    from nyc.taxis
""")

In [15]:
filter_condition = col("year_month").isin([f"{year}-{month:02d}" for year, month in year_month_fact])

tmp_df = df \
    .withColumn("year_month", date_format(col("tpep_pickup_datetime"), "yyyy-MM")) \
    .filter(filter_condition)

In [16]:
tmp_df.registerTempTable('tmp')

spark.sql("""
   select *
   from tmp
""").show(5)

                                                                                

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|year_month|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+----------+
|       1| 2022-03-01 00:13:08|  2022-03-01 00:24:35|            1.0|          2.4|       1.0|                 N|          90|         209|           2|  

In [17]:
tmp_df = tmp_df.drop("year_month")

In [18]:
tmp_df.registerTempTable('tmp')

In [19]:
%%sql 

select count(*)
from tmp

                                                                                

count(1)
38908423


Возьму только миллион записей, чтобы ноутбук осилил

In [20]:
tmp_df_limit = tmp_df.limit(1_000_000_000)

tmp_df_limit.show(5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2022-03-01 00:13:08|  2022-03-01 00:24:35|            1.0|          2.4|       1.0|                 N|          90|         209|           2|       10.0|  3.0|    0.5|       0.

При попытке перезаписать таблицу используя sql replace, repartitioning ноутбук падает с ошибкой памяти. Поэтому удаляю старую таблицу и записываю новую. К сожалению, так теряются снэпшоты исходной таблицы

In [21]:
%%sql

DROP TABLE IF EXISTS nyc.taxis

In [22]:
tmp_df_limit.writeTo("nyc.taxis").create()

                                                                                

Проверка, что данные записались корректно

In [23]:
%%sql

select year(tpep_pickup_datetime) as year, month(tpep_pickup_datetime) as month
from nyc.taxis
group by year(tpep_pickup_datetime), month(tpep_pickup_datetime)
order by year, month

                                                                                

year,month
2021,4
2021,5
2021,6
2021,7
2021,8
2021,9
2021,10
2021,11
2021,12
2022,1


Данные перезаписались корректно

### Посмотрим таблицу снэпшотов

In [25]:
%%sql
    
SELECT snapshot_id, manifest_list
FROM nyc.taxis.snapshots

snapshot_id,manifest_list
8867405477361775537,s3://warehouse/nyc/taxis/metadata/snap-8867405477361775537-1-efe121d5-6281-4a5f-8ed0-08a716d1dade.avro


Как и ожидалось, в таблице только один снэпшот, так как она была перезаписана

### Переименуем столбцы таблицы

In [26]:
%%sql

DESCRIBE nyc.taxis

col_name,data_type,comment
VendorID,bigint,
tpep_pickup_datetime,timestamp_ntz,
tpep_dropoff_datetime,timestamp_ntz,
passenger_count,double,
trip_distance,double,
RatecodeID,double,
store_and_fwd_flag,string,
PULocationID,bigint,
DOLocationID,bigint,
payment_type,bigint,


In [29]:
%%sql

ALTER TABLE nyc.taxis RENAME COLUMN tpep_pickup_datetime TO pickup_datetime

In [30]:
%%sql

ALTER TABLE nyc.taxis RENAME COLUMN tpep_dropoff_datetime TO dropoff_datetime

In [31]:
%%sql

DESCRIBE nyc.taxis

col_name,data_type,comment
VendorID,bigint,
pickup_datetime,timestamp_ntz,
dropoff_datetime,timestamp_ntz,
passenger_count,double,
trip_distance,double,
RatecodeID,double,
store_and_fwd_flag,string,
PULocationID,bigint,
DOLocationID,bigint,
payment_type,bigint,


### Удалим столбцы и вставим новые данные, затем откатим изменения

In [32]:
%%sql

SELECT snapshot_id
FROM nyc.taxis.snapshots

                                                                                

snapshot_id
8867405477361775537


In [33]:
current_snapshot = 8867405477361775537

In [34]:
%%sql

ALTER TABLE nyc.taxis
DROP COLUMN airport_fee, congestion_surcharge, improvement_surcharge

In [35]:
%%sql

ALTER TABLE nyc.taxis
DROP COLUMN tolls_amount, mta_tax, extra, fare_amount, store_and_fwd_flag, RatecodeID

In [36]:
%%sql 

DESCRIBE nyc.taxis

col_name,data_type,comment
VendorID,bigint,
pickup_datetime,timestamp_ntz,
dropoff_datetime,timestamp_ntz,
passenger_count,double,
trip_distance,double,
PULocationID,bigint,
DOLocationID,bigint,
payment_type,bigint,
tip_amount,double,
total_amount,double,


In [37]:
%%sql 

SELECT *
FROM nyc.taxis

                                                                                

VendorID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID,payment_type,tip_amount,total_amount
1,2022-03-01 00:13:08,2022-03-01 00:24:35,1.0,2.4,90,209,2,0.0,13.8
1,2022-03-01 00:47:52,2022-03-01 01:00:08,1.0,2.2,148,234,2,0.0,14.3
2,2022-03-01 00:02:46,2022-03-01 00:46:43,1.0,19.78,132,249,1,11.06,67.61
2,2022-03-01 00:52:43,2022-03-01 01:03:40,2.0,2.94,211,66,1,4.44,19.24
2,2022-03-01 00:15:35,2022-03-01 00:34:13,1.0,8.57,138,197,1,5.51,33.06
1,2022-03-01 00:11:57,2022-03-01 00:53:05,2.0,14.0,132,33,1,9.2,55.25
2,2022-03-01 00:05:11,2022-03-01 00:08:22,1.0,0.61,166,151,1,1.0,6.8
2,2022-03-01 00:30:56,2022-03-01 00:46:21,1.0,2.83,74,238,1,3.7,18.0
2,2022-03-01 00:30:28,2022-03-01 00:30:36,1.0,0.1,145,145,3,0.0,-3.8
2,2022-03-01 00:30:28,2022-03-01 00:30:36,1.0,0.1,145,145,2,0.0,3.8


In [43]:
%%sql

INSERT INTO nyc.taxis
VALUES (1, timestamp('2999-01-01 00:00:00'), timestamp('3000-01-01 00:00:00'), 1, 1, 1, 1, 1, 0, 0);

Проверим, что данные вставились

In [44]:
%%sql

SELECT *
FROM nyc.taxis
WHERE year(pickup_datetime) == 2999

                                                                                

VendorID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID,payment_type,tip_amount,total_amount
1,2999-01-01 00:00:00,3000-01-01 00:00:00,1.0,1.0,1,1,1,0.0,0.0
1,2999-01-01 00:00:00,,1.0,1.0,1,1,1,0.0,0.0
1,2999-01-01 00:00:00,,1.0,1.0,1,1,1,0.0,0.0


Данные вставились корректно

In [47]:
%%sql

SELECT *
FROM nyc.taxis.snapshots

committed_at,snapshot_id,parent_id,operation,manifest_list,summary
2024-11-19 13:07:51.509000,8867405477361775537,,append,s3://warehouse/nyc/taxis/metadata/snap-8867405477361775537-1-efe121d5-6281-4a5f-8ed0-08a716d1dade.avro,"{'spark.app.id': 'local-1732021182220', 'changed-partition-count': '1', 'added-data-files': '2', 'total-equality-deletes': '0', 'added-records': '38908423', 'total-position-deletes': '0', 'added-files-size': '600144900', 'total-delete-files': '0', 'total-files-size': '600144900', 'total-records': '38908423', 'total-data-files': '2'}"
2024-11-19 13:26:48.341000,6185280295707655119,8.867405477361775e+18,append,s3://warehouse/nyc/taxis/metadata/snap-6185280295707655119-1-3fd1d21b-c4c0-4a0b-8bc3-73832a5bd280.avro,"{'spark.app.id': 'local-1732021182220', 'changed-partition-count': '1', 'added-data-files': '1', 'total-equality-deletes': '0', 'added-records': '1', 'total-position-deletes': '0', 'added-files-size': '2870', 'total-delete-files': '0', 'total-files-size': '600147770', 'total-records': '38908424', 'total-data-files': '3'}"
2024-11-19 13:28:22.704000,56325367639566605,6.185280295707655e+18,append,s3://warehouse/nyc/taxis/metadata/snap-56325367639566605-1-d1640faa-5d2a-43f9-b43e-1959c090a27c.avro,"{'spark.app.id': 'local-1732021182220', 'changed-partition-count': '1', 'added-data-files': '1', 'total-equality-deletes': '0', 'added-records': '1', 'total-position-deletes': '0', 'added-files-size': '2870', 'total-delete-files': '0', 'total-files-size': '600150640', 'total-records': '38908425', 'total-data-files': '4'}"
2024-11-19 13:28:51.894000,1638052002034642056,5.632536763956661e+16,append,s3://warehouse/nyc/taxis/metadata/snap-1638052002034642056-1-1d622acb-665a-4209-b1f9-c7320138b0a3.avro,"{'spark.app.id': 'local-1732021182220', 'changed-partition-count': '1', 'added-data-files': '1', 'total-equality-deletes': '0', 'added-records': '1', 'total-position-deletes': '0', 'added-files-size': '2934', 'total-delete-files': '0', 'total-files-size': '600153574', 'total-records': '38908426', 'total-data-files': '5'}"


Теперь у нас несколько снэпшотов

Посмотрим на таблицу `history`

In [48]:
%%sql

SELECT *
FROM nyc.taxis.history

made_current_at,snapshot_id,parent_id,is_current_ancestor
2024-11-19 13:07:51.509000,8867405477361775537,,True
2024-11-19 13:26:48.341000,6185280295707655119,8.867405477361775e+18,True
2024-11-19 13:28:22.704000,56325367639566605,6.185280295707655e+18,True
2024-11-19 13:28:51.894000,1638052002034642056,5.632536763956661e+16,True


Откатим изменения

In [49]:
spark.sql(f"CALL system.rollback_to_snapshot('nyc.taxis', {current_snapshot})")

DataFrame[previous_snapshot_id: bigint, current_snapshot_id: bigint]

Проверим

In [51]:
%%sql

SELECT *
FROM nyc.taxis
WHERE year(pickup_datetime) == 2999

                                                                                

VendorID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,PULocationID,DOLocationID,payment_type,tip_amount,total_amount


### Таблицы `files`, `partitions`, `history`

In [52]:
%%sql

SELECT *
FROM nyc.taxis.files

content,file_path,file_format,spec_id,record_count,file_size_in_bytes,column_sizes,value_counts,null_value_counts,nan_value_counts,lower_bounds,upper_bounds,key_metadata,split_offsets,equality_ids,sort_order_id,readable_metrics
0,s3://warehouse/nyc/taxis/data/00000-171-24e86526-5b86-4e6e-94a5-568aebe6463d-0-00001.parquet,PARQUET,0,34839000,537836059,"{1: 4477105, 2: 124921006, 3: 128238855, 4: 7142554, 5: 50313194, 6: 2346836, 7: 1072171, 8: 30900499, 9: 34818710, 10: 5835024, 11: 38041573, 12: 8621736, 13: 1106014, 14: 37553838, 15: 4321021, 16: 609971, 17: 50377877, 18: 3336738, 19: 2653707}","{1: 34839000, 2: 34839000, 3: 34839000, 4: 34839000, 5: 34839000, 6: 34839000, 7: 34839000, 8: 34839000, 9: 34839000, 10: 34839000, 11: 34839000, 12: 34839000, 13: 34839000, 14: 34839000, 15: 34839000, 16: 34839000, 17: 34839000, 18: 34839000, 19: 34839000}","{1: 0, 2: 0, 3: 0, 4: 1293232, 5: 0, 6: 1293232, 7: 1293232, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 0, 18: 1293232, 19: 1293280}","{16: 0, 17: 0, 18: 0, 19: 0, 4: 0, 5: 0, 6: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0}","{1: bytearray(b'\x01\x00\x00\x00\x00\x00\x00\x00'), 2: bytearray(b'@\x03\x06\x844\xc1\x05\x00'), 3: bytearray(b'@U\xc2\x934\xc1\x05\x00'), 4: bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00'), 5: bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00'), 6: bytearray(b'\x00\x00\x00\x00\x00\x00\xf0?'), 7: bytearray(b'N'), 8: bytearray(b'\x01\x00\x00\x00\x00\x00\x00\x00'), 9: bytearray(b'\x01\x00\x00\x00\x00\x00\x00\x00'), 10: bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00'), 11: bytearray(b'\x00\x00\x00\x00\x00\x08\xa4\xc0'), 12: bytearray(b'\x00\x00\x00\x00\x00\x00\x16\xc0'), 13: bytearray(b'\x9a\x99\x99\x99\x99\x99\xe1\xbf'), 14: bytearray(b'\x00\x00\x00\x00\x00\xa0y\xc0'), 15: bytearray(b'\x00\x00\x00\x00\x000V\xc0'), 16: bytearray(b'333333\xd3\xbf'), 17: bytearray(b'\x9a\x99\x99\x99\x99\x0f\xa4\xc0'), 18: bytearray(b'\x00\x00\x00\x00\x00\x00\x04\xc0'), 19: bytearray(b'\x00\x00\x00\x00\x00\x00\xf4\xbf')}","{1: bytearray(b'\x06\x00\x00\x00\x00\x00\x00\x00'), 2: bytearray(b'\xc0\x1d\xdf\xf4\xe7\xdd\x05\x00'), 3: bytearray(b'\x00\xa6\x9c\xe2\xfb\xdd\x05\x00'), 4: bytearray(b'\x00\x00\x00\x00\x00\x00\\@'), 5: bytearray(b'\n\xd7\xa3p\xf5u\x15A'), 6: bytearray(b'\x00\x00\x00\x00\x00\xc0X@'), 7: bytearray(b'Y'), 8: bytearray(b'\t\x01\x00\x00\x00\x00\x00\x00'), 9: bytearray(b'\t\x01\x00\x00\x00\x00\x00\x00'), 10: bytearray(b'\x05\x00\x00\x00\x00\x00\x00\x00'), 11: bytearray(b'\x14\xaeG\xe1\xd6\xf8(A'), 12: bytearray(b'\xa4p=\n\xd7\x83V@'), 13: bytearray(b'ffffffC@'), 14: bytearray(b'R\xb8\x1e\x85\xeb?\x8f@'), 15: bytearray(b'fffff\xe4\x8d@'), 16: bytearray(b'333333\xd3?'), 17: bytearray(b'\xaeG\xe1z\xdd\xf8(A'), 18: bytearray(b'\x00\x00\x00\x00\x00\x00\x06@'), 19: bytearray(b'\x00\x00\x00\x00\x00\x00\xf4?')}",,"[4, 135022197, 270113232, 405249937]",,0,"Row(DOLocationID=Row(column_size=34818710, value_count=34839000, null_value_count=0, nan_value_count=None, lower_bound=1, upper_bound=265), PULocationID=Row(column_size=30900499, value_count=34839000, null_value_count=0, nan_value_count=None, lower_bound=1, upper_bound=265), VendorID=Row(column_size=4477105, value_count=34839000, null_value_count=0, nan_value_count=None, lower_bound=1, upper_bound=6), dropoff_datetime=Row(column_size=128238855, value_count=34839000, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2021, 4, 30, 18, 14, 5), upper_bound=datetime.datetime(2022, 5, 1, 23, 46, 32)), passenger_count=Row(column_size=7142554, value_count=34839000, null_value_count=1293232, nan_value_count=0, lower_bound=0.0, upper_bound=112.0), payment_type=Row(column_size=5835024, value_count=34839000, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=5), pickup_datetime=Row(column_size=124921006, value_count=34839000, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2021, 4, 30, 18, 9, 41), upper_bound=datetime.datetime(2022, 4, 30, 23, 59, 59)), tip_amount=Row(column_size=37553838, value_count=34839000, null_value_count=0, nan_value_count=0, lower_bound=-410.0, upper_bound=999.99), total_amount=Row(column_size=50377877, value_count=34839000, null_value_count=0, nan_value_count=0, lower_bound=-2567.8, upper_bound=818286.74), trip_distance=Row(column_size=50313194, value_count=34839000, null_value_count=0, nan_value_count=0, lower_bound=0.0, upper_bound=351613.36))"
0,s3://warehouse/nyc/taxis/data/00000-171-24e86526-5b86-4e6e-94a5-568aebe6463d-0-00002.parquet,PARQUET,0,4069423,62308841,"{1: 515952, 2: 14980649, 3: 15328049, 4: 819560, 5: 5746861, 6: 199845, 7: 108911, 8: 3447653, 9: 3818065, 10: 634132, 11: 4385101, 12: 974817, 13: 105600, 14: 4147042, 15: 431999, 16: 66383, 17: 5850063, 18: 388043, 19: 226608}","{1: 4069423, 2: 4069423, 3: 4069423, 4: 4069423, 5: 4069423, 6: 4069423, 7: 4069423, 8: 4069423, 9: 4069423, 10: 4069423, 11: 4069423, 12: 4069423, 13: 4069423, 14: 4069423, 15: 4069423, 16: 4069423, 17: 4069423, 18: 4069423, 19: 4069423}","{1: 0, 2: 0, 3: 0, 4: 270874, 5: 0, 6: 270874, 7: 270874, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 0, 18: 270874, 19: 270950}","{16: 0, 17: 0, 18: 0, 19: 0, 4: 0, 5: 0, 6: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0}","{1: bytearray(b'\x01\x00\x00\x00\x00\x00\x00\x00'), 2: bytearray(b'\xc0\x06\xcb\xe9\xdd\xbe\x05\x00'), 3: bytearray(b'\x008\xf7\xf6\xdd\xbe\x05\x00'), 4: bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00'), 5: bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00'), 6: bytearray(b'\x00\x00\x00\x00\x00\x00\xf0?'), 7: bytearray(b'N'), 8: bytearray(b'\x01\x00\x00\x00\x00\x00\x00\x00'), 9: bytearray(b'\x01\x00\x00\x00\x00\x00\x00\x00'), 10: bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00'), 11: bytearray(b'\x00\x00\x00\x00\x00\xf0y\xc0'), 12: bytearray(b'\x00\x00\x00\x00\x00\x00\x16\xc0'), 13: bytearray(b'\x00\x00\x00\x00\x00\x00\xe0\xbf'), 14: bytearray(b'\x85\xebQ\xb8\x1e\xd5t\xc0'), 15: bytearray(b'\x00\x00\x00\x00\x00\xc0?\xc0'), 16: bytearray(b'333333\xd3\xbf'), 17: bytearray(b'\xcd\xcc\xcc\xcc\xcc\xf4y\xc0'), 18: bytearray(b'\x00\x00\x00\x00\x00\x00\x04\xc0'), 19: bytearray(b'\x00\x00\x00\x00\x00\x00\xf4\xbf')}","{1: bytearray(b'\x06\x00\x00\x00\x00\x00\x00\x00'), 2: bytearray(b'\x80\xc6\xccDM\xcd\x05\x00'), 3: bytearray(b'\xc0\xfc\xc3NS\xcd\x05\x00'), 4: bytearray(b'\x00\x00\x00\x00\x00\x00""@'), 5: bytearray(b'\xcd\xcc\xcc\xcc\xf9[\x13A'), 6: bytearray(b'\x00\x00\x00\x00\x00\xc0X@'), 7: bytearray(b'Y'), 8: bytearray(b'\t\x01\x00\x00\x00\x00\x00\x00'), 9: bytearray(b'\t\x01\x00\x00\x00\x00\x00\x00'), 10: bytearray(b'\x04\x00\x00\x00\x00\x00\x00\x00'), 11: bytearray(b')\\\x8f\xc29)\x18A'), 12: bytearray(b'\x9a\x99\x99\x99\x99\x19#@'), 13: bytearray(b'ffffff\n@'), 14: bytearray(b'\x00\x00\x00\x00\x00\xc0w@'), 15: bytearray(b'fffffz\x8c@'), 16: bytearray(b'333333\xd3?'), 17: bytearray(b'\\\x8f\xc2\xf5:)\x18A'), 18: bytearray(b'\x00\x00\x00\x00\x00\x00\x06@'), 19: bytearray(b'\x00\x00\x00\x00\x00\x00\xf4?')}",,[4],,0,"Row(DOLocationID=Row(column_size=3818065, value_count=4069423, null_value_count=0, nan_value_count=None, lower_bound=1, upper_bound=265), PULocationID=Row(column_size=3447653, value_count=4069423, null_value_count=0, nan_value_count=None, lower_bound=1, upper_bound=265), VendorID=Row(column_size=515952, value_count=4069423, null_value_count=0, nan_value_count=None, lower_bound=1, upper_bound=6), dropoff_datetime=Row(column_size=15328049, value_count=4069423, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2021, 4, 1, 0, 3, 44), upper_bound=datetime.datetime(2021, 10, 1, 23, 56, 11)), passenger_count=Row(column_size=819560, value_count=4069423, null_value_count=270874, nan_value_count=0, lower_bound=0.0, upper_bound=9.0), payment_type=Row(column_size=634132, value_count=4069423, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=4), pickup_datetime=Row(column_size=14980649, value_count=4069423, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2021, 4, 1, 0, 0, 3), upper_bound=datetime.datetime(2021, 10, 1, 16, 43, 54)), tip_amount=Row(column_size=4147042, value_count=4069423, null_value_count=0, nan_value_count=0, lower_bound=-333.32, upper_bound=380.0), total_amount=Row(column_size=5850063, value_count=4069423, null_value_count=0, nan_value_count=0, lower_bound=-415.3, upper_bound=395854.74), trip_distance=Row(column_size=5746861, value_count=4069423, null_value_count=0, nan_value_count=0, lower_bound=0.0, upper_bound=317182.45))"


In [53]:
%%sql

SELECT *
FROM nyc.taxis.partitions

record_count,file_count,total_data_file_size_in_bytes,position_delete_record_count,position_delete_file_count,equality_delete_record_count,equality_delete_file_count,last_updated_at,last_updated_snapshot_id
38908423,2,600144900,0,0,0,0,2024-11-19 13:07:51.509000,8867405477361775537


In [54]:
%%sql

SELECT *
FROM nyc.taxis.history

made_current_at,snapshot_id,parent_id,is_current_ancestor
2024-11-19 13:07:51.509000,8867405477361775537,,True
2024-11-19 13:26:48.341000,6185280295707655119,8.867405477361775e+18,False
2024-11-19 13:28:22.704000,56325367639566605,6.185280295707655e+18,False
2024-11-19 13:28:51.894000,1638052002034642056,5.632536763956661e+16,False
2024-11-19 13:38:45.911000,8867405477361775537,,True


В таблице `history` отразилось, что мы вернулись к предыдущему снэпшоту

### Изменим партицию

In [59]:
%%sql

ALTER TABLE nyc.taxis ADD PARTITION FIELD year(pickup_datetime)

In [61]:
%%sql

ALTER TABLE nyc.taxis ADD PARTITION FIELD month(pickup_datetime)

In [63]:
%%sql

CALL system.rewrite_data_files('nyc.taxis')

rewritten_data_files_count,added_data_files_count,rewritten_bytes_count,failed_data_files_count
0,0,0,0


In [66]:
%%sql 

SELECT *
FROM nyc.taxis.partitions

partition,spec_id,record_count,file_count,total_data_file_size_in_bytes,position_delete_record_count,position_delete_file_count,equality_delete_record_count,equality_delete_file_count,last_updated_at,last_updated_snapshot_id
"Row(pickup_datetime_year=None, pickup_datetime_month=None)",0,38908423,2,600144900,0,0,0,0,2024-11-19 13:07:51.509000,8867405477361775537


### Аудит прямым способом

In [67]:
import uuid
ia_session_id = uuid.uuid4().hex

In [68]:
%%sql

ALTER TABLE nyc.taxis
SET TBLPROPERTIES (
    'write.wap.enabled'='true'
)

In [69]:
spark.conf.set('spark.wap.id', ia_session_id)

In [71]:
%%sql

DELETE FROM nyc.taxis
WHERE payment_type == 1

                                                                                

In [72]:
query = f"""
SELECT snapshot_id
FROM nyc.taxis.snapshots
WHERE summary['wap.id'] = '{ia_session_id}'
"""

ia_session_snapshot = spark.sql(query).head().snapshot_id

In [73]:
spark.read \
    .option("snapshot-id", ia_session_snapshot) \
    .format("iceberg") \
    .load("nyc.taxis") \
    .groupBy("payment_type") \
    .count() \
    .show()



+------------+-------+
|payment_type|  count|
+------------+-------+
|           0|1564106|
|           5|      7|
|           3| 180916|
|           2|8068370|
|           4| 156542|
+------------+-------+



                                                                                

In [74]:
publish_query = f"CALL system.cherrypick_snapshot('nyc.taxis', {ia_session_snapshot})"

%sql $publish_query

source_snapshot_id,current_snapshot_id
7508518728360165294,7508518728360165294


In [75]:
%%sql

SELECT payment_type
FROM nyc.taxis
WHERE payment_type == 1

payment_type


### Аудит с использованием веток

In [77]:
%%sql

ALTER TABLE nyc.taxis
SET TBLPROPERTIES (
    'write.wap.enabled'='true'
)

In [83]:
%%sql

ALTER TABLE nyc.taxis
CREATE BRANCH new_branch

IllegalArgumentException: Ref new_branch already exists

In [84]:
spark.conf.set('spark.wap.branch', 'new_branch')

In [86]:
spark.conf.unset('spark.wap.id')

In [87]:
%%sql

DELETE FROM nyc.taxis
WHERE payment_type == 0

                                                                                

In [93]:
%%sql

SELECT DISTINCT payment_type
FROM nyc.taxis

                                                                                

payment_type
3
2
4
5


Посмотрим на ветку main

In [91]:
%%sql

SELECT DISTINCT payment_type
FROM nyc.taxis VERSION AS OF 'main'

                                                                                

payment_type
0
5
3
2
4


In [94]:
query = f"""
SELECT snapshot_id
FROM nyc.taxis.refs
WHERE name = 'new_branch'
"""

wap_snapshot_id = spark.sql(query).head().snapshot_id

In [95]:
publish_query = f"CALL system.cherrypick_snapshot('nyc.taxis', {wap_snapshot_id})"

%sql $publish_query

source_snapshot_id,current_snapshot_id
8206386991873546081,8206386991873546081


In [96]:
%%sql

SELECT DISTINCT payment_type
FROM nyc.taxis VERSION AS OF 'main'

                                                                                

payment_type
3
2
4
5


In [98]:
%%sql

ALTER TABLE nyc.taxis
DROP BRANCH new_branch

In [99]:
%%sql

SELECT *
FROM nyc.taxis.refs

name,type,snapshot_id,max_reference_age_in_ms,min_snapshots_to_keep,max_snapshot_age_in_ms
main,BRANCH,8206386991873546081,,,
